def run(): conn = psycopg2.connect(database="video_article_retrieval", user="******") article_cursor = conn.cursor() update_cursor = conn.cursor() article_cursor.execute( "SELECT count(1) FROM articles WHERE text_extraction_status='Success'") article_count, = article_cursor.fetchone() # avoid loading all articles into memory. article_cursor.execute( "SELECT id, tokens FROM articles WHERE text_extraction_status='Success'" ) crawling_progress = StatusVisualization(article_count, update_every=1000) with Pool(8, initializer=init_worker) as pool: for status, article_id, compressed_bow, compressed_w2v in pool.imap_unordered( extract_features, article_cursor): if status == 'Success': update_cursor.execute( "UPDATE articles SET bow_2048=%s, w2v_2048=%s, feature_extraction_status='Success' WHERE id=%s", [compressed_bow, compressed_w2v, article_id]) else: update_cursor.execute( "UPDATE articles SET feature_extraction_status=%s WHERE id=%s", [status, article_id]) crawling_progress.inc() conn.commit()
def run(db, user): # We create a Pool (of Threads, not processes, since, again, this task is I/O-bound anyways) conn = psycopg2.connect(database=db, user=user) c = conn.cursor() c.execute( """SELECT id, platform FROM videos WHERE crawling_status='Not Crawled'""" ) videos = c.fetchall() #[:10000] shuffle(videos) print(len(videos)) pool = Pool(16) crawling_progress = StatusVisualization(len(videos), update_every=100) for video in pool.imap_unordered(download_video, videos): if video["crawling_status"] == "Player Config: 429": print("Twitter rate limit hit. Try again in 15 minutes") sys.exit(1) query = ( "UPDATE videos SET %s" % postgres_helper.dict_mogrifying_string(video)) + " WHERE id=%(id)s" c.execute(query, video) conn.commit() crawling_progress.inc(by=1) pool.close() pool.join()
def run(db, user): conn = psycopg2.connect(database=db, user=user) c = conn.cursor() # Only crawl articles that have not yet been crawled c.execute("SELECT source_url, source_name FROM articles WHERE crawling_status<>'Success'") articles = c.fetchall() crawling_progress = StatusVisualization(len(articles), update_every=10000) # parallel crawling and parsing to speed things up with Pool(32) as pool: # 16 seems to be around optimum for (index, status, videos) in pool.imap_unordered(crawl_article, enumerate(articles), chunksize=100): source_url = articles[index][0] source_name = articles[index][1] # Update article crawling status c.execute("UPDATE articles SET crawling_status=%s WHERE source_url=%s", [status, source_url]) # If the article has been successfully crawled... if status == 'Success': # ...Update the article count in the sources table c.execute( "INSERT INTO sources (source_name) VALUES (%s) ON CONFLICT (source_name) DO UPDATE SET article_count = sources.article_count + 1", [source_name]) # ...Save all the found videos to the database for platform, video_id in videos: # Insert it into the videos table s.t. it contains all videos in the end c.execute("INSERT INTO videos (platform, id) VALUES (%s,%s) ON CONFLICT DO NOTHING", [platform, video_id]) c.execute("""INSERT INTO article_videos (source_url, source_name, platform, video_id) VALUES (%s, %s, %s, %s)""", [source_url, source_name, platform, video_id]) conn.commit() crawling_progress.inc(1)
def run(): conn = psycopg2.connect(database="video_article_retrieval", user="******") video_cursor = conn.cursor() update_cursor = conn.cursor() video_cursor.execute( "SELECT id, platform FROM videos WHERE i3d_rgb_status<>'Success' AND resnet_status='Success'" ) videos = video_cursor.fetchall() crawling_progress = StatusVisualization(len(videos), update_every=100) # 4 works best. Too many and each worker doesn't have the GPU memory it needs with Pool(4, initializer=init_worker) as pool: for status, id, platform, compressed_feature in pool.imap_unordered( process, videos, chunksize=10): if status == 'Success': # Insert embedding and update the classification status update_cursor.execute( "UPDATE videos SET i3d_rgb_status = 'Success', i3d_rgb_1024 = %s WHERE id=%s AND platform=%s", [compressed_feature, id, platform]) else: update_cursor.execute( "UPDATE videos SET i3d_rgb_status = %s WHERE id=%s AND platform=%s", [status, id, platform]) conn.commit() crawling_progress.inc()
def run(): conn = psycopg2.connect(database="video_article_retrieval", user="******") c = conn.cursor() # Reset the topics table c.execute("DROP TABLE IF EXISTS topics") num_topics = 10 query = "CREATE TABLE topics (source_url TEXT PRIMARY KEY, " query += ",".join("topic_%d FLOAT DEFAULT 0" % index for index in range(0, num_topics)) query += ")" c.execute(query) conn.commit() c.execute( "SELECT count(1) FROM articles WHERE text_extraction_status='Success'") article_count, = c.fetchone() crawling_progress = StatusVisualization(article_count, update_every=1000) articles_cursor = conn.cursor() articles_cursor.execute( "SELECT source_url, text FROM articles WHERE text_extraction_status='Success'" ) # parallel classification with Pool(8) as pool: # 16 seems to be around optimum for source_url, topics in pool.imap_unordered(classify, articles_cursor, chunksize=100): query = "INSERT INTO topics (source_url, " + ",".join("topic_%d" % topic[0] for topic in topics) + ")" \ + ("VALUES ('%s', " % source_url) + ",".join("%f" % topic[1] for topic in topics) + ")" c.execute(query) conn.commit() crawling_progress.inc(1)
def run(): conn = psycopg2.connect(database="gdelt_social_video", user="******") c = conn.cursor() c.execute("SELECT * FROM sources LIMIT 1") # We're only interested in hosts that had any video etc. in them c.execute('SELECT source_name FROM sources') sources = c.fetchall() progress = StatusVisualization(total_count=len(sources), update_every=1000) for source in sources: source = source[0] features = dict() # article_count is already computed in when the db is populated for platform in ["twitter", "youtube", "facebook"]: # Get all videos from that source, of that platform: c.execute('SELECT video_url FROM article_videos WHERE source_name=%s AND platform=%s', [source, platform]) videos = c.fetchall() # Get the count of each article c.execute('SELECT Count(1) FROM article_videos WHERE source_name=%s AND platform=%s GROUP BY source_url', [source, platform]) video_counts = c.fetchall() features[platform + "_std_dev"] = np.std(video_counts) if len(video_counts) > 0 else -1 features[platform + "_count"] = len(video_counts) features[platform + "_sum"] = len(videos) features[platform + "_sum_distinct"] = len(set(videos)) query = "UPDATE sources SET %s WHERE source_name=\'%s\'" % (postgres_helper.dict_set_string(features), source) c.execute(query) conn.commit() progress.inc()
def run(year, month): # Make sure the data directories for the interesting collections exist. for collection in INTERESTING_COLLECTIONS: path = "%s/external/%s/" % (os.environ["DATA_PATH"], collection) if not os.path.exists(path): os.makedirs(path) with open(os.environ["DATA_PATH"] + "/external/masterfilelist.txt") as master_file_list: urls = list() malformed_lines = 0 relevant_lines = 0 for line in master_file_list: # Example line: 134072 f1c7a45aa0292b0aee2bc5b674841096 http://data.gdeltproject.org/gdeltv2/20180731191500.export.CSV.zip # But some files are missing, then the master file just contains http://data.gdeltproject.org/gdeltv2/ try: url = line.rstrip("\n").split(" ")[2] file_name = url.split("/")[-1].lower( ) # Casing is inconsistent in the data source, we don't want that # Correct time? if file_name.startswith("%d%02d" % (year, month)): relevant_lines += 1 # One of the collections we're interested in? collection = file_name.split(".")[-3] if collection in INTERESTING_COLLECTIONS: file_path = "%s/external/%s/%s" % ( os.environ["DATA_PATH"], collection, file_name) # Not already downloaded? # if not os.path.isfile(file_path): urls.append((url, file_path)) elif file_name.startswith("%d%02d" % (year, month + 1)): # We're done. (the dates are in order in the master file) break except Exception as e: malformed_lines += 1 # Some lines just contain http://data.gdeltproject.org/gdeltv2/ print("\nDone reading the master file.\n") print("%d relevant files,\n %d malformed,\n %d already downloaded,\n %d now downloading...\n" \ % (relevant_lines, malformed_lines, relevant_lines - len(urls), len(urls))) crawling_progress = StatusVisualization(len(urls), update_every=50) with Pool(16) as pool: for _ in pool.imap_unordered(retrieve_and_save, urls): crawling_progress.inc()
def run(): conn = psycopg2.connect(database="video_article_retrieval", user="******") article_cursor = conn.cursor() update_cursor = conn.cursor() # Get the count. article_cursor.execute( "SELECT count(1) FROM articles WHERE text_extraction_status='Success'") article_count, = article_cursor.fetchone() # avoid loading all articles into memory. article_cursor.execute( "SELECT id, text FROM articles WHERE text_extraction_status='Success'") # Parallel tokenization, since it takes by far the most time crawling_progress = StatusVisualization(article_count, update_every=1000) with Pool(8) as pool: for article_id, tokens_string in pool.imap_unordered(tokenize_parallel, article_cursor, chunksize=100): update_cursor.execute("UPDATE articles SET tokens=%s WHERE id=%s", [tokens_string, article_id]) crawling_progress.inc() conn.commit()
def run(): MODEL = "yolov3" # Postfix -tiny net, meta = darknet_wrapper.initialize_classifier( config="cfg/%s.cfg" % MODEL, weights="weights/%s.weights" % MODEL, data="cfg/coco.data") conn = psycopg2.connect(database="video_article_retrieval", user="******") c = conn.cursor() # Just classifying facebook videos for now c.execute( "SELECT id, platform FROM videos WHERE object_detection_yolo_status<>'Success' AND platform = 'facebook'" ) videos = c.fetchall() print("%d videos left to analyze" % len(videos)) crawling_progress = StatusVisualization(len(videos), update_every=10) for id, platform in videos: # print(platform, id) # We need to extract the images first # start = time.time() images = [] cap = cv2.VideoCapture(video_helper.get_path(platform, id)) count = 0 while True: success, image = cap.read() if success: if count % 30 == 0: path = tempfile.gettempdir() + "/%05d.jpg" % count cv2.imwrite(path, image) images.append(path) count += 1 else: # Reached the end of the video break # print("Extracted %d images in %d seconds" % (len(images), time.time() - start)) # start = time.time() for index, image in enumerate(images): try: result = darknet_wrapper.detect(net, meta, image) # print("%d: Found %d rois in %s" % (index, len(result), image)) for entity in result: # format is (class, probability (x,y,width, height)) ANKERED IN THE CENTER! (label, probability, (x, y, width, height)) = entity # x,y,height and width are not saved for now. # print("%d,%d (%dx%d): %s (%.3f)" % (x, y, width, height, label, probability)) c.execute( "INSERT INTO object_detection_yolo(id,platform,second,class,probability) VALUES (%s,%s,%s,%s,%s)", [ id, platform, index, str(label, "utf-8"), probability ]) conn.commit() except Exception as e: print(e) # Update the classification status c.execute( "UPDATE videos SET object_detection_yolo_status = 'Success' WHERE id=%s AND platform=%s", [id, platform]) conn.commit() # print("Detection took %d seconds" % (time.time() - start)) crawling_progress.inc()
def tokenize_parallel(article): source_url, text = article return source_url, tokenize(text) # Parallel tokenization, since it takes by far the most time crawling_progress = StatusVisualization(article_count, update_every=1000) articles = list() token_count = 0 with Pool(8) as pool: for source_url, tokens in pool.imap_unordered(tokenize_parallel, c, chunksize=100): articles.append((source_url, tokens)) token_count += len(tokens) crawling_progress.inc() # articles = [(source_url, tokenize.tokenize(text)) for (source_url, text) in c] # The rest is not parallelized. print("Extracting %d tokens took %.2f seconds" % (token_count, time.time() - start)) start = time.time() token_frequency = defaultdict(int) for source_url, tokens in articles: for token in tokens: token_frequency[token] += 1 print("Counting frequencies of %d distinct tokens took %.2f seconds" % (len(token_frequency), time.time() - start)) start = time.time()
import psycopg2 from src import util from src.data.articles import article as article_helper from src.data.articles.boilerpipe import BoilerPipeArticleExtractor from src.visualization.console import StatusVisualization articles_base_path = os.environ["DATA_PATH"] + "/raw/articles/" if __name__ == "__main__": conn = psycopg2.connect(database="video_article_retrieval", user="******") c = conn.cursor() c.execute("SELECT source_url FROM articles WHERE text_extraction_status = 'Not Tried'") extractor = BoilerPipeArticleExtractor() article_urls = list(c) crawling_progress = StatusVisualization(len(article_urls), update_every=100) for source_url, in article_urls: article_path, article_file = article_helper.get_article_html_filepath(source_url) html = util.load_gzip_text(os.path.join(article_path, article_file)) try: text = extractor.get_text(html) # Save it to the DB c.execute("UPDATE articles SET text=%s, text_extraction_status=%s WHERE source_url=%s", [text, "Success", source_url]) conn.commit() except Exception as e: c.execute("UPDATE articles SET text_extraction_status=%s WHERE source_url=%s", [type(e).__name__, source_url]) crawling_progress.inc(by=1)