def main(): logger.success("Starting") config = parse_config("config.json") print(config) # load database db = get_db(config) logger.info("%sMB in db" % db.get_db_mb()) # insert seed into db for s in config["seed"]: db.insert_person(s) # fix entities with bad label db.update_entities_to(config["actually_orgs"], "ORG") threads = [ run_threaded(explore_news, config), run_threaded(explore_people, config), run_threaded(explore_entities, config), run_threaded(next_round, config) ] while threads[-1].is_alive(): # wait for next_round to complete time.sleep(1) # join does not allow ctrl+c logger.success("Round execution finished, stopping") exit()
def explore_news(config): # iterate all unexpanded news articles in the database db = get_db(config) while True: # db.has_unexplored_news(): logger.warning("New explore news loop") with ThreadPoolExecutor() as pool: # for n in db.get_unexplored_news().limit(LIMIT_FETCH): # explore_news_piece(db, n) pool.map(lambda n: explore_news_piece(db, n), db.get_unexplored_news().limit(LIMIT_FETCH)) time.sleep(SLEEP_BETWEEN_LOOPS ) # sleep waiting for other threads to do their part
def explore_entities(config): # iterate all unexplored news and get its entities db = get_db(config) while True: # db.has_unexplored_news_entities(): logger.warning("New explore entities loop") # for n in db.get_unexplored_news_entities().limit(LIMIT_FETCH): # extract_entities(db, n) with ThreadPoolExecutor() as pool: pool.map(lambda n: extract_entities(db, n), db.get_unexplored_news_entities().limit(500)) time.sleep(SLEEP_BETWEEN_LOOPS ) # sleep waiting for other threads to do their part
def next_round(config): db = get_db(config) rounds = 0 while rounds < config["rounds"]: while db.has_unexplored_people() or db.has_unexplored_news() or db.has_unexplored_news_entities(): time.sleep(10) # all threads completed: check if there are still things to process: logger.success("db.has_unexplored_people(): %s" % db.has_unexplored_people()) logger.success("db.has_unexplored_news(): %s" % db.has_unexplored_news()) logger.success("db.has_unexplored_news_entities(): %s" % db.has_unexplored_news_entities()) # the current collection round is over -> add new people if rounds + 1 == config["rounds"]: break extract_next_round_people(config, db) rounds += 1 logger.success("NEXT_ROUND exited")
def explore_people(config): # iterate all people and search arquivo.pt for them db = get_db(config) intervals = split_between_dates(config["from"], config["to"]) while True: # db.has_unexplored_people(): logger.warning("New explore people loop") try: for person in db.get_unexplored_people().limit(LIMIT_FETCH): # for site in config["websites"]: # explore_person(config, db, person, site, intervals) with ThreadPoolExecutor() as pool: pool.map( lambda site: explore_person(config, db, person, site, intervals), config["websites"] ) person["processed"] = True db.upsert_person(person) except Exception as e: logger.error("A rare exception [%s] on explore_people" % e) time.sleep(SLEEP_BETWEEN_LOOPS) # sleep waiting for other threads to do their part
import sys sys.path = ['.', '..', '../src'] + sys.path from src.utils import * from src.dbmongo import DbMongo, get_db config = parse_config("config.json") db = get_db(config) print("%sMB in db" % db.get_db_mb()) status = { "unprocessed_people": db['people'].count_documents({"processed": { "$exists": False }}), "unprocessed_news": db['news'].count_documents({"processed": { "$exists": False }}), "unprocessed_news_entities": db['news'].count_documents({ "processed_entities": { "$exists": False }, "valid": { "$exists": False } }), "most_mentioned": list(db.get_most_mentioned([], min_len=0)) }