예제 #1
0
def main():
    logger.success("Starting")
    config = parse_config("config.json")
    print(config)

    # load database
    db = get_db(config)
    logger.info("%sMB in db" % db.get_db_mb())

    # insert seed into db
    for s in config["seed"]:
        db.insert_person(s)
    # fix entities with bad label
    db.update_entities_to(config["actually_orgs"], "ORG")

    threads = [
        run_threaded(explore_news, config),
        run_threaded(explore_people, config),
        run_threaded(explore_entities, config),
        run_threaded(next_round, config)
    ]
    while threads[-1].is_alive():  # wait for next_round to complete
        time.sleep(1)  # join does not allow ctrl+c
    logger.success("Round execution finished, stopping")
    exit()
예제 #2
0
def explore_news(config):
    # iterate all unexpanded news articles in the database
    db = get_db(config)
    while True:  # db.has_unexplored_news():
        logger.warning("New explore news loop")
        with ThreadPoolExecutor() as pool:
            # for n in db.get_unexplored_news().limit(LIMIT_FETCH):
            #     explore_news_piece(db, n)
            pool.map(lambda n: explore_news_piece(db, n),
                     db.get_unexplored_news().limit(LIMIT_FETCH))
        time.sleep(SLEEP_BETWEEN_LOOPS
                   )  # sleep waiting for other threads to do their part
예제 #3
0
def explore_entities(config):
    # iterate all unexplored news and get its entities
    db = get_db(config)
    while True:  # db.has_unexplored_news_entities():
        logger.warning("New explore entities loop")
        # for n in db.get_unexplored_news_entities().limit(LIMIT_FETCH):
        #     extract_entities(db, n)
        with ThreadPoolExecutor() as pool:
            pool.map(lambda n: extract_entities(db, n),
                     db.get_unexplored_news_entities().limit(500))
        time.sleep(SLEEP_BETWEEN_LOOPS
                   )  # sleep waiting for other threads to do their part
예제 #4
0
def next_round(config):
    db = get_db(config)
    rounds = 0
    while rounds < config["rounds"]:
        while db.has_unexplored_people() or db.has_unexplored_news() or db.has_unexplored_news_entities():
            time.sleep(10)
        # all threads completed: check if there are still things to process:
        logger.success("db.has_unexplored_people(): %s" % db.has_unexplored_people())
        logger.success("db.has_unexplored_news(): %s" % db.has_unexplored_news())
        logger.success("db.has_unexplored_news_entities(): %s" % db.has_unexplored_news_entities())
        # the current collection round is over -> add new people
        if rounds + 1 == config["rounds"]: break
        extract_next_round_people(config, db)
        rounds += 1
    logger.success("NEXT_ROUND exited")
예제 #5
0
def explore_people(config):
    # iterate all people and search arquivo.pt for them
    db = get_db(config)
    intervals = split_between_dates(config["from"], config["to"])
    while True:  # db.has_unexplored_people():
        logger.warning("New explore people loop")
        try:
            for person in db.get_unexplored_people().limit(LIMIT_FETCH):
                # for site in config["websites"]:
                #     explore_person(config, db, person, site, intervals)
                with ThreadPoolExecutor() as pool:
                    pool.map(
                        lambda site: explore_person(config, db, person, site, intervals),
                        config["websites"]
                    )
                person["processed"] = True
                db.upsert_person(person)
        except Exception as e:
            logger.error("A rare exception [%s] on explore_people" % e)
        time.sleep(SLEEP_BETWEEN_LOOPS) # sleep waiting for other threads to do their part
예제 #6
0
import sys
sys.path = ['.', '..', '../src'] + sys.path

from src.utils import *
from src.dbmongo import DbMongo, get_db

config = parse_config("config.json")
db = get_db(config)
print("%sMB in db" % db.get_db_mb())

status = {
    "unprocessed_people":
    db['people'].count_documents({"processed": {
        "$exists": False
    }}),
    "unprocessed_news":
    db['news'].count_documents({"processed": {
        "$exists": False
    }}),
    "unprocessed_news_entities":
    db['news'].count_documents({
        "processed_entities": {
            "$exists": False
        },
        "valid": {
            "$exists": False
        }
    }),
    "most_mentioned":
    list(db.get_most_mentioned([], min_len=0))
}