Exemplo n.º 1
0
def monitor_progress(num_files):
    """Watches a log file changes and draws a progress bar
    in the terminal.
    """
    from time import sleep
    import sys

    pbar = ProgressBar(num_files)

    # Try three times to open the file
    for x in range(3):
        try:
            f = open(os.path.join(config.DATA, 'corenlp_log.txt'))
            break
        except IOError:
            sleep(4)
        print "ERROR: Unable to find corenlp_log.txt"


    fname = ''
    while True:
        f.seek(0) # Refresh log.
        try:
            line = f.readlines()[-1]
        except IndexError:
            sleep(1)
            continue
        

        if line and line.strip().startswith('Annotating file'):
            # Once we find the right line, start the pbar
            if not pbar.has_started():
                print "Sending files to StanfordCoreNLP..."
                pbar.start()

            # Ensure corenlp is working on a new file
            new_fname = line.split('/')[-1].split(' ')[0]
            if pbar.has_started() and new_fname != fname:
                fname = new_fname
                pbar.tick()
                
        if pbar.is_done():
            # Stop the thread
            return
        sleep(.1)
Exemplo n.º 2
0
def batch_process(file_dict, dbpath, memory):
    """Parses, resolves corefs, and extracts triplets from file in a
    directory.
    """
    from threading import Thread
    try:

        # Parse files with progress bar
        t = Thread(target=monitor_progress, kwargs={
            'num_files':len(file_dict)
            })
        t.daemon = True
        t.start()

        print "Starting corenlp. Wait a few moments."
        this_dir = os.path.dirname(os.path.realpath(__file__))
        corenlp_path = os.path.join(this_dir,
                "stanford-corenlp-full-2013-11-12")
        log_path = os.path.join(TEMP, 'corenlp_log.txt')
        parses = corenlp.batch_parse(TEMP, log_path, memory=memory,
                corenlp_path=corenlp_path)

        # Extract triplets and save to db
        pbar = ProgressBar(len(file_dict))
        file_name = ''
        for parse_dict in parses:
            if not pbar.has_started():
                print "Extracting triplets..."
                pbar.start()
            article_dict = file_dict[parse_dict['file_name']]

            # add article to db
            database.save_article(article_dict, dbpath)

            # resolve corefs and extract triplets
            triplets = process_parsed(parse_dict)

            # save triplet to db
            if len(triplets) > 0:
                for triplet in triplets:
                    triplet['article_path'] = article_dict['path']
                    triplet['pub_date'] = article_dict['pub_date']

                    database.save_triplet(triplet, dbpath)
            if parse_dict['file_name'] != file_name:
                file_name = parse_dict['file_name']
                pbar.tick()
    finally:  # remove temp files
        for root, dirs, fnames in os.walk(TEMP):
            for fname in fnames:
                p = os.path.join(root, fname)
                os.remove(p)
Exemplo n.º 3
0
def batch_process(directory):
    """Parses, resolves corefs, and extracts triplets from file in a
    directory.
    """
    from threading import Thread
    try:
        file_dict = preprocess_dir(directory)

        # Parse files with progress bar
        t = Thread(target=monitor_progress, kwargs={
            'num_files':len(file_dict)
            })
        t.daemon = True
        t.start()
        print "Starting corenlp. Wait a few moments."
        parses = corenlp.batch_parse(config.TEMP, memory=config.memory)

        # Extract triplets and save to db
        pbar = ProgressBar(len(file_dict))
        file_name = ''
        for parse_dict in parses:
            if not pbar.has_started():
                print "Extracting triplets..."
                pbar.start()
            article_dict = file_dict[parse_dict['file_name']]

            # add article to db
            database.save_article(article_dict)

            # resolve corefs and extract triplets
            triplets = process_parsed(parse_dict)

            # save triplet to db
            if len(triplets) > 0:
                for triplet in triplets:
                    triplet['article_path'] = article_dict['path']
                    triplet['pub_date'] = article_dict['pub_date']

                    database.save_triplet(triplet)
            if parse_dict['file_name'] != file_name:
                file_name = parse_dict['file_name']
                pbar.tick()
    finally:  # remove temp files
        for root, dirs, fnames in os.walk(config.TEMP):
            for fname in fnames:
                p = os.path.join(root, fname)
                os.remove(p)