def monitor_progress(num_files): """Watches a log file changes and draws a progress bar in the terminal. """ from time import sleep import sys pbar = ProgressBar(num_files) # Try three times to open the file for x in range(3): try: f = open(os.path.join(config.DATA, 'corenlp_log.txt')) break except IOError: sleep(4) print "ERROR: Unable to find corenlp_log.txt" fname = '' while True: f.seek(0) # Refresh log. try: line = f.readlines()[-1] except IndexError: sleep(1) continue if line and line.strip().startswith('Annotating file'): # Once we find the right line, start the pbar if not pbar.has_started(): print "Sending files to StanfordCoreNLP..." pbar.start() # Ensure corenlp is working on a new file new_fname = line.split('/')[-1].split(' ')[0] if pbar.has_started() and new_fname != fname: fname = new_fname pbar.tick() if pbar.is_done(): # Stop the thread return sleep(.1)
def batch_process(file_dict, dbpath, memory): """Parses, resolves corefs, and extracts triplets from file in a directory. """ from threading import Thread try: # Parse files with progress bar t = Thread(target=monitor_progress, kwargs={ 'num_files':len(file_dict) }) t.daemon = True t.start() print "Starting corenlp. Wait a few moments." this_dir = os.path.dirname(os.path.realpath(__file__)) corenlp_path = os.path.join(this_dir, "stanford-corenlp-full-2013-11-12") log_path = os.path.join(TEMP, 'corenlp_log.txt') parses = corenlp.batch_parse(TEMP, log_path, memory=memory, corenlp_path=corenlp_path) # Extract triplets and save to db pbar = ProgressBar(len(file_dict)) file_name = '' for parse_dict in parses: if not pbar.has_started(): print "Extracting triplets..." pbar.start() article_dict = file_dict[parse_dict['file_name']] # add article to db database.save_article(article_dict, dbpath) # resolve corefs and extract triplets triplets = process_parsed(parse_dict) # save triplet to db if len(triplets) > 0: for triplet in triplets: triplet['article_path'] = article_dict['path'] triplet['pub_date'] = article_dict['pub_date'] database.save_triplet(triplet, dbpath) if parse_dict['file_name'] != file_name: file_name = parse_dict['file_name'] pbar.tick() finally: # remove temp files for root, dirs, fnames in os.walk(TEMP): for fname in fnames: p = os.path.join(root, fname) os.remove(p)
def batch_process(directory): """Parses, resolves corefs, and extracts triplets from file in a directory. """ from threading import Thread try: file_dict = preprocess_dir(directory) # Parse files with progress bar t = Thread(target=monitor_progress, kwargs={ 'num_files':len(file_dict) }) t.daemon = True t.start() print "Starting corenlp. Wait a few moments." parses = corenlp.batch_parse(config.TEMP, memory=config.memory) # Extract triplets and save to db pbar = ProgressBar(len(file_dict)) file_name = '' for parse_dict in parses: if not pbar.has_started(): print "Extracting triplets..." pbar.start() article_dict = file_dict[parse_dict['file_name']] # add article to db database.save_article(article_dict) # resolve corefs and extract triplets triplets = process_parsed(parse_dict) # save triplet to db if len(triplets) > 0: for triplet in triplets: triplet['article_path'] = article_dict['path'] triplet['pub_date'] = article_dict['pub_date'] database.save_triplet(triplet) if parse_dict['file_name'] != file_name: file_name = parse_dict['file_name'] pbar.tick() finally: # remove temp files for root, dirs, fnames in os.walk(config.TEMP): for fname in fnames: p = os.path.join(root, fname) os.remove(p)