def batch_process(file_dict, dbpath, memory): """Parses, resolves corefs, and extracts triplets from file in a directory. """ from threading import Thread try: # Parse files with progress bar t = Thread(target=monitor_progress, kwargs={ 'num_files':len(file_dict) }) t.daemon = True t.start() print "Starting corenlp. Wait a few moments." this_dir = os.path.dirname(os.path.realpath(__file__)) corenlp_path = os.path.join(this_dir, "stanford-corenlp-full-2013-11-12") log_path = os.path.join(TEMP, 'corenlp_log.txt') parses = corenlp.batch_parse(TEMP, log_path, memory=memory, corenlp_path=corenlp_path) # Extract triplets and save to db pbar = ProgressBar(len(file_dict)) file_name = '' for parse_dict in parses: if not pbar.has_started(): print "Extracting triplets..." pbar.start() article_dict = file_dict[parse_dict['file_name']] # add article to db database.save_article(article_dict, dbpath) # resolve corefs and extract triplets triplets = process_parsed(parse_dict) # save triplet to db if len(triplets) > 0: for triplet in triplets: triplet['article_path'] = article_dict['path'] triplet['pub_date'] = article_dict['pub_date'] database.save_triplet(triplet, dbpath) if parse_dict['file_name'] != file_name: file_name = parse_dict['file_name'] pbar.tick() finally: # remove temp files for root, dirs, fnames in os.walk(TEMP): for fname in fnames: p = os.path.join(root, fname) os.remove(p)
def batch_process(directory): """Parses, resolves corefs, and extracts triplets from file in a directory. """ from threading import Thread try: file_dict = preprocess_dir(directory) # Parse files with progress bar t = Thread(target=monitor_progress, kwargs={ 'num_files':len(file_dict) }) t.daemon = True t.start() print "Starting corenlp. Wait a few moments." parses = corenlp.batch_parse(config.TEMP, memory=config.memory) # Extract triplets and save to db pbar = ProgressBar(len(file_dict)) file_name = '' for parse_dict in parses: if not pbar.has_started(): print "Extracting triplets..." pbar.start() article_dict = file_dict[parse_dict['file_name']] # add article to db database.save_article(article_dict) # resolve corefs and extract triplets triplets = process_parsed(parse_dict) # save triplet to db if len(triplets) > 0: for triplet in triplets: triplet['article_path'] = article_dict['path'] triplet['pub_date'] = article_dict['pub_date'] database.save_triplet(triplet) if parse_dict['file_name'] != file_name: file_name = parse_dict['file_name'] pbar.tick() finally: # remove temp files for root, dirs, fnames in os.walk(config.TEMP): for fname in fnames: p = os.path.join(root, fname) os.remove(p)
def monitor_progress(num_files): """Watches a log file changes and draws a progress bar in the terminal. """ from time import sleep import sys pbar = ProgressBar(num_files) # Try three times to open the file for x in range(3): try: f = open(os.path.join(config.DATA, 'corenlp_log.txt')) break except IOError: sleep(4) print "ERROR: Unable to find corenlp_log.txt" fname = '' while True: f.seek(0) # Refresh log. try: line = f.readlines()[-1] except IndexError: sleep(1) continue if line and line.strip().startswith('Annotating file'): # Once we find the right line, start the pbar if not pbar.has_started(): print "Sending files to StanfordCoreNLP..." pbar.start() # Ensure corenlp is working on a new file new_fname = line.split('/')[-1].split(' ')[0] if pbar.has_started() and new_fname != fname: fname = new_fname pbar.tick() if pbar.is_done(): # Stop the thread return sleep(.1)
def make_graph(date_range=False, giant=False, show_pbar=True): conn = sqlite3.connect(config.DB) if date_range is False: query = """SELECT article_path, subject, predicate, obj, sentence, sentiment, pub_date, subj_named, obj_named FROM triplets WHERE is_reliable = 1""" triplets = conn.execute(query).fetchall() else: query = """SELECT article_path, subject, predicate, obj, sentence, sentiment, pub_date, subj_named, obj_named FROM triplets WHERE is_reliable = 1 AND pub_date >= ? AND pub_date < ?""" params = (date_range[0], date_range[1]) triplets = conn.execute(query, params).fetchall() G = ig.Graph(directed=True) if show_pbar: pbar = ProgressBar(len(triplets)) pbar.start() # Generate graph for article_path, subject, pred, obj, sentence, sentiment, pub_date, \ subj_named, obj_named in triplets: if subject.lower() != obj.lower(): # Sentence = sentence.encode('ascii', 'ignore') add_igraph_vertex(subject, subj_named, G) add_igraph_vertex(obj, obj_named, G) add_igraph_edge( article_path, subject, pred, obj, sentence, sentiment, G) if show_pbar: pbar.tick() conn.close() if giant: return G.clusters(mode=ig.WEAK).giant() else: return G
def set_reliable(frequency_threshold, unnamed_threshold): clear_reliable() query = "UPDATE triplets SET is_reliable = 1 WHERE ROWID = ?" conn = sqlite3.connect(config.DB, isolation_level=None) reliable = get_reliable( frequency_threshold, weight_threshold, unnamed_threshold) print "Setting reliable..." pbar = ProgressBar(len(reliable)) #len(reliable.fetchall())) pbar.start() for rowid in reliable: params = (rowid,) conn.execute(query, params) pbar.tick() conn.commit() conn.close() print "Done!"
def scrape(self, pause=(30, 60)): """Scraper's main loop. Pulls a news website's search result page via format url and extracts article links with extract_links(). Then it loops through those extracted links and pulls out the article with extract_article() and metadata with extract_metadata(). Lastly, it yields the results in a dictionary. Arguments: pause (tuple): Program will pause for a random number of seconds between pause[0] and pause[1]. pause (None): Program will not pause at all. NOT RECOMMENDED! Yields: dict -- a dictionary containing data returned by extract_metadata() and the article content under the 'content' key. """ # Start the progress bar pbar = ProgressBar(self._num_articles) pbar.start() count = 0 # Loop through search result pages. for i in itertools.count(self._start, self._step): # Stop if we have desired number of articles. if count > self._num_articles: break url = self._format_url.format(i) # log search results page turn self.logger.info("Extracting search results from {}".format(url)) # Begin scraping try: # Extract search result URLs rawsearchresults = requests.get(url, headers=self.headers) searchresults = self.extract_links(rawsearchresults.text, url) # Add referer to headers to look like a real boy self.headers['Referer'] = url # Walk throguh search results for link in searchresults: # Stop if we have desired number of articles. if count > self._num_articles: break self.logger.info("Extracting article from {}".format(link)) # Download article raw_article = requests.get(link, headers=self.headers) # Extract article / remove boilerplate content = self.extract_article(raw_article.text, link) # Extract various metadata article = self.extract_metadata(raw_article.text, link) # add article content to metadata dictionary. article['content'] = content if pause: sleep(randint(pause[0], pause[1])) count += 1 pbar.tick() yield article except ParseError as e: # Log error, then continue self.logger.error(str(e)) # Update counter and progressbar count += 1 pbar.tick() except Exception as e: # Log error, then exit self.logger.error('Error occured while in scrape()', exc_info=True) raise e