def parse_unparsed(session, crawlers): session.flush() unparsed_rawurl_count = session.query(Rawurl).filter_by(state = Rawurl.STATE_UNPARSED).count() if unparsed_rawurl_count == 0: print('nothing to parse') return else: print('Trying to parse ' + str(unparsed_rawurl_count) + ' unparsed urls...') parse_success = 0 parse_failed = 0 before = time.time() rawurl = session.query(Rawurl).filter_by(state = Rawurl.STATE_UNPARSED).limit(1).first() rawurl_index = 0 while rawurl is not None: foundparser = False for _, parser in crawlers.items(): try: parser.parse_rawurl(rawurl) parse_success += 1 foundparser = True session.commit() break except InvalidArticleException: continue if not foundparser: rawurl.state = Rawurl.STATE_INVALID parse_failed += 1 rawurl_index += 1 if (rawurl_index % 10 == 0): percent, etl = statistics_get_percent_estimated_time(before, rawurl_index, unparsed_rawurl_count) print('\r %0.2f%% | %d urls parsed, %d urls failed | ETL: %s' % (percent, parse_success, parse_failed, etl), end='') rawurl = session.query(Rawurl).filter_by(state = Rawurl.STATE_UNPARSED).limit(1).first() session.commit() print('') print('from %d urls, %d failed to parse (%d%%)' % (unparsed_rawurl_count, parse_failed, round(100 * parse_failed / unparsed_rawurl_count)))
def process_articles(session): """Processes (analyzes) articles for each day """ print("Fetching words...", end="") words = get_all_words_in_db(session) print("done.") print("Fetching dates...", end="") dates = get_articles_dates(session) print("done.") day_index = 0 max_index = len(dates) time_before = time.time() statistics = { "day": 0, "maxday": len(dates), "timebefore": time.time(), "articles_processed": 0, "new_words": 0, "new_occurences": 0, } print("Processing articles...") for date in dates: process_daily_articles(session, date, words, statistics) statistics["day"] += 1 percent, etl = statistics_get_percent_estimated_time( statistics["timebefore"], statistics["day"], statistics["maxday"] ) print("\r%0.2f%% | ETL: %s | %s" % (percent, etl, str(date)), end="") session.commit() print("\nPerforming final commit...", end="") session.commit() print("done.\n") print("Articles processed: %d" % statistics["articles_processed"]) print("New words added: %d" % statistics["new_words"]) print("Occurences added: %d" % statistics["new_occurences"]) print("Dates spanned: %d" % max_index) seconds = round(time.time() - time_before) print("Time taken: %s" % seconds_to_string(seconds)) print("Articles per second: %0.2f" % (statistics["articles_processed"] / seconds))
def process_daily_articles(session, article_date, all_words, statistics): daily_articles = session.query(Article).filter_by(state=0).filter_by(date=article_date) verbose = False # this is often empty # but also often the most expensive because it performs a large flush occurences = session.query(Occurence).filter_by(date=article_date) if verbose: print("\npocet nacitanych zavislosti: %d " % occurences.count()) # will map word_text to occurence (for each source) all_occurences = defaultdict(dict) for occ in occurences: all_occurences[occ.source][occ.word.text] = occ # statistics cnt_of_new_words = 0 cnt_of_new_occurences = 0 cnt_of_existing_occurences = 0 cnt_of_existing_words = 0 articles_processed = 0 words_dict = defaultdict(Counter) if verbose: print("Pocet clankov pre datum %s je %d" % (str(article_date), daily_articles.count())) start = time.time() for article in daily_articles: words_dict[article.source].update(re.split("\W+", article.content)) article.state = 1 # we can't commit till this property is not properly reflected in occurences articles_processed += 1 count_sources = len(words_dict) index_source = 0 for source in words_dict: iterator = words_dict[source].items() first = True index_word = 0 for word_text, frequency in iterator: # adds word if word_text in all_words: cnt_of_existing_words += 1 else: new_word = Word(word_text, strip_accents(word_text).lower()) cnt_of_new_words += 1 session.add(new_word) all_words[word_text] = new_word # adds occurence all_occurence_current = all_occurences[source] if word_text in all_occurence_current: all_occurence_current[word_text].count += frequency all_occurence_current[word_text].article_count += 1 cnt_of_existing_occurences += 1 else: new_occurence = Occurence(all_words[word_text], frequency, article_date, source, 1) # the following line can be ommited all_occurence_current[word_text] = new_occurence session.add(new_occurence) cnt_of_new_occurences += 1 index_word += 1 if index_word % 100 == 0: session.flush() precise_progress = ( statistics["day"] + (index_source + index_word / len(words_dict[source])) / count_sources ) percent, etl = statistics_get_percent_estimated_time( statistics["timebefore"], precise_progress, statistics["maxday"] ) print("\r%0.2f%% | ETL: %s | %s" % (percent, etl, str(article_date)), end="") index_source += 1 # session.commit() statistics["articles_processed"] += articles_processed statistics["new_words"] += cnt_of_new_words statistics["new_occurences"] += cnt_of_new_occurences stop = time.time() if verbose: print("pocet spracovanych clankov %d" % articles_processed) print("pocet pridanych slov %d" % cnt_of_new_words) print("pocet existujucich slov %d" % cnt_of_existing_words) print("pocet novych zavislosti %d" % cnt_of_new_occurences) print("pocet existujucich zavislosti %d" % cnt_of_existing_occurences) print("spracovanie jedneho clanku: %0.2f " % round((stop - start) / articles_processed, 2)) print("")
def crawl(self, start = 1, maxindex=None, settings = {}): """ Crawls provided links for articles, then donwloads and parses them. Prints statistics. """ count_links_already_in_db = 0 count_links_new_found = 0 count_links_new_added = 0 count_links_download_error = 0 count_links_parse_error = 0 count_lists_download_error = 0 count_lists_checked = 0 #repair settings if settings is None: settings = {} if 'stop_on_oldpage' not in settings: settings['stop_on_oldpage'] = False if 'stop_on_lastpage' not in settings: settings['stop_on_lastpage'] = False #pageurls = self.get_urls_to_search_for_links() pageurls = self.get_url_iterator(start=start, maxindex=maxindex) page_index = 0 time_before = time.time() last_page_warning = False for pageurl in pageurls: count_lists_checked += 1 try: #print('\nopening %s \n' % pageurl) pagehtml = my_urlopen(pageurl) last_page_warning = False except (PageNotFoundException, FailedAttemptsException, ConnectionResetError): print('\ncould not open article list %s' % pageurl, file=sys.stderr) count_lists_download_error += 1 if last_page_warning: pageurls.stop() print('\nStopping because of two 404s in row\n') last_page_warning = False continue last_page_warning = True continue soup = BeautifulSoup(pagehtml) # parse for links links = self.get_links_from_soup(soup, pageurl) if settings['stop_on_lastpage'] and self.stopping_criterion(soup): pageurls.stop() print('\nStopping because individual criterion fired (no nextpage link)\n') link_index = 1 link_count = len(links) localnewlinks = 0 for link in links: # approx. 10 links if not self.link_exists_in_db(link): count_links_new_found += 1 localnewlinks += 1 #download article try: htmlcontent = self.download_link(link) except (PageNotFoundException, FailedAttemptsException, ConnectionResetError, Exception): print('\nnot able to download article %s\n' % link, file=sys.stderr) count_links_download_error += 1 continue #add link rawurl = self.store_link(link, htmlcontent) #parse page try: self.parse_rawurl(rawurl) except InvalidArticleException: rawurl.state = Rawurl.STATE_INVALID print('\ninvalid article %s' % rawurl.url, file=sys.stderr) count_links_parse_error += 1 count_links_new_added +=1 else: count_links_already_in_db += 1 link_index += 1 percent, etl = statistics_get_percent_estimated_time(time_before, page_index + link_index/link_count, len(pageurls)) #print('\r%0.2f%% | ETL: %s' % (percent, etl), end='') print('\r%0.2f%% | ETL: %s | new: %d | old: %d | pix: %d' % (percent, etl, count_links_new_added, count_links_already_in_db, page_index), end='') if settings['stop_on_oldpage'] and localnewlinks == 0: pageurls.stop() print('\nStopping because no new links were found on the last page\n') #print('stopping topic, no new articles') self.session.commit() page_index += 1 if page_index % 100 == 0: print('\nwhoa! %s | %s!' % (strftime("%H:%M:%S"), pageurl)) percent, etl = statistics_get_percent_estimated_time(time_before, page_index, len(pageurls)) print('\r%0.2f%% | ETL: %s | new: %d | old: %d | pix: %d' % (percent, etl, count_links_new_added, count_links_already_in_db, page_index), end='') print('') print('Checked %d article lists (originally: %d, %d failed to download)' % (count_lists_checked, len(pageurls), count_lists_download_error)) print('Found %d links' % (count_links_already_in_db + count_links_new_found)) print('Article lists failed to download: %d' % count_lists_download_error) print('-Already in db: %d' % count_links_already_in_db) print('-New links found: %d' % count_links_new_found) print('--Failed to download: %d' % count_links_download_error) print('--Downloaded and stored, but unable to parse as article: %d' % count_links_parse_error) print('--Downloaded, stored & parsed successfully: %d' % count_links_new_added)