def next(self): ''' (Crawler) -> newspaper.Article returns the next article in the sequence ''' #standard non-recursive tree iteration while(True): if(len(self.visit_queue) <= 0): raise StopIteration current_url = self.visit_queue.pop() if(self._should_skip()): logging.info(u"skipping {0} randomly".format(current_url)) continue logging.info(u"visiting {0}".format(current_url)) #use newspaper to download and parse the article article = ExplorerArticle(current_url) article.download() # get get urls from the article for link in article.get_links(): url = urljoin(current_url, link.href, False) if self.url_in_filter(url, self.filters): logging.info("Matches with filter, skipping the {0}".format(url)) continue try: parsed_url = urlparse(url) parsed_as_list = list(parsed_url) if parsed_url.scheme != u"http" and parsed_url.scheme != u"https": logging.info(u"skipping url with invalid scheme: {0}".format(url)) continue parsed_as_list[5] = '' url = urlunparse(urlnorm.norm_tuple(*parsed_as_list)) except Exception as e: logging.info(u"skipping malformed url {0}. Error: {1}".format(url, str(e))) continue if not parsed_url.netloc.endswith(self.domain): continue if url in self.visited_urls: continue self.visit_queue.appendleft(url) self.visited_urls.add(url) logging.info(u"added {0} to the visit queue".format(url)) self.pages_visited += 1 return article
def parse_articles_per_site(db_keywords, source_sites, twitter_accounts_explorer, site): logging.info("Started multiprocessing of Site: %s", site.name) #Setup logging for this site setup_logging(site.name) article_count = 0 newspaper_articles = [] crawlersource_articles = [] logging.info("Site: %s Type:%i"%(site.name, site.mode)) #0 = newspaper, 1 = crawler, 2 = both if(site.mode == 0 or site.mode == 2): logging.disable(logging.ERROR) newspaper_source = newspaper.build(site.url, memoize_articles=False, keep_article_html=True, fetch_images=False, number_threads=1) logging.disable(logging.NOTSET) newspaper_articles = newspaper_source.articles article_count += newspaper_source.size() logging.info("populated {0} articles using newspaper".format(article_count)) if(site.mode == 1 or site.mode == 2): crawlersource_articles = Crawler.Crawler(site) article_count += crawlersource_articles.probabilistic_n logging.debug("expecting {0} from plan b crawler".format(crawlersource_articles.probabilistic_n)) article_iterator = itertools.chain(iter(newspaper_articles), crawlersource_articles).__iter__() processed = 0 filters = site.referringsitefilter_set.all() while True: try: try: article = article_iterator.next() except StopIteration: break #have to put all the iteration stuff at the top because I used continue extensively in this loop processed += 1 if url_in_filter(article.url, filters): logging.info("Matches with filter, skipping the {0}".format(article.url)) continue print( "%s (Article|%s) %i/%i \r" % (str(timezone.localtime(timezone.now()))[:-13], site.name, processed, article_count)) logging.info("Processing %s"%article.url) url = article.url if 'http://www.' in url: url = url[:7] + url[11:] elif 'https://www.' in url: url = url[:8] + url[12:] article = ExplorerArticle(article.url) logging.debug("ExplorerArticle Created") # Try to download and extract the useful data if(not article.is_downloaded): if(not article.download()): logging.warning("article skipped because download failed") continue url = article.canonical_url if (not article.is_parsed): if (not article.preliminary_parse()): logging.warning("article skipped because parse failed") continue logging.debug("Article Parsed") logging.debug(u"Title: {0}".format(repr(article.title))) if not article.title: logging.info("article missing title, skipping") continue if not article.text: logging.info("article missing text, skipping") continue # Regex the keyword from the article's text keywords = get_keywords(article, db_keywords) logging.debug(u"matched keywords: {0}".format(repr(keywords))) # Regex the links within article's html sources = get_sources_sites(article, source_sites) logging.debug(u"matched sources: {0}".format(repr(sources))) twitter_accounts = get_sources_twitter(article, twitter_accounts_explorer) logging.debug(u"matched twitter_accounts: {0}".format(repr(twitter_accounts[0]))) if((not keywords) and (not sources[0]) and (not twitter_accounts[0])):#[] gets coverted to false logging.debug("skipping article because it's not a match") continue article.newspaper_parse() text = article._newspaper_text # Rerun the get_keywords with text parsed by newspaper. keywords = get_keywords(article, db_keywords) if((not keywords) and (not sources[0]) and (not twitter_accounts[0])):#[] gets coverted to false logging.debug("skipping article because it's not a match") continue logging.info("match found") #load selectors from db! #parameter is a namedtuple of "css" and "regex" title = article.evaluate_css_selectors(site.referringsitecssselector_set.filter(field=0)) or article.title authors = article.evaluate_css_selectors(site.referringsitecssselector_set.filter(field=1)) if(authors): authors = [authors] else: authors = article.authors pub_date = article.evaluate_css_selectors(site.referringsitecssselector_set.filter(field=2)) if(pub_date): pub_date = dateutil.parser.parse(pub_date) else: pub_date = get_pub_date(article) mod_date = article.evaluate_css_selectors(site.referringsitecssselector_set.filter(field=3)) language = article.language date_now=timezone.localtime(timezone.now()) # Check if the entry already exists db_article_list = Article.objects.filter(url=url) if not db_article_list: logging.info("Adding new Article to the DB") # If the db_article is new to the database, # add it to the database db_article = Article(title=title, url=url, domain=site.url, date_added=date_now, date_last_seen=date_now, date_published=pub_date, date_modified=mod_date, language=language, text=text) db_article.save() db_article = Article.objects.get(url=url) for key in keywords: db_article.keyword_set.create(name=key) for author in authors: db_article.author_set.create(name=author) for account in twitter_accounts[0]: db_article.sourcetwitter_set.create(name = account, matched = True) for account in twitter_accounts[1]: db_article.sourcetwitter_set.create(name = account, matched = False) for source in sources[0]: db_article.sourcesite_set.create(url=source[0], domain=source[1], anchor_text=source[2], matched=True, local=(source[1] in site.url)) for source in sources[1]: db_article.sourcesite_set.create(url=source[0], domain=source[1], anchor_text=source[2], matched=False, local=(source[1] in site.url)) else: logging.info("Modifying existing Article in the DB") # If the db_article already exists, # update all fields except date_added db_article = db_article_list[0] db_article.title = title db_article.url = url db_article.domain = site.url # Do not update the added date # db_article.date_added = today db_article.date_last_seen = date_now db_article.date_published = pub_date db_article.date_modified = mod_date db_article.language = language db_article.text = text db_article.save() for key in keywords: if not db_article.keyword_set.filter(name=key): db_article.keyword_set.create(name=key) for author in authors: if not db_article.author_set.filter(name=author): db_article.author_set.create(name=author) for account in twitter_accounts[0]: if not db_article.sourcetwitter_set.filter(name=account): db_article.sourcetwitter_set.create(name = account, matched = True) for account in twitter_accounts[1]: if not db_article.sourcetwitter_set.filter(name=account): db_article.sourcetwitter_set.create(name = account, matched = False) for source in sources[0]: if not db_article.sourcesite_set.filter(url=source[0]): db_article.sourcesite_set.create(url=source[0], domain=source[1], anchor_text=source[2], matched=True, local=(source[1] in site.url)) for source in sources[1]: if not db_article.sourcesite_set.filter(url=source[0]): db_article.sourcesite_set.create(url=source[0], domain=source[1], anchor_text=source[2], matched=False, local=(source[1] in site.url)) warc_creator.enqueue_article(url) except (KeyboardInterrupt, SystemExit): raise except Exception as e: logging.exception("Unhandled exception while crawling: " + str(e)) logging.info("Finished Site: %s"%site.name) setup_logging(increment=False) logging.info("Finished Site: %s"%site.name)
def next(self): ''' (Crawler) -> newspaper.Article returns the next article in the sequence ''' #standard non-recursive tree iteration with open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'a') as ignore_filter_file: try: current_level = 0 while (True): if (self.limit > 0 and self.visited_count > self.limit): raise StopIteration('Limit reached: {:d}'.format( self.limit)) # if(self.pages_visited > self.probabilistic_n): # raise StopIteration # self.cursor.execute("SELECT * FROM " + self.tovisit_table + " ORDER BY id LIMIT 1") # row = self.cursor.fetchone() # if(row): # row_id = row[0] # current_url = row[1] # self.cursor.execute("DELETE FROM " + self.tovisit_table + " WHERE id=%s", (row_id,)) # else: # raise StopIteration # if(self._should_skip()): # logging.info(u"skipping {0} randomly".format(current_url)) # continue try: if (self.site.is_shallow): current = self.to_visit.get_nowait() current_url = current[0] current_level = current[1] logging.info(u"Shallow on level {0} {1}".format( current_level, current_url)) else: current_url = self.to_visit.get_nowait() except Empty: self.site.is_shallow = True # On line 26 the site gets set TO DELETE self.to_visit.put((self.site.url, str(0))) self.ignore_filter = ScalableBloomFilter( initial_capacity=10000000, error_rate=0.00001) ignore_filter_file.close() os.remove('../ignore_filter/' + self.site.name + '_ignore_file.txt') logging.info("stopped iteration") logging.info(u"{0}".format(self.site.url)) raise ZeroDivisionError logging.info(u"visiting {0}".format(current_url)) self.visited_count += 1 #use newspaper to download and parse the article article = ExplorerArticle(current_url) article.download() if (self.site.is_shallow): if (int(current_level) > self.level): continue # get urls from the article for link in article.get_links(): url = urljoin(current_url, link.href, False) if self.url_in_filter(url, self.filters): logging.info( u"skipping url \"{0}\" because it matches filter" .format(url)) continue try: parsed_url = urlparse(url) parsed_as_list = list(parsed_url) if (parsed_url.scheme != u"http" and parsed_url.scheme != u"https"): logging.info( u"skipping url with invalid scheme: {0}". format(url)) continue parsed_as_list[5] = '' url = urlunparse( urlnorm.norm_tuple(*parsed_as_list)) except Exception as e: logging.info( u"skipping malformed url {0}. Error: {1}". format(url, str(e))) continue if (not parsed_url.netloc.endswith(self.domain)): continue # If the url have been added to ignore list, skip if (url in self.ignore_filter): continue # Ignores the subscribe links for many domains if (u"subscribe" in url or "subscribe" in url and not (u"-subscribe" in url or "-subscribe" or u"subscribe-" in url or "subscribe-")): continue # Append the url to to_visit queue if (self.site.is_shallow): self.to_visit.put( (url, str(int(current_level) + 1))) logging.info( u"added {0} to the to_visit as well as the level {1}" .format(url, str(int(current_level) + 1))) # Append the url to visited to remove duplicates self.ignore_filter.add(url) ignore_filter_file.write(url.encode('utf8') + "\n") else: self.to_visit.put(url) logging.info( u"added {0} to the to_visit".format(url)) # Append the url to visited to remove duplicates self.ignore_filter.add(url) ignore_filter_file.write(url.encode('utf8') + "\n") # Update the Queue self.to_visit.task_done() return article except StopIteration as e: raise e except ValueError as e: raise ValueError except Exception as e: raise e
def parse_articles(referring_sites, db_keywords, source_sites, twitter_accounts_explorer): """ (list of [str, newspaper.source.Source, str], list of str, list of str, str) -> None Downloads each db_article in the site, extracts, compares with Foreign Sites and Keywords provided. Then the db_article which had a match will be stored into the Django database Keyword arguments: referring_sites -- List of [name, 'built_article'] of each site db_keywords -- List of keywords source_sites -- List of foreign sites """ added, updated, failed, no_match = 0, 0, 0, 0 # for each db_article in each sites, download and parse important data for site in referring_sites: # print "\n%s" % site[0] article_count = 0 newspaper_articles = [] crawlersource_articles = [] logging.info("Site: %s Type:%i"%(site['name'], site['type'])) #0 = newspaper, 1 = crawler, 2 = both if(site["type"] == 0 or site["type"] == 2): logging.disable(logging.ERROR) newspaper_source = newspaper.build(site["url"], memoize_articles=False, keep_article_html=True, fetch_images=False, language='en', number_threads=1) logging.disable(logging.NOTSET) newspaper_articles = newspaper_source.articles article_count += newspaper_source.size() logging.info("populated {0} articles using newspaper".format(article_count)) if(site["type"] == 1 or site["type"] == 2): crawlersource_articles = Crawler.Crawler(site["url"], site["filter"]) article_count += crawlersource_articles.probabilistic_n logging.debug("expecting {0} from plan b crawler".format(crawlersource_articles.probabilistic_n)) article_iterator = itertools.chain(iter(newspaper_articles), crawlersource_articles) processed = 0 for article in article_iterator: #have to put all the iteration stuff at the top because I used continue extensively in this loop processed += 1 # Check for any new command on communication stream check_command() if url_in_filter(article.url, site["filter"]): logging.info("Matches with filter, skipping the {0}".format(article.url)) continue print( "%s (Article|%s) %i/%i \r" % (str(timezone.localtime(timezone.now()))[:-13], site["name"], processed, article_count)) logging.info("Processing %s"%article.url) url = article.url if 'http://www.' in url: url = url[:7] + url[11:] elif 'https://www.' in url: url = url[:8] + url[12:] article = ExplorerArticle(article.url) # Try to download and extract the useful data if(not article.is_downloaded): if(not article.download()): logging.warning("article skipped because download failed") continue article.preliminary_parse() if not article.title: logging.info("article missing title, skipping") continue if not article.text: logging.info("article missing text, skipping") continue # Regex the keyword from the article's text keywords = get_keywords(article, db_keywords) logging.debug(u"matched keywords: {0}".format(repr(keywords))) # Regex the links within article's html sources = get_sources_sites(article, source_sites) logging.debug(u"matched sources: {0}".format(repr(sources))) twitter_accounts = get_sources_twitter(article, twitter_accounts_explorer) logging.debug(u"matched twitter_accounts: {0}".format(repr(twitter_accounts[0]))) if((not keywords) or (not sources[0]) or (not twitter_accounts[0])):#[] gets coverted to false logging.debug("skipping article because it's not a match") continue logging.info("match found") article.newspaper_parse() authors = article.authors pub_date = get_pub_date(article) # Check if the entry already exists db_article_list = Article.objects.filter(url=url) if not db_article_list: logging.info("Adding new Article to the DB") # If the db_article is new to the database, # add it to the database db_article = Article(title=article.title, url=url, domain=site["url"], date_added=timezone.localtime( timezone.now()), date_published=pub_date) db_article.save() db_article = Article.objects.get(url=url) for key in keywords: db_article.keyword_set.create(name=key) for author in authors: db_article.author_set.create(name=author) for account in twitter_accounts[0]: db_article.sourcetwitter_set.create(name = account, matched = True) for account in twitter_accounts[1]: db_article.sourcetwitter_set.create(name = account, matched = False) for source in sources[0]: db_article.sourcesite_set.create(url=source[0], domain=source[1], matched=True, local=(source[1] in site["url"])) for source in sources[1]: db_article.sourcesite_set.create(url=source[0], domain=source[1], matched=False, local=(source[1] in site["url"])) added += 1 else: logging.info("Modifying existing Article in the DB") # If the db_article already exists, # update all fields except date_added db_article = db_article_list[0] db_article.title = article.title db_article.url = url db_article.domain = site["url"] # Do not update the added date # db_article.date_added = today db_article.date_published = pub_date db_article.save() for key in keywords: if not db_article.keyword_set.filter(name=key): db_article.keyword_set.create(name=key) for author in authors: if not db_article.author_set.filter(name=author): db_article.author_set.create(name=author) for account in twitter_accounts[0]: if not db_article.sourcetwitter_set.filter(name=account): db_article.sourcetwitter_set.create(name = account, matched = True) for account in twitter_accounts[1]: if not db_article.sourcetwitter_set.filter(name=account): db_article.sourcetwitter_set.create(name = account, matched = False) for source in sources[0]: if not db_article.sourcesite_set.filter(url=source[0]): db_article.sourcesite_set.create(url=source[0], domain=source[1], matched=True, local=(source[1] in site["url"])) for source in sources[1]: if not db_article.sourcesite_set.filter(url=source[0]): db_article.sourcesite_set.create(url=source[0], domain=source[1], matched=False, local=(source[1] in site["url"])) warc_creator.create_article_warc(url) logging.info("Finished Site: %s"%site['name']) print( "%s (Article|%s) %i/%i " % (str(timezone.localtime(timezone.now()))[:-13], site["name"], processed, article_count))
def next(self): ''' (Crawler) -> newspaper.Article returns the next article in the sequence ''' #standard non-recursive tree iteration with open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'a') as ignore_filter_file: try: current_level = 0 while(True): if (self.limit > 0 and self.visited_count > self.limit): raise StopIteration('Limit reached: {:d}'.format(self.limit)) # if(self.pages_visited > self.probabilistic_n): # raise StopIteration # self.cursor.execute("SELECT * FROM " + self.tovisit_table + " ORDER BY id LIMIT 1") # row = self.cursor.fetchone() # if(row): # row_id = row[0] # current_url = row[1] # self.cursor.execute("DELETE FROM " + self.tovisit_table + " WHERE id=%s", (row_id,)) # else: # raise StopIteration # if(self._should_skip()): # logging.info(u"skipping {0} randomly".format(current_url)) # continue try: if (self.site.is_shallow): current = self.to_visit.get_nowait() current_url = current[0] current_level = current[1] logging.info(u"Shallow on level {0} {1}".format(current_level, current_url)) else: current_url = self.to_visit.get_nowait() except Empty: self.site.is_shallow = True # On line 26 the site gets set TO DELETE self.to_visit.put((self.site.url, str(0))) self.ignore_filter = ScalableBloomFilter( initial_capacity=10000000, error_rate=0.00001) ignore_filter_file.close() os.remove('../ignore_filter/' + self.site.name + '_ignore_file.txt') logging.info("stopped iteration") logging.info(u"{0}".format(self.site.url)) raise ZeroDivisionError logging.info(u"visiting {0}".format(current_url)) self.visited_count += 1 #use newspaper to download and parse the article article = ExplorerArticle(current_url) article.download() if (self.site.is_shallow): if (int(current_level) > self.level): continue # get urls from the article for link in article.get_links(): url = urljoin(current_url, link.href, False) if self.url_in_filter(url, self.filters): logging.info(u"skipping url \"{0}\" because it matches filter".format(url)) continue try: parsed_url = urlparse(url) parsed_as_list = list(parsed_url) if(parsed_url.scheme != u"http" and parsed_url.scheme != u"https"): logging.info(u"skipping url with invalid scheme: {0}".format(url)) continue parsed_as_list[5] = '' url = urlunparse(urlnorm.norm_tuple(*parsed_as_list)) except Exception as e: logging.info(u"skipping malformed url {0}. Error: {1}".format(url, str(e))) continue if(not parsed_url.netloc.endswith(self.domain)): continue # If the url have been added to ignore list, skip if (url in self.ignore_filter): continue # Ignores the subscribe links for many domains if (u"subscribe" in url or "subscribe" in url and not(u"-subscribe" in url or "-subscribe" or u"subscribe-" in url or "subscribe-")): continue # Append the url to to_visit queue if (self.site.is_shallow): self.to_visit.put((url, str(int(current_level) + 1))) logging.info(u"added {0} to the to_visit as well as the level {1}".format(url, str(int(current_level) + 1))) # Append the url to visited to remove duplicates self.ignore_filter.add(url) ignore_filter_file.write(url.encode('utf8') + "\n") else: self.to_visit.put(url) logging.info(u"added {0} to the to_visit".format(url)) # Append the url to visited to remove duplicates self.ignore_filter.add(url) ignore_filter_file.write(url.encode('utf8') + "\n") # Update the Queue self.to_visit.task_done() return article except StopIteration as e: raise e except ValueError as e: raise ValueError except Exception as e: raise e
import os sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'Frontend'))) import django os.environ['DJANGO_SETTINGS_MODULE'] = 'Frontend.settings' # For Models connecting with the Django Database from explorer.models import * from ExplorerArticle import ExplorerArticle if __name__ == "__main__": id = input("site id: ") django.setup() site = ReferringSite.objects.get(pk=id) url = input("url: ") article = ExplorerArticle(url) article.download() article.preliminary_parse() article.newspaper_parse() fields = {} for css in site.referringsitecssselector_set.all(): if not (css.field_choice in fields.keys()): fields[css.field_choice] = [] fields[css.field_choice].append({'pattern': css.pattern, 'regex': css.regex}) if(not len(fields.keys())): print "no fields" for key, value in fields.iteritems(): print "field \"{0}\"".format(key)