Пример #1
0
def parse_articles_per_site(db_keywords, source_sites, twitter_accounts_explorer, site):

    logging.info("Started multiprocessing of Site: %s", site.name)
    #Setup logging for this site
    setup_logging(site.name)

    article_count = 0
    newspaper_articles = []
    crawlersource_articles = []
    logging.info("Site: %s Type:%i"%(site.name, site.mode))
    #0 = newspaper, 1 = crawler, 2 = both

    if(site.mode == 0 or site.mode == 2):
        logging.disable(logging.ERROR)
        newspaper_source = newspaper.build(site.url,
                                         memoize_articles=False,
                                         keep_article_html=True,
                                         fetch_images=False,
                                         number_threads=1)
        logging.disable(logging.NOTSET)
        newspaper_articles = newspaper_source.articles
        article_count += newspaper_source.size()
        logging.info("populated {0} articles using newspaper".format(article_count))
    if(site.mode == 1 or site.mode == 2):
        crawlersource_articles = Crawler.Crawler(site)
        article_count += crawlersource_articles.probabilistic_n
        logging.debug("expecting {0} from plan b crawler".format(crawlersource_articles.probabilistic_n))
    article_iterator = itertools.chain(iter(newspaper_articles), crawlersource_articles).__iter__()
    processed = 0

    filters = site.referringsitefilter_set.all()
    while True:
        try:
            try:
                article = article_iterator.next()
            except StopIteration:
                break
            #have to put all the iteration stuff at the top because I used continue extensively in this loop
            processed += 1

            if url_in_filter(article.url, filters):
                logging.info("Matches with filter, skipping the {0}".format(article.url))
                continue

            print(
                "%s (Article|%s) %i/%i          \r" %
                (str(timezone.localtime(timezone.now()))[:-13],
                 site.name, processed, article_count))
            logging.info("Processing %s"%article.url)

            url = article.url
            if 'http://www.' in url:
                url = url[:7] + url[11:]
            elif 'https://www.' in url:
                url = url[:8] + url[12:]
            article = ExplorerArticle(article.url)
            logging.debug("ExplorerArticle Created")
            # Try to download and extract the useful data
            if(not article.is_downloaded):
                if(not article.download()):
                    logging.warning("article skipped because download failed")
                    continue
            url = article.canonical_url

            if (not article.is_parsed):
                if (not article.preliminary_parse()):
                    logging.warning("article skipped because parse failed")
                    continue

            logging.debug("Article Parsed")
            
            logging.debug(u"Title: {0}".format(repr(article.title)))
            if not article.title:
                logging.info("article missing title, skipping")
                continue

            if not article.text:
                logging.info("article missing text, skipping")
                continue

            # Regex the keyword from the article's text
            keywords = get_keywords(article, db_keywords)
            logging.debug(u"matched keywords: {0}".format(repr(keywords)))
            # Regex the links within article's html
            sources = get_sources_sites(article, source_sites)
            logging.debug(u"matched sources: {0}".format(repr(sources)))
            twitter_accounts = get_sources_twitter(article, twitter_accounts_explorer)
            logging.debug(u"matched twitter_accounts: {0}".format(repr(twitter_accounts[0])))

            if((not keywords) and (not sources[0]) and (not twitter_accounts[0])):#[] gets coverted to false
                logging.debug("skipping article because it's not a match")
                continue

            article.newspaper_parse()
            text = article._newspaper_text
            # Rerun the get_keywords with text parsed by newspaper.
            keywords = get_keywords(article, db_keywords)

            if((not keywords) and (not sources[0]) and (not twitter_accounts[0])):#[] gets coverted to false
                logging.debug("skipping article because it's not a match")
                continue
                
            logging.info("match found")

            #load selectors from db!
            #parameter is a namedtuple of "css" and "regex"
            title = article.evaluate_css_selectors(site.referringsitecssselector_set.filter(field=0)) or article.title
            authors = article.evaluate_css_selectors(site.referringsitecssselector_set.filter(field=1))
            if(authors):
                authors = [authors]
            else:
                authors = article.authors
            pub_date = article.evaluate_css_selectors(site.referringsitecssselector_set.filter(field=2))
            if(pub_date):
                pub_date = dateutil.parser.parse(pub_date)
            else:
                pub_date = get_pub_date(article)
            mod_date = article.evaluate_css_selectors(site.referringsitecssselector_set.filter(field=3))

            language = article.language

            date_now=timezone.localtime(timezone.now())

            # Check if the entry already exists
            db_article_list = Article.objects.filter(url=url)
            if not db_article_list:
                logging.info("Adding new Article to the DB")
                # If the db_article is new to the database,
                # add it to the database
                db_article = Article(title=title, url=url,
                                  domain=site.url,
                                  date_added=date_now,
                                  date_last_seen=date_now,
                                  date_published=pub_date,
                                  date_modified=mod_date,
                                  language=language,
                                  text=text)
                db_article.save()

                db_article = Article.objects.get(url=url)

                for key in keywords:
                    db_article.keyword_set.create(name=key)

                for author in authors:
                    db_article.author_set.create(name=author)
                for account in twitter_accounts[0]:

                    db_article.sourcetwitter_set.create(name = account, matched = True)

                for account in twitter_accounts[1]:
                    db_article.sourcetwitter_set.create(name = account, matched = False)

                for source in sources[0]:
                    db_article.sourcesite_set.create(url=source[0],
                                              domain=source[1], anchor_text=source[2],
                                              matched=True, local=(source[1] in site.url))

                for source in sources[1]:
                    db_article.sourcesite_set.create(url=source[0],
                                              domain=source[1], anchor_text=source[2],
                                              matched=False, local=(source[1] in site.url))

            else:
                logging.info("Modifying existing Article in the DB")
                # If the db_article already exists,
                # update all fields except date_added
                db_article = db_article_list[0]
                db_article.title = title
                db_article.url = url
                db_article.domain = site.url
                # Do not update the added date
                # db_article.date_added = today
                db_article.date_last_seen = date_now
                db_article.date_published = pub_date
                db_article.date_modified = mod_date
                db_article.language = language
                db_article.text = text
                db_article.save()

                for key in keywords:
                    if not db_article.keyword_set.filter(name=key):
                        db_article.keyword_set.create(name=key)

                for author in authors:
                    if not db_article.author_set.filter(name=author):
                        db_article.author_set.create(name=author)

                for account in twitter_accounts[0]:
                    if not db_article.sourcetwitter_set.filter(name=account):
                        db_article.sourcetwitter_set.create(name = account, matched = True)

                for account in twitter_accounts[1]:
                    if not db_article.sourcetwitter_set.filter(name=account):
                        db_article.sourcetwitter_set.create(name = account, matched = False)

                for source in sources[0]:
                    if not db_article.sourcesite_set.filter(url=source[0]):
                        db_article.sourcesite_set.create(url=source[0],
                                              domain=source[1], anchor_text=source[2],
                                              matched=True, local=(source[1] in site.url))

                for source in sources[1]:
                    if not db_article.sourcesite_set.filter(url=source[0]):
                        db_article.sourcesite_set.create(url=source[0],
                                              domain=source[1], anchor_text=source[2],
                                              matched=False, local=(source[1] in site.url))

            warc_creator.enqueue_article(url)
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            logging.exception("Unhandled exception while crawling: " + str(e))

    logging.info("Finished Site: %s"%site.name)
    setup_logging(increment=False)
    logging.info("Finished Site: %s"%site.name)
Пример #2
0
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..',
                                             'Frontend')))
import django
os.environ['DJANGO_SETTINGS_MODULE'] = 'Frontend.settings'
# For Models connecting with the Django Database
from explorer.models import *
from ExplorerArticle import ExplorerArticle

if __name__ == "__main__":
    id = input("site id: ")
    django.setup()
    site = ReferringSite.objects.get(pk=id)
    url = input("url: ")
    article = ExplorerArticle(url)
    article.download()
    article.preliminary_parse()
    article.newspaper_parse()

    fields = {}
    for css in site.referringsitecssselector_set.all():
        if not (css.field_choice in fields.keys()):
            fields[css.field_choice] = []

        fields[css.field_choice].append({'pattern': css.pattern, 'regex': css.regex})
    if(not len(fields.keys())):
        print "no fields"

    for key, value in fields.iteritems():
        print "field \"{0}\"".format(key)
        print article.evaluate_css_selectors(value)
        print