示例#1
0
文件: ranker.py 项目: gogasca/news_ml
def process_articles(campaign, limit=100):
    """Test processing and DB insertion.

    :param campaign:
    :param limit:
    :return:
    """
    ranked_articles = get_and_rank_news_articles()
    sorted_articles = sort_articles(ranked_articles)
    if not isinstance(limit, int) and limit > 1:
        raise ValueError('Invalid limit')
    for order, article in enumerate(sorted_articles[:limit], 1):
        logging.info(article)
        DbHelper.update_ranked_post(news_id=article.news_id,
                                    rank_score=article.score,
                                    rank_order=order)
    logging.info('Process %d articles', order)
示例#2
0
    def terminate(self, status=1):
        """Updates campaign status and sends email report if reporting was
        enabled.

        :param status:
        :return:
        """
        if self.send_report:
            logging.info('Sending Email report')
            self.report.email_recipients = self.email_recipients
            self.report.send()

        if self.campaign_reference:
            sqlquery = 'UPDATE campaign SET campaign_end=\'%s\', status=%d ' \
                       'WHERE campaign.reference=\'%s\'' % (
                           settings.DBNOW, status, self.campaign_reference)
            DbHelper.update_database(sqlquery)
示例#3
0
def person_top(request):
    """

    :param request:
    :return:
    """
    sqlquery = """SELECT COUNT(id) AS mentions,name FROM persons GROUP BY
    name ORDER BY mentions DESC;"""
    return DbHelper.get_multiple_records(sqlquery)
示例#4
0
def get_articles_by_date(date):
    """

    :param date:
    :return:
    """
    logging.info('Looking news for %s', date)
    articles = DbHelper.get_multiple_records(
        settings.CLUSTERING_QUERY_GET_NEWS % date)
    logging.info('Total news found: %s', len(articles))
    return articles
示例#5
0
def person_list(limit=10):
    """

    :param limit:
    :return:
    """

    if limit < 1:
        limit = 10
    sqlquery = """SELECT COUNT(id) AS mentions,name FROM persons WHERE
    valid=True GROUP BY name ORDER BY mentions DESC LIMIT %s;""" % str(limit)
    return DbHelper.get_multiple_records(sqlquery)
示例#6
0
def user_list(limit=10):
    """

    :param limit:
    :return:
    """

    if limit < 1:
        limit = 10
    sqlquery = """SELECT id, username, created FROM api_users GROUP BY id,
    username, created ORDER BY created DESC LIMIT %s;""" % str(limit)
    return DbHelper.get_multiple_records(sqlquery)
示例#7
0
def translate_article(campaign_instance, article, new_article, news_id):
    """

    :param campaign_instance:
    :param article:
    :param new_article:
    :param report:
    :param news_id:
    :return:
    """
    # Perform translation using Google Translate API
    log.info('Translating...%r', article.url)
    translated_text = translate_content(article.title,
                                        campaign_instance.translation_lang)
    if new_article and translated_text:
        # Update database record
        sql_query = update_db_query(translated_text.replace("'", "''"),
                                    settings.DEFAULT_LANGUAGE, news_id)
        DbHelper.update_database(sql_query)
    else:
        log.warning('Article already exists, skipping DB update')
    return translated_text
示例#8
0
文件: ranker.py 项目: gogasca/news_ml
def get_articles_by_date(date):
    """

    :param date:
    :return: list(List of RankedArticle)
    """
    logging.info('Looking news for %s', date)
    news = DbHelper.get_multiple_records(
        settings.RANKING_QUERY_GET_NEWS_BY_DATE % date)
    if news:
        logging.info('Total news found: %s', len(news))
        return news
    else:
        raise ValueError('No news found')
示例#9
0
def person_filter(request):
    """

    :param request:
    :return:
    """
    persons = None
    name_value = request.args.get('name')

    if name_value and len(name_value) > 1:
        name_value = name_value.replace("'", "''")
        sqlquery = """SELECT COUNT(id) AS mentions,name FROM persons WHERE
        name='%s' AND valid=True GROUP BY name ORDER BY mentions DESC;""" % \
                   name_value
        persons = DbHelper.get_multiple_records(sqlquery)
    return persons
示例#10
0
    def get_articles(self, date='latest'):
        """Read posts from database.

        :param date:
        :return:
        """

        # Get latest date for News inserted in Database.
        if date == 'latest':
            date = DbHelper.get_record(settings.CLUSTERING_QUERY_DATE)
            logging.info('Using latest date. The latest date found was: %s',
                         date)
        logging.info('Using date: %s', date)
        if re.match('\d{4}-\d{1,2}-\d{2}', date):
            self.articles = get_articles_by_date(date)
            logging.info('Found %d articles.' % len(self.articles))
        return self.articles
示例#11
0
def process_entities(article, news_id, db_update=True):
    """

    :param article:
    :param news_id:
    :param db_update:
    :return:
    """
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = settings.CREDENTIALS
    log.info('process_entities() Processing content for : %s using NLP',
             article.url)
    num_of_entities = 0
    entities = nlp.analyze_entities(
        '%r %r' % (article.title, article.description))
    if entities:
        num_of_entities = len(entities)
    if num_of_entities < 1:
        log.error('No NLP entities found')
        return

    log.info('Processing %d entities: %s', num_of_entities, article.url)
    # Extract tags and associate them with original article.

    log.info('Processing article tags: %s', article.url)
    tags = nlp_utils.extract_tags(entities)
    log.info('Found %d tags', len(tags))

    log.info('Processing persons in tags: %s', article.url)
    persons = nlp_utils.extract_entity(entities, 'PERSON')
    log.info('Found %d persons', len(persons))

    log.info('Processing organizations in tags: %s', article.url)
    organizations = nlp_utils.extract_entity(entities, 'ORGANIZATION')
    log.info('Found %d organizations', len(organizations))

    if db_update:
        log.info('Updating database...')
        for tag in tags:
            tag_id = DbHelper.insert_tag(tag_name=tag)
            if news_id and tag_id:
                DbHelper.associate_tag_news(news_id, tag_id)
        for person in persons:
            DbHelper.insert_person(person)
        for organization in organizations:
            DbHelper.insert_company(organization)

    return {'persons': persons, 'organizations': organizations}
示例#12
0
def process_articles(articles, news_provider, campaign_instance):
    """Get content from News API.
    For each article verifies if exists in DB or not.
    If exists, ignore it, otherwise, process each field.
    Perform sentiment analysis on article content or description fields.


    :param articles:
    :param news_provider:
    :param campaign_instance:
    :return:
    """
    news_id = None
    translated_text = None
    tweets = []
    num_of_articles = len(articles)
    campaign_instance.set_articles(num_of_articles)
    report = Report.get_report(subject='News ML | %s' % news_provider)

    log.info('Analyzing %d articles...', num_of_articles)
    if num_of_articles < 1:
        if campaign_instance.send_report:
            log.warning('Skipping report via email...')
        log.error('No titles found')
        return
    # Create Report instance and attach recipients.
    log.info('Translation enabled: %s', campaign_instance.translation_enable)
    log.info('Email reporting enabled: %s', campaign_instance.send_report)
    log.info('Twitter enabled: %s', campaign_instance.twitter)
    log.info('Twitter image extraction: %s', settings.EXTRACT_TWITTER_IMAGE)
    log.info('Twitter add hash tags: %s', settings.TWITTER_ADD_HASHTAGS)

    if campaign_instance.send_report:
        report.email_recipients = campaign_instance.email_recipients

    for _, article in articles.items():
        new_article = False
        if not article.description:
            log.error('Not description found in article: %s', article.url)
            continue
        if settings.EXTRACT_TWITTER_IMAGE:  # meta:twitter:image
            article.twitter_image = twitter_utils.get_twitter_element(
                article.url, 'twitter:image')
        log.info('Article: %s, [%s], [%s]', article.title, article.url,
                 article.twitter_image)
        if not DbHelper.record_exists(article.url):
            news_id = None
            log.info('New Article retrieved: %r, %r' %
                     (article.title, article.url))
            try:
                log.info('Processing sentiment analysis')
                score, magnitude = nlp_utils.get_sentiment_scores(
                    article.content or article.description)
                log.info('Insert article into Database')
                news_id = DbHelper.insert_news(
                    title=article.title,
                    author=article.author,
                    description=article.description,
                    content=article.content,
                    url=article.url,
                    url_to_image=article.url_to_image,
                    source_id=article.source_id,
                    source=article.source,
                    campaign=campaign_instance.reference,
                    published_at=article.published_at,
                    score=score,
                    magnitude=magnitude,
                    sentiment=nlp_utils.get_sentiment(score))
                if not news_id:
                    log.error('Unable to insert record %s', article.url)
                    continue
            except (ValueError, UnicodeDecodeError) as exception:
                log.exception(exception)
            new_article = True
            if settings.PROCESS_ENTITIES:
                entities = common_utils.process_entities(
                    article, news_id, True)
        else:
            log.warning('Article %r already exists ', article.url)
            if settings.PROCESS_ENTITIES:
                entities = common_utils.process_entities(
                    article, news_id, False)

        if campaign_instance.translation_enable:
            translated_text = translate_utils.translate_article(
                campaign_instance, article, new_article, news_id)
            if len(translated_text) > 1:
                log.info('Adding translated content to report.')
                article.title = translated_text
            else:
                logging.error('Translated text is empty.')

        if campaign_instance.send_report:
            # Only send today articles in Report.
            today = datetime.now().date()
            published_at = datetime.strptime(article.published_at[:10],
                                             '%Y-%m-%d').date()
            if settings.REPORT_ALL_DATES_ARTICLES:
                log.info('Publishing all dates articles')
            log.info('Today: %s Report date: %s. ', today, published_at)
            if today == published_at or settings.REPORT_ALL_DATES_ARTICLES:
                log.info('Adding article information to Report: %s %s' %
                         (article.title, article.url))
                report.add_content(article.url, article.title,
                                   article.twitter_image)
            else:
                log.warning(
                    'Article published date is not today (%s), '
                    'skipping article from Report', published_at)

        # Handle Twitter
        if campaign_instance.twitter:
            tweet_text = article.title
            if campaign_instance.translation_enable:
                tweet_text = translated_text
            if settings.TWITTER_ADD_HASHTAGS:
                # TODO (gogasca) Find Twitter handlers
                tweet_text = twitter_utils.add_hash_tags(tweet_text, entities)
            tweets.append('{} {}'.format(tweet_text, article.url))

    if campaign_instance.send_report:
        log.info('Sending report via email...')
        report.send()

    if campaign_instance.twitter:
        log.info('Sending Tweets')
        twitter_utils.send_tweets(tweets, campaign_instance.twitter_delay)

    log.info('Extraction completed')
示例#13
0
def launch(campaign_instance=None):
    """
    The logic is as follows:
        1. Extract Articles from <Provider: (Techmeme, Techcrunch)> web page
        2. For each article extract title, short url and content.
        3. Translate Article information
        4. Use Google NLP to extract meaningful keywords from content
        5. Insert record in database.
        6. Send report.

    :param campaign_instance:
    :return:
    """
    entities = None
    news_id = None
    num_of_articles = 0
    translated_text = None
    tweets = []
    report = Report.get_report(subject=settings.TECHMEME_REPORT)

    """Create Report instance and attach recipients."""
    log.info('Translation enabled: %s', campaign_instance.translation_enable)
    log.info('Email reporting enabled: %s', campaign_instance.send_report)
    log.info('Twitter enabled: %s', campaign_instance.twitter)
    log.info('Twitter image extraction: %s', settings.EXTRACT_TWITTER_IMAGE)
    log.info('Twitter add hash tags: %s', settings.TWITTER_ADD_HASHTAGS)

    articles = extract_articles(settings.TECHMEME_URL)
    if articles:
        num_of_articles = len(articles)
    else:
        logging.error('No articles found.')

    log.info('Retrieving %d articles...', num_of_articles)
    if campaign_instance.limit > 0:
        logging.warning('Limit is defined. Skipping other news')
        articles = dict(
            itertools.islice(articles.items(), campaign_instance.limit))
        num_of_articles = len(articles)
    if num_of_articles < 1:
        log.error('No articles found')
        if campaign_instance.send_report:
            log.warning('Skipping report via email...')
        return
    log.info('Processing %d articles...', num_of_articles)
    campaign_instance.set_articles(num_of_articles)

    if campaign_instance.send_report:
        report.email_recipients = campaign_instance.email_recipients
    for _, article in articles.items():
        new_article = False
        if not article.title:
            log.warning('No title found. Article won\'t be inserted')
            continue
        if settings.EXTRACT_TWITTER_IMAGE:  # meta:twitter:image
            article.twitter_image = twitter_utils.get_twitter_element(
                article.url, 'twitter:image')
        log.info('Article: %s, [%s], [%s]', article.title, article.url,
                 article.twitter_image)
        if not DbHelper.record_exists(article.url):
            news_id = None
            log.info('New Article retrieved: %r, %r' % (
                article.title, article.url))
            try:
                log.info('Processing sentiment analysis')
                score, magnitude = nlp_utils.get_sentiment_scores(
                    article.content)
                source = url_extract.get_domain(article.url) or ''
                log.info('Insert article into Database')
                news_id = DbHelper.insert_news(title=article.title,
                                               content=article.content,
                                               url=article.url,
                                               provider=settings.TECHMEME,
                                               source=source.upper(),
                                               source_id=source,
                                               campaign=campaign_instance.reference,
                                               score=score,
                                               magnitude=magnitude,
                                               sentiment=nlp_utils.get_sentiment(
                                                   score)
                                               )
                if not news_id:
                    log.error('Unable to insert record %s', article.url)
                    continue
            except (ValueError, UnicodeDecodeError) as e:
                log.exception(e)
            new_article = True
            if settings.PROCESS_ENTITIES:
                entities = common_utils.process_entities(article, news_id, True)
        else:
            log.warning('Article already exists.')
            if settings.PROCESS_ENTITIES:
                entities = common_utils.process_entities(article, news_id, False)

        if campaign_instance.translation_enable:
            translated_text = translate_utils.translate_article(
                campaign_instance, article, new_article, news_id)
            if translated_text:
                log.info('Adding translated content to report.')
                article.title = translated_text

        if campaign_instance.send_report:
            # Only send articles created 'today' in Report.
            today = datetime.now().date()
            published_at = datetime.strptime(article.published_at,
                                             '%y%m%d').date()
            if settings.REPORT_ALL_DATES_ARTICLES:
                log.info('Publishing all dates articles')
            log.info('Today: %s Report date: %s. ', today, published_at)
            if today == published_at or settings.REPORT_ALL_DATES_ARTICLES:
                log.info(
                    'Adding article information to Report: %s %s' % (
                        article.title, article.url))
                report.add_content(article.url, article.title,
                                   article.twitter_image)
            else:
                log.warning(
                    'Article published date is not today (%s), '
                    'skipping article from Report', published_at)

        if campaign_instance.twitter:
            tweet_text = article.title
            if campaign_instance.translation_enable:
                tweet_text = translated_text
            if settings.TWITTER_ADD_HASHTAGS:
                # TODO (gogasca) Find Twitter handlers
                tweet_text = twitter_utils.add_hash_tags(tweet_text, entities)
            tweets.append('{} {}'.format(tweet_text, article.url))

    if campaign_instance.send_report:
        log.info('Sending email notification...')
        report.send()

    if campaign_instance.twitter:
        log.info('Sending Tweets...')
        twitter_utils.send_tweets(tweets, campaign_instance.twitter_delay)

    log.info('Extraction completed')
示例#14
0
def main():
    DbHelper.test_connection()