def distill_article_interactions(session, user, data):
    """

        extracts info from user_activity_data

    :param session:
    :param event:
    :param value:
    :param user:
    """

    event = data['event']
    value = data['value']
    article_id = int(data['article_id'])

    log(f'event is: {event}')

    if "UMR - OPEN ARTICLE" in event:
        article_opened(session, article_id, user)
    elif "UMR - LIKE ARTICLE" in event:
        article_liked(session, article_id, user, True)
    elif "UMR - UNLIKE ARTICLE" in event:
        article_liked(session, article_id, user, False)
    elif "UMR - USER FEEDBACK" in event:
        article_feedback(session, article_id, user, value)
def article_liked(session, article_id, user, like_value):
    from zeeguu.core.emailer.user_activity import send_notification_article_feedback
    article = Article.query.filter_by(id=article_id).one()
    ua = UserArticle.find(user, article)
    ua.liked = like_value
    session.add(ua)
    session.commit()
    log(f"{ua}")
    send_notification_article_feedback('Liked', user, article.title,
                                       article.url.as_string(), article.id)
예제 #3
0
def article_search_for_user(user, count, search_terms):
    try:

        return elastic_article_search_for_user(user, count, search_terms)

    except elasticsearch.exceptions.ConnectionError:
        log(ES_DOWN_MESSAGE)
        log(print(traceback.format_exc()))

    return mixed_article_search_for_user(user, count, search_terms)
def article_opened(session, article_id, user):
    article = Article.query.filter_by(id=article_id).one()
    ua = UserArticle.find(user, article)
    if not ua:
        ua = UserArticle.find_or_create(session,
                                        user,
                                        article,
                                        opened=datetime.now())
    ua.opened = datetime.now()
    session.add(ua)
    session.commit()
    log(f"{ua}")
예제 #5
0
def retrieve_articles_from_all_feeds():
    counter = 0
    all_feeds = RSSFeed.query.all()
    all_feeds_count = len(all_feeds)
    for feed in all_feeds:
        counter += 1
        try:
            msg = f"*** >>>>>>>>> {feed.title} ({counter}/{all_feeds_count}) <<<<<<<<<< "  # .encode('utf-8')
            log("")
            log(f"{msg}")

            download_from_feed(feed, zeeguu.core.db.session)

        except Exception as e:
            traceback.print_exc()
def download_from_feed(feed: RSSFeed,
                       session,
                       limit=1000,
                       save_in_elastic=True):
    """

    Session is needed because this saves stuff to the DB.


    last_crawled_time is useful because otherwise there would be a lot of time
    wasted trying to retrieve the same articles, especially the ones which
    can't be retrieved, so they won't be cached.


    """

    downloaded = 0
    skipped_due_to_low_quality = 0
    skipped_already_in_db = 0

    last_retrieval_time_from_DB = None
    last_retrieval_time_seen_this_crawl = None

    if feed.last_crawled_time:
        last_retrieval_time_from_DB = feed.last_crawled_time
        log(f"LAST CRAWLED::: {last_retrieval_time_from_DB}")

    try:
        items = feed.feed_items(last_retrieval_time_from_DB)
    except Exception as e:
        log(f"Failed to download feed ({e})")
        from sentry_sdk import capture_exception

        capture_exception(e)
        return

    for feed_item in items:

        skipped_already_in_db = 0

        if downloaded >= limit:
            break

        feed_item_timestamp = feed_item["published_datetime"]

        if _date_in_the_future(feed_item_timestamp):
            log("Article from the future!")
            continue

        if (not last_retrieval_time_seen_this_crawl) or (
                feed_item_timestamp > last_retrieval_time_seen_this_crawl):
            last_retrieval_time_seen_this_crawl = feed_item_timestamp

        if last_retrieval_time_seen_this_crawl > feed.last_crawled_time:
            feed.last_crawled_time = last_retrieval_time_seen_this_crawl
            log(f"+updated feed's last crawled time to {last_retrieval_time_seen_this_crawl}"
                )

        session.add(feed)
        session.commit()

        try:
            new_article = download_feed_item(session, feed, feed_item)
            downloaded += 1
        except SkippedForTooOld:
            log("- Article too old")
            continue
        except SkippedForLowQuality as e:
            log(f" - Low quality: {e.reason}")
            skipped_due_to_low_quality += 1
            continue
        except SkippedAlreadyInDB:
            skipped_already_in_db += 1
            log(" - Already in DB")
            continue

        except Exception as e:
            from sentry_sdk import capture_exception

            capture_exception(e)

            if hasattr(e, "message"):
                log(e.message)
            else:
                log(e)
            continue

        # Saves the news article at ElasticSearch.
        # We recommend that everything is stored both in SQL and Elasticsearch
        # as ElasticSearch isn't persistent data
        try:
            if save_in_elastic:
                if new_article:
                    es = Elasticsearch(ES_CONN_STRING)
                    doc = document_from_article(new_article, session)
                    res = es.index(index=ES_ZINDEX,
                                   id=new_article.id,
                                   body=doc)
                    print("elastic res: " + res["result"])
        except Exception as e:
            from sentry_sdk import capture_exception

            capture_exception(e)

            log("***OOPS***: ElasticSearch seems down?")
            if hasattr(e, "message"):
                log(e.message)
            else:
                log(e)
            continue

    log(f"*** Downloaded: {downloaded} From: {feed.title}")
    log(f"*** Low Quality: {skipped_due_to_low_quality}")
    log(f"*** Already in DB: {skipped_already_in_db}")
    log(f"*** ")
def download_feed_item(session, feed, feed_item):
    new_article = None

    try:

        url = _url_after_redirects(feed_item["url"])
        log(url)

    except requests.exceptions.TooManyRedirects:
        raise Exception(f"- Too many redirects")
    except Exception:
        raise Exception(
            f"- Could not get url after redirects for {feed_item['url']}")

    title = feed_item["title"]

    published_datetime = feed_item["published_datetime"]

    try:
        art = model.Article.find(url)
    except:
        import sys

        ex = sys.exc_info()[0]
        raise Exception(
            f" {LOG_CONTEXT}: For some reason excepted during Article.find \n{str(ex)}"
        )

    if art:
        raise SkippedAlreadyInDB()

    try:

        art = newspaper.Article(url)
        art.download()
        art.parse()

        debug("- Succesfully parsed")

        cleaned_up_text = cleanup_non_content_bits(art.text)

        cleaned_up_text = flatten_composed_unicode_characters(cleaned_up_text)

        is_quality_article, reason = sufficient_quality(art)

        if not is_quality_article:
            raise SkippedForLowQuality(reason)

        summary = feed_item["summary"]
        # however, this is not so easy... there have been cases where
        # the summary is just malformed HTML... thus we try to extract
        # the text:
        from bs4 import BeautifulSoup

        soup = BeautifulSoup(summary, "lxml")
        summary = soup.get_text()
        # then there are cases where the summary is huge... so we clip it
        summary = summary[:MAX_CHAR_COUNT_IN_SUMMARY]
        # and if there is still no summary, we simply use the beginning of
        # the article
        if len(summary) < 10:
            summary = cleaned_up_text[:MAX_CHAR_COUNT_IN_SUMMARY]

            # Create new article and save it to DB
        new_article = zeeguu.core.model.Article(
            Url.find_or_create(session, url),
            title,
            ", ".join(art.authors),
            cleaned_up_text,
            summary,
            published_datetime,
            feed,
            feed.language,
        )
        session.add(new_article)

        topics = add_topics(new_article, session)
        log(f" Topics ({topics})")

        add_searches(title, url, new_article, session)
        debug(" Added keywords")

        session.commit()
        log(f"SUCCESS for: {new_article.title}")

    except SkippedForLowQuality as e:
        raise e

    except Exception as e:
        from sentry_sdk import capture_exception

        capture_exception(e)

        log(f"* Rolling back session due to exception while creating article and attaching words/topics: {str(e)}"
            )
        session.rollback()

    return new_article