Exemplo n.º 1
0
    def find_url_in_extra_data(self):
        """
        DB structure is a mess!
        There is no convention where the url associated with an event is.
        Thu we need to look for it in different places

        NOTE: This can be solved by creating a new column called url and write the url only there

        returns: url if found or None otherwise
        """

        if self.extra_data and self.extra_data != "{}" and self.extra_data != "null":
            try:
                extra_event_data = json.loads(self.extra_data)

                if "articleURL" in extra_event_data:
                    url = extra_event_data["articleURL"]
                elif "url" in extra_event_data:
                    url = extra_event_data["url"]
                else:  # There is no url
                    return None
                return Url.extract_canonical_url(url)

            except:  # Some json strings are truncated and some other times extra_event_data is an int
                # therefore cannot be parsed correctly and throw an exception
                return None
        else:  # The extra_data field is empty
            return None
Exemplo n.º 2
0
    def find_or_create(cls, session, user: User, _url, _title: str, _language):
        """

            create a new object and add it to the db if it's not already there
            otherwise retrieve the existing object and update

            in case of creation, the created object is incomplete

\        """

        language = Language.find(_language)
        url = Url.find_or_create(session, _url, _title)

        try:
            return cls.query.filter_by(user=user, url=url).one()
        except NoResultFound:
            try:
                new = cls(user, url, _title, language)
                session.add(new)
                session.commit()
                return new
            except Exception as e:
                from sentry_sdk import capture_exception
                capture_exception(e)
                print("seems we avoided a race condition")
                session.rollback()
                return cls.query.filter_by(user=user, url=url).one()
Exemplo n.º 3
0
    def test_find_or_create_works(self):

        _url = self.url_rule.url.as_string()
        _title = self.url_rule.url.title

        url = Url.find_or_create(session, _url, _title)

        self.assertEqual(url.title, _title)
Exemplo n.º 4
0
    def delete(cls, session, user, _url):

        try:
            url = Url.find(_url)
            item = cls.query.filter_by(user=user, url=url).one()
            session.delete(item)
            session.commit()
        except Exception as e:
            from sentry_sdk import capture_exception
            capture_exception(e)
Exemplo n.º 5
0
    def _localized_topic_keyword_in_url(self, topic: str, localized: str,
                                        keyword: str, url: str):
        topic = Topic(topic)
        localized_topic = LocalizedTopic(topic, self.user.learned_language,
                                         localized)
        localized_topic.keywords = keyword

        article = ArticleRule().article
        url = Url.find_or_create(self.db.session, url)
        article.url = url

        assert localized_topic.matches_article(article)
Exemplo n.º 6
0
    def delete(cls, session, user, _url):

        try:
            url = Url.find(_url)
            item = cls.query.filter_by(
                user=user,
                url=url
            ).one()
            session.delete(item)
            session.commit()
        except Exception as e:
            print(e)
Exemplo n.º 7
0
    def test_domain_plus_path_must_be_unique(self):

        _url = self.url_rule.url.as_string()
        _title = self.url_rule.url.title
        _domain = DomainName.get_domain(_url)

        with self.assertRaises(Exception) as context:
            domain = DomainName.find(_domain)
            url = Url(_url, _title, domain)
            session.add(url)
            session.commit()

        self.assertTrue('Duplicate entry'
                        or 'IntegrityError' in str(context.exception))
Exemplo n.º 8
0
    def __init__(self):
        super().__init__()

        self.rss_feed = self._create_model_object()
        self.feed = self.rss_feed
        self.save(self.rss_feed)

        lang1 = Language.find_or_create('de')
        url = Url.find_or_create(self.db.session, url_spiegel_rss)

        self.feed1 = RSSFeed.find_or_create(self.db.session,
                                            url,
                                            "",
                                            "",
                                            icon_name_spiegel,
                                            language=lang1)
        self.save(self.feed1)
Exemplo n.º 9
0
def upload_articles(cohort_id):
    """
    uploads articles for a cohort with input from a POST request
    """
    if not has_permission_for_cohort(cohort_id):
        flask.abort(401)
    try:
        for article_data in json.loads(request.data):
            url = Url("userarticle/{}".format(uuid.uuid4().hex))
            title = article_data["title"]
            authors = article_data["authors"]
            content = article_data["content"]
            summary = article_data["summary"]
            published_time = datetime.now()
            language_code = article_data["language_code"]
            language = Language.find(language_code)

            new_article = Article(
                url,
                title,
                authors,
                content,
                summary,
                published_time,
                None,  # rss feed
                language,
            )

            db.session.add(new_article)
            db.session.flush()
            db.session.refresh(new_article)

            cohort = Cohort.find(cohort_id)
            new_cohort_article_map = CohortArticleMap(cohort, new_article)

            db.session.add(new_cohort_article_map)
        db.session.commit()
        return "OK"
    except ValueError:
        flask.abort(400)
        return "ValueError"
Exemplo n.º 10
0
    def test_try_to_get_race_condition(self):

        _url = self.url_rule.url.as_string()
        _title = self.url_rule.url.title

        def threaded_create_url():
            url = Url.find_or_create(session, _url, _title)

        threads = []

        for i in range(0):  # multithreaded connections freeze on mysqldb.
            # so this is here to be tested manually and killed for now...
            t = Thread(target=threaded_create_url, args=())
            threads.append(t)
            t.start()

        for t in threads:
            t.join()

        url = Url.find_or_create(session, _url, _title)
        self.assertEqual(url.title, _title)
Exemplo n.º 11
0
def upload_articles(cohort_id):
    '''
        uploads articles for a cohort with input from a POST request
    '''
    if (not has_permission_for_cohort(cohort_id)):
        flask.abort(401)
    try:
        for article_data in json.loads(request.data):
            url = Url('userarticle/{}'.format(uuid.uuid4().hex))
            title = article_data['title']
            authors = article_data['authors']
            content = article_data['content']
            summary = article_data['summary']
            published_time = datetime.now()
            language_code = article_data['language_code']
            language = Language.find(language_code)

            new_article = Article(
                url,
                title,
                authors,
                content,
                summary,
                published_time,
                None,  # rss feed
                language)

            db.session.add(new_article)
            db.session.flush()
            db.session.refresh(new_article)

            cohort = Cohort.find(cohort_id)
            new_cohort_article_map = CohortArticleMap(cohort, new_article)

            db.session.add(new_cohort_article_map)
        db.session.commit()
        return 'OK'
    except ValueError:
        flask.abort(400)
        return 'ValueError'
Exemplo n.º 12
0
def download_from_feed(feed: RSSFeed, session, limit=1000):
    """

        Session is needed because this saves stuff to the DB.


        last_crawled_time is useful because otherwise there would be a lot of time
        wasted trying to retrieve the same articles, especially the ones which
        can't be retrieved, so they won't be cached.


    """
    log(feed.title)

    downloaded = 0
    skipped = 0
    skipped_due_to_low_quality = dict()
    skipped_already_in_db = 0

    last_retrieval_time_from_DB = None
    last_retrieval_time_seen_this_crawl = None

    if feed.last_crawled_time:
        last_retrieval_time_from_DB = feed.last_crawled_time
        log(f"last retrieval time from DB = {last_retrieval_time_from_DB}")

    try:
        items = feed.feed_items()
    except:
        log("Failed to connect to feed")
        return

    for feed_item in items:

        if downloaded >= limit:
            break

        try:
            url = _url_after_redirects(feed_item['url'])
        except requests.exceptions.TooManyRedirects:
            log(f"Too many redirects for: {url}")
            continue

        try:
            this_article_time = datetime.strptime(feed_item['published'],
                                                  SIMPLE_TIME_FORMAT)
            this_article_time = this_article_time.replace(tzinfo=None)
        except:
            log(f"can't get time from {url}: {feed_item['published']}")
            continue

        if _date_in_the_future(this_article_time):
            log("article from the future...")
            continue

        if last_retrieval_time_from_DB:

            if this_article_time < last_retrieval_time_from_DB:
                skipped += 1
                continue

        title = feed_item['title']
        summary = feed_item['summary']

        log(url)

        try:
            art = model.Article.find(url)
        except:
            import sys
            ex = sys.exc_info()[0]
            log(f" {LOG_CONTEXT}: For some reason excepted during Article.find \n{str(ex)}"
                )
            continue

        if (not last_retrieval_time_seen_this_crawl) or (
                this_article_time > last_retrieval_time_seen_this_crawl):
            last_retrieval_time_seen_this_crawl = this_article_time

        if art:
            skipped_already_in_db += 1
            log("- already in db")
        else:
            try:

                art = newspaper.Article(url)
                art.download()
                art.parse()
                log("- succesfully parsed")

                cleaned_up_text = cleanup_non_content_bits(art.text)

                quality_article = sufficient_quality(
                    art, skipped_due_to_low_quality)
                if quality_article:
                    from zeeguu_core.language.difficulty_estimator_factory import DifficultyEstimatorFactory

                    try:
                        # Create new article and save it to DB
                        new_article = zeeguu_core.model.Article(
                            Url.find_or_create(session, url), title,
                            ', '.join(art.authors), cleaned_up_text, summary,
                            this_article_time, feed, feed.language)
                        session.add(new_article)
                        session.commit()
                        downloaded += 1

                        add_topics(new_article, session)
                        log("- added topics")
                        add_searches(title, url, new_article, session)
                        log("- added keywords")
                        session.commit()

                        if last_retrieval_time_seen_this_crawl:
                            feed.last_crawled_time = last_retrieval_time_seen_this_crawl
                        session.add(feed)

                    except Exception as e:
                        log(f'Something went wrong when creating article and attaching words/topics: {e}'
                            )
                        log("rolling back the session... ")
                        session.rollback()

            except Exception as e:
                # raise e
                import sys
                ex = sys.exc_info()[0]
                log(f"Failed to create zeeguu.Article from {url}\n{str(ex)}")

    log(f'  Skipped due to time: {skipped} ')
    log(f'  Downloaded: {downloaded}')
    log(f'  Low Quality: {skipped_due_to_low_quality}')
    log(f'  Already in DB: {skipped_already_in_db}')
Exemplo n.º 13
0
def download_feed_item(session,
                       feed,
                       feed_item):
    new_article = None

    try:

        url = _url_after_redirects(feed_item['url'])
        log(url)

    except requests.exceptions.TooManyRedirects:
        raise Exception(f"- Too many redirects")
    except Exception:
        raise Exception(f"- Could not get url after redirects for {feed_item['url']}")

    title = feed_item['title']
    summary = feed_item['summary']
    published_datetime = feed_item['published_datetime']

    try:
        art = model.Article.find(url)
    except:
        import sys
        ex = sys.exc_info()[0]
        raise Exception(f" {LOG_CONTEXT}: For some reason excepted during Article.find \n{str(ex)}")

    if art:
        raise SkippedAlreadyInDB()

    try:

        art = newspaper.Article(url)
        art.download()
        art.parse()

        debug("- Succesfully parsed")

        cleaned_up_text = cleanup_non_content_bits(art.text)

        is_quality_article, reason = sufficient_quality(art)

        if not is_quality_article:
            raise SkippedForLowQuality(reason)

        # Create new article and save it to DB
        new_article = zeeguu_core.model.Article(
            Url.find_or_create(session, url),
            title,
            ', '.join(art.authors),
            cleaned_up_text,
            summary,
            published_datetime,
            feed,
            feed.language
        )
        session.add(new_article)

        topics = add_topics(new_article, session)
        log(f" Topics ({topics})")

        add_searches(title, url, new_article, session)
        debug(" Added keywords")

        session.commit()
        log(f"SUCCESS for: {new_article.title}")

    except SkippedForLowQuality as e:
        raise e

    except Exception as e:
        log(f"* Rolling back session due to exception while creating article and attaching words/topics: {str(e)}")
        session.rollback()

    return new_article
Exemplo n.º 14
0
test_feed = test_feed(_feed_url)

feed_name = input(f"Feed name (Enter for: {test_feed.title}):  ") or test_feed.title
print(f'= {feed_name}')

icon_name = input(
    "Icon name to be found in resources folder (e.g. 20min.png):  ")
print(f'= {icon_name}')

description = input(f'Description (Enter for: {test_feed.description}): ') or test_feed.description
print(f'= {description}')

_language = input("Language code (e.g. en): ")
print(f'= {_language}')

feed_url = Url.find_or_create(zeeguu_core.db.session, _feed_url)
language = Language.find_or_create(_language)

rss_feed = RSSFeed.find_or_create(zeeguu_core.db.session,
                                  feed_url,
                                  feed_name,
                                  description,
                                  icon_name=icon_name,
                                  language=language)

print("Done: ")
print(rss_feed.title)
print(rss_feed.description)
print(rss_feed.language_id)
print(rss_feed.url.as_string())
Exemplo n.º 15
0
 def threaded_create_url():
     url = Url.find_or_create(session, _url, _title)
Exemplo n.º 16
0
#!/usr/bin/env python

from zeeguu_core.model import RSSFeed, Url, Language
import zeeguu_core

RESOURCES_FOLDER = "https://zeeguu.unibe.ch/api/resources/"

name = input ("Name of feed to update: ")

session = zeeguu_core.db.session

all_feeds = RSSFeed.query.all()
for feed in all_feeds:
    if feed.title == name:
        print("Updating ... " + name)
        feed.title = input (f'Title ({feed.title}): ') or feed.title
        print (f'new title is: {feed.title}')
        _image_url = input ('Icon file: ') 
        feed.image_url = Url.find_or_create(session, RESOURCES_FOLDER+_image_url)
        print ('new image url: ' + feed.image_url.as_string())
        session.add(feed)
        session.commit()

Exemplo n.º 17
0
feed_name = input(
    f"Feed name (Enter for: {test_feed.title}):  ") or test_feed.title
print(f'= {feed_name}')

icon_name = input(
    "Icon name to be found in resources folder (e.g. 20min.png):  ")
print(f'= {icon_name}')

description = input(f'Description (Enter for: {test_feed.description}): '
                    ) or test_feed.description
print(f'= {description}')

_language = input("Language code (e.g. en): ")
print(f'= {_language}')

feed_url = Url.find_or_create(zeeguu_core.db.session, _feed_url)
language = Language.find_or_create(_language)

rss_feed = RSSFeed.find_or_create(zeeguu_core.db.session,
                                  feed_url,
                                  feed_name,
                                  description,
                                  icon_name=icon_name,
                                  language=language)

print("Done: ")
print(rss_feed.title)
print(rss_feed.description)
print(rss_feed.language_id)
print(rss_feed.url.as_string())
Exemplo n.º 18
0
def download_feed_item(session, feed, feed_item):
    new_article = None

    try:

        url = _url_after_redirects(feed_item['url'])
        log(url)

    except requests.exceptions.TooManyRedirects:
        raise Exception(f"- Too many redirects")
    except Exception:
        raise Exception(
            f"- Could not get url after redirects for {feed_item['url']}")

    title = feed_item['title']

    published_datetime = feed_item['published_datetime']

    try:
        art = model.Article.find(url)
    except:
        import sys
        ex = sys.exc_info()[0]
        raise Exception(
            f" {LOG_CONTEXT}: For some reason excepted during Article.find \n{str(ex)}"
        )

    if art:
        raise SkippedAlreadyInDB()

    try:

        art = newspaper.Article(url)
        art.download()
        art.parse()

        debug("- Succesfully parsed")

        cleaned_up_text = cleanup_non_content_bits(art.text)

        cleaned_up_text = flatten_composed_unicode_characters(cleaned_up_text)

        is_quality_article, reason = sufficient_quality(art)

        if not is_quality_article:
            raise SkippedForLowQuality(reason)

        summary = feed_item['summary']
        # however, this is not so easy... there have been cases where
        # the summary is just malformed HTML... thus we try to extract
        # the text:
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(summary, "lxml")
        summary = soup.get_text()
        # then there are cases where the summary is huge... so we clip it
        summary = summary[:MAX_CHAR_COUNT_IN_SUMMARY]
        # and if there is still no summary, we simply use the beginning of
        # the article
        if len(summary) < 10:
            summary = cleaned_up_text[:MAX_CHAR_COUNT_IN_SUMMARY]

            # Create new article and save it to DB
        new_article = zeeguu_core.model.Article(
            Url.find_or_create(session, url), title, ', '.join(art.authors),
            cleaned_up_text, summary, published_datetime, feed, feed.language)
        session.add(new_article)

        topics = add_topics(new_article, session)
        log(f" Topics ({topics})")

        add_searches(title, url, new_article, session)
        debug(" Added keywords")

        session.commit()
        log(f"SUCCESS for: {new_article.title}")

    except SkippedForLowQuality as e:
        raise e

    except Exception as e:
        from sentry_sdk import capture_exception
        capture_exception(e)

        log(f"* Rolling back session due to exception while creating article and attaching words/topics: {str(e)}"
            )
        session.rollback()

    return new_article