示例#1
0
    def find_url_in_extra_data(self):
        """
        DB structure is a mess!
        There is no convention where the url associated with an event is.
        Thu we need to look for it in different places

        NOTE: This can be solved by creating a new column called url and write the url only there

        returns: url if found or None otherwise
        """

        if self.extra_data and self.extra_data != "{}" and self.extra_data != "null":
            try:
                extra_event_data = json.loads(self.extra_data)

                if "articleURL" in extra_event_data:
                    url = extra_event_data["articleURL"]
                elif "url" in extra_event_data:
                    url = extra_event_data["url"]
                else:  # There is no url
                    return None
                return Url.extract_canonical_url(url)

            except:  # Some json strings are truncated and some other times extra_event_data is an int
                # therefore cannot be parsed correctly and throw an exception
                return None
        else:  # The extra_data field is empty
            return None
    def find_or_create(cls, session, user: User, _url, _title: str, _language):
        """

            create a new object and add it to the db if it's not already there
            otherwise retrieve the existing object and update

            in case of creation, the created object is incomplete

\        """

        language = Language.find(_language)
        url = Url.find_or_create(session, _url, _title)

        try:
            return cls.query.filter_by(user=user, url=url).one()
        except NoResultFound:
            try:
                new = cls(user, url, _title, language)
                session.add(new)
                session.commit()
                return new
            except Exception as e:
                from sentry_sdk import capture_exception
                capture_exception(e)
                print("seems we avoided a race condition")
                session.rollback()
                return cls.query.filter_by(user=user, url=url).one()
示例#3
0
    def test_find_or_create_works(self):

        _url = self.url_rule.url.as_string()
        _title = self.url_rule.url.title

        url = Url.find_or_create(session, _url, _title)

        self.assertEqual(url.title, _title)
    def delete(cls, session, user, _url):

        try:
            url = Url.find(_url)
            item = cls.query.filter_by(user=user, url=url).one()
            session.delete(item)
            session.commit()
        except Exception as e:
            from sentry_sdk import capture_exception
            capture_exception(e)
    def _localized_topic_keyword_in_url(self, topic: str, localized: str,
                                        keyword: str, url: str):
        topic = Topic(topic)
        localized_topic = LocalizedTopic(topic, self.user.learned_language,
                                         localized)
        localized_topic.keywords = keyword

        article = ArticleRule().article
        url = Url.find_or_create(self.db.session, url)
        article.url = url

        assert localized_topic.matches_article(article)
示例#6
0
    def __init__(self):
        super().__init__()

        self.rss_feed = self._create_model_object()
        self.feed = self.rss_feed
        self.save(self.rss_feed)

        lang1 = Language.find_or_create('de')
        url = Url.find_or_create(self.db.session, url_spiegel_rss)

        self.feed1 = RSSFeed.find_or_create(self.db.session, url, "", "", icon_name_spiegel,
                                            language=lang1)
        self.save(self.feed1)
示例#7
0
    def test_domain_plus_path_must_be_unique(self):

        _url = self.url_rule.url.as_string()
        _title = self.url_rule.url.title
        _domain = DomainName.get_domain(_url)

        with self.assertRaises(Exception) as context:
            domain = DomainName.find(_domain)
            url = Url(_url, _title, domain)
            session.add(url)
            session.commit()

        self.assertTrue('Duplicate entry'
                        or 'IntegrityError' in str(context.exception))
def upload_articles(cohort_id):
    """
    uploads articles for a cohort with input from a POST request
    """
    check_permission_for_cohort(cohort_id)

    try:
        for article_data in json.loads(request.data):
            url = Url("userarticle/{}".format(uuid.uuid4().hex))
            title = article_data["title"]
            authors = article_data["authors"]
            content = article_data["content"]
            summary = article_data["summary"]
            published_time = datetime.now()
            language_code = article_data["language_code"]
            language = Language.find(language_code)

            new_article = Article(
                url,
                title,
                authors,
                content,
                summary,
                published_time,
                None,  # rss feed
                language,
            )

            db.session.add(new_article)
            db.session.flush()
            db.session.refresh(new_article)

            cohort = Cohort.find(cohort_id)
            now = datetime.now()
            new_cohort_article_map = CohortArticleMap(cohort, new_article, now)

            db.session.add(new_cohort_article_map)
        db.session.commit()
        return "OK"
    except ValueError:
        flask.abort(400)
        return "ValueError"
示例#9
0
    def test_try_to_get_race_condition(self):

        _url = self.url_rule.url.as_string()
        _title = self.url_rule.url.title

        def threaded_create_url():
            url = Url.find_or_create(session, _url, _title)

        threads = []

        for i in range(0):  # multithreaded connections freeze on mysqldb.
            # so this is here to be tested manually and killed for now...
            t = Thread(target=threaded_create_url, args=())
            threads.append(t)
            t.start()

        for t in threads:
            t.join()

        url = Url.find_or_create(session, _url, _title)
        self.assertEqual(url.title, _title)
示例#10
0
 def threaded_create_url():
     url = Url.find_or_create(session, _url, _title)
示例#11
0
feed_name = input(
    f"Feed name (Enter for: {test_feed.title}):  ") or test_feed.title
print(f'= {feed_name}')

icon_name = input(
    "Icon name to be found in resources folder (e.g. 20min.png):  ")
print(f'= {icon_name}')

description = input(f'Description (Enter for: {test_feed.description}): '
                    ) or test_feed.description
print(f'= {description}')

_language = input("Language code (e.g. en): ")
print(f'= {_language}')

feed_url = Url.find_or_create(zeeguu.core.db.session, _feed_url)
language = Language.find_or_create(_language)

rss_feed = RSSFeed.find_or_create(zeeguu.core.db.session,
                                  feed_url,
                                  feed_name,
                                  description,
                                  icon_name=icon_name,
                                  language=language)

print("Done: ")
print(rss_feed.title)
print(rss_feed.description)
print(rss_feed.language_id)
print(rss_feed.url.as_string())
def download_feed_item(session, feed, feed_item):
    new_article = None

    try:

        url = _url_after_redirects(feed_item["url"])
        log(url)

    except requests.exceptions.TooManyRedirects:
        raise Exception(f"- Too many redirects")
    except Exception:
        raise Exception(
            f"- Could not get url after redirects for {feed_item['url']}")

    title = feed_item["title"]

    published_datetime = feed_item["published_datetime"]

    try:
        art = model.Article.find(url)
    except:
        import sys

        ex = sys.exc_info()[0]
        raise Exception(
            f" {LOG_CONTEXT}: For some reason excepted during Article.find \n{str(ex)}"
        )

    if art:
        raise SkippedAlreadyInDB()

    try:

        art = newspaper.Article(url)
        art.download()
        art.parse()

        debug("- Succesfully parsed")

        cleaned_up_text = cleanup_non_content_bits(art.text)

        cleaned_up_text = flatten_composed_unicode_characters(cleaned_up_text)

        is_quality_article, reason = sufficient_quality(art)

        if not is_quality_article:
            raise SkippedForLowQuality(reason)

        summary = feed_item["summary"]
        # however, this is not so easy... there have been cases where
        # the summary is just malformed HTML... thus we try to extract
        # the text:
        from bs4 import BeautifulSoup

        soup = BeautifulSoup(summary, "lxml")
        summary = soup.get_text()
        # then there are cases where the summary is huge... so we clip it
        summary = summary[:MAX_CHAR_COUNT_IN_SUMMARY]
        # and if there is still no summary, we simply use the beginning of
        # the article
        if len(summary) < 10:
            summary = cleaned_up_text[:MAX_CHAR_COUNT_IN_SUMMARY]

            # Create new article and save it to DB
        new_article = zeeguu.core.model.Article(
            Url.find_or_create(session, url),
            title,
            ", ".join(art.authors),
            cleaned_up_text,
            summary,
            published_datetime,
            feed,
            feed.language,
        )
        session.add(new_article)

        topics = add_topics(new_article, session)
        log(f" Topics ({topics})")

        add_searches(title, url, new_article, session)
        debug(" Added keywords")

        session.commit()
        log(f"SUCCESS for: {new_article.title}")

    except SkippedForLowQuality as e:
        raise e

    except Exception as e:
        from sentry_sdk import capture_exception

        capture_exception(e)

        log(f"* Rolling back session due to exception while creating article and attaching words/topics: {str(e)}"
            )
        session.rollback()

    return new_article
示例#13
0
#!/usr/bin/env python

from zeeguu.core.model import RSSFeed, Url, Language
import zeeguu.core

RESOURCES_FOLDER = "https://zeeguu.unibe.ch/api/resources/"

name = input("Name of feed to update: ")

session = zeeguu.core.db.session

all_feeds = RSSFeed.query.all()
for feed in all_feeds:
    if feed.title == name:
        print("Updating ... " + name)
        feed.title = input(f'Title ({feed.title}): ') or feed.title
        print(f'new title is: {feed.title}')
        _image_url = input('Icon file: ')
        feed.image_url = Url.find_or_create(session,
                                            RESOURCES_FOLDER + _image_url)
        print('new image url: ' + feed.image_url.as_string())
        session.add(feed)
        session.commit()