def find_url_in_extra_data(self):
        """
            DB structure is a mess!
            There is no convention where the url associated with an event is.
            Thu we need to look for it in different places

            NOTE: This can be solved by creating a new column called url and write the url only there

            returns: url if found or None otherwise
        """

        if self.extra_data and self.extra_data != '{}' and self.extra_data != 'null':
            try:
                extra_event_data = json.loads(self.extra_data)

                if 'articleURL' in extra_event_data:
                    url = extra_event_data['articleURL']
                elif 'url' in extra_event_data:
                    url = extra_event_data['url']
                else:  # There is no url
                    return None
                return Url.extract_canonical_url(url)

            except:  # Some json strings are truncated and some other times extra_event_data is an int
                # therefore cannot be parsed correctly and throw an exception
                return None
        else:  # The extra_data field is empty
            return None
Exemplo n.º 2
0
    def find_or_create(cls, session, user: User, _url, _title: str, _language):
        """

            create a new object and add it to the db if it's not already there
            otherwise retrieve the existing object and update

            in case of creation, the created object is incomplete

\        """

        language = Language.find(_language)
        url = Url.find_or_create(session, _url, _title)

        try:
            return cls.query.filter_by(user=user, url=url).one()
        except NoResultFound:
            try:
                new = cls(user, url, _title, language)
                session.add(new)
                session.commit()
                return new
            except Exception as e:
                print("seems we avoided a race condition")
                session.rollback()
                return cls.query.filter_by(user=user, url=url).one()
Exemplo n.º 3
0
def add_bookmark(user, original_language, original_word, translation_language,
                 translation_word, date, the_context, the_url, the_url_title):

    url = Url.find(the_url, the_url_title)
    text = Text(the_context, translation_language, url)

    if RankedWord.exists(original_word.lower(), original_language):
        rank1 = UserWord.find_rank(original_word.lower(), original_language)
        w1 = UserWord(original_word, original_language, rank1)
    else:
        w1 = UserWord(original_word, original_language, None)
    if RankedWord.exists(translation_word.lower(), translation_language):
        rank2 = UserWord.find_rank(translation_word.lower(),
                                   translation_language)
        w2 = UserWord(translation_word, translation_language, rank2)
    else:
        w2 = UserWord(translation_word, translation_language, None)

    zeeguu.db.session.add(url)
    zeeguu.db.session.add(text)
    zeeguu.db.session.add(w1)
    zeeguu.db.session.add(w2)
    t1 = Bookmark(w1, w2, user, text, date)
    zeeguu.db.session.add(t1)

    zeeguu.db.session.commit()
    add_probability_to_existing_words_of_user(user, t1, original_language)
Exemplo n.º 4
0
def add_bookmark(user, original_language, original_word, translation_language, translation_word,  date, the_context, the_url, the_url_title):

    url = Url.find (the_url, the_url_title)
    text = Text(the_context, translation_language, url)



    if RankedWord.exists(original_word.lower(), original_language):
        rank1 = UserWord.find_rank(original_word.lower(), original_language)
        w1 = UserWord(original_word, original_language,rank1)
    else:
        w1  = UserWord(original_word, original_language,None)
    if RankedWord.exists(translation_word.lower(), translation_language):
        rank2 = UserWord.find_rank(translation_word.lower(), translation_language)
        w2 = UserWord(translation_word, translation_language,rank2)
    else:
        w2  = UserWord(translation_word, translation_language,None)

    zeeguu.db.session.add(url)
    zeeguu.db.session.add(text)
    zeeguu.db.session.add(w1)
    zeeguu.db.session.add(w2)
    t1= Bookmark(w1,w2, user, text, date)
    zeeguu.db.session.add(t1)

    zeeguu.db.session.commit()
    add_probability_to_existing_words_of_user(user,t1,original_language)
Exemplo n.º 5
0
    def test_find_or_create_works(self):

        _url = self.url_rule.url.as_string()
        _title = self.url_rule.url.title

        url = Url.find_or_create(session, _url, _title)

        self.assertEqual(url.title, _title)
Exemplo n.º 6
0
    def delete(cls, session, user, _url):

        try:
            url = Url.find(_url)
            item = cls.query.filter_by(user=user, url=url).one()
            session.delete(item)
            session.commit()
        except Exception as e:
            print(e)
Exemplo n.º 7
0
    def __init__(self):
        super().__init__()

        self.rss_feed = self._create_model_object()
        self.feed = self.rss_feed
        self.save(self.rss_feed)

        lang1 = Language.find_or_create(LANG_OF_FEED_ONE)
        url = Url.find_or_create(self.db.session, URL_OF_FEED_ONE)
        image_url = Url.find_or_create(self.db.session, IMG_URL_OF_FEED_ONE)
        self.feed1 = RSSFeed.find_or_create(self.db.session, url, "", "", image_url=image_url,
                                            language=lang1)
        self.save(self.feed1)

        lang2 = Language.find_or_create(LANG_OF_FEED_TWO)
        url2 = Url.find_or_create(self.db.session, URL_OF_FEED_TWO)
        image_url2 = Url.find_or_create(self.db.session, IMG_URL_OF_FEED_TWO)
        self.feed2 = RSSFeed.find_or_create(self.db.session,
                                            url2, "", "", image_url=image_url2, language=lang2)
        self.save(self.feed2)
Exemplo n.º 8
0
    def test_domain_plus_path_must_be_unique(self):

        _url = self.url_rule.url.as_string()
        _title = self.url_rule.url.title
        _domain = DomainName.get_domain(_url)

        with self.assertRaises(Exception) as context:
            domain = DomainName.find(_domain)
            url = Url(_url, _title, domain)
            session.add(url)
            session.commit()

        self.assertTrue('Duplicate entry'
                        or 'IntegrityError' in str(context.exception))
def download_from_starrred_article(starArticle: StarredArticle, session):
    """

        Session is needed because this saves stuff to the DB.


    """
    url = str(starArticle.url)
    findart = model.Article.find(url)
    if findart:
        print(f"Already in the DB: {findart}")
    else:
        try:

            art = watchmen.article_parser.get_article(url)
            title = art.title
            summary = art.summary

            word_count = len(art.text.split(" "))

            if word_count < 10:
                zeeguu.log_n_print(
                    f" {LOG_CONTEXT}: Can't find text for: {url}")
            elif word_count < Article.MINIMUM_WORD_COUNT:
                zeeguu.log_n_print(
                    f" {LOG_CONTEXT}: Skipped. Less than {Article.MINIMUM_WORD_COUNT} words of text. {url}"
                )
            else:
                from zeeguu.language.difficulty_estimator_factory import DifficultyEstimatorFactory

                # Create new article and save it to DB
                new_article = model.Article(Url.find_or_create(session, url),
                                            title, ', '.join(art.authors),
                                            art.text, summary, datetime.now(),
                                            RSSFeed.query.first(),
                                            starArticle.language)

                session.add(new_article)
                session.commit()
                zeeguu.log_n_print(f" {LOG_CONTEXT}: Added: {new_article}")
        except:
            import sys
            ex = sys.exc_info()
            zeeguu.log_n_print(
                f" {LOG_CONTEXT}: Failed to create zeeguu.Article from {url}\n{str(ex)}"
            )
Exemplo n.º 10
0
def bookmark_with_context(from_lang_code, term, to_lang_code, translation):
    """
    The preferred way of a user saving a word/translation/context to his
    profile.
    :param from_lang_code:
    :param term:
    :param to_lang_code:
    :param translation:
    :return:
    """

    if 'title' in flask.request.form:
        bookmarked_url_title = flask.request.form['title']
    else:
        bookmarked_url_title = ''

    bookmarked_url = flask.request.form['url']
    context = flask.request.form['context']

    url = Url.find(bookmarked_url, bookmarked_url_title)

    from_lang = Language.find(from_lang_code)
    to_lang = Language.find(to_lang_code)

    word = (decode_word(term))
    translation_word = decode_word(translation)
    user_word = UserWord.find(word, from_lang)
    translation = UserWord.find(translation_word, to_lang)

    # search = Search.query.filter_by(
    #     user=flask.g.user, user_word=user_word, language=to_lang
    # ).order_by(Search.id.desc()).first()

    #create the text entity first
    new_text = Text(context, from_lang, url)
    bookmark = Bookmark(user_word, translation, flask.g.user, new_text,
                        datetime.datetime.now())
    zeeguu.db.session.add(bookmark)
    bookmark.calculate_probabilities_after_adding_a_bookmark(
        flask.g.user, bookmark.origin.language)
    return str(bookmark.id)
Exemplo n.º 11
0
    def test_try_to_get_race_condition(self):

        _url = self.url_rule.url.as_string()
        _title = self.url_rule.url.title

        def threaded_create_url():
            url = Url.find_or_create(session, _url, _title)

        threads = []

        for i in range(0):  # multithreaded connections freeze on mysqldb.
            # so this is here to be tested manually and killed for now...
            t = Thread(target=threaded_create_url, args=())
            threads.append(t)
            t.start()

        for t in threads:
            t.join()

        url = Url.find_or_create(session, _url, _title)
        self.assertEqual(url.title, _title)
Exemplo n.º 12
0
def bookmark_with_context(from_lang_code, term, to_lang_code, translation):
    """
    The preferred way of a user saving a word/translation/context to his
    profile.
    :param from_lang_code:
    :param term:
    :param to_lang_code:
    :param translation:
    :return:
    """

    if 'title' in flask.request.form:
        bookmarked_url_title = flask.request.form['title']
    else:
        bookmarked_url_title = ''

    bookmarked_url = flask.request.form['url']
    context = flask.request.form['context']


    url = Url.find(bookmarked_url, bookmarked_url_title)

    from_lang = Language.find(from_lang_code)
    to_lang = Language.find(to_lang_code)

    word = (decode_word(term))
    translation_word = decode_word(translation)
    user_word = UserWord.find(word,from_lang)
    translation = UserWord.find(translation_word,to_lang)

    # search = Search.query.filter_by(
    #     user=flask.g.user, user_word=user_word, language=to_lang
    # ).order_by(Search.id.desc()).first()

    #create the text entity first
    new_text = Text(context, from_lang, url)
    bookmark = Bookmark(user_word, translation, flask.g.user, new_text, datetime.datetime.now())
    zeeguu.db.session.add(bookmark)
    bookmark.calculate_probabilities_after_adding_a_bookmark(flask.g.user, bookmark.origin.language)
    return str(bookmark.id)
Exemplo n.º 13
0
 def threaded_create_url():
     url = Url.find_or_create(session, _url, _title)
Exemplo n.º 14
0
test_feed = test_feed(_feed_url)

feed_name = input(
    f"Feed name (Enter for: {test_feed.title}):  ") or test_feed.title
print(f'= {feed_name}')

icon_name = input(
    "Icon name to be found in resources folder (e.g. 20min.png):  ")
print(f'= {icon_name}')

description = input(f'Description (Enter for: {test_feed.description}): '
                    ) or test_feed.description
print(f'= {description}')

_language = input("Language code (e.g. en): ")
print(f'= {_language}')

icon_url = Url.find_or_create(zeeguu.db.session, RESOURCES_FOLDER + icon_name)
feed_url = Url.find_or_create(zeeguu.db.session, _feed_url)
language = Language.find_or_create(_language)

rss_feed = RSSFeed.find_or_create(zeeguu.db.session, feed_url, feed_name,
                                  description, icon_url, language)

print("Done: ")
print(rss_feed.title)
print(rss_feed.description)
print(rss_feed.language_id)
print(rss_feed.url.as_string())
print(rss_feed.image_url.as_string())
Exemplo n.º 15
0
#!/usr/bin/env python

from zeeguu.model import RSSFeed, Url, Language
import zeeguu

RESOURCES_FOLDER = "https://zeeguu.unibe.ch/api/resources/"

name = input ("Name of feed to update: ")

session = zeeguu.db.session

all_feeds = RSSFeed.query.all()
for feed in all_feeds:
    if feed.title == name:
        print("Updating ... " + name)
        feed.title = input (f'Title ({feed.title}): ') or feed.title
        print (f'new title is: {feed.title}')
        _image_url = input ('Icon file: ') 
        feed.image_url = Url.find_or_create(session, RESOURCES_FOLDER+_image_url)
        print ('new image url: ' + feed.image_url.as_string())
        session.add(feed)
        session.commit()

Exemplo n.º 16
0
def download_from_feed(feed: RSSFeed, session, limit=1000):
    """

        Session is needed because this saves stuff to the DB.


        last_crawled_time is useful because otherwise there would be a lot of time
        wasted trying to retrieve the same articles, especially the ones which
        can't be retrieved, so they won't be cached.


    """
    zeeguu.log(feed)
    downloaded = 0
    skipped = 0
    skipped_due_to_low_quality = dict()
    skipped_already_in_db = 0

    last_retrieval_time_from_DB = None
    last_retrieval_time_seen_this_crawl = None

    if feed.last_crawled_time:
        last_retrieval_time_from_DB = feed.last_crawled_time
        zeeguu.log(f"last retrieval time from DB = {last_retrieval_time_from_DB}")

    for feed_item in feed.feed_items():

        if downloaded >= limit:
            break

        try:
            url = _url_after_redirects(feed_item['url'])
        except requests.exceptions.TooManyRedirects:
            zeeguu.log(f"Too many redirects for: {url}")
            continue

        try:
            this_article_time = datetime.strptime(feed_item['published'], SIMPLE_TIME_FORMAT)
            this_article_time = this_article_time.replace(tzinfo=None)
        except:
            zeeguu.log(f"can't get time from {url}: {feed_item['published']}")
            continue

        if last_retrieval_time_from_DB:

            if this_article_time < last_retrieval_time_from_DB:
                skipped += 1
                continue

        title = feed_item['title']
        summary = feed_item['summary']

        art = model.Article.find(url)

        if (not last_retrieval_time_seen_this_crawl) or (this_article_time > last_retrieval_time_seen_this_crawl):
            last_retrieval_time_seen_this_crawl = this_article_time

        if art:
            skipped_already_in_db += 1
        else:
            try:

                art = newspaper.Article(url)
                art.download()
                art.parse()

                cleaned_up_text = cleanup_non_content_bits(art.text)

                quality_article = sufficient_quality(art, skipped_due_to_low_quality)
                if quality_article:
                    from zeeguu.language.difficulty_estimator_factory import DifficultyEstimatorFactory

                    # Create new article and save it to DB
                    new_article = zeeguu.model.Article(
                        Url.find_or_create(session, url),
                        title,
                        ', '.join(art.authors),
                        cleaned_up_text,
                        summary,
                        this_article_time,
                        feed,
                        feed.language
                    )
                    session.add(new_article)
                    session.commit()
                    downloaded += 1

                    add_topics(new_article, session)
                    add_searches(title, url, new_article, session)

                    try:
                        session.commit()
                    except Exception as e:
                        zeeguu.log(f'{LOG_CONTEXT}: Something went wrong when committing words/topic to article: {e}')

            except Exception as e:
                # raise e
                import sys
                ex = sys.exc_info()[0]
                zeeguu.log(f" {LOG_CONTEXT}: Failed to create zeeguu.Article from {url}\n{str(ex)}")

    zeeguu.log(f'  Skipped due to time: {skipped} ')
    zeeguu.log(f'  Downloaded: {downloaded}')
    zeeguu.log(f'  Low Quality: {skipped_due_to_low_quality}')
    zeeguu.log(f'  Already in DB: {skipped_already_in_db}')

    if last_retrieval_time_seen_this_crawl:
        feed.last_crawled_time = last_retrieval_time_seen_this_crawl
    session.add(feed)
    session.commit()