def find_url_in_extra_data(self): """ DB structure is a mess! There is no convention where the url associated with an event is. Thu we need to look for it in different places NOTE: This can be solved by creating a new column called url and write the url only there returns: url if found or None otherwise """ if self.extra_data and self.extra_data != '{}' and self.extra_data != 'null': try: extra_event_data = json.loads(self.extra_data) if 'articleURL' in extra_event_data: url = extra_event_data['articleURL'] elif 'url' in extra_event_data: url = extra_event_data['url'] else: # There is no url return None return Url.extract_canonical_url(url) except: # Some json strings are truncated and some other times extra_event_data is an int # therefore cannot be parsed correctly and throw an exception return None else: # The extra_data field is empty return None
def find_or_create(cls, session, user: User, _url, _title: str, _language): """ create a new object and add it to the db if it's not already there otherwise retrieve the existing object and update in case of creation, the created object is incomplete \ """ language = Language.find(_language) url = Url.find_or_create(session, _url, _title) try: return cls.query.filter_by(user=user, url=url).one() except NoResultFound: try: new = cls(user, url, _title, language) session.add(new) session.commit() return new except Exception as e: print("seems we avoided a race condition") session.rollback() return cls.query.filter_by(user=user, url=url).one()
def add_bookmark(user, original_language, original_word, translation_language, translation_word, date, the_context, the_url, the_url_title): url = Url.find(the_url, the_url_title) text = Text(the_context, translation_language, url) if RankedWord.exists(original_word.lower(), original_language): rank1 = UserWord.find_rank(original_word.lower(), original_language) w1 = UserWord(original_word, original_language, rank1) else: w1 = UserWord(original_word, original_language, None) if RankedWord.exists(translation_word.lower(), translation_language): rank2 = UserWord.find_rank(translation_word.lower(), translation_language) w2 = UserWord(translation_word, translation_language, rank2) else: w2 = UserWord(translation_word, translation_language, None) zeeguu.db.session.add(url) zeeguu.db.session.add(text) zeeguu.db.session.add(w1) zeeguu.db.session.add(w2) t1 = Bookmark(w1, w2, user, text, date) zeeguu.db.session.add(t1) zeeguu.db.session.commit() add_probability_to_existing_words_of_user(user, t1, original_language)
def add_bookmark(user, original_language, original_word, translation_language, translation_word, date, the_context, the_url, the_url_title): url = Url.find (the_url, the_url_title) text = Text(the_context, translation_language, url) if RankedWord.exists(original_word.lower(), original_language): rank1 = UserWord.find_rank(original_word.lower(), original_language) w1 = UserWord(original_word, original_language,rank1) else: w1 = UserWord(original_word, original_language,None) if RankedWord.exists(translation_word.lower(), translation_language): rank2 = UserWord.find_rank(translation_word.lower(), translation_language) w2 = UserWord(translation_word, translation_language,rank2) else: w2 = UserWord(translation_word, translation_language,None) zeeguu.db.session.add(url) zeeguu.db.session.add(text) zeeguu.db.session.add(w1) zeeguu.db.session.add(w2) t1= Bookmark(w1,w2, user, text, date) zeeguu.db.session.add(t1) zeeguu.db.session.commit() add_probability_to_existing_words_of_user(user,t1,original_language)
def test_find_or_create_works(self): _url = self.url_rule.url.as_string() _title = self.url_rule.url.title url = Url.find_or_create(session, _url, _title) self.assertEqual(url.title, _title)
def delete(cls, session, user, _url): try: url = Url.find(_url) item = cls.query.filter_by(user=user, url=url).one() session.delete(item) session.commit() except Exception as e: print(e)
def __init__(self): super().__init__() self.rss_feed = self._create_model_object() self.feed = self.rss_feed self.save(self.rss_feed) lang1 = Language.find_or_create(LANG_OF_FEED_ONE) url = Url.find_or_create(self.db.session, URL_OF_FEED_ONE) image_url = Url.find_or_create(self.db.session, IMG_URL_OF_FEED_ONE) self.feed1 = RSSFeed.find_or_create(self.db.session, url, "", "", image_url=image_url, language=lang1) self.save(self.feed1) lang2 = Language.find_or_create(LANG_OF_FEED_TWO) url2 = Url.find_or_create(self.db.session, URL_OF_FEED_TWO) image_url2 = Url.find_or_create(self.db.session, IMG_URL_OF_FEED_TWO) self.feed2 = RSSFeed.find_or_create(self.db.session, url2, "", "", image_url=image_url2, language=lang2) self.save(self.feed2)
def test_domain_plus_path_must_be_unique(self): _url = self.url_rule.url.as_string() _title = self.url_rule.url.title _domain = DomainName.get_domain(_url) with self.assertRaises(Exception) as context: domain = DomainName.find(_domain) url = Url(_url, _title, domain) session.add(url) session.commit() self.assertTrue('Duplicate entry' or 'IntegrityError' in str(context.exception))
def download_from_starrred_article(starArticle: StarredArticle, session): """ Session is needed because this saves stuff to the DB. """ url = str(starArticle.url) findart = model.Article.find(url) if findart: print(f"Already in the DB: {findart}") else: try: art = watchmen.article_parser.get_article(url) title = art.title summary = art.summary word_count = len(art.text.split(" ")) if word_count < 10: zeeguu.log_n_print( f" {LOG_CONTEXT}: Can't find text for: {url}") elif word_count < Article.MINIMUM_WORD_COUNT: zeeguu.log_n_print( f" {LOG_CONTEXT}: Skipped. Less than {Article.MINIMUM_WORD_COUNT} words of text. {url}" ) else: from zeeguu.language.difficulty_estimator_factory import DifficultyEstimatorFactory # Create new article and save it to DB new_article = model.Article(Url.find_or_create(session, url), title, ', '.join(art.authors), art.text, summary, datetime.now(), RSSFeed.query.first(), starArticle.language) session.add(new_article) session.commit() zeeguu.log_n_print(f" {LOG_CONTEXT}: Added: {new_article}") except: import sys ex = sys.exc_info() zeeguu.log_n_print( f" {LOG_CONTEXT}: Failed to create zeeguu.Article from {url}\n{str(ex)}" )
def bookmark_with_context(from_lang_code, term, to_lang_code, translation): """ The preferred way of a user saving a word/translation/context to his profile. :param from_lang_code: :param term: :param to_lang_code: :param translation: :return: """ if 'title' in flask.request.form: bookmarked_url_title = flask.request.form['title'] else: bookmarked_url_title = '' bookmarked_url = flask.request.form['url'] context = flask.request.form['context'] url = Url.find(bookmarked_url, bookmarked_url_title) from_lang = Language.find(from_lang_code) to_lang = Language.find(to_lang_code) word = (decode_word(term)) translation_word = decode_word(translation) user_word = UserWord.find(word, from_lang) translation = UserWord.find(translation_word, to_lang) # search = Search.query.filter_by( # user=flask.g.user, user_word=user_word, language=to_lang # ).order_by(Search.id.desc()).first() #create the text entity first new_text = Text(context, from_lang, url) bookmark = Bookmark(user_word, translation, flask.g.user, new_text, datetime.datetime.now()) zeeguu.db.session.add(bookmark) bookmark.calculate_probabilities_after_adding_a_bookmark( flask.g.user, bookmark.origin.language) return str(bookmark.id)
def test_try_to_get_race_condition(self): _url = self.url_rule.url.as_string() _title = self.url_rule.url.title def threaded_create_url(): url = Url.find_or_create(session, _url, _title) threads = [] for i in range(0): # multithreaded connections freeze on mysqldb. # so this is here to be tested manually and killed for now... t = Thread(target=threaded_create_url, args=()) threads.append(t) t.start() for t in threads: t.join() url = Url.find_or_create(session, _url, _title) self.assertEqual(url.title, _title)
def bookmark_with_context(from_lang_code, term, to_lang_code, translation): """ The preferred way of a user saving a word/translation/context to his profile. :param from_lang_code: :param term: :param to_lang_code: :param translation: :return: """ if 'title' in flask.request.form: bookmarked_url_title = flask.request.form['title'] else: bookmarked_url_title = '' bookmarked_url = flask.request.form['url'] context = flask.request.form['context'] url = Url.find(bookmarked_url, bookmarked_url_title) from_lang = Language.find(from_lang_code) to_lang = Language.find(to_lang_code) word = (decode_word(term)) translation_word = decode_word(translation) user_word = UserWord.find(word,from_lang) translation = UserWord.find(translation_word,to_lang) # search = Search.query.filter_by( # user=flask.g.user, user_word=user_word, language=to_lang # ).order_by(Search.id.desc()).first() #create the text entity first new_text = Text(context, from_lang, url) bookmark = Bookmark(user_word, translation, flask.g.user, new_text, datetime.datetime.now()) zeeguu.db.session.add(bookmark) bookmark.calculate_probabilities_after_adding_a_bookmark(flask.g.user, bookmark.origin.language) return str(bookmark.id)
def threaded_create_url(): url = Url.find_or_create(session, _url, _title)
test_feed = test_feed(_feed_url) feed_name = input( f"Feed name (Enter for: {test_feed.title}): ") or test_feed.title print(f'= {feed_name}') icon_name = input( "Icon name to be found in resources folder (e.g. 20min.png): ") print(f'= {icon_name}') description = input(f'Description (Enter for: {test_feed.description}): ' ) or test_feed.description print(f'= {description}') _language = input("Language code (e.g. en): ") print(f'= {_language}') icon_url = Url.find_or_create(zeeguu.db.session, RESOURCES_FOLDER + icon_name) feed_url = Url.find_or_create(zeeguu.db.session, _feed_url) language = Language.find_or_create(_language) rss_feed = RSSFeed.find_or_create(zeeguu.db.session, feed_url, feed_name, description, icon_url, language) print("Done: ") print(rss_feed.title) print(rss_feed.description) print(rss_feed.language_id) print(rss_feed.url.as_string()) print(rss_feed.image_url.as_string())
#!/usr/bin/env python from zeeguu.model import RSSFeed, Url, Language import zeeguu RESOURCES_FOLDER = "https://zeeguu.unibe.ch/api/resources/" name = input ("Name of feed to update: ") session = zeeguu.db.session all_feeds = RSSFeed.query.all() for feed in all_feeds: if feed.title == name: print("Updating ... " + name) feed.title = input (f'Title ({feed.title}): ') or feed.title print (f'new title is: {feed.title}') _image_url = input ('Icon file: ') feed.image_url = Url.find_or_create(session, RESOURCES_FOLDER+_image_url) print ('new image url: ' + feed.image_url.as_string()) session.add(feed) session.commit()
def download_from_feed(feed: RSSFeed, session, limit=1000): """ Session is needed because this saves stuff to the DB. last_crawled_time is useful because otherwise there would be a lot of time wasted trying to retrieve the same articles, especially the ones which can't be retrieved, so they won't be cached. """ zeeguu.log(feed) downloaded = 0 skipped = 0 skipped_due_to_low_quality = dict() skipped_already_in_db = 0 last_retrieval_time_from_DB = None last_retrieval_time_seen_this_crawl = None if feed.last_crawled_time: last_retrieval_time_from_DB = feed.last_crawled_time zeeguu.log(f"last retrieval time from DB = {last_retrieval_time_from_DB}") for feed_item in feed.feed_items(): if downloaded >= limit: break try: url = _url_after_redirects(feed_item['url']) except requests.exceptions.TooManyRedirects: zeeguu.log(f"Too many redirects for: {url}") continue try: this_article_time = datetime.strptime(feed_item['published'], SIMPLE_TIME_FORMAT) this_article_time = this_article_time.replace(tzinfo=None) except: zeeguu.log(f"can't get time from {url}: {feed_item['published']}") continue if last_retrieval_time_from_DB: if this_article_time < last_retrieval_time_from_DB: skipped += 1 continue title = feed_item['title'] summary = feed_item['summary'] art = model.Article.find(url) if (not last_retrieval_time_seen_this_crawl) or (this_article_time > last_retrieval_time_seen_this_crawl): last_retrieval_time_seen_this_crawl = this_article_time if art: skipped_already_in_db += 1 else: try: art = newspaper.Article(url) art.download() art.parse() cleaned_up_text = cleanup_non_content_bits(art.text) quality_article = sufficient_quality(art, skipped_due_to_low_quality) if quality_article: from zeeguu.language.difficulty_estimator_factory import DifficultyEstimatorFactory # Create new article and save it to DB new_article = zeeguu.model.Article( Url.find_or_create(session, url), title, ', '.join(art.authors), cleaned_up_text, summary, this_article_time, feed, feed.language ) session.add(new_article) session.commit() downloaded += 1 add_topics(new_article, session) add_searches(title, url, new_article, session) try: session.commit() except Exception as e: zeeguu.log(f'{LOG_CONTEXT}: Something went wrong when committing words/topic to article: {e}') except Exception as e: # raise e import sys ex = sys.exc_info()[0] zeeguu.log(f" {LOG_CONTEXT}: Failed to create zeeguu.Article from {url}\n{str(ex)}") zeeguu.log(f' Skipped due to time: {skipped} ') zeeguu.log(f' Downloaded: {downloaded}') zeeguu.log(f' Low Quality: {skipped_due_to_low_quality}') zeeguu.log(f' Already in DB: {skipped_already_in_db}') if last_retrieval_time_seen_this_crawl: feed.last_crawled_time = last_retrieval_time_seen_this_crawl session.add(feed) session.commit()