def find_or_create(cls, session, user: User, _url, _title: str, _language): """ create a new object and add it to the db if it's not already there otherwise retrieve the existing object and update in case of creation, the created object is incomplete \ """ language = Language.find(_language) url = Url.find_or_create(session, _url, _title) try: return cls.query.filter_by(user=user, url=url).one() except NoResultFound: try: new = cls(user, url, _title, language) session.add(new) session.commit() return new except Exception as e: print("seems we avoided a race condition") session.rollback() return cls.query.filter_by(user=user, url=url).one()
def test_find_or_create_works(self): _url = self.url_rule.url.as_string() _title = self.url_rule.url.title url = Url.find_or_create(session, _url, _title) self.assertEqual(url.title, _title)
def __init__(self): super().__init__() self.rss_feed = self._create_model_object() self.feed = self.rss_feed self.save(self.rss_feed) lang1 = Language.find_or_create(LANG_OF_FEED_ONE) url = Url.find_or_create(self.db.session, URL_OF_FEED_ONE) image_url = Url.find_or_create(self.db.session, IMG_URL_OF_FEED_ONE) self.feed1 = RSSFeed.find_or_create(self.db.session, url, "", "", image_url=image_url, language=lang1) self.save(self.feed1) lang2 = Language.find_or_create(LANG_OF_FEED_TWO) url2 = Url.find_or_create(self.db.session, URL_OF_FEED_TWO) image_url2 = Url.find_or_create(self.db.session, IMG_URL_OF_FEED_TWO) self.feed2 = RSSFeed.find_or_create(self.db.session, url2, "", "", image_url=image_url2, language=lang2) self.save(self.feed2)
def download_from_starrred_article(starArticle: StarredArticle, session): """ Session is needed because this saves stuff to the DB. """ url = str(starArticle.url) findart = model.Article.find(url) if findart: print(f"Already in the DB: {findart}") else: try: art = watchmen.article_parser.get_article(url) title = art.title summary = art.summary word_count = len(art.text.split(" ")) if word_count < 10: zeeguu.log_n_print( f" {LOG_CONTEXT}: Can't find text for: {url}") elif word_count < Article.MINIMUM_WORD_COUNT: zeeguu.log_n_print( f" {LOG_CONTEXT}: Skipped. Less than {Article.MINIMUM_WORD_COUNT} words of text. {url}" ) else: from zeeguu.language.difficulty_estimator_factory import DifficultyEstimatorFactory # Create new article and save it to DB new_article = model.Article(Url.find_or_create(session, url), title, ', '.join(art.authors), art.text, summary, datetime.now(), RSSFeed.query.first(), starArticle.language) session.add(new_article) session.commit() zeeguu.log_n_print(f" {LOG_CONTEXT}: Added: {new_article}") except: import sys ex = sys.exc_info() zeeguu.log_n_print( f" {LOG_CONTEXT}: Failed to create zeeguu.Article from {url}\n{str(ex)}" )
def test_try_to_get_race_condition(self): _url = self.url_rule.url.as_string() _title = self.url_rule.url.title def threaded_create_url(): url = Url.find_or_create(session, _url, _title) threads = [] for i in range(0): # multithreaded connections freeze on mysqldb. # so this is here to be tested manually and killed for now... t = Thread(target=threaded_create_url, args=()) threads.append(t) t.start() for t in threads: t.join() url = Url.find_or_create(session, _url, _title) self.assertEqual(url.title, _title)
def threaded_create_url(): url = Url.find_or_create(session, _url, _title)
test_feed = test_feed(_feed_url) feed_name = input( f"Feed name (Enter for: {test_feed.title}): ") or test_feed.title print(f'= {feed_name}') icon_name = input( "Icon name to be found in resources folder (e.g. 20min.png): ") print(f'= {icon_name}') description = input(f'Description (Enter for: {test_feed.description}): ' ) or test_feed.description print(f'= {description}') _language = input("Language code (e.g. en): ") print(f'= {_language}') icon_url = Url.find_or_create(zeeguu.db.session, RESOURCES_FOLDER + icon_name) feed_url = Url.find_or_create(zeeguu.db.session, _feed_url) language = Language.find_or_create(_language) rss_feed = RSSFeed.find_or_create(zeeguu.db.session, feed_url, feed_name, description, icon_url, language) print("Done: ") print(rss_feed.title) print(rss_feed.description) print(rss_feed.language_id) print(rss_feed.url.as_string()) print(rss_feed.image_url.as_string())
#!/usr/bin/env python from zeeguu.model import RSSFeed, Url, Language import zeeguu RESOURCES_FOLDER = "https://zeeguu.unibe.ch/api/resources/" name = input ("Name of feed to update: ") session = zeeguu.db.session all_feeds = RSSFeed.query.all() for feed in all_feeds: if feed.title == name: print("Updating ... " + name) feed.title = input (f'Title ({feed.title}): ') or feed.title print (f'new title is: {feed.title}') _image_url = input ('Icon file: ') feed.image_url = Url.find_or_create(session, RESOURCES_FOLDER+_image_url) print ('new image url: ' + feed.image_url.as_string()) session.add(feed) session.commit()
def download_from_feed(feed: RSSFeed, session, limit=1000): """ Session is needed because this saves stuff to the DB. last_crawled_time is useful because otherwise there would be a lot of time wasted trying to retrieve the same articles, especially the ones which can't be retrieved, so they won't be cached. """ zeeguu.log(feed) downloaded = 0 skipped = 0 skipped_due_to_low_quality = dict() skipped_already_in_db = 0 last_retrieval_time_from_DB = None last_retrieval_time_seen_this_crawl = None if feed.last_crawled_time: last_retrieval_time_from_DB = feed.last_crawled_time zeeguu.log(f"last retrieval time from DB = {last_retrieval_time_from_DB}") for feed_item in feed.feed_items(): if downloaded >= limit: break try: url = _url_after_redirects(feed_item['url']) except requests.exceptions.TooManyRedirects: zeeguu.log(f"Too many redirects for: {url}") continue try: this_article_time = datetime.strptime(feed_item['published'], SIMPLE_TIME_FORMAT) this_article_time = this_article_time.replace(tzinfo=None) except: zeeguu.log(f"can't get time from {url}: {feed_item['published']}") continue if last_retrieval_time_from_DB: if this_article_time < last_retrieval_time_from_DB: skipped += 1 continue title = feed_item['title'] summary = feed_item['summary'] art = model.Article.find(url) if (not last_retrieval_time_seen_this_crawl) or (this_article_time > last_retrieval_time_seen_this_crawl): last_retrieval_time_seen_this_crawl = this_article_time if art: skipped_already_in_db += 1 else: try: art = newspaper.Article(url) art.download() art.parse() cleaned_up_text = cleanup_non_content_bits(art.text) quality_article = sufficient_quality(art, skipped_due_to_low_quality) if quality_article: from zeeguu.language.difficulty_estimator_factory import DifficultyEstimatorFactory # Create new article and save it to DB new_article = zeeguu.model.Article( Url.find_or_create(session, url), title, ', '.join(art.authors), cleaned_up_text, summary, this_article_time, feed, feed.language ) session.add(new_article) session.commit() downloaded += 1 add_topics(new_article, session) add_searches(title, url, new_article, session) try: session.commit() except Exception as e: zeeguu.log(f'{LOG_CONTEXT}: Something went wrong when committing words/topic to article: {e}') except Exception as e: # raise e import sys ex = sys.exc_info()[0] zeeguu.log(f" {LOG_CONTEXT}: Failed to create zeeguu.Article from {url}\n{str(ex)}") zeeguu.log(f' Skipped due to time: {skipped} ') zeeguu.log(f' Downloaded: {downloaded}') zeeguu.log(f' Low Quality: {skipped_due_to_low_quality}') zeeguu.log(f' Already in DB: {skipped_already_in_db}') if last_retrieval_time_seen_this_crawl: feed.last_crawled_time = last_retrieval_time_seen_this_crawl session.add(feed) session.commit()