def find_url_in_extra_data(self): """ DB structure is a mess! There is no convention where the url associated with an event is. Thu we need to look for it in different places NOTE: This can be solved by creating a new column called url and write the url only there returns: url if found or None otherwise """ if self.extra_data and self.extra_data != "{}" and self.extra_data != "null": try: extra_event_data = json.loads(self.extra_data) if "articleURL" in extra_event_data: url = extra_event_data["articleURL"] elif "url" in extra_event_data: url = extra_event_data["url"] else: # There is no url return None return Url.extract_canonical_url(url) except: # Some json strings are truncated and some other times extra_event_data is an int # therefore cannot be parsed correctly and throw an exception return None else: # The extra_data field is empty return None
def find_or_create(cls, session, user: User, _url, _title: str, _language): """ create a new object and add it to the db if it's not already there otherwise retrieve the existing object and update in case of creation, the created object is incomplete \ """ language = Language.find(_language) url = Url.find_or_create(session, _url, _title) try: return cls.query.filter_by(user=user, url=url).one() except NoResultFound: try: new = cls(user, url, _title, language) session.add(new) session.commit() return new except Exception as e: from sentry_sdk import capture_exception capture_exception(e) print("seems we avoided a race condition") session.rollback() return cls.query.filter_by(user=user, url=url).one()
def test_find_or_create_works(self): _url = self.url_rule.url.as_string() _title = self.url_rule.url.title url = Url.find_or_create(session, _url, _title) self.assertEqual(url.title, _title)
def delete(cls, session, user, _url): try: url = Url.find(_url) item = cls.query.filter_by(user=user, url=url).one() session.delete(item) session.commit() except Exception as e: from sentry_sdk import capture_exception capture_exception(e)
def _localized_topic_keyword_in_url(self, topic: str, localized: str, keyword: str, url: str): topic = Topic(topic) localized_topic = LocalizedTopic(topic, self.user.learned_language, localized) localized_topic.keywords = keyword article = ArticleRule().article url = Url.find_or_create(self.db.session, url) article.url = url assert localized_topic.matches_article(article)
def __init__(self): super().__init__() self.rss_feed = self._create_model_object() self.feed = self.rss_feed self.save(self.rss_feed) lang1 = Language.find_or_create('de') url = Url.find_or_create(self.db.session, url_spiegel_rss) self.feed1 = RSSFeed.find_or_create(self.db.session, url, "", "", icon_name_spiegel, language=lang1) self.save(self.feed1)
def test_domain_plus_path_must_be_unique(self): _url = self.url_rule.url.as_string() _title = self.url_rule.url.title _domain = DomainName.get_domain(_url) with self.assertRaises(Exception) as context: domain = DomainName.find(_domain) url = Url(_url, _title, domain) session.add(url) session.commit() self.assertTrue('Duplicate entry' or 'IntegrityError' in str(context.exception))
def upload_articles(cohort_id): """ uploads articles for a cohort with input from a POST request """ check_permission_for_cohort(cohort_id) try: for article_data in json.loads(request.data): url = Url("userarticle/{}".format(uuid.uuid4().hex)) title = article_data["title"] authors = article_data["authors"] content = article_data["content"] summary = article_data["summary"] published_time = datetime.now() language_code = article_data["language_code"] language = Language.find(language_code) new_article = Article( url, title, authors, content, summary, published_time, None, # rss feed language, ) db.session.add(new_article) db.session.flush() db.session.refresh(new_article) cohort = Cohort.find(cohort_id) now = datetime.now() new_cohort_article_map = CohortArticleMap(cohort, new_article, now) db.session.add(new_cohort_article_map) db.session.commit() return "OK" except ValueError: flask.abort(400) return "ValueError"
def test_try_to_get_race_condition(self): _url = self.url_rule.url.as_string() _title = self.url_rule.url.title def threaded_create_url(): url = Url.find_or_create(session, _url, _title) threads = [] for i in range(0): # multithreaded connections freeze on mysqldb. # so this is here to be tested manually and killed for now... t = Thread(target=threaded_create_url, args=()) threads.append(t) t.start() for t in threads: t.join() url = Url.find_or_create(session, _url, _title) self.assertEqual(url.title, _title)
def threaded_create_url(): url = Url.find_or_create(session, _url, _title)
feed_name = input( f"Feed name (Enter for: {test_feed.title}): ") or test_feed.title print(f'= {feed_name}') icon_name = input( "Icon name to be found in resources folder (e.g. 20min.png): ") print(f'= {icon_name}') description = input(f'Description (Enter for: {test_feed.description}): ' ) or test_feed.description print(f'= {description}') _language = input("Language code (e.g. en): ") print(f'= {_language}') feed_url = Url.find_or_create(zeeguu.core.db.session, _feed_url) language = Language.find_or_create(_language) rss_feed = RSSFeed.find_or_create(zeeguu.core.db.session, feed_url, feed_name, description, icon_name=icon_name, language=language) print("Done: ") print(rss_feed.title) print(rss_feed.description) print(rss_feed.language_id) print(rss_feed.url.as_string())
def download_feed_item(session, feed, feed_item): new_article = None try: url = _url_after_redirects(feed_item["url"]) log(url) except requests.exceptions.TooManyRedirects: raise Exception(f"- Too many redirects") except Exception: raise Exception( f"- Could not get url after redirects for {feed_item['url']}") title = feed_item["title"] published_datetime = feed_item["published_datetime"] try: art = model.Article.find(url) except: import sys ex = sys.exc_info()[0] raise Exception( f" {LOG_CONTEXT}: For some reason excepted during Article.find \n{str(ex)}" ) if art: raise SkippedAlreadyInDB() try: art = newspaper.Article(url) art.download() art.parse() debug("- Succesfully parsed") cleaned_up_text = cleanup_non_content_bits(art.text) cleaned_up_text = flatten_composed_unicode_characters(cleaned_up_text) is_quality_article, reason = sufficient_quality(art) if not is_quality_article: raise SkippedForLowQuality(reason) summary = feed_item["summary"] # however, this is not so easy... there have been cases where # the summary is just malformed HTML... thus we try to extract # the text: from bs4 import BeautifulSoup soup = BeautifulSoup(summary, "lxml") summary = soup.get_text() # then there are cases where the summary is huge... so we clip it summary = summary[:MAX_CHAR_COUNT_IN_SUMMARY] # and if there is still no summary, we simply use the beginning of # the article if len(summary) < 10: summary = cleaned_up_text[:MAX_CHAR_COUNT_IN_SUMMARY] # Create new article and save it to DB new_article = zeeguu.core.model.Article( Url.find_or_create(session, url), title, ", ".join(art.authors), cleaned_up_text, summary, published_datetime, feed, feed.language, ) session.add(new_article) topics = add_topics(new_article, session) log(f" Topics ({topics})") add_searches(title, url, new_article, session) debug(" Added keywords") session.commit() log(f"SUCCESS for: {new_article.title}") except SkippedForLowQuality as e: raise e except Exception as e: from sentry_sdk import capture_exception capture_exception(e) log(f"* Rolling back session due to exception while creating article and attaching words/topics: {str(e)}" ) session.rollback() return new_article
#!/usr/bin/env python from zeeguu.core.model import RSSFeed, Url, Language import zeeguu.core RESOURCES_FOLDER = "https://zeeguu.unibe.ch/api/resources/" name = input("Name of feed to update: ") session = zeeguu.core.db.session all_feeds = RSSFeed.query.all() for feed in all_feeds: if feed.title == name: print("Updating ... " + name) feed.title = input(f'Title ({feed.title}): ') or feed.title print(f'new title is: {feed.title}') _image_url = input('Icon file: ') feed.image_url = Url.find_or_create(session, RESOURCES_FOLDER + _image_url) print('new image url: ' + feed.image_url.as_string()) session.add(feed) session.commit()