def from_url(cls, url: str): data = feedparser.parse(url) try: title = data.feed.title except: title = "" try: description = data.feed.subtitle except: description = None try: image_url_string = data.feed.image.href print(f'Found image url at: {image_url_string}') image_url = Url(image_url_string, title + " Icon") except: print('Could not find any image url.') image_url = None feed_url = Url(url, title) return RSSFeed(feed_url, title, description, image_url, None) return RSSFeed()
def start_following_feed(): """ Start following a feed for which the client provides all the metadata. This is useful for the cases where badly formed feeds can't be parsed by feedparser. :return: """ feed_info = json.loads(request.form.get('feed_info', ''), "utf-8") image_url = feed_info["image"] language = Language.find(feed_info["language"]) url_string = feed_info["url"] title = feed_info["title"] description = feed_info["description"] url = Url.find(url_string) zeeguu.db.session.add(url) # Important to commit this url first; otherwise we end up creating # two domains with the same name for both the urls... zeeguu.db.session.commit() feed_image_url = Url.find(image_url) feed_object = RSSFeed.find_or_create(url, title, description, feed_image_url, language) feed_registration = RSSFeedRegistration.find_or_create( flask.g.user, feed_object) zeeguu.db.session.add_all([feed_image_url, feed_object, feed_registration]) zeeguu.db.session.commit() return "OK"
def set_default_exercise_based_prob(): zeeguu.app.test_request_context().push() zeeguu.db.session.commit() urls = Url.query.all() for url in urls: url.path = Url.get_path(url.url) d = DomainName.find(Url.get_domain(url.url)) url.domain = d zeeguu.db.session.add(url) zeeguu.db.session.add(d) zeeguu.db.session.commit()
def start_following_feeds(): """ A user can start following multiple feeds at once. The feeds are passed as the post parameter :feeds: which contains a json list with URLs for the feeds to be followed. :return: """ json_array_with_feeds = json.loads(request.form.get('feeds', '')) for urlString in json_array_with_feeds: feed = feedparser.parse(urlString).feed feed_image_url_string = "" if "image" in feed: feed_image_url_string = feed.image["href"] lan = None if "language" in feed: lan = Language.find(two_letter_language_code(feed)) url = Url.find(urlString) zeeguu.db.session.add(url) # Important to commit this url first; otherwise we end up creating # two domains with the same name for both the urls... zeeguu.db.session.commit() feed_object = RSSFeed.find_by_url(url) if not feed_object: feed_image_url = Url.find(feed_image_url_string) title = url if "title" in feed: title = feed.title feed_object = RSSFeed.find_or_create(url, title, feed.description, feed_image_url, lan) zeeguu.db.session.add_all([feed_image_url, feed_object]) zeeguu.db.session.commit() feed_registration = RSSFeedRegistration.find_or_create( flask.g.user, feed_object) zeeguu.db.session.add(feed_registration) zeeguu.db.session.commit() return "OK"
def _create_model_object(self): random_url = self.faker.uri() random_title = self.faker.sentence() url = Url.find_or_create(self.db.session, random_url, random_title) if self._exists_in_db(url): return self._create_model_object() return url
def test_feed_items(self): url = Url( "http://www.bild.de/rss-feeds/rss-16725492,feed=home.bild.html", "Build") feed = RSSFeed(url, "Bild.de Home", "build", image_url=None, language=None) items = feed.feed_items() first_item_date = items[0]["published"] assert first_item_date
def find(cls, url: str): """ Find by url :return: object or None if not found """ from zeeguu.model import Url try: url_object = Url.find(url) return (cls.query.filter(cls.url == url_object)).one() except NoResultFound: return None
def test_same_text_does_not_get_created_multiple_Times(self): context = u'Die kleine Jägermeister' with zeeguu.app.app_context(): url = Url.find('http://mir.lu/stories/german/jagermeister', "Die Kleine Jagermeister (Mircea's Stories)") source_language = Language.find('de') form_data = dict(url=url.as_string(), context=context, word="Die") self.api_post('/translate_and_bookmark/de/en', form_data) text1 = Text.find_or_create(context, source_language, url) self.api_post('/translate_and_bookmark/de/en', form_data) text2 = Text.find_or_create(context, source_language, url) assert (text1 == text2)
def find_or_create(cls, session, user, _origin: str, _origin_lang: str, _translation: str, _translation_lang: str, _context: str, _url: str, _url_title: str, article_id: int): """ if the bookmark does not exist, it creates it and returns it if it exists, it ** updates the translation** and returns the bookmark object :param _origin: :param _context: :param _url: :return: """ origin_lang = Language.find_or_create(_origin_lang) translation_lang = Language.find_or_create(_translation_lang) origin = UserWord.find_or_create(session, _origin, origin_lang) article = Article.query.filter_by(id=article_id).one() url = Url.find_or_create(session, article.url.as_string(), _url_title) context = Text.find_or_create(session, _context, origin_lang, url, article) translation = UserWord.find_or_create(session, _translation, translation_lang) now = datetime.now() try: # try to find this bookmark bookmark = Bookmark.find_by_user_word_and_text( user, origin, context) # update the translation bookmark.translation = translation except sqlalchemy.orm.exc.NoResultFound as e: bookmark = cls(origin, translation, user, context, now) except Exception as e: raise e session.add(bookmark) session.commit() return bookmark
def test_url_domain(self): """Tests the correct retrieval of a domain from a random url e.g. 'https://google.com' should be retrieved from e.g. 'https://google.com/search' """ url_random = UrlRule().url.as_string() url_parts = url_random.split('//', 1) domain_should_be = url_parts[0] + '//' + url_parts[1].split('/', 1)[0] domain_to_check = Url(url_random, self.faker.word()).domain_name() assert domain_to_check == domain_should_be, (domain_should_be + " should be " + domain_to_check)
def add_bookmark(user, original_language, original_word, translation_language, translation_word, date, the_context, the_url, the_url_title): url = Url.find (the_url) text = Text.find_or_create(the_context, translation_language, url) origin = UserWord.find(original_word.lower(), original_language) translation = UserWord.find(translation_word.lower(), translation_language) zeeguu.db.session.add(url) zeeguu.db.session.add(text) zeeguu.db.session.add(origin) zeeguu.db.session.add(translation) t1= Bookmark(origin, translation, user, text, date) zeeguu.db.session.add(t1) zeeguu.db.session.commit() add_probability_to_existing_words_of_user(user,t1,original_language)
def bookmark_with_context(from_lang_code, to_lang_code, word_str, url_str, title_str, context_str, translation_str): """ This function will lookup a given word-text pair, and if found, it will return that bookmark rather than a new one :param from_lang_code: :param to_lang_code: :param word_str: :param url_str: :param title_str: :param context_str: :param translation_str: :return: """ from_lang = Language.find(from_lang_code) to_lang = Language.find(to_lang_code) user_word = UserWord.find(word_str, from_lang) url = Url.find(url_str, title_str) zeeguu.db.session.add(url) zeeguu.db.session.commit() context = Text.find_or_create(context_str, from_lang, url) zeeguu.db.session.add(context) zeeguu.db.session.commit() translation = UserWord.find(translation_str, to_lang) try: bookmark = Bookmark.find_all_by_user_word_and_text( flask.g.user, user_word, context)[0] # TODO: Think about updating the date of this bookmark, or maybe creating a duplicate # otherwise, in the history this translation will not be visible! except Exception: bookmark = Bookmark(user_word, translation, flask.g.user, context, datetime.now()) zeeguu.db.session.add(bookmark) bookmark.calculate_probabilities_after_adding_a_bookmark( flask.g.user, bookmark.origin.language) zeeguu.db.session.commit() return str(bookmark.id)
def add_bookmark(db, user, original_language, original_word, translation_language, translation_word, date, the_context, the_url, the_url_title): session = db.session url = Url.find_or_create(session, the_url, the_url_title) text = Text.find_or_create(session, the_context, translation_language, url) origin = UserWord.find_or_create(session, original_word, original_language) translation = UserWord.find_or_create(session, translation_word, translation_language) b1 = Bookmark(origin, translation, user, text, date) db.session.add(b1) db.session.commit() return b1
def bookmark_with_context(from_lang_code, to_lang_code, word_str, url_str, title_str, context_str, translation_str): """ This function will lookup a given word-text pair, and if found, it will return that bookmark rather than a new one :param from_lang_code: :param to_lang_code: :param word_str: :param url_str: :param title_str: :param context_str: :param translation_str: :return: """ from_lang = Language.find(from_lang_code) to_lang = Language.find(to_lang_code) user_word = UserWord.find(word_str, from_lang) url = Url.find(url_str, title_str) zeeguu.db.session.add(url) zeeguu.db.session.commit() context = Text.find_or_create(context_str, from_lang, url) zeeguu.db.session.add(context) zeeguu.db.session.commit() translation = UserWord.find(translation_str, to_lang) try: bookmark = Bookmark.find_all_by_user_word_and_text(flask.g.user, user_word, context)[0] # TODO: Think about updating the date of this bookmark, or maybe creating a duplicate # otherwise, in the history this translation will not be visible! except Exception: bookmark = Bookmark(user_word, translation, flask.g.user, context, datetime.now()) zeeguu.db.session.add(bookmark) bookmark.calculate_probabilities_after_adding_a_bookmark(flask.g.user, bookmark.origin.language) zeeguu.db.session.commit() return str(bookmark.id)
def test_one_domain_multiple_urls(self): """ Tests that if multiple URLs are added to the database that their DomainName is not added to the database more than once """ # Create an 'original' URL, which is saved to the Database url_random_obj_origin = UrlRule().url # Create a random number of URLs, each with the same DomainName random_num = random.randint(0, 10) for _ in range(0, random_num): url_random_extended = url_random_obj_origin.as_string() + self.faker.word() _ = Url(url_random_extended, self.faker.word()) domain_for_query = url_random_obj_origin.domain_name() try: assert DomainName.find(domain_for_query) except NoResultFound: assert False, "No domains found in database" except MultipleResultsFound: assert False, "There were multiple DomainNames in the database"
def find_or_create(cls, session, _url:str, language=None, sleep_a_bit=False): """ If not found, download and extract all the required info for this article. :param url: :return: """ from zeeguu.model import Url, Article, Language import newspaper url = Url.extract_canonical_url(_url) try: found = cls.find(url) if found: return found art = newspaper.Article(url=url) art.download() art.parse() if art.text == '': raise Exception("Newspaper got empty article from: " + url) if sleep_a_bit: import time from random import randint print("GOT: " + url) sleep_time = randint(3, 33) print(f"sleeping for {sleep_time}s... so we don't annoy our friendly servers") time.sleep(sleep_time) if not language: if art.meta_lang == '': art.meta_lang = detect(art.text) zeeguu.log(f"langdetect: {art.meta_lang} for {url}") language = Language.find_or_create(art.meta_lang) # Create new article and save it to DB url_object = Url.find_or_create(session, url) new_article = Article( url_object, art.title, ', '.join(art.authors), art.text[0:32000], # any article longer than this will be truncated... art.summary, None, None, language ) session.add(new_article) session.commit() return new_article except sqlalchemy.exc.IntegrityError or sqlalchemy.exc.DatabaseError: for i in range(10): try: session.rollback() u = cls.find(url) print("Found article by url after recovering from race") return u except: print("Exception of second degree in article..." + str(i)) time.sleep(0.3) continue break