def _create_model_object(self): random_url = self.faker.uri() random_title = self.faker.sentence() url = Url.find_or_create(self.db.session, random_url, random_title) if self._exists_in_db(url): return self._create_model_object() return url
def find_or_create( cls, session, user, _origin: str, _origin_lang: str, _translation: str, _translation_lang: str, _context: str, _url: str, _url_title: str, article_id: int, ): """ if the bookmark does not exist, it creates it and returns it if it exists, it ** updates the translation** and returns the bookmark object :param _origin: :param _context: :param _url: :return: """ origin_lang = Language.find_or_create(_origin_lang) translation_lang = Language.find_or_create(_translation_lang) origin = UserWord.find_or_create(session, _origin, origin_lang) article = Article.query.filter_by(id=article_id).one() url = Url.find_or_create(session, article.url.as_string(), _url_title) context = Text.find_or_create(session, _context, origin_lang, url, article) translation = UserWord.find_or_create(session, _translation, translation_lang) now = datetime.now() try: # try to find this bookmark bookmark = Bookmark.find_by_user_word_and_text(user, origin, context) # update the translation bookmark.translation = translation except sqlalchemy.orm.exc.NoResultFound as e: bookmark = cls(origin, translation, user, context, now) except Exception as e: raise e session.add(bookmark) session.commit() return bookmark
def find(cls, url: str): """ Find by url :return: object or None if not found """ from zeeguu_core.model import Url try: url_object = Url.find(url) return (cls.query.filter(cls.url == url_object)).one() except NoResultFound: return None
def test_url_domain(self): """Tests the correct retrieval of a domain from a random url e.g. 'https://google.com' should be retrieved from e.g. 'https://google.com/search' """ url_random = UrlRule().url.as_string() url_parts = url_random.split('//', 1) domain_should_be = url_parts[0] + '//' + url_parts[1].split('/', 1)[0] domain_to_check = Url(url_random, self.faker.word()).domain_name() assert domain_to_check == domain_should_be, (domain_should_be + " should be " + domain_to_check)
def from_url(cls, url: str): data = feedparser.parse(url) try: title = data.feed.title except: title = "" try: description = data.feed.subtitle except: description = None try: image_url_string = data.feed.image.href print(f'Found image url at: {image_url_string}') except: print('Could not find any image url.') feed_url = Url(url, title) return RSSFeed(feed_url, title, description)
def test_one_domain_multiple_urls(self): """ Tests that if multiple URLs are added to the database that their DomainName is not added to the database more than once """ # Create an 'original' URL, which is saved to the Database url_random_obj_origin = UrlRule().url # Create a random number of URLs, each with the same DomainName random_num = random.randint(0, 10) for _ in range(0, random_num): url_random_extended = url_random_obj_origin.as_string( ) + self.faker.word() _ = Url(url_random_extended, self.faker.word()) domain_for_query = url_random_obj_origin.domain_name() try: assert DomainName.find(domain_for_query) except NoResultFound: assert False, "No domains found in database" except MultipleResultsFound: assert False, "There were multiple DomainNames in the database"
def find_or_create(cls, session, _url: str, language=None, sleep_a_bit=False): """ If not found, download and extract all the required info for this article. :param url: :return: """ from zeeguu_core.model import Url, Article, Language import newspaper url = Url.extract_canonical_url(_url) try: found = cls.find(url) if found: return found art = newspaper.Article(url=url) art.download() art.parse() if art.text == '': raise Exception("Newspaper got empty article from: " + url) if sleep_a_bit: import time from random import randint print("GOT: " + url) sleep_time = randint(3, 33) print( f"sleeping for {sleep_time}s... so we don't annoy our friendly servers" ) time.sleep(sleep_time) if not language: if art.meta_lang == '': art.meta_lang = detect(art.text) zeeguu_core.log(f"langdetect: {art.meta_lang} for {url}") language = Language.find_or_create(art.meta_lang) # Create new article and save it to DB url_object = Url.find_or_create(session, url) new_article = Article( url_object, art.title, ', '.join(art.authors), art.text[ 0: 32000], # any article longer than this will be truncated... art.summary, None, None, language) session.add(new_article) session.commit() return new_article except sqlalchemy.exc.IntegrityError or sqlalchemy.exc.DatabaseError: for i in range(10): try: session.rollback() u = cls.find(url) print("Found article by url after recovering from race") return u except: print("Exception of second degree in article..." + str(i)) time.sleep(0.3) continue break