def user_article(): """ called user_article because it returns info about the article but also the user-specific data relative to the article takes url as URL argument NOTE: the url should be encoded with quote_plus (Pyton) and encodeURIComponent(Javascript) this is not perfectly RESTful, but we're not fundamentalist... and currently we want to have the url as the URI for the article and for some reason if we put the uri as part of the path, apache decodes it before we get it in here. so for now, we're just not putting it as part of the path :return: json as prepared by content_recommender.mixed_recommender.user_article_info """ url = request.args.get('url', '') if not url: flask.abort(400) article = Article.find_or_create(db_session, url) return json_result( user_article_info(flask.g.user, article, with_content=True))
def user_article_update(): """ update info about this (user x article) pair in the form data you can provide - liked=True|1|False|0 - starred -ibidem- :return: json as prepared by content_recommender.mixed_recommender.user_article_info """ url = request.form.get('url') starred = request.form.get('starred') liked = request.form.get('liked') article = Article.find_or_create(db_session, url) user_article = UserArticle.find_or_create(db_session, flask.g.user, article) if starred is not None: user_article.set_starred(starred in ["True", "1"]) if liked is not None: user_article.set_liked(liked in ["True", "1"]) db_session.commit() return "OK"
def user_article_info(user: User, article: Article, with_content=False, with_translations=True): from zeeguu.model import UserArticle prior_info = UserArticle.find(user, article) ua_info = article.article_info(with_content=with_content) if not prior_info: ua_info['starred'] = False ua_info['opened'] = False ua_info['liked'] = False ua_info['translations'] = [] return ua_info ua_info['starred'] = prior_info.starred is not None ua_info['opened'] = prior_info.opened is not None ua_info['liked'] = prior_info.liked if with_translations: translations = Bookmark.find_all_for_user_and_url(user, article.url) ua_info['translations'] = [ each.serializable_dictionary() for each in translations ] return ua_info
def __init__(self, real=False): super().__init__() if real: self.article = Article.find_or_create(ArticleRule.db.session, TEST_URL) else: self.article = self._create_model_object() self.save(self.article)
def article_feedback(session, value, extra_data): # the url that comes from zeeguu event logger # might be the zeeguu url: which is of the form # https://www.zeeguu.unibe.ch/read/article?articleLanguage=de&articleURL=https://www.nzz.ch/wissenschaft/neandertaler-waren-kuenstler-ld.1358862 # thus we extract only the last part url = value.split('articleURL=')[-1] article = Article.find_or_create(session, url) if "not_finished_for_broken" in extra_data: article.vote_broken() session.add(article) session.commit()
def article_liked(session, value, user, like_value): # the url that comes from zeeguu event logger # might be the zeeguu url: which is of the form # https://www.zeeguu.unibe.ch/read/article?articleLanguage=de&articleURL=https://www.nzz.ch/wissenschaft/neandertaler-waren-kuenstler-ld.1358862 # thus we extract only the last part url = value.split('articleURL=')[-1] article = Article.find_or_create(session, url) ua = UserArticle.find(user, article) ua.liked = like_value session.add(ua) session.commit() log(f"{ua}")
def get_user_article_info(): """ expects one parameter: url :return: json dictionary with info """ url = str(request.form.get('url', '')) article = Article.find_or_create(db_session, url) return json_result(user_article_info(flask.g.user, article))
def _create_model_object(self): title = " ".join(self.faker.text().split()[:4]) authors = self.faker.name() content = self.faker.text() summary = self.faker.text() published = datetime.now() - timedelta(minutes=randint(0, 7200)) rss_feed = RSSFeedRule().feed language = LanguageRule().random url = UrlRule().url article = Article(url, title, authors, content, summary, published, rss_feed, language) if self._exists_in_db(article): return self._create_model_object() return article
def add_bookmark(db, user, original_language, original_word, translation_language, translation_word, date, the_context, the_url, the_url_title): session = db.session url = Url.find_or_create(session, the_url, the_url_title) article = Article.find_or_create(session, url.as_string()) text = Text.find_or_create(session, the_context, translation_language, url, article) origin = UserWord.find_or_create(session, original_word, original_language) translation = UserWord.find_or_create(session, translation_word, translation_language) b1 = Bookmark(origin, translation, user, text, date) db.session.add(b1) db.session.commit() return b1
def _find_article_in_value_or_extra_data(self, db_session): """ Finds or creates an article_id return: articleID or NONE NOTE: When the article cannot be downloaded anymore, either because the article is no longer available or the newspaper.parser() fails """ if self.event in ALL_ARTICLE_INTERACTION_ACTIONS: if self.value.startswith('http'): url = self.value else: url = self.find_url_in_extra_data() if url: return Article.find_or_create(db_session, url, sleep_a_bit=True).id return None
def _exists_in_db(obj): return Article.exists(obj)
found = 1 for text in texts: # print(text.article_id) if not text.article: article_id = None if 'articleID' in text.url.as_canonical_string(): article_id = text.url.as_canonical_string().split("articleID=")[-1] # print(f'extracted id: {article_id}') if article_id: article = Article.query.filter_by(id=article_id).one() else: article = Article.find(text.url.as_canonical_string()) # print(text.url.as_canonical_string()) if not article: not_found += 1 print(f'not found: {not_found}') else: found += 1 text.article = article zeeguu.db.session.add(text) # print(text) # print(article) # print(text.url.as_string()) # print(text.article.url.as_string()) print(f'found: {found}')
visited_url_user_pairs = [] for bookmark in Bookmark.query.all(): try: urlcrop = str(bookmark.text.url).split('articleURL=')[-1] url_user_hash = urlcrop + bookmark.user.name if url_user_hash in visited_url_user_pairs: continue visited_url_user_pairs.append(url_user_hash) article = Article.find_or_create(session, urlcrop, bookmark.text.language) likes = UserActivityData.find(bookmark.user, extra_filter='title', extra_value=str(bookmark.text.url.title), event_filter='UMR - LIKE ARTICLE') Nlikes = len(likes) #print(sa.url) url_end = urlcrop.find("xtor=RSS") if url_end < 0: url = str(urlcrop) else: url = str(urlcrop)[:url_end-1] last_opened_act = UserActivityData.find(bookmark.user, extra_filter='articleURL', extra_value=url, event_filter='UMR - OPEN ARTICLE', only_latest=True) if last_opened_act is None: last_opened = None else: last_opened = last_opened_act.time
def test_load_article_without_language_information(self): url = 'https://edition.cnn.com/2018/03/12/asia/kathmandu-plane-crash/index.html' art = Article.find_or_create(session, url) assert (art)
def test_find_or_create(self): self.new_art = Article.find_or_create(session, SOME_ARTICLE_URL) assert (self.new_art.fk_difficulty)
def find_or_create(cls, session, _url:str, language=None, sleep_a_bit=False): """ If not found, download and extract all the required info for this article. :param url: :return: """ from zeeguu.model import Url, Article, Language import newspaper url = Url.extract_canonical_url(_url) try: found = cls.find(url) if found: return found art = newspaper.Article(url=url) art.download() art.parse() if art.text == '': raise Exception("Newspaper got empty article from: " + url) if sleep_a_bit: import time from random import randint print("GOT: " + url) sleep_time = randint(3, 33) print(f"sleeping for {sleep_time}s... so we don't annoy our friendly servers") time.sleep(sleep_time) if not language: if art.meta_lang == '': art.meta_lang = detect(art.text) zeeguu.log(f"langdetect: {art.meta_lang} for {url}") language = Language.find_or_create(art.meta_lang) # Create new article and save it to DB url_object = Url.find_or_create(session, url) new_article = Article( url_object, art.title, ', '.join(art.authors), art.text[0:32000], # any article longer than this will be truncated... art.summary, None, None, language ) session.add(new_article) session.commit() return new_article except sqlalchemy.exc.IntegrityError or sqlalchemy.exc.DatabaseError: for i in range(10): try: session.rollback() u = cls.find(url) print("Found article by url after recovering from race") return u except: print("Exception of second degree in article..." + str(i)) time.sleep(0.3) continue break
deleted = [] print("1. finding urls in activity data...") all_urls = set() all_activity_data = UserActivityData.query.all() for each in all_activity_data: url = each.find_url_in_extra_data() if url: all_urls.add(url) print(f" ... url count: {len(all_urls)}") # print(f"2. finding articles older than {DAYS} data...") all_articles = Article.all_older_than(days=DAYS) print(f" ... article count: {len(all_articles)}") for each in all_articles: info = UserArticle.find_by_article(each) url_found = each.url.as_string() in all_urls if info or url_found: if info: print(f"WON'T DELETE info! {each.id} {each.title}") for ainfo in info: print(ainfo.user_info_as_string()) if url_found: print(f"WON'T DELETE url_found! {each.id} {each.title}") else: deleted.append(each.id) dbs.delete(each)
import zeeguu from zeeguu.model import Article, UserArticle from zeeguu.model.starred_article import StarredArticle session = zeeguu.db.session for sa in StarredArticle.query.all(): try: article = Article.find_or_create(session, sa.url.as_string()) ua = UserArticle.find_or_create(session, sa.user, article, starred=sa.starred_date) session.add(ua) session.commit() print(f'{sa.starred_date} x {ua.user.name} x {ua.article.title}') except Exception as ex: print(f'could not import {sa.url.as_string()}') print(ex)