def user_article_info(user: User, article: Article, with_content=False, with_translations=True): from zeeguu_core.model import UserArticle prior_info = UserArticle.find(user, article) ua_info = article.article_info(with_content=with_content) if not prior_info: ua_info['starred'] = False ua_info['opened'] = False ua_info['liked'] = False ua_info['translations'] = [] return ua_info ua_info['starred'] = prior_info.starred is not None ua_info['opened'] = prior_info.opened is not None ua_info['liked'] = prior_info.liked if with_translations: translations = Bookmark.find_all_for_user_and_url(user, article.url) ua_info['translations'] = [ each.serializable_dictionary() for each in translations ] return ua_info
def article_id(): """ returns the article at that URL or creates an article and returns it takes url as URL argument NOTE: the url should be encoded with quote_plus (Pyton) and encodeURIComponent(Javascript) :return: article id """ url = request.args.get("url", "") if not url: flask.abort(400) try: article = Article.find_or_create(db_session, url) return json_result(dict(article_id=article.id)) except Exception as e: from sentry_sdk import capture_exception capture_exception(e) zeeguu_core.log(e) flask.abort(500)
def user_article_info(cls, user: User, article: Article, with_content=False, with_translations=True): from zeeguu_core.model import Bookmark # Initialize returned info with the default article info returned_info = article.article_info(with_content=with_content) user_article_info = UserArticle.find(user, article) if not user_article_info: returned_info['starred'] = False returned_info['opened'] = False returned_info['liked'] = False returned_info['translations'] = [] return returned_info returned_info['starred'] = user_article_info.starred is not None returned_info['opened'] = user_article_info.opened is not None returned_info['liked'] = user_article_info.liked if with_translations: translations = Bookmark.find_all_for_user_and_url( user, article.url) returned_info['translations'] = [ each.serializable_dictionary() for each in translations ] return returned_info
def create_from_upload(cls, session, title, content, uploader, language): new_article = Article(None, title, None, content, None, None, None, language, uploader) session.add(new_article) session.commit() return new_article.id
def get_cohorts_for_article(article_id): """ Gets all the cohorts for this article """ article = Article.find_by_id(article_id) return json.dumps(CohortArticleMap.get_cohorts_for_article(article))
def __init__(self, real=False): super().__init__() if real: self.article = Article.find_or_create(ArticleRule.db.session, url_diesel_fahrverbote) else: self.article = self._create_model_object() self.save(self.article)
def get_possible_translations(from_lang_code, to_lang_code): """ Returns a list of possible translations in :param to_lang_code for :param word in :param from_lang_code. You must also specify the :param context, :param url, and :param title of the page where the word was found. The context is the sentence. :return: json array with translations """ data = {"from_lang_code": from_lang_code, "to_lang_code": to_lang_code} data["context"] = request.form.get('context', '') url = request.form.get('url', '') data["url"] = url article_id = None if 'articleID' in url: article_id = url.split('articleID=')[-1] url = Article.query.filter_by(id=article_id).one().url.as_canonical_string() elif 'articleURL' in url: url = url.split('articleURL=')[-1] else: # the url comes from elsewhere not from the reader, so we find or creat the article article = Article.find_or_create(db_session, url) article_id = article.id zeeguu_core.log(f"url before being saved: {url}") word_str = request.form['word'] data["word"] = word_str title_str = request.form.get('title', '') data["title"] = title_str zeeguu_core.log(f'translating to... {data["to_lang_code"]}') minimal_context, query = minimize_context( data["context"], data["from_lang_code"], data["word"]) zeeguu_core.log(f"Query to translate is: {query}") data["query"] = query translations = get_all_translations(data).translations zeeguu_core.log(f"Got translations: {translations}") # translators talk about quality, but our users expect likelihood. # rename the key in the dictionary for t in translations: t['likelihood'] = t.pop("quality") t['source'] = t.pop('service_name') best_guess = translations[0]["translation"] Bookmark.find_or_create(db_session, flask.g.user, word_str, from_lang_code, best_guess, to_lang_code, minimal_context, url, title_str, article_id) return json_result(dict(translations=translations))
def translate_and_bookmark(from_lang_code, to_lang_code): """ @deprecated This should be deprecated and /get_possible_translations used instead However, it is still used by the zeeguu chrome extension. This expects in the post parameter the following: - word (to translate) - context (surrounding paragraph of the original word ) - url (of the origin) - title (of the origin page) /get_possible_translations has very similar behavior, only that if focuses on returning the possible alternative translations :param from_lang_code: :param to_lang_code: :return: """ data = {"from_lang_code": from_lang_code, "to_lang_code": to_lang_code} word_str = unquote_plus(request.form['word']) data["word"] = word_str url_str = request.form.get('url', '') data["url"] = url_str title_str = request.form.get('title', '') data["title"] = title_str context_str = request.form.get('context', '') data["context"] = context_str # the url comes from elsewhere not from the reader, so we find or creat the article article = Article.find_or_create(db_session, url_str) article_id = article.id try: minimal_context, query = minimize_context( data["context"], data["from_lang_code"], data["word"]) data["query"] = query translations = get_all_translations(data).translations best_guess = translations[0]["translation"] bookmark = Bookmark.find_or_create(db_session, flask.g.user, word_str, from_lang_code, best_guess, to_lang_code, minimal_context, url_str, title_str, article_id) except ValueError as e: zeeguu_core.log(f"minimize context failed {e}on: {context_str} x {from_lang_code} x {word_str} ") return context_str, query return json_result(dict( bookmark_id=bookmark.id, translation=best_guess))
def teacher_texts(): """ Gets all the articles of this teacher """ articles = Article.own_texts_for_user(flask.g.user) article_info_dicts = [ article.article_info_for_teacher() for article in articles ] return json.dumps(article_info_dicts)
def upload_own_text(): db_session.rollback() language = Language.find_or_create(request.form.get("language", "")) content = request.form.get("content", "") title = request.form.get("title", "") new_article_id = Article.create_from_upload( db_session, title, content, flask.g.user, language ) return str(new_article_id)
def contribute_translation(from_lang_code, to_lang_code): """ User contributes a translation they think is appropriate for a given :param word in :param from_lang_code in a given :param context The :param translation is in :param to_lang_code Together with the two words and the textual context, you must submit also the :param url, :param title of the page where the original word and context occurred. :return: in case of success, the bookmark_id and main translation """ # All these POST params are mandatory word_str = unquote_plus(request.form['word']) translation_str = request.form['translation'] url = request.form.get('url', '') context_str = request.form.get('context', '') title_str = request.form.get('title', '') # when a translation is added by hand, the servicename_translation is None # thus we set it to MANUAL service_name = request.form.get('servicename_translation', 'MANUAL') article_id = None if 'articleID' in url: article_id = url.split('articleID=')[-1] url = Article.query.filter_by(id=article_id).one().url.as_canonical_string() elif 'articleURL' in url: url = url.split('articleURL=')[-1] else: # the url comes from elsewhere not from the reader, so we find or creat the article article = Article.find_or_create(db_session, url) article_id = article.id # Optional POST param selected_from_predefined_choices = request.form.get('selected_from_predefined_choices', '') minimal_context, _ = minimize_context(context_str, from_lang_code, word_str) bookmark = Bookmark.find_or_create(db_session, flask.g.user, word_str, from_lang_code, translation_str, to_lang_code, minimal_context, url, title_str, article_id) # Inform apimux about translation selection data = {"word_str": word_str, "translation_str": translation_str, "url": url, "context_size": len(context_str), "service_name": service_name} contribute_trans(data) return json_result(dict(bookmark_id=bookmark.id))
def get_user_article_info(): """ expects one parameter: url :return: json dictionary with info """ url = str(request.form.get("url", "")) article = Article.find_or_create(db_session, url) return json_result(UserArticle.user_article_info(flask.g.user, article))
def delete_articles_older_than(DAYS, print_progress_for_every_article=False): print(f"Finding articles older than {DAYS} days...") all_articles = Article.all_older_than(days=DAYS) print(f" ... article count: {len(all_articles)}") i = 0 referenced_in_this_batch = 0 deleted = [] for each in all_articles: i += 1 if print_progress_for_every_article: print(f"#{i} -- ID: {each.id}") if is_the_article_referenced(each, True): referenced_in_this_batch += 1 continue try: articles_cache = ArticlesCache.query.filter_by( article_id=each.id).all() if articles_cache: for each_cache_line in articles_cache: print( f"... ID: {each.id} deleting also cache line: {each_cache_line}" ) dbs.delete(each_cache_line) deleted.append(each.id) dbs.delete(each) if i % BATCH_COMMIT_SIZE == 0: print( f"Keeping {referenced_in_this_batch} articles from the last {BATCH_COMMIT_SIZE} batch..." ) dbs.commit() print( f"... the rest of {BATCH_COMMIT_SIZE-referenced_in_this_batch} are now deleted!!!" ) referenced_in_this_batch = 0 except sqlalchemy.exc.IntegrityError as e: traceback.print_exc() dbs.rollback() continue print(f'Deleted: {deleted}')
def _create_model_object(self): title = " ".join(self.faker.text().split()[:4]) authors = self.faker.name() content = self.faker.text() summary = self.faker.text() published = datetime.now() - timedelta(minutes=randint(0, 7200)) rss_feed = RSSFeedRule().feed language = LanguageRule().random url = UrlRule().url article = Article(url, title, authors, content, summary, published, rss_feed, language) if self._exists_in_db(article): return self._create_model_object() return article
def add_article_to_cohort(): """ Gets all the articles of this teacher """ cohort = Cohort.find(request.form.get("cohort_id")) if not has_permission_for_cohort(cohort.id): flask.abort(401) article = Article.find_by_id(request.form.get("article_id")) if not CohortArticleMap.find(cohort.id, article.id): new_mapping = CohortArticleMap(cohort, article) db.session.add(new_mapping) db.session.commit() return "OK"
def delete_article_from_cohort(): """ Gets all the articles of this teacher """ cohort = Cohort.find(request.form.get("cohort_id")) if not has_permission_for_cohort(cohort.id): flask.abort(401) article = Article.find_by_id(request.form.get("article_id")) mapping = CohortArticleMap.find(cohort.id, article.id) if mapping: db.session.delete(mapping) db.session.commit() return "OK" else: return make_error(401, "That article does not belong to the cohort!")
def upload_articles(cohort_id): """ uploads articles for a cohort with input from a POST request """ if not has_permission_for_cohort(cohort_id): flask.abort(401) try: for article_data in json.loads(request.data): url = Url("userarticle/{}".format(uuid.uuid4().hex)) title = article_data["title"] authors = article_data["authors"] content = article_data["content"] summary = article_data["summary"] published_time = datetime.now() language_code = article_data["language_code"] language = Language.find(language_code) new_article = Article( url, title, authors, content, summary, published_time, None, # rss feed language, ) db.session.add(new_article) db.session.flush() db.session.refresh(new_article) cohort = Cohort.find(cohort_id) new_cohort_article_map = CohortArticleMap(cohort, new_article) db.session.add(new_cohort_article_map) db.session.commit() return "OK" except ValueError: flask.abort(400) return "ValueError"
def upload_articles(cohort_id): ''' uploads articles for a cohort with input from a POST request ''' if (not has_permission_for_cohort(cohort_id)): flask.abort(401) try: for article_data in json.loads(request.data): url = Url('userarticle/{}'.format(uuid.uuid4().hex)) title = article_data['title'] authors = article_data['authors'] content = article_data['content'] summary = article_data['summary'] published_time = datetime.now() language_code = article_data['language_code'] language = Language.find(language_code) new_article = Article( url, title, authors, content, summary, published_time, None, # rss feed language) db.session.add(new_article) db.session.flush() db.session.refresh(new_article) cohort = Cohort.find(cohort_id) new_cohort_article_map = CohortArticleMap(cohort, new_article) db.session.add(new_cohort_article_map) db.session.commit() return 'OK' except ValueError: flask.abort(400) return 'ValueError'
def article_id(): """ returns the article at that URL or creates an article and returns it takes url as URL argument NOTE: the url should be encoded with quote_plus (Pyton) and encodeURIComponent(Javascript) :return: article id """ url = request.args.get('url', '') if not url: flask.abort(400) try: article = Article.find_or_create(db_session, url) return json_result(dict(article_id=article.id)) except Exception as e: zeeguu_core.log(e) flask.abort(500)
def more_like_this_article(user, count, article_id): """ Given a article ID find more articles like that one via Elasticsearchs "more_like_this" method """ article = Article.find_by_id(article_id) query_body = build_more_like_this_query(count, article.content, article.language) es = Elasticsearch(ES_CONN_STRING) res = es.search(index=ES_ZINDEX, body=query_body) # execute search hit_list = res["hits"].get("hits") # TODO need to make sure either that the searched on article is always a part of the list \ # or that it is never there. # it could be used to show on website; you searched on X, here is what we found related to X final_article_mix = _to_articles_from_ES_hits(hit_list) return [ UserArticle.user_article_info(user, article) for article in final_article_mix ]
def _find_article_in_value_or_extra_data(self, db_session): """ Finds or creates an article_id return: articleID or NONE NOTE: When the article cannot be downloaded anymore, either because the article is no longer available or the newspaper.parser() fails """ if self.event in ALL_ARTICLE_INTERACTION_ACTIONS: if self.value.startswith("http"): url = self.value else: url = self.find_url_in_extra_data() if url: return Article.find_or_create(db_session, url, sleep_a_bit=True).id return None
found = 1 for text in texts: # print(text.article_id) if not text.article: article_id = None if 'articleID' in text.url.as_canonical_string(): article_id = text.url.as_canonical_string().split("articleID=")[-1] # print(f'extracted id: {article_id}') if article_id: article = Article.query.filter_by(id=article_id).one() else: article = Article.find(text.url.as_canonical_string()) # print(text.url.as_canonical_string()) if not article: not_found += 1 print(f'not found: {not_found}') else: found += 1 text.article = article zeeguu_core.db.session.add(text) # print(text) # print(article) # print(text.url.as_string()) # print(text.article.url.as_string()) print(f'found: {found}')
def _exists_in_db(obj): return Article.exists(obj)
deleted = [] print("1. finding urls in activity data...") all_urls = set() all_activity_data = UserActivityData.query.all() for each in all_activity_data: url = each.find_url_in_extra_data() if url: all_urls.add(url) print(f" ... url count: {len(all_urls)}") # print(f"2. finding articles older than {DAYS} days...") all_articles = Article.all_older_than(days=DAYS) print(f" ... article count: {len(all_articles)}") i = 0 for each in all_articles: i += 1 info = UserArticle.find_by_article(each) url_found = each.url.as_string() in all_urls if info or url_found: if info: print(f"WON'T DELETE info! {each.id} {each.title}") for ainfo in info: print(ainfo.user_info_as_string()) if url_found: print(f"WON'T DELETE url_found! {each.id} {each.title}")
def _to_articles_from_ES_hits(hits): articles = [] for hit in hits: articles.append(Article.find_by_id(hit.get("_id"))) return articles
def own_texts(): r = [e.article_info() for e in Article.own_texts_for_user(flask.g.user)] return json_result(r)
def get_one_translation(from_lang_code, to_lang_code): """ Addressing some of the problems with the get_next_translations... - it should be separated in get_first and get_alternatives - alternatively it can be get one and get all To think about: - it would also make sense to separate translation from logging; or at least, allow for situations where a translation is not associated with an url... or? :return: json array with translations """ word_str = request.form["word"] url = request.form.get("url") title_str = request.form.get("title", "") context = request.form.get("context", "") minimal_context, query = minimize_context(context, from_lang_code, word_str) translation = own_translation( flask.g.user, word_str, from_lang_code, to_lang_code, minimal_context ) if translation: return json_result(dict(translations=translation)) translations = get_next_results( { "from_lang_code": from_lang_code, "to_lang_code": to_lang_code, "url": request.form.get("url"), "word": word_str, "title": title_str, "query": query, "context": minimal_context, }, number_of_results=1, ).translations # do we really need this? # translators talk about quality, but our users expect likelihood. # rename the key in the dictionary for t in translations: t["likelihood"] = t.pop("quality") t["source"] = t["service_name"] article_id = None if "article?id=" in url: article_id = url.split("article?id=")[-1] url = Article.query.filter_by(id=article_id).one().url.as_canonical_string() else: # the url comes from elsewhere not from the reader, so we find or creat the article article = Article.find_or_create(db_session, url) article_id = article.id if len(translations) > 0: best_guess = translations[0]["translation"] Bookmark.find_or_create( db_session, flask.g.user, word_str, from_lang_code, best_guess, to_lang_code, minimal_context, url, title_str, article_id, ) return json_result(dict(translations=translations))
def get_next_translations(from_lang_code, to_lang_code): """ Returns a list of possible translations in :param to_lang_code for :param word in :param from_lang_code. You must also specify the :param context, :param url, and :param title of the page where the word was found. The context is the sentence. :return: json array with translations """ data = {"from_lang_code": from_lang_code, "to_lang_code": to_lang_code} data["context"] = request.form.get("context", "") url = request.form.get("url", "") number_of_results = int(request.form.get("numberOfResults", -1)) service_name = request.form.get("service", "") exclude_services = [] if service_name == "" else [service_name] currentTranslation = request.form.get("currentTranslation", "") exclude_results = [] if currentTranslation == "" else [currentTranslation.lower()] data["url"] = url article_id = request.form.get("articleID", None) if article_id == None: if "articleID" in url: article_id = url.split("articleID=")[-1] url = Article.query.filter_by(id=article_id).one().url.as_canonical_string() elif "articleURL" in url: url = url.split("articleURL=")[-1] else: # the url comes from elsewhere not from the reader, so we find or creat the article article = Article.find_or_create(db_session, url) article_id = article.id zeeguu_core.log(f"url before being saved: {url}") word_str = request.form["word"] data["word"] = word_str title_str = request.form.get("title", "") data["title"] = title_str zeeguu_core.log(f'translating to... {data["to_lang_code"]}') minimal_context, query = minimize_context( data["context"], data["from_lang_code"], data["word"] ) zeeguu_core.log(f"Query to translate is: {query}") data["query"] = query first_call_for_this_word = len(exclude_services) == 0 if first_call_for_this_word: translations = own_or_crowdsourced_translation( flask.g.user, word_str, from_lang_code, to_lang_code, minimal_context ) if translations: return json_result(dict(translations=translations)) translations = get_next_results( data, exclude_services=exclude_services, exclude_results=exclude_results, number_of_results=number_of_results, ).translations # translators talk about quality, but our users expect likelihood. # rename the key in the dictionary for t in translations: t["likelihood"] = t.pop("quality") t["source"] = t["service_name"] if len(translations) > 0 and first_call_for_this_word: best_guess = translations[0]["translation"] Bookmark.find_or_create( db_session, flask.g.user, word_str, from_lang_code, best_guess, to_lang_code, minimal_context, url, title_str, article_id, ) return json_result(dict(translations=translations))
def find_or_create(cls, session, _url: str, language=None, sleep_a_bit=False): """ If not found, download and extract all the required info for this article. :param url: :return: """ from zeeguu_core.model import Url, Article, Language import newspaper url = Url.extract_canonical_url(_url) try: found = cls.find(url) if found: return found art = newspaper.Article(url=url) art.download() art.parse() if art.text == '': raise Exception("Newspaper got empty article from: " + url) if sleep_a_bit: import time from random import randint print("GOT: " + url) sleep_time = randint(3, 33) print( f"sleeping for {sleep_time}s... so we don't annoy our friendly servers" ) time.sleep(sleep_time) if not language: if art.meta_lang == '': art.meta_lang = detect(art.text) zeeguu_core.log(f"langdetect: {art.meta_lang} for {url}") language = Language.find_or_create(art.meta_lang) # Create new article and save it to DB url_object = Url.find_or_create(session, url) new_article = Article( url_object, art.title, ', '.join(art.authors), art.text[ 0: 32000], # any article longer than this will be truncated... art.summary, None, None, language) session.add(new_article) session.commit() return new_article except sqlalchemy.exc.IntegrityError or sqlalchemy.exc.DatabaseError: for i in range(10): try: session.rollback() u = cls.find(url) print("Found article by url after recovering from race") return u except: print("Exception of second degree in article..." + str(i)) time.sleep(0.3) continue break
def test_load_article_without_language_information(self): art = Article.find_or_create(session, url_plane_crashes) assert (art)
def test_find_or_create(self): self.new_art = Article.find_or_create(session, url_formation_professionnelle) assert (self.new_art.fk_difficulty)