def t_find_pdf_primarily(self, pub_id: str, title: str, authors: list, url: str): resd = {"status": "ok"} if url: files_path = get_config("FILES_PATH") file_name = md5(url.encode("utf-8")).hexdigest() if not os.path.exists(files_path): os.makedirs(files_path) pdf_raw = download(url) full_path = f'{files_path}{os.path.sep}{file_name}.pdf' with open(full_path, "wb+") as f: f.write(pdf_raw) resd["path"] = full_path try: content = extract_text_from_pdf(full_path) except Exception as e: resd["extraction_failure"] = str(e) logger.debug(e) content = None update_result = update_one("publication", { "filter": {"id": {"$eq": pub_id}}, "update": { "$set": { "raw_base64": base64.encodebytes(pdf_raw).decode("utf-8"), "content": content } }, "upsert": True }) logger.info(f'Update Result: {update_result}') t_elasticsearch_indexing.apply_async((pub_id,)) else: authors = find("author", { "filter": {"id": {"$in": authors}}, "projection": {"name": 1} }) t_find_pdf_secondarily.apply_async( (pub_id, title, [a["name"] for a in authors]) ) return resd
import base64
def t_elasticsearch_indexing(self, pub_id: str): resd = {"status": "ok"} pub = find_one("publication", { "filter": {"id": {"$eq": pub_id}, "vector": {"$exists": True}} }) pub["authors"] = find("author", { "filter": {"id": {"$in": pub.get("authors", [])}}, "projection": ["id", "name", "affiliation", "citedby", "interests", "organizations"] }) pub.pop("created_at", None) pub.pop("raw_base64", None) pub.pop("title_md5", None) pub.pop("_id", None) pub_id = pub.pop("id") vector_field_tokens = list() if pub.get("content", None): vector_field_tokens += pub["content"].split() if not pub["title"].startswith("unk_"): vector_field_tokens += pub["title"].split() vector_field = " ".join(vector_field_tokens) vectorizer_response = get_vector(preprocess_text(vector_field)) pub["lang"] = vectorizer_response["lang"] pub["vector"] = vectorizer_response["vector"] langs = get_config("LANGUAGES") for lang in langs: if lang != pub["lang"]: pub[f'title_{lang}'] = preprocess_text( translate(pub["title"], lang) or "" ) if str(pub.get("content", None)).strip().lower() not in ["none", ""]: pub[f'content_{lang}'] = preprocess_text( translate(pub["content"], lang) or "" ) else: pub[f'title_{lang}'] = preprocess_text(pub["title"]) pub[f'content_{lang}'] = preprocess_text(pub.get("content", "") or "") if "title" in pub: del pub["title"] if "content" in pub: del pub["content"] update_one("publication", { "filter": {"id": {"$eq": pub_id}}, "update": {"$set": {"vector": pub["vector"], "lang": pub["lang"]}} }) for lang in langs: publication_mappings["properties"][f'title_{lang}'] = {"type": "text"} publication_mappings["properties"][f'content_{lang}'] = {"type": "text"} resp = rq.put( get_config("ELASTICSEARCH") + "/publication", json={"mappings": publication_mappings} ) if resp.status_code == 400: resp = rq.put( get_config("ELASTICSEARCH") + "/publication/_mappings", json=publication_mappings ) logger.info(f'Mapping Response: {resp.json()}') # resp = es.indices.create( # index="publication", # body={"mappings": publication_mappings}, # ignore=400 # ) result = es.index(index="publication", body=pub, id=pub_id) resd["result"] = result return resd
from application import celery, logger
from threading import Thread
from flask import Blueprint, request
def t_authors_scraper(self): organizations = find("organization") # Get all organizations. len_organizations = len(organizations) logger.info(f'There are {len_organizations} organizations.') proxies = [ "http://192.116.142.153:8080", "http://192.241.149.83:80", "http://192.241.150.4:80" ] for org in organizations: logger.info(f'Starting for <{org["domain"]}>') proxy = choice(proxies) tree, org_href = get_organization_page(org["domain"], proxy) counter = 10 updates = list() while True: authors = get_authors(tree) if authors is None: break for author in authors: author = parse_author(author) author_in_mongo = find_one( "author", {"filter": { "id": { "$eq": author["id"] } }}) if not author_in_mongo: logger.info(f'Starting for <{author["name"]}>') t_get_author.apply_async((author, org["domain"])) else: updates.append(author["id"]) proxy = choice(proxies) tree = get_next_page(tree, counter, org_href, proxy) counter += 10 update_many( "author", { "filter": { "id": { "$in": updates } }, "update": { "$addToSet": { "organizations": org["domain"] } } }) return {"status": "ok", "num_organizations": len_organizations}