def author_paper_query(author_ids): ''' Query author id for availible papers. ''' # Elastic search client client = Elasticsearch(conf.get("elasticsearch.hostname")) # Target paa_target = 'PaperId' # Query results auth_paper_res = list() # Query for paa paa_s = Search(index='paperauthoraffiliations', using=client) paa_s = paa_s.query('terms', AuthorId=author_ids) paa_s = paa_s.source([paa_target]) # Parse to list for paa in paa_s.scan(): auth_paper_res.append(paa[paa_target]) return auth_paper_res
def fos_name_level_dict_query(fos_ids): ''' Find field of study name from id. ''' # Elastic search client client = Elasticsearch(conf.get("elasticsearch.hostname")) # Target fos_target = ['NormalizedName', 'Level'] # Query for paa fos_s = Search(index='fieldsofstudy', using=client) fos_s = fos_s.query('terms', FieldOfStudyId=fos_ids) fos_s = fos_s.source(fos_target) fos_s = fos_s.params(request_timeout=30) names = dict() levels = dict() for fos in fos_s.scan(): # Add names to result names[int(fos.meta.id)] = fos[fos_target[0]] levels[int(fos.meta.id)] = fos[fos_target[1]] return names, levels
def cache_paper_info(paper_infos, chunk_size=20, request_timeout=100, additional_tag={}): ''' Converts and caches a single paper info dictionary. ''' # Elastic search client client = Elasticsearch(conf.get("elasticsearch.hostname")) # Convert into cache-able json paper_info_chunks = [paper_infos[i:i+chunk_size] for i in \ range(0, len(paper_infos), chunk_size)] # Cache to database for chunk in paper_info_chunks: cache_datas = (paper_info_to_cache_json(pi, additional_tag=additional_tag) for pi in chunk) helpers.bulk(client, cache_datas, request_timeout=request_timeout, refresh=True, stats_only=True)
def journal_paper_query(jour_ids): ''' Query journal id for availible papers. ''' # Elastic search client client = Elasticsearch(conf.get("elasticsearch.hostname")) # Target papers_target = 'PaperId' # Query results jour_paper_res = list() # Query for papers papers_s = Search(index='papers', using=client) papers_s = papers_s.query('terms', JournalId=jour_ids) papers_s = papers_s.source([papers_target]) papers_s = papers_s.params(request_timeout=30) # Parse to list for papers in papers_s.scan(): jour_paper_res.append(papers[papers_target]) return jour_paper_res
def conference_paper_query(conf_ids): ''' Query conference (instance) id for availible papers. ''' # Elastic search client client = Elasticsearch(conf.get("elasticsearch.hostname")) # Target papers_target = 'PaperId' # Query results conf_paper_res = list() # Query for papers papers_s = Search(index='papers', using=client) papers_s = papers_s.query('terms', ConferenceSeriesId=conf_ids) papers_s = papers_s.source([papers_target]) papers_s = papers_s.params(request_timeout=30) # Parse to list for papers in papers_s.scan(): conf_paper_res.append(papers[papers_target]) return conf_paper_res
# Try load config try: config_path = os.path.join('scripts', SCRIPT_CONFIG) with open(config_path, 'r') as f: config = json.load(f) START_VERSION = config['update_version'] THREADS = config['threads'] BATCH_SIZE = config['batch_size'] print(BATCH_SIZE) except FileExistsError: pass #%% # Elastic search client client = Elasticsearch(conf.get('elasticsearch.hostname')) #%% query = Q('bool', should=[~ Q('exists', field='UpdateVersion'), Q('range', UpdateVersion={'lt': START_VERSION})], minimum_should_match=1 ) cache_allow = Q('bool', must=[Q('exists', field='UpdateVersion'), Q('range', UpdateVersion={'gte': START_VERSION})], minimum_should_match=1 ) #%%
import os, sys, json, uuid, hashlib from multiprocessing import Pool from elasticsearch_dsl.connections import connections from datetime import datetime from graph.schema_cache import BrowseCache, AuthorGroup, PaperGroup, AuthorInfo, PaperInfo from graph.config import conf hostname = conf.get("elasticsearch.hostname") def generate_uuid(seed = None): return uuid.uuid1() if not seed else hashlib.sha1(str.encode(seed)).hexdigest() def init_es(): connections.create_connection(hosts = hostname, timeout=20) print("Elasticsearch connections initialized") def saveNewAuthorGroupCache(cache): print("starting cache") init_es() assert cache["Type"] in cache_types["author_group"] doc = AuthorGroup() doc.Type = cache["Type"] doc.NormalizedNames = cache["NormalizedNames"] doc.DisplayName = cache["DisplayName"] doc.Year = cache["Year"] if ("Year" in cache and cache['Year'].isdigit()) else None doc.Affiliations = cache["Affiliations"] if "Affiliations" in cache else None doc.Keywords = cache["Keywords"] if "Keywords" in cache else None doc.Url = cache['Url'] if 'Url' in cache else None doc.Citation = cache['Citation']
def paper_info_cache_query(paper_ids, batch_size=DEFAULT_BATCH, query_filter=None): """ Gets paper info from cache. """ start = datetime.now() # Elastic search client client = Elasticsearch(conf.get("elasticsearch.hostname")) # Query results complete_info = list() partial_info = list() seen = set() # Query for paper info paper_info_s = Search(index='paper_info', using=client) paper_info_s = paper_info_s.filter('terms', _id=paper_ids) paper_info_s = paper_info_s.params(size=DEFAULT_BATCH) if query_filter is not None: paper_info_s = paper_info_s.query(query_filter) # Convert query into dictionary format for paper_info in paper_info_s.scan(): paper_info_res = paper_info.to_dict() # Remove the creation date for query field_del(paper_info_res, 'CreatedDate') # Check the type of the result if 'FieldsOfStudy' not in paper_info_res: continue if paper_info_res['cache_type'] == 'partial': # if paper_info_res['cache_type'] == 'partial': partial_info.append(paper_info_res) else: skip = False for ref in paper_info_res['References']: if 'FieldsOfStudy' not in ref: skip = True continue for cit in paper_info_res['Citations']: if 'FieldsOfStudy' not in cit: skip = True continue if skip: continue complete_info.append(paper_info_res) del paper_info_res['cache_type'] # Add to seen set seen.add(paper_info_res['PaperId']) print(batch_size, datetime.now() - start) # Check for no results and return return { 'complete': complete_info, 'partial': partial_info, 'missing': set(paper_ids) - seen }
Update cache to include fos information ''' from datetime import datetime from graph.config import conf from elasticsearch import Elasticsearch from elasticsearch_dsl import Search, Q from core.search.query_info_db import paper_info_multiquery from core.search.cache_data import cache_paper_info # Constants NUM_PAPERS = 1 # Elastic search client client = Elasticsearch(conf.get("elasticsearch.hostname")) THRESHOLD_DATE = datetime(2019, 3, 6, 10, 43, 45, 734484) # Memory for deleting entries which no longer exist last_papers = set() while True: # Specify the query paper_info_s = Search(index='paper_info', using=client) paper_info_s = paper_info_s.sort({ "CreatedDate": { "order": "desc" } }) paper_info_s = paper_info_s.update_from_dict({ "query": { "bool": { "must_not": [ { "exists": { "field": "FieldsOfStudy" } } ], "must": { "range": { "CreatedDate": { "lt": THRESHOLD_DATE } } } } } }) paper_info_s = paper_info_s.source(['PaperId']) # Get number of query results results = paper_info_s[:NUM_PAPERS]
def paa_prop_query(paper_ids): ''' Get properties of a paper. ''' # Elastic search client client = Elasticsearch(conf.get("elasticsearch.hostname")) # Targets paa_targets = ['PaperId', 'AuthorId', 'AffiliationId'] # Query for paper affiliation paa_s = Search(index='paperauthoraffiliations', using=client) paa_s = paa_s.query('terms', PaperId=paper_ids) paa_s = paa_s.source(paa_targets) paa_s = paa_s.params(request_timeout=TIMEOUT) # Convert paa into dictionary format results = dict() auth_ids = set() affi_ids = set() for paa in paa_s.scan(): paa_res = paa.to_dict() # Get fields paper_id = paa_res['PaperId'] del paa_res['PaperId'] # Author if 'AuthorId' in paa_res: auth_ids.add(paa_res['AuthorId']) # Affiliation if 'AffiliationId' in paa_res: affi_ids.add(paa_res['AffiliationId']) # Aggregate results if paper_id in results: results[paper_id].append(paa_res) else: results[paper_id] = [paa_res] auth_names = author_name_dict_query(list(auth_ids)) affi_names = affiliation_name_dict_query(list(affi_ids)) res = dict() for p_id, paa_info_list in results.items(): paa_res = list() for paa_info in paa_info_list: if 'AuthorId' in paa_info: if paa_info['AuthorId'] in auth_names: paa_info['AuthorName'] = auth_names[paa_info['AuthorId']] else: continue if 'AffiliationId' in paa_info: if paa_info['AffiliationId'] in affi_names: paa_info['AffiliationName'] = affi_names[ paa_info['AffiliationId']] else: continue paa_res.append(paa_info) res[p_id] = paa_res # Return as dictionary return res