Exemplo n.º 1
0
def author_paper_query(author_ids):
    ''' Query author id for availible papers.
    '''
    # Elastic search client
    client = Elasticsearch(conf.get("elasticsearch.hostname"))

    # Target
    paa_target = 'PaperId'

    # Query results
    auth_paper_res = list()

    # Query for paa
    paa_s = Search(index='paperauthoraffiliations', using=client)
    paa_s = paa_s.query('terms', AuthorId=author_ids)
    paa_s = paa_s.source([paa_target])

    # Parse to list
    for paa in paa_s.scan():
        auth_paper_res.append(paa[paa_target])

    return auth_paper_res
Exemplo n.º 2
0
def fos_name_level_dict_query(fos_ids):
    ''' Find field of study name from id.
    '''
    # Elastic search client
    client = Elasticsearch(conf.get("elasticsearch.hostname"))

    # Target
    fos_target = ['NormalizedName', 'Level']

    # Query for paa
    fos_s = Search(index='fieldsofstudy', using=client)
    fos_s = fos_s.query('terms', FieldOfStudyId=fos_ids)
    fos_s = fos_s.source(fos_target)
    fos_s = fos_s.params(request_timeout=30)

    names = dict()
    levels = dict()
    for fos in fos_s.scan():
        # Add names to result
        names[int(fos.meta.id)] = fos[fos_target[0]]
        levels[int(fos.meta.id)] = fos[fos_target[1]]

    return names, levels
Exemplo n.º 3
0
def cache_paper_info(paper_infos,
                     chunk_size=20,
                     request_timeout=100,
                     additional_tag={}):
    ''' Converts and caches a single paper info dictionary.
    '''
    # Elastic search client
    client = Elasticsearch(conf.get("elasticsearch.hostname"))

    # Convert into cache-able json
    paper_info_chunks = [paper_infos[i:i+chunk_size] for i in \
                    range(0, len(paper_infos), chunk_size)]

    # Cache to database
    for chunk in paper_info_chunks:
        cache_datas = (paper_info_to_cache_json(pi,
                                                additional_tag=additional_tag)
                       for pi in chunk)
        helpers.bulk(client,
                     cache_datas,
                     request_timeout=request_timeout,
                     refresh=True,
                     stats_only=True)
Exemplo n.º 4
0
def journal_paper_query(jour_ids):
    ''' Query journal id for availible papers.
    '''
    # Elastic search client
    client = Elasticsearch(conf.get("elasticsearch.hostname"))

    # Target
    papers_target = 'PaperId'

    # Query results
    jour_paper_res = list()

    # Query for papers
    papers_s = Search(index='papers', using=client)
    papers_s = papers_s.query('terms', JournalId=jour_ids)
    papers_s = papers_s.source([papers_target])
    papers_s = papers_s.params(request_timeout=30)

    # Parse to list
    for papers in papers_s.scan():
        jour_paper_res.append(papers[papers_target])

    return jour_paper_res
Exemplo n.º 5
0
def conference_paper_query(conf_ids):
    ''' Query conference (instance) id for availible papers.
    '''
    # Elastic search client
    client = Elasticsearch(conf.get("elasticsearch.hostname"))

    # Target
    papers_target = 'PaperId'

    # Query results
    conf_paper_res = list()

    # Query for papers
    papers_s = Search(index='papers', using=client)
    papers_s = papers_s.query('terms', ConferenceSeriesId=conf_ids)
    papers_s = papers_s.source([papers_target])
    papers_s = papers_s.params(request_timeout=30)

    # Parse to list
    for papers in papers_s.scan():
        conf_paper_res.append(papers[papers_target])

    return conf_paper_res
Exemplo n.º 6
0
# Try load config
try:
    config_path = os.path.join('scripts', SCRIPT_CONFIG)

    with open(config_path, 'r') as f:
        config = json.load(f)
        START_VERSION = config['update_version']
        THREADS = config['threads']
        BATCH_SIZE = config['batch_size']
        print(BATCH_SIZE)
except FileExistsError:
    pass

#%%
# Elastic search client
client = Elasticsearch(conf.get('elasticsearch.hostname'))

#%%
query = Q('bool',
        should=[~ Q('exists', field='UpdateVersion'),
            Q('range', UpdateVersion={'lt': START_VERSION})],
        minimum_should_match=1
        )

cache_allow = Q('bool',
        must=[Q('exists', field='UpdateVersion'),
            Q('range', UpdateVersion={'gte': START_VERSION})],
        minimum_should_match=1
        )

#%%
Exemplo n.º 7
0
import os, sys, json, uuid, hashlib
from multiprocessing import Pool
from elasticsearch_dsl.connections import connections
from datetime import datetime
from graph.schema_cache import BrowseCache, AuthorGroup, PaperGroup, AuthorInfo, PaperInfo
from graph.config import conf

hostname = conf.get("elasticsearch.hostname")


def generate_uuid(seed = None):
    return uuid.uuid1() if not seed else hashlib.sha1(str.encode(seed)).hexdigest()


def init_es():
    connections.create_connection(hosts = hostname, timeout=20)
    print("Elasticsearch connections initialized")

def saveNewAuthorGroupCache(cache):
    print("starting cache")
    init_es()
    assert cache["Type"] in cache_types["author_group"]
    doc = AuthorGroup()
    doc.Type = cache["Type"]
    doc.NormalizedNames = cache["NormalizedNames"]
    doc.DisplayName = cache["DisplayName"]
    doc.Year = cache["Year"] if ("Year" in cache and cache['Year'].isdigit()) else None
    doc.Affiliations = cache["Affiliations"] if "Affiliations" in cache else None
    doc.Keywords = cache["Keywords"] if "Keywords" in cache else None
    doc.Url = cache['Url'] if 'Url' in cache else None
    doc.Citation = cache['Citation']
Exemplo n.º 8
0
def paper_info_cache_query(paper_ids,
                           batch_size=DEFAULT_BATCH,
                           query_filter=None):
    """ Gets paper info from cache.
    """
    start = datetime.now()

    # Elastic search client
    client = Elasticsearch(conf.get("elasticsearch.hostname"))

    # Query results
    complete_info = list()
    partial_info = list()
    seen = set()

    # Query for paper info
    paper_info_s = Search(index='paper_info', using=client)
    paper_info_s = paper_info_s.filter('terms', _id=paper_ids)
    paper_info_s = paper_info_s.params(size=DEFAULT_BATCH)
    if query_filter is not None:
        paper_info_s = paper_info_s.query(query_filter)

    # Convert query into dictionary format
    for paper_info in paper_info_s.scan():
        paper_info_res = paper_info.to_dict()

        # Remove the creation date for query
        field_del(paper_info_res, 'CreatedDate')

        # Check the type of the result
        if 'FieldsOfStudy' not in paper_info_res:
            continue

        if paper_info_res['cache_type'] == 'partial':
            # if paper_info_res['cache_type'] == 'partial':
            partial_info.append(paper_info_res)
        else:
            skip = False
            for ref in paper_info_res['References']:
                if 'FieldsOfStudy' not in ref:
                    skip = True
                    continue

            for cit in paper_info_res['Citations']:
                if 'FieldsOfStudy' not in cit:
                    skip = True
                    continue

            if skip:
                continue
            complete_info.append(paper_info_res)

        del paper_info_res['cache_type']

        # Add to seen set
        seen.add(paper_info_res['PaperId'])

    print(batch_size, datetime.now() - start)

    # Check for no results and return
    return {
        'complete': complete_info,
        'partial': partial_info,
        'missing': set(paper_ids) - seen
    }
Exemplo n.º 9
0
Update cache to include fos information
'''
from datetime import datetime

from graph.config      import conf
from elasticsearch     import Elasticsearch
from elasticsearch_dsl import Search, Q

from core.search.query_info_db    import paper_info_multiquery
from core.search.cache_data       import cache_paper_info

# Constants
NUM_PAPERS = 1

# Elastic search client
client = Elasticsearch(conf.get("elasticsearch.hostname"))

THRESHOLD_DATE = datetime(2019, 3, 6, 10, 43, 45, 734484) 

# Memory for deleting entries which no longer exist
last_papers = set()

while True:
    # Specify the query
    paper_info_s = Search(index='paper_info', using=client)
    paper_info_s = paper_info_s.sort({ "CreatedDate": { "order": "desc" } })
    paper_info_s = paper_info_s.update_from_dict({ "query": { "bool": { "must_not": [ { "exists": { "field": "FieldsOfStudy" } } ], "must": { "range": { "CreatedDate": { "lt": THRESHOLD_DATE } } } } } })
    paper_info_s = paper_info_s.source(['PaperId'])

    # Get number of query results
    results = paper_info_s[:NUM_PAPERS]
Exemplo n.º 10
0
def paa_prop_query(paper_ids):
    ''' Get properties of a paper.
    '''

    # Elastic search client
    client = Elasticsearch(conf.get("elasticsearch.hostname"))

    # Targets
    paa_targets = ['PaperId', 'AuthorId', 'AffiliationId']

    # Query for paper affiliation
    paa_s = Search(index='paperauthoraffiliations', using=client)
    paa_s = paa_s.query('terms', PaperId=paper_ids)
    paa_s = paa_s.source(paa_targets)
    paa_s = paa_s.params(request_timeout=TIMEOUT)

    # Convert paa into dictionary format
    results = dict()
    auth_ids = set()
    affi_ids = set()
    for paa in paa_s.scan():
        paa_res = paa.to_dict()

        # Get fields
        paper_id = paa_res['PaperId']
        del paa_res['PaperId']

        # Author
        if 'AuthorId' in paa_res:
            auth_ids.add(paa_res['AuthorId'])

        # Affiliation
        if 'AffiliationId' in paa_res:
            affi_ids.add(paa_res['AffiliationId'])

        # Aggregate results
        if paper_id in results:
            results[paper_id].append(paa_res)
        else:
            results[paper_id] = [paa_res]

    auth_names = author_name_dict_query(list(auth_ids))
    affi_names = affiliation_name_dict_query(list(affi_ids))

    res = dict()
    for p_id, paa_info_list in results.items():
        paa_res = list()
        for paa_info in paa_info_list:
            if 'AuthorId' in paa_info:
                if paa_info['AuthorId'] in auth_names:
                    paa_info['AuthorName'] = auth_names[paa_info['AuthorId']]
                else:
                    continue

            if 'AffiliationId' in paa_info:
                if paa_info['AffiliationId'] in affi_names:
                    paa_info['AffiliationName'] = affi_names[
                        paa_info['AffiliationId']]
                else:
                    continue

            paa_res.append(paa_info)

        res[p_id] = paa_res

    # Return as dictionary
    return res