Exemplo n.º 1
0
def calc_hits(address, top_k=10, index_name=INDEX_NAME, repeat_count=5):
    es = get_es(address)
    doc_ids = get_all_ids(es)
    author_graph = get_author_graph(es, index_name)
    i_to_author = list(author_graph)
    author_to_i = {author: i for i, author in enumerate(author_graph)}
    n = len(i_to_author)
    adj = [[] for _ in range(n)]
    for i, author in enumerate(i_to_author):
        for j in map(lambda x: author_to_i[x], author_graph[author]):
            adj[i].append(j)
    a = np.ones(n)
    h = np.ones(n)
    li = []
    all_edges = set([(i, j) for i in range(n) for j in adj[i]])
    for repeat in range(repeat_count):
        new_a = np.zeros(n)
        new_h = np.zeros(n)
        for i, j in all_edges:
            new_h[i] += a[j]
            new_a[j] += h[i]

        a = new_a
        h = new_h
        a *= 1 / a.sum()
        h *= 1 / h.sum()
        li.append((a, h))
    top_author_index = np.argsort(-a)[:top_k]
    return list(map(lambda i: (i_to_author[i], a[i]), top_author_index))
Exemplo n.º 2
0
def add_data(address, path='data/quotes.json', index_name=INDEX_NAME):
    es = get_es(address)
    with open(path, 'r') as f:
        data = json.load(f)

    data = get_bulk(data, index_name)
    helpers.bulk(es, data)
Exemplo n.º 3
0
def query(address,
          q_title,
          q_abstract,
          q_year,
          use_page_rank=False,
          w_title=1,
          w_abstract=1,
          w_year=1,
          w_page_rank=1000,
          index_name=INDEX_NAME):
    es = get_es(address)
    query = {
        'query': {
            'bool': {
                'should': [
                    {
                        'match': {
                            'paper.title': {
                                'query': q_title,
                                'boost': w_title,
                            }
                        }
                    },
                    {
                        'match': {
                            'paper.abstract': {
                                'query': q_abstract,
                                'boost': w_abstract,
                            }
                        }
                    },
                    {
                        'range': {
                            'paper.date': {
                                'gte': q_year,
                                'boost': w_year,
                            }
                        }
                    },
                ]
            }
        }
    }
    number_of_docs = 2000
    if use_page_rank:
        query['query']['bool']['should'].append({
            'script_score': {
                'query': {
                    'match_all': {}
                },
                'script': {
                    'source': 'saturation(doc["paper.page_rank"].value, 1)',
                },
                'boost': w_page_rank,
            },
        })
    ans = es.search(query, index_name, size=10)['hits']['hits']
    return list(map(lambda doc: doc['_source']['paper'], ans))
Exemplo n.º 4
0
def add_pagerank(address, index_name=INDEX_NAME, alpha=0.1):
    es = get_es(address)
    doc_ids = get_all_ids(es, index_name)
    P = get_transition_matrix(es, doc_ids, index_name, alpha)
    v = calc_pagerank_vector(P)
    pagerank_by_id = {doc_id: v[i] for i, doc_id in enumerate(doc_ids)}
    es.update_by_query(
        INDEX_NAME, {
            "script": {
                "source":
                "ctx._source.paper.page_rank = params.pagerank[ctx._id]",
                "params": {
                    "pagerank": pagerank_by_id
                }
            }
        })
Exemplo n.º 5
0
def clear_index(address, index_name=INDEX_NAME):
    es = get_es(address)
    es.indices.delete(index_name)
    create_empty_index(address, index_name)
Exemplo n.º 6
0
def get_doc_count(address, index_name=INDEX_NAME):
    es = get_es(address)
    return len(get_all_ids(es, index_name))
Exemplo n.º 7
0
def get_first_doc(address, index_name=INDEX_NAME):
    es = get_es(address)
    all_ids = get_all_ids(es, index_name)
    if len(all_ids) == 0:
        return "No docs in index"
    return es.get(index_name, all_ids[0])