def calc_hits(address, top_k=10, index_name=INDEX_NAME, repeat_count=5): es = get_es(address) doc_ids = get_all_ids(es) author_graph = get_author_graph(es, index_name) i_to_author = list(author_graph) author_to_i = {author: i for i, author in enumerate(author_graph)} n = len(i_to_author) adj = [[] for _ in range(n)] for i, author in enumerate(i_to_author): for j in map(lambda x: author_to_i[x], author_graph[author]): adj[i].append(j) a = np.ones(n) h = np.ones(n) li = [] all_edges = set([(i, j) for i in range(n) for j in adj[i]]) for repeat in range(repeat_count): new_a = np.zeros(n) new_h = np.zeros(n) for i, j in all_edges: new_h[i] += a[j] new_a[j] += h[i] a = new_a h = new_h a *= 1 / a.sum() h *= 1 / h.sum() li.append((a, h)) top_author_index = np.argsort(-a)[:top_k] return list(map(lambda i: (i_to_author[i], a[i]), top_author_index))
def add_data(address, path='data/quotes.json', index_name=INDEX_NAME): es = get_es(address) with open(path, 'r') as f: data = json.load(f) data = get_bulk(data, index_name) helpers.bulk(es, data)
def query(address, q_title, q_abstract, q_year, use_page_rank=False, w_title=1, w_abstract=1, w_year=1, w_page_rank=1000, index_name=INDEX_NAME): es = get_es(address) query = { 'query': { 'bool': { 'should': [ { 'match': { 'paper.title': { 'query': q_title, 'boost': w_title, } } }, { 'match': { 'paper.abstract': { 'query': q_abstract, 'boost': w_abstract, } } }, { 'range': { 'paper.date': { 'gte': q_year, 'boost': w_year, } } }, ] } } } number_of_docs = 2000 if use_page_rank: query['query']['bool']['should'].append({ 'script_score': { 'query': { 'match_all': {} }, 'script': { 'source': 'saturation(doc["paper.page_rank"].value, 1)', }, 'boost': w_page_rank, }, }) ans = es.search(query, index_name, size=10)['hits']['hits'] return list(map(lambda doc: doc['_source']['paper'], ans))
def add_pagerank(address, index_name=INDEX_NAME, alpha=0.1): es = get_es(address) doc_ids = get_all_ids(es, index_name) P = get_transition_matrix(es, doc_ids, index_name, alpha) v = calc_pagerank_vector(P) pagerank_by_id = {doc_id: v[i] for i, doc_id in enumerate(doc_ids)} es.update_by_query( INDEX_NAME, { "script": { "source": "ctx._source.paper.page_rank = params.pagerank[ctx._id]", "params": { "pagerank": pagerank_by_id } } })
def clear_index(address, index_name=INDEX_NAME): es = get_es(address) es.indices.delete(index_name) create_empty_index(address, index_name)
def get_doc_count(address, index_name=INDEX_NAME): es = get_es(address) return len(get_all_ids(es, index_name))
def get_first_doc(address, index_name=INDEX_NAME): es = get_es(address) all_ids = get_all_ids(es, index_name) if len(all_ids) == 0: return "No docs in index" return es.get(index_name, all_ids[0])