class Searchengine: def __init__(self): self.INDEX_NAME = "paper" self.es_client = Elasticsearch() def create_index(self): """ Creates an Elasticsearch index.""" is_created = False # Index settings analyzer = {"tokenizer": "standard", "filter": ["lowercase", "porter_stem"]} settings = { "settings": { "number_of_shards": 1, "number_of_replicas": 1, "analysis": { "analyzer": { "default": analyzer } } }, "mappings": { "dynamic": "true", "_source": { "enabled": "true" }, "properties": { "title": {"type": "text"}, "dblpKey": {"type": "keyword"}, "doi": {"type": "keyword"}, "authors": {"type": "keyword"}, "publisher": {"type": "keyword"}, "booktitle": {"type": "text"}, "keyqueries": {"type": "flattened"}, "keywords": {"type": "flattened"}, "abstract": {"type": "text"}, "fulltext": {"type": "text"} }, }, } print(f'Creating {self.INDEX_NAME} index...') try: if self.es_client.indices.exists(self.INDEX_NAME): self.es_client.indices.delete(index=self.INDEX_NAME, ignore=[404]) self.es_client.indices.create(index=self.INDEX_NAME, body=settings) is_created = True print('index created successfully.') except Exception as ex: print(str(ex)) finally: return is_created def createIndexAndIndexDocs_(self, path): self.create_index() self.index_data(readJSON_(path)) def createIndexAndIndexDocs(self, path): self.create_index() self.index_data(readJSON(path)) def index_data(self, data, batch_size=10000): """ Indexs all the rows in data""" for chunk in [data[x:x + batch_size] for x in range(0, len(data), batch_size)]: self.index_batch(chunk) # print(f'Indexed {doc} document.') print("Done indexing!!! Wuhu") def index_batch(self, docs): """ Indexes a batch of documents.""" requests = [] for doc in docs: request = dict() request["_op_type"] = "index" request["_index"] = self.INDEX_NAME request["_source"] = doc requests.append(request) bulk(self.es_client, requests, refresh=True) def run_query_loop(self): """ Asks user to enter a query to search.""" while True: try: self.title_search(input('enter query\n')) except KeyboardInterrupt: break return def title_search(self, title, size=10000): """ Searches the user query and finds the best matches using elasticsearch.""" search = { "size": size, "query": { "match": { "title": { "query": title, "operator": "and" } } } } return self.es_client.search(index=self.INDEX_NAME, body=search) def normal_search(self, query, size=10000): """ Searches the user query and finds the best matches using elasticsearch.""" search = { "size": size, "query": { "multi_match": { "query": query, "fields": ["title", "abstract", "fulltext"] }, }, } return self.es_client.search(index=self.INDEX_NAME, body=search) def id_search(self, _id, size=10000): return self.es_client.get(index=self.INDEX_NAME, id=_id) def normal_search_exclude_ids(self, query, ids, size=10000): """ Searches the user query and finds the best matches using elasticsearch.""" if not isinstance(ids, list): return None search = { "size": size, "query": { "bool": { "must": [ { "multi_match": { "query": query, "fields": ["title", "abstract", "fulltext"] }, } ], "must_not": [ { "ids": { "values": ids } } ] } } } return self.es_client.search(index=self.INDEX_NAME, body=search) def fill_documents(self, path): docs = readJSON(path) doi_id = {hit["_source"]["doi"]: hit["_id"] for hits in self.chunk_iterate_docs() for hit in hits} gen = ((doi_id[doc["doi"]], { "abstract": doc.get("abstract"), "fulltext": doc.get("fulltext"), "acmId": doc.get("acmId") }) for doc in docs if doc["doi"] in doi_id) self.chunk_update_field(gen) def start(self, size=20): papers = [] while True: query = input("Enter the paper you want to keyquerie: (empty input to cancel)") if not query: break response = self.title_search(query) if response["hits"]["total"]["value"] == 0: print("no documents found, specify another search string please") continue print("Please select the paper(s) you want to use like so ('1, 3, 10')") print('\n'.join( [f'{index} : {paper["_source"]["title"]}' for index, paper in enumerate(response["hits"]["hits"])])) while True: numbers = input() if not numbers: break numbers = numbers.split(",") if all(elem.isdigit() and int(elem) in range(0, len(response["hits"]["hits"])) for elem in numbers): for n in numbers: papers.append(response["hits"]["hits"][int(n)]) break else: print("Wrong Input, pls try again!") print('currently selected papers: [\n {}\n]'.format( "\n ".join(str(paper['_source']["title"]) for paper in papers))) ask = input("Do you want to add another paper [Y/n]?") if ask == 'n': break if papers: print("\n##################### Searchresult #########################") ids = list({paper["_id"] for paper in papers}) kq, _ = self.select_keyquerie(papers) if isinstance(kq, tuple): print("Selected KQ: " + " ".join(kq[0]) + "\n") result = self.normal_search_exclude_ids(" ".join(kq[0]), ids=ids, size=size)["hits"]["hits"] else: print("Selected KQ: " + str(kq) + "\n") result = self.normal_search_exclude_ids(kq, ids=ids, size=size)["hits"]["hits"] i = 0 for hit in result: print(str(i) + " " + hit["_source"]["title"] + " \nwith score: " + str(hit["_score"])+ "\n") i += 1 def chunk_iterate_docs(self, page_size=10000, keep_alive="12h"): if page_size > 10000: page_size = 10000 pit: dict = self.es_client.open_point_in_time(index=self.INDEX_NAME, keep_alive=keep_alive) query = { "size": page_size, "sort": [{"_doc": "asc"}], "pit": pit } response = self.es_client.search(body=query) while response["hits"]["hits"]: yield response["hits"]["hits"] query["search_after"] = response["hits"]["hits"][-1]["sort"] query["pit"]["id"] = response["pit_id"] response = self.es_client.search(body=query) self.es_client.close_point_in_time(body={"id": response["pit_id"]}) def update_keyqueries_without_noise(self, new_inputs, num_keywords=9, min_rank=50, candidate_pos=('NOUN', 'PROPN')): self.es_client.indices.refresh(index=self.INDEX_NAME) hits = [] for titles in new_inputs.values(): for title in titles: hits.extend(self.title_search(title, size=1)["hits"]["hits"]) kwqss = calc_kwss_kqss(hits, num_keywords=num_keywords, min_rank=min_rank, candidate_pos=candidate_pos) if kwqss: try_n = 1 while try_n < 5: try: self.chunk_update_field(((_id, {"keywords": kws, "keyqueries": kqs}) for (_id, (kws, kqs)) in kwqss.items()), page_size=len(kwqss)) break except ElasticsearchException as e: print(f"exception {e} occured on try {try_n}") print(kwqss) try_n += 1 else: print("could not read that stuff in") raise Exception() else: print("no keyqueries calculated") return True def update_keyqueries(self, num_keywords=9, min_rank=50, candidate_pos=('NOUN', 'PROPN')): self.es_client.indices.refresh(index=self.INDEX_NAME) doc_count = self.es_client.indices.stats(index=self.INDEX_NAME, metric="docs")["indices"][self.INDEX_NAME]["total"]["docs"]["count"] cpu_count = ceil(multiprocessing.cpu_count()*2) chunk_size = min(max(ceil(doc_count / cpu_count), 1), 1000) # cpu_count = ceil(doc_count / chunk_size) # cpu_count = multiprocessing.cpu_count() # chunk_size = 10000 unhandled = "json/unhandled.json" exceptions = list() with open(unhandled, "w+") as fc: fc.write("{") with ThreadPoolExecutor(max_workers=cpu_count) as executor: for kwqss in executor.map(calc_kwss_kqss, self.chunk_iterate_docs(page_size=chunk_size), (n for n in infinite(num_keywords)), (m for m in infinite(min_rank)), (c for c in infinite(candidate_pos))): if not kwqss: continue try_n = 1 while try_n < 5: try: self.chunk_update_field(((_id, {"keywords": kws, "keyqueries": kqs}) for (_id, (kws, kqs)) in kwqss.items()), page_size=len(kwqss)) # every "_id" has a field called "keyqueries" which contains a dict consisting of a space # separated concatenation of the keywords of the keyquery and the score of the respective "_id" # regarding this particular keyquery print(f"done {len(kwqss)}") # break except ElasticsearchException as e: exceptions.append(e) print(f"exception occured on try {try_n}") finally: try_n += 1 else: with open(unhandled, "a") as fjson: for (_id, (kws, kqs)) in kwqss.items(): fjson.write(f'"{_id}": ') fjson.write(json.dumps((kws, kqs)) + ',') with open(unhandled, "r") as fr: content = fr.read() if content != '{': content = content[:-1] content += '}' with open(unhandled, "w") as fa: fa.write(content) with open("exceptions.txt", "w") as fexc: fexc.writelines(str(e) for e in exceptions) def chunk_update_field(self, gen, chunk_size=1000, page_size=None): """ :param gen: iterable yielding at most page_size update lines :param chunk_size: batching the update process via bulk(client, update) in steps of size chunk_size :param page_size: how many updates are retrievable by gen :return: None """ gen_ = ({"_index": self.INDEX_NAME, "_op_type": "update", "_id": _id, "doc": print_return(fields)} for (_id, fields) in itertools.islice(gen, chunk_size)) if page_size and page_size <= chunk_size: bulk(self.es_client, gen_) else: try: while True: actions = [next(gen_)] bulk(self.es_client, actions) bulk(self.es_client, itertools.islice(gen_, chunk_size)) except StopIteration: pass print(f"{page_size} documents successfully read") # self.es_client.indices.refresh(index=self.INDEX_NAME) def select_keyquerie(self, papers, final_kws=9, min_rank=50): # return self.dontcareaboutcoverageofkeyqueries(papers), "dcacok-algorithm" # return self.option5(papers, num_keywords=final_kws) kqss_v = [paper["_source"].get("keyqueries") for paper in papers if paper["_source"].get("keyqueries")] ids = {paper["_id"] for paper in papers} allkeys = [] for dic in kqss_v: for key in dic: allkeys.append(key) revindex = Counter(allkeys) # print(sorted(revindex.items(), key=lambda x: x[1], reverse=True)) candidates = [k for k, v in revindex.items() if float(v) >= len(papers)] selected = "" if candidates: print("\n---------------------- Option 1 ------------------------") score = 0 for temp in candidates: aver = 0 for number in kqss_v: aver += number[temp] maybe = aver / len(papers) if maybe > score: score = maybe selected = temp else: candidates.remove(temp) return selected, 1 # print("\n---------------------- dcacok --------------------------") # return self.dontcareaboutcoverageofkeyqueries(papers), "dcacok" solutions = self.option2(papers) if solutions: print("\n---------------------- Option 2 ------------------------") max_keywords = frozenset.union(*solutions.keys()) k = keyqueries.Keyqueries() keyout = set() if len(max_keywords) <= final_kws: output = k.best_kq(_ids=list(ids), keywords=list(max_keywords), min_rank=min_rank) return output, 2 max_anz = sum(len(v) for v in solutions.values()) for solution, _ids in solutions.items(): keywords = dict() for _id in _ids: for keyword, value in self.id_search(_id)["_source"].get("keywords").items(): if keyword in solution: old_v = keywords.get(keyword, 0) keywords[keyword] = old_v + value sorted_merge = sorted(keywords.items(), key=lambda item: item[1], reverse=True) allowed_n = math.ceil((len(_ids) / max_anz) * final_kws) keyout.update({keyword for (keyword, value) in sorted_merge[:allowed_n]}) output = k.best_kq(_ids=list(ids), keywords=list(keyout), min_rank=min_rank) return output, 2 print("\n---------------------- Option 3 ------------------------") k = keyqueries.Keyqueries() top_kwss = set() for hit in papers: kqs_v = hit["_source"].get("keyqueries") if kqs_v: kqs_v_srtd = sorted(kqs_v.items(), key=lambda item: item[1], reverse=True) top_kwss.update(kqs_v_srtd[0][0]) with ThreadPoolExecutor(max_workers=1) as pool: try: return next(pool.map(k.best_kq, (list(ids),), (list(top_kwss),), timeout=120), None), 3 except TimeoutError: pass return None, None def dontcareaboutcoverageofkeyqueries(self, docs_p, top_kqs=5): kqs = dict() for doc in docs_p: for kq_str, score in doc["_source"].get("keyqueries", dict()).items(): if kq_str and score: kq = frozenset(kq_str.split()) kqs[kq] = kqs.get(kq, 0.0) + score return sorted(kqs.items(), key=lambda x: x[1], reverse=True)[0] def option2(self, docs_p): docs = [] for doc_p in docs_p: doc = dict() doc["_id"] = doc_p["_id"] try: doc["keyqueries"] = {frozenset(kq_str.split()): score for kq_str, score in doc_p["_source"]["keyqueries"].items()} docs.append(doc) except KeyError: pass id_src = {doc["_id"]: doc["keyqueries"] for doc in docs} kqs = set(kq for doc in docs for kq, score in doc["keyqueries"].items()) ms = {kq: set(doc["_id"] for doc in docs if kq in doc["keyqueries"]) for kq in kqs} def kq_sort(kq__doc_ids): """ :param kq__doc_ids: tuple in ms.items(), 1st is a keyquery and 2nd is the set of ids of the corresponding docs :return: a tuple, 1st is the amount of corresponding docs and second the avg of the scores of the kq across the docs """ kq = kq__doc_ids[0] doc_ids = kq__doc_ids[1] return len(doc_ids), mean(id_src[_id].get(kq) for _id in doc_ids) ms = dict(sorted(ms.items(), key=kq_sort, reverse=True)) return self.greedy(ms, set(id_src.keys())) def greedy(self, ms, _ids): found_docs = set() solution = dict() for kq, kq_docs in ms.items(): if not kq_docs.issubset(found_docs): solution[kq] = kq_docs found_docs.update(kq_docs) if found_docs == _ids: break return solution def full_text_search(self): pass def option4(self, papers): print("\n---------------------- Option 4 ------------------------") ids = {paper["_id"] for paper in papers} score_this = [] for paper in papers: kqs_v = paper["_source"].get("keyqueries") kq, _ = sorted(kqs_v.items(), key=lambda x: x[1], reverse=True)[0] score_this.append(self.normal_search_exclude_ids(kq, ids=list(ids))["hits"]["hits"]) thislist = list(itertools.chain.from_iterable(score_this)) thislist.sort(key=lambda x: x["_score"], reverse=True) thislist = [item for item in thislist if item["_score"] == max([item2["_score"] for item2 in thislist if item["_id"] == item2["_id"]])] # for item in thislist: # for seconditem in thislist: # if not item == seconditem: # if item["_id"] == seconditem["_id"]: # thislist.remove(item) return thislist def kqc(self, papers, num_keywords=9, min_rank=50, candidate_pos=('NOUN', 'PROPN')): print("\n---------------------- Option 5 ------------------------") k = keyqueries.Keyqueries() ids = {paper["_id"] for paper in papers} kws = k.extract_keywords_kqc(papers, num_keywords=num_keywords, candidate_pos=candidate_pos) kq = k.best_kq(_ids=ids, keywords=kws, min_rank=min_rank) return kq, "kqc" def debug_print(self): print("hits without kq:", len(list(hit for hits in self.chunk_iterate_docs() for hit in hits if not hit["_source"].get("keyqueries")))) def extract_noise(self, size=5000): query = { "size": size, "query": { "function_score": { "query": {"match_all": {}}, "random_score": {} } } } content = json.dumps([hit["_source"] for hit in self.es_client.search(body=query, index=self.INDEX_NAME)["hits"]["hits"]]) with open(f"json/noise{size}.json", "w") as file: file.write(content) def extract_json(self, search_phrase, file_name=None): if not file_name: file_name = f"{search_phrase}.json" with open(file_name, "w") as file: file.write( json.dumps([hit["_source"] for hit in self.title_search(search_phrase, size=1000)["hits"]["hits"]]))