def get_similar_docs(self, **kwargs): if len(kwargs["questions"]) == 0: return [] my_connector = Es_connector(index=kwargs["index"]) # , config_relative_path='../') duplicated_docs = [] docs_ids_matches = [{"match": {"id_str": {"query": question["str_id"] }}} for question in kwargs["questions"]] docs_original_textual_content = my_connector.search({ "query": { "bool": { "should": docs_ids_matches, "minimum_should_match": 1, "must": [ { "match": { kwargs["session"]: "proposed" } } ] } } }) for doc in docs_original_textual_content["hits"]["hits"]: query = { "query": { "bool": { "must": [ { "term": { "text.keyword": { "value": doc["_source"][kwargs["text_field"]] } } } ] } } } matching_docs = my_connector.search(query) if matching_docs["hits"]["total"]>1: label = [question for question in kwargs["questions"] if question["str_id"] == doc["_source"]["id_str"]][0]["label"] for dup_doc in matching_docs["hits"]["hits"]: duplicated_docs.append({ "filename": dup_doc["_source"]["id_str"], "label": label, kwargs["text_field"]: dup_doc["_source"][kwargs["text_field"]] }) return duplicated_docs
def get_ngrams_by_query(self, query="", **kwargs): try: my_connector = Es_connector( index=kwargs["index"], config_relative_path=self.config_relative_path) full_query = { "query": query, "size": 0, "aggs": { "ngrams_count": { "terms": { "field": kwargs["n_size"] + "grams.keyword", "size": kwargs["results_size"] }, "aggs": { "status": { "terms": { "field": kwargs["session"] + ".keyword" } } } } } } return my_connector.search(full_query) except Exception as e: print('Error: ' + str(e)) traceback.print_exc() return {}
def get_answers(self, **kwargs): if "config_relative_path" in kwargs: my_connector = Es_connector( index=kwargs["index"], config_relative_path=kwargs["config_relative_path"]) else: my_connector = Es_connector(index=kwargs["index"]) wrong_labels = 0 all_ids = self.join_ids(kwargs["questions"]) res = my_connector.search({"query": {"match": {"id_str": all_ids}}}) for question in kwargs["questions"]: question_id = self.classifier.extract_filename_no_ext( question["filename"]) gt_tweet = [ tweet for tweet in res["hits"]["hits"] if tweet["_source"]["id_str"] == question_id ] question["label"] = gt_tweet[0]["_source"][kwargs["gt_session"]] if question["pred_label"] != question["label"]: wrong_labels += 1 # print(json.dumps(kwargs["questions"], indent=4, sort_keys=True)) return kwargs["questions"], wrong_labels
def update_docs_by_ids(self, docs_matches, pred_labed, config_relative_path=None): if len(docs_matches)>0: if config_relative_path != None: my_connector = Es_connector(index=self.index, config_relative_path=config_relative_path) else: my_connector = Es_connector(index=self.index) query = { "query": { "bool": { "should": docs_matches, "minimum_should_match": 1 } } } original_docs = my_connector.search(query)["hits"]["hits"] if len(original_docs)>0: for doc in original_docs: # my_connector = Es_connector(index=self.index, config_relative_path=config_relative_path)\ my_connector.es.update( index=self.index, doc_type="tweet", id=doc["_id"], body={"doc": { self.session: pred_labed }}, retry_on_conflict=5 )
def get_sessions(self): my_connector = Es_connector(index=self.sessions_index, doc_type=self.sessions_doc_type) query = {"query": {"match_all": {}}} res = my_connector.search(query) return res
def get_clusters(self, index="test3", word=""): my_connector = Es_connector(index=index) res = my_connector.search({ "size": 1, "query": { "simple_query_string": { "fields": ["text"], "query": word } }, "aggs": { "group_by_cluster": { "terms": { "field": "imagesCluster", "size": 9999 } } } }) # print("Clusters") # print(res['aggregations']['group_by_cluster']['buckets']) clusters = res['aggregations']['group_by_cluster']['buckets'] with open(index + '.json') as f: data = json.load(f) for cluster in clusters: # print(cluster['key']) images = data['duplicates'][cluster['key']] # print(images[0]) cluster['image'] = images[0] cluster['size'] = len(images) # print(clusters) return clusters
def get_event_image(self, index="test3", main_term="", related_terms=""): my_connector = Es_connector(index=index) terms = [] words = main_term + ' ' for t in related_terms: terms.append( {"match": { "text": { "query": t['word'], "boost": t['value'] } }}) words += t['word'] + " " terms.append({"match": {"text": {"query": main_term, "boost": 2}}}) # res = my_connector.search({"query": {"term" : { "text" : word }}}) # query = { # "bool": { # "must": { # "match": { # "text": { # "query": main_term, # "operator": "or" # } # } # }, # "should": terms # } # } query = { "size": 1, "_source": [ "id_str", "imagesCluster", "session_Twitter2015", "extended_entities" ], "query": { "bool": { "must": { "exists": { "field": "extended_entities" } }, "should": terms } } } # print(query) res = my_connector.search(query) return res
def get_session_by_Name(self, name): my_connector = Es_connector(index=self.sessions_index, doc_type=self.sessions_doc_type) query = { "query": { "constant_score": { "filter": { "term": { "s_name": name } } } } } res = my_connector.search(query) return res
def get_cluster_tweets(self, index="test3", cid=0): my_connector = Es_connector(index=index) query = { # "_source": [ # "id_str", # "imagesCluster", # "session_Twitter2015", # "extended_entities" # ], "query": { "term": { "imagesCluster": cid } } } res = my_connector.search(query) return res
def getMean(self, index="test3", main_term="", related_terms=""): my_connector = Es_connector(index=index) terms = [] words = main_term + ' ' for t in related_terms: terms.append( {"match": { "text": { "query": t['word'], "boost": t['value'] } }}) words += t['word'] + " " terms.append({"match": {"text": {"query": main_term, "boost": 2}}}) query = { "sort": ["_score"], "_source": ["_score"], "query": { "bool": { "should": terms } } } query = { "size": 0, "query": { "bool": { "should": terms } }, "aggs": { "sum_scores": { "sum": { "script": "_score" } } } } res = my_connector.search(query) total = res['hits']['total'] sum = res['aggregations']['sum_scores']['value'] mean = sum / total # res = my_connector.bigSearchMean(query) return mean
def get_search_related_classification_data( self, index="test3", word="", session="", label="confirmed OR proposed OR negative", matching_ngrams=[], full_search=False): if full_search: query = {"bool": {"must": [{"match": {session: label}}]}} else: query = { "bool": { "must": [{ "match": { "text": word } }, { "match": { session: label } }] } } my_connector = Es_connector(index=index) res = my_connector.search({ "size": 0, "query": query, "aggs": { "query_classification": { "terms": { "field": session + ".keyword" } } } }) return res['aggregations']['query_classification']['buckets']
def get_valid_tweets(self, index="test3"): my_connector = Es_connector(index=index) res = my_connector.search({ "query": { "simple_query_string": { "fields": ["text"], "query": word } } }) # res = my_connector.bigSearch( # { # "_source": ["text", "id_str", "extended_entities", "user", "created_at", "link"], # "query": { # "simple_query_string": { # "fields": [ # "text" # ], # "query": word # } # } # }) return res['hits']['hits']
print('Number of clusters: %d' % len(data['duplicates'])) print('Index', args.i) my_connector = Es_connector(index=args.i) imgs = 0 count = 0 c_count = 0 for cluster in data['duplicates']: for img in cluster: imgs += 1 print(" Image ", imgs) target_tweet_id = re.search(r'(?<=/)(\d*)_(.*)\.(.*)', img, re.M | re.I) res = my_connector.search( {"query": { "term": { "id_str": target_tweet_id.group(1) } }}) if res['hits']['total'] > 0: id = res['hits']['hits'][0]['_id'] if 'imagesCluster' in res['hits']['hits'][0]['_source']: arr = res['hits']['hits'][0]['_source']['imagesCluster'] if isinstance(arr, list): arr.extend([c_count]) arr = list(set(arr)) update = my_connector.update_field( id, 'imagesCluster', arr) else: update = my_connector.update_field( id, 'imagesCluster', [arr]) else:
data = json.load(f) print('Number of clusters: %d' % len(data['duplicates'])) my_connector = Es_connector(index=args.i) # my_connector = Es_connector(index=args.i, host='http://206.189.211.142', user='', password='') imgs = 0 count = 0 c_count = 0 for cluster in data['duplicates']: for img in cluster: imgs += 1 matchObj = re.match(r'(\d*)_(.*).(.*)', img, re.M | re.I) res = my_connector.search( {"query": { "term": { "id_str": matchObj.group(1) } }}) if res['hits']['total'] > 0: id = res['hits']['hits'][0]['_id'] if 'imagesCluster' in res['hits']['hits'][0]['_source']: arr = res['hits']['hits'][0]['_source']['imagesCluster'] if isinstance(arr, list): print( res['hits']['hits'][0]['_source']['imagesCluster']) arr.extend([c_count]) arr = list(set(arr)) update = my_connector.update_field( id, 'imagesCluster', arr) else: update = my_connector.update_field(
def get_event_clusters(self, index="test3", main_term="", related_terms=""): my_connector = Es_connector(index=index) terms = [] words = main_term + ' ' for t in related_terms: terms.append( {"match": { "text": { "query": t['word'], "boost": t['value'] } }}) words += t['word'] + " " terms.append({"match": {"text": {"query": main_term, "boost": 2}}}) # query = { # "size": 0, # "query": { # "bool": { # "should": terms # } # }, # "aggs": { # "group_by_cluster": { # "terms": { # "field": "imagesCluster", # "size": 200 # } # } # } # } query = { "size": 0, "query": { "bool": { "should": terms } }, "aggregations": { "group_by_cluster": { "terms": { "field": "imagesCluster", # "shard_size": 999999999, "size": 999999 } } } } # print(query) res = my_connector.search(query) # print("Clusters") # print(res['aggregations']['group_by_cluster']['buckets']) clusters = res['aggregations']['group_by_cluster']['buckets'] with open(index + '.json') as f: data = json.load(f) for cluster in clusters: # q1 = { # "_source": [ # "text", # "imagesCluster" # ], # "query": { # "bool": { # "should": terms, # "filter": { # "bool": { # "should": [ # { # "match": { # "imagesCluster": cluster['key'] # } # } # ] # } # } # } # } # } q2 = {"query": {"term": {"imagesCluster": cluster['key']}}} # cres1 = my_connector.search(q1) cres = my_connector.count(q2) # print(cluster['key']) images = data['duplicates'][cluster['key']] # print(images[0]) cluster['image'] = images[0] # cluster['size'] = len(images) # print(cres) cluster['size'] = cres['count'] # cluster['size2'] = cres1['hits']['total'] # if cluster['key']==1452: # print(cluster) # print(clusters) return clusters
print("You are removing duplicates from the " + args.index + " index.") my_conn = Es_connector(index=args.index) buckets_size = 1 while buckets_size > 0: res = my_conn.search({ "size": 0, "query": { "match_all": {} }, "aggs": { "duplicated_by_str_id": { "terms": { "field": "id_str.keyword", "min_doc_count": 2, "size": 20 } } } }) buckets_size = len(res['aggregations']['duplicated_by_str_id']['buckets']) for bucket in res['aggregations']['duplicated_by_str_id']['buckets']: print("Deleting ", bucket["key"]) duplicated_res = my_conn.search( {"query": { "match": {