예제 #1
0
    def get_similar_docs(self, **kwargs):

        if len(kwargs["questions"]) == 0:
            return []

        my_connector = Es_connector(index=kwargs["index"])  # , config_relative_path='../')
        duplicated_docs = []

        docs_ids_matches = [{"match": {"id_str": {"query": question["str_id"] }}} for question in kwargs["questions"]]

        docs_original_textual_content = my_connector.search({
            "query": {
                "bool": {
                    "should": docs_ids_matches,
                    "minimum_should_match": 1,
                    "must": [
                        {
                            "match": {
                                kwargs["session"]: "proposed"
                            }
                        }
                    ]
                }
            }
        })

        for doc in docs_original_textual_content["hits"]["hits"]:
            query = {
                "query": {
                    "bool": {
                        "must": [
                            {
                                "term": {
                                    "text.keyword": {
                                        "value": doc["_source"][kwargs["text_field"]]
                                    }
                                }
                            }
                        ]
                    }
                }
            }

            matching_docs = my_connector.search(query)
            if matching_docs["hits"]["total"]>1:

                label = [question for question in kwargs["questions"] if question["str_id"] == doc["_source"]["id_str"]][0]["label"]

                for dup_doc in matching_docs["hits"]["hits"]:
                    duplicated_docs.append({
                        "filename": dup_doc["_source"]["id_str"],
                        "label": label,
                        kwargs["text_field"]: dup_doc["_source"][kwargs["text_field"]]
                    })

        return duplicated_docs
예제 #2
0
    def get_ngrams_by_query(self, query="", **kwargs):

        try:
            my_connector = Es_connector(
                index=kwargs["index"],
                config_relative_path=self.config_relative_path)
            full_query = {
                "query": query,
                "size": 0,
                "aggs": {
                    "ngrams_count": {
                        "terms": {
                            "field": kwargs["n_size"] + "grams.keyword",
                            "size": kwargs["results_size"]
                        },
                        "aggs": {
                            "status": {
                                "terms": {
                                    "field": kwargs["session"] + ".keyword"
                                }
                            }
                        }
                    }
                }
            }
            return my_connector.search(full_query)

        except Exception as e:
            print('Error: ' + str(e))
            traceback.print_exc()
            return {}
예제 #3
0
    def get_answers(self, **kwargs):

        if "config_relative_path" in kwargs:
            my_connector = Es_connector(
                index=kwargs["index"],
                config_relative_path=kwargs["config_relative_path"])
        else:
            my_connector = Es_connector(index=kwargs["index"])

        wrong_labels = 0

        all_ids = self.join_ids(kwargs["questions"])

        res = my_connector.search({"query": {"match": {"id_str": all_ids}}})

        for question in kwargs["questions"]:

            question_id = self.classifier.extract_filename_no_ext(
                question["filename"])
            gt_tweet = [
                tweet for tweet in res["hits"]["hits"]
                if tweet["_source"]["id_str"] == question_id
            ]
            question["label"] = gt_tweet[0]["_source"][kwargs["gt_session"]]

            if question["pred_label"] != question["label"]:
                wrong_labels += 1

        # print(json.dumps(kwargs["questions"], indent=4, sort_keys=True))
        return kwargs["questions"], wrong_labels
예제 #4
0
    def update_docs_by_ids(self, docs_matches, pred_labed, config_relative_path=None):

        if len(docs_matches)>0:

            if config_relative_path != None:
                my_connector = Es_connector(index=self.index, config_relative_path=config_relative_path)
            else: my_connector = Es_connector(index=self.index)
            query = {
                "query": {
                    "bool": {
                        "should": docs_matches,
                        "minimum_should_match": 1
                    }
                }
            }
            original_docs = my_connector.search(query)["hits"]["hits"]

            if len(original_docs)>0:
                for doc in original_docs:
                    # my_connector = Es_connector(index=self.index, config_relative_path=config_relative_path)\
                    my_connector.es.update(
                        index=self.index,
                        doc_type="tweet",
                        id=doc["_id"],
                        body={"doc": {
                            self.session: pred_labed
                        }},
                        retry_on_conflict=5
                    )
예제 #5
0
    def get_sessions(self):
        my_connector = Es_connector(index=self.sessions_index,
                                    doc_type=self.sessions_doc_type)
        query = {"query": {"match_all": {}}}

        res = my_connector.search(query)
        return res
예제 #6
0
 def get_clusters(self, index="test3", word=""):
     my_connector = Es_connector(index=index)
     res = my_connector.search({
         "size": 1,
         "query": {
             "simple_query_string": {
                 "fields": ["text"],
                 "query": word
             }
         },
         "aggs": {
             "group_by_cluster": {
                 "terms": {
                     "field": "imagesCluster",
                     "size": 9999
                 }
             }
         }
     })
     # print("Clusters")
     # print(res['aggregations']['group_by_cluster']['buckets'])
     clusters = res['aggregations']['group_by_cluster']['buckets']
     with open(index + '.json') as f:
         data = json.load(f)
     for cluster in clusters:
         # print(cluster['key'])
         images = data['duplicates'][cluster['key']]
         # print(images[0])
         cluster['image'] = images[0]
         cluster['size'] = len(images)
     # print(clusters)
     return clusters
예제 #7
0
 def get_event_image(self, index="test3", main_term="", related_terms=""):
     my_connector = Es_connector(index=index)
     terms = []
     words = main_term + ' '
     for t in related_terms:
         terms.append(
             {"match": {
                 "text": {
                     "query": t['word'],
                     "boost": t['value']
                 }
             }})
         words += t['word'] + " "
     terms.append({"match": {"text": {"query": main_term, "boost": 2}}})
     # res = my_connector.search({"query": {"term" : { "text" : word }}})
     # query = {
     #     "bool": {
     #         "must": {
     #             "match": {
     #                 "text": {
     #                     "query": main_term,
     #                     "operator": "or"
     #                 }
     #             }
     #         },
     #         "should": terms
     #     }
     # }
     query = {
         "size":
         1,
         "_source": [
             "id_str", "imagesCluster", "session_Twitter2015",
             "extended_entities"
         ],
         "query": {
             "bool": {
                 "must": {
                     "exists": {
                         "field": "extended_entities"
                     }
                 },
                 "should": terms
             }
         }
     }
     # print(query)
     res = my_connector.search(query)
     return res
예제 #8
0
 def get_session_by_Name(self, name):
     my_connector = Es_connector(index=self.sessions_index,
                                 doc_type=self.sessions_doc_type)
     query = {
         "query": {
             "constant_score": {
                 "filter": {
                     "term": {
                         "s_name": name
                     }
                 }
             }
         }
     }
     res = my_connector.search(query)
     return res
예제 #9
0
 def get_cluster_tweets(self, index="test3", cid=0):
     my_connector = Es_connector(index=index)
     query = {
         # "_source": [
         #     "id_str",
         #     "imagesCluster",
         #     "session_Twitter2015",
         #     "extended_entities"
         # ],
         "query": {
             "term": {
                 "imagesCluster": cid
             }
         }
     }
     res = my_connector.search(query)
     return res
예제 #10
0
    def getMean(self, index="test3", main_term="", related_terms=""):
        my_connector = Es_connector(index=index)
        terms = []
        words = main_term + ' '
        for t in related_terms:
            terms.append(
                {"match": {
                    "text": {
                        "query": t['word'],
                        "boost": t['value']
                    }
                }})
            words += t['word'] + " "
        terms.append({"match": {"text": {"query": main_term, "boost": 2}}})
        query = {
            "sort": ["_score"],
            "_source": ["_score"],
            "query": {
                "bool": {
                    "should": terms
                }
            }
        }

        query = {
            "size": 0,
            "query": {
                "bool": {
                    "should": terms
                }
            },
            "aggs": {
                "sum_scores": {
                    "sum": {
                        "script": "_score"
                    }
                }
            }
        }
        res = my_connector.search(query)
        total = res['hits']['total']
        sum = res['aggregations']['sum_scores']['value']
        mean = sum / total
        # res = my_connector.bigSearchMean(query)
        return mean
예제 #11
0
    def get_search_related_classification_data(
            self,
            index="test3",
            word="",
            session="",
            label="confirmed OR proposed OR negative",
            matching_ngrams=[],
            full_search=False):

        if full_search:
            query = {"bool": {"must": [{"match": {session: label}}]}}
        else:
            query = {
                "bool": {
                    "must": [{
                        "match": {
                            "text": word
                        }
                    }, {
                        "match": {
                            session: label
                        }
                    }]
                }
            }

        my_connector = Es_connector(index=index)
        res = my_connector.search({
            "size": 0,
            "query": query,
            "aggs": {
                "query_classification": {
                    "terms": {
                        "field": session + ".keyword"
                    }
                }
            }
        })
        return res['aggregations']['query_classification']['buckets']
예제 #12
0
 def get_valid_tweets(self, index="test3"):
     my_connector = Es_connector(index=index)
     res = my_connector.search({
         "query": {
             "simple_query_string": {
                 "fields": ["text"],
                 "query": word
             }
         }
     })
     # res = my_connector.bigSearch(
     #     {
     #         "_source": ["text", "id_str", "extended_entities", "user", "created_at", "link"],
     #         "query": {
     #             "simple_query_string": {
     #               "fields": [
     #                 "text"
     #               ],
     #               "query": word
     #             }
     #           }
     #     })
     return res['hits']['hits']
예제 #13
0
파일: images.py 프로젝트: gbosetti/cati
    print('Number of clusters: %d' % len(data['duplicates']))
    print('Index', args.i)

    my_connector = Es_connector(index=args.i)
    imgs = 0
    count = 0
    c_count = 0
    for cluster in data['duplicates']:
        for img in cluster:
            imgs += 1
            print("     Image ", imgs)
            target_tweet_id = re.search(r'(?<=/)(\d*)_(.*)\.(.*)', img,
                                        re.M | re.I)
            res = my_connector.search(
                {"query": {
                    "term": {
                        "id_str": target_tweet_id.group(1)
                    }
                }})
            if res['hits']['total'] > 0:
                id = res['hits']['hits'][0]['_id']
                if 'imagesCluster' in res['hits']['hits'][0]['_source']:
                    arr = res['hits']['hits'][0]['_source']['imagesCluster']
                    if isinstance(arr, list):
                        arr.extend([c_count])
                        arr = list(set(arr))
                        update = my_connector.update_field(
                            id, 'imagesCluster', arr)
                    else:
                        update = my_connector.update_field(
                            id, 'imagesCluster', [arr])
                else:
예제 #14
0
파일: images.py 프로젝트: FirasOdeh/MABED
        data = json.load(f)

    print('Number of clusters: %d' % len(data['duplicates']))

    my_connector = Es_connector(index=args.i)
    # my_connector = Es_connector(index=args.i, host='http://206.189.211.142', user='', password='')
    imgs = 0
    count = 0
    c_count = 0
    for cluster in data['duplicates']:
        for img in cluster:
            imgs += 1
            matchObj = re.match(r'(\d*)_(.*).(.*)', img, re.M | re.I)
            res = my_connector.search(
                {"query": {
                    "term": {
                        "id_str": matchObj.group(1)
                    }
                }})
            if res['hits']['total'] > 0:
                id = res['hits']['hits'][0]['_id']
                if 'imagesCluster' in res['hits']['hits'][0]['_source']:
                    arr = res['hits']['hits'][0]['_source']['imagesCluster']
                    if isinstance(arr, list):
                        print(
                            res['hits']['hits'][0]['_source']['imagesCluster'])
                        arr.extend([c_count])
                        arr = list(set(arr))
                        update = my_connector.update_field(
                            id, 'imagesCluster', arr)
                    else:
                        update = my_connector.update_field(
예제 #15
0
    def get_event_clusters(self,
                           index="test3",
                           main_term="",
                           related_terms=""):
        my_connector = Es_connector(index=index)
        terms = []
        words = main_term + ' '
        for t in related_terms:
            terms.append(
                {"match": {
                    "text": {
                        "query": t['word'],
                        "boost": t['value']
                    }
                }})
            words += t['word'] + " "
        terms.append({"match": {"text": {"query": main_term, "boost": 2}}})
        # query = {
        #     "size": 0,
        #     "query": {
        #             "bool": {
        #                 "should": terms
        #             }
        #         },
        #     "aggs": {
        #         "group_by_cluster": {
        #             "terms": {
        #                 "field": "imagesCluster",
        #                 "size": 200
        #             }
        #         }
        #     }
        # }
        query = {
            "size": 0,
            "query": {
                "bool": {
                    "should": terms
                }
            },
            "aggregations": {
                "group_by_cluster": {
                    "terms": {
                        "field": "imagesCluster",
                        # "shard_size": 999999999,
                        "size": 999999
                    }
                }
            }
        }
        # print(query)
        res = my_connector.search(query)
        # print("Clusters")
        # print(res['aggregations']['group_by_cluster']['buckets'])
        clusters = res['aggregations']['group_by_cluster']['buckets']
        with open(index + '.json') as f:
            data = json.load(f)

        for cluster in clusters:
            # q1 = {
            #       "_source": [
            #         "text",
            #         "imagesCluster"
            #       ],
            #       "query": {
            #         "bool": {
            #            "should": terms,
            #           "filter": {
            #             "bool": {
            #               "should": [
            #                 {
            #                   "match": {
            #                     "imagesCluster": cluster['key']
            #                   }
            #                 }
            #               ]
            #             }
            #           }
            #         }
            #       }
            #     }
            q2 = {"query": {"term": {"imagesCluster": cluster['key']}}}
            # cres1 = my_connector.search(q1)
            cres = my_connector.count(q2)
            # print(cluster['key'])
            images = data['duplicates'][cluster['key']]
            # print(images[0])
            cluster['image'] = images[0]
            # cluster['size'] = len(images)
            # print(cres)
            cluster['size'] = cres['count']
            # cluster['size2'] = cres1['hits']['total']
            # if cluster['key']==1452:
            #     print(cluster)
        # print(clusters)
        return clusters
예제 #16
0
print("You are removing duplicates from the " + args.index + " index.")

my_conn = Es_connector(index=args.index)
buckets_size = 1

while buckets_size > 0:

    res = my_conn.search({
        "size": 0,
        "query": {
            "match_all": {}
        },
        "aggs": {
            "duplicated_by_str_id": {
                "terms": {
                    "field": "id_str.keyword",
                    "min_doc_count": 2,
                    "size": 20
                }
            }
        }
    })
    buckets_size = len(res['aggregations']['duplicated_by_str_id']['buckets'])

    for bucket in res['aggregations']['duplicated_by_str_id']['buckets']:

        print("Deleting ", bucket["key"])
        duplicated_res = my_conn.search(
            {"query": {
                "match": {