示例#1
0
    def get_answers(self, **kwargs):

        if "config_relative_path" in kwargs:
            my_connector = Es_connector(
                index=kwargs["index"],
                config_relative_path=kwargs["config_relative_path"])
        else:
            my_connector = Es_connector(index=kwargs["index"])

        wrong_labels = 0

        all_ids = self.join_ids(kwargs["questions"])

        res = my_connector.search({"query": {"match": {"id_str": all_ids}}})

        for question in kwargs["questions"]:

            question_id = self.classifier.extract_filename_no_ext(
                question["filename"])
            gt_tweet = [
                tweet for tweet in res["hits"]["hits"]
                if tweet["_source"]["id_str"] == question_id
            ]
            question["label"] = gt_tweet[0]["_source"][kwargs["gt_session"]]

            if question["pred_label"] != question["label"]:
                wrong_labels += 1

        # print(json.dumps(kwargs["questions"], indent=4, sort_keys=True))
        return kwargs["questions"], wrong_labels
示例#2
0
 def get_clusters(self, index="test3", word=""):
     my_connector = Es_connector(index=index)
     res = my_connector.search({
         "size": 1,
         "query": {
             "simple_query_string": {
                 "fields": ["text"],
                 "query": word
             }
         },
         "aggs": {
             "group_by_cluster": {
                 "terms": {
                     "field": "imagesCluster",
                     "size": 9999
                 }
             }
         }
     })
     # print("Clusters")
     # print(res['aggregations']['group_by_cluster']['buckets'])
     clusters = res['aggregations']['group_by_cluster']['buckets']
     with open(index + '.json') as f:
         data = json.load(f)
     for cluster in clusters:
         # print(cluster['key'])
         images = data['duplicates'][cluster['key']]
         # print(images[0])
         cluster['image'] = images[0]
         cluster['size'] = len(images)
     # print(clusters)
     return clusters
示例#3
0
    def get_ngrams_by_query(self, query="", **kwargs):

        try:
            my_connector = Es_connector(
                index=kwargs["index"],
                config_relative_path=self.config_relative_path)
            full_query = {
                "query": query,
                "size": 0,
                "aggs": {
                    "ngrams_count": {
                        "terms": {
                            "field": kwargs["n_size"] + "grams.keyword",
                            "size": kwargs["results_size"]
                        },
                        "aggs": {
                            "status": {
                                "terms": {
                                    "field": kwargs["session"] + ".keyword"
                                }
                            }
                        }
                    }
                }
            }
            return my_connector.search(full_query)

        except Exception as e:
            print('Error: ' + str(e))
            traceback.print_exc()
            return {}
示例#4
0
    def get_tweets(self, index="test3", word=""):
        my_connector = Es_connector(index=index)
        # res = my_connector.search({
        #         "query": {
        #             "simple_query_string": {
        #               "fields": [
        #                 "text"
        #               ],
        #               "query": word
        #             }
        #           }
        #         })

        # res = my_connector.bigSearch(
        #     {
        #         "_source": ["text", "id_str", "extended_entities", "user", "created_at", "link"],
        #         "query": {
        #             "simple_query_string": {
        #               "fields": [
        #                 "text"
        #               ],
        #               "query": word
        #             }
        #           }
        #     })

        res = my_connector.init_paginatedSearch({
            "query": {
                "simple_query_string": {
                    "fields": ["text"],
                    "query": word
                }
            }
        })
        return res
示例#5
0
    def update_tweets_state_by_event_ngram(self, **kwargs):

        tweets_connector = Es_connector(index=kwargs["index"],
                                        doc_type="tweet")

        query = {
            "query": {
                "bool": {
                    "should":
                    kwargs["target_terms"],
                    "minimum_should_match":
                    1,
                    "must": [{
                        "match_phrase": {
                            kwargs["ngramsPropName"]: kwargs["ngram"]
                        }
                    }, {
                        "match": {
                            kwargs["session"]: kwargs["query_label"]
                        }
                    }]
                }
            }
        }
        return tweets_connector.update_query(query, kwargs["session"],
                                             kwargs["new_label"])
示例#6
0
    def set_tweet_state(self, index, session, tid, val):
        tweets_connector = Es_connector(index=index, doc_type="tweet")
        session = 'session_' + session

        query = {"doc": {session: val}}
        res = tweets_connector.update(tid, query)
        return res
示例#7
0
    def get_sessions(self):
        my_connector = Es_connector(index=self.sessions_index,
                                    doc_type=self.sessions_doc_type)
        query = {"query": {"match_all": {}}}

        res = my_connector.search(query)
        return res
示例#8
0
 def set_cluster_state(self, index, session, cid, state):
     tweets_connector = Es_connector(index=index, doc_type="tweet")
     # All tweets
     session = 'session_' + session
     query = {"query": {"term": {"imagesCluster": cid}}}
     res = tweets_connector.update_query(query, session, state)
     return res
示例#9
0
    def update_docs_by_ids(self, docs_matches, pred_labed, config_relative_path=None):

        if len(docs_matches)>0:

            if config_relative_path != None:
                my_connector = Es_connector(index=self.index, config_relative_path=config_relative_path)
            else: my_connector = Es_connector(index=self.index)
            query = {
                "query": {
                    "bool": {
                        "should": docs_matches,
                        "minimum_should_match": 1
                    }
                }
            }
            original_docs = my_connector.search(query)["hits"]["hits"]

            if len(original_docs)>0:
                for doc in original_docs:
                    # my_connector = Es_connector(index=self.index, config_relative_path=config_relative_path)\
                    my_connector.es.update(
                        index=self.index,
                        doc_type="tweet",
                        id=doc["_id"],
                        body={"doc": {
                            self.session: pred_labed
                        }},
                        retry_on_conflict=5
                    )
示例#10
0
 def set_search_status(self, index, session, state, word):
     tweets_connector = Es_connector(index=index, doc_type="tweet")
     session = 'session_' + session
     query = {
         "query": {
             "bool": {
                 "must": {
                     "simple_query_string": {
                         "fields": ["text"],
                         "query": word
                     }
                 },
                 "filter": {
                     "bool": {
                         "should": [{
                             "match": {
                                 session: "proposed"
                             }
                         }]
                     }
                 }
             }
         }
     }
     res = tweets_connector.update_query(query, session, state)
     return res
示例#11
0
    def get_tweets(self, index, doc_field):

        my_connector = Es_connector(index=index)
        all_tweets = []
        query = {
            "_source": [doc_field, "timestamp_ms"],
            "query": {
                "exists": {
                    "field": doc_field
                }
            }
        }
        res = my_connector.init_paginatedSearch(query)
        sid = res["sid"]
        scroll_size = res["scroll_size"]

        # Analyse and process page by page
        processed_tweets = 0
        while scroll_size > 0:

            tweets = res["results"]
            all_tweets.extend([{
                '_source': {
                    doc_field:
                    self.tknzr.tokenize(tweet["_source"][doc_field]),
                    "timestamp_ms": tweet["_source"]["timestamp_ms"]
                }
            } for tweet in tweets])
            processed_tweets += scroll_size

            res = my_connector.loop_paginatedSearch(sid, scroll_size)
            scroll_size = res["scroll_size"]

        return all_tweets
示例#12
0
 def get_event_state_tweets_count(self,
                                  index="test3",
                                  session="",
                                  words="",
                                  state="confirmed"):
     my_connector = Es_connector(index=index)
     query = {
         "query": {
             "bool": {
                 "must": [{
                     "match": {
                         "text": {
                             "query": words
                         }
                     }
                 }],
                 "filter": {
                     "bool": {
                         "should": [{
                             "match": {
                                 "session_" + session: state
                             }
                         }]
                     }
                 }
             }
         }
     }
     res = my_connector.count(query)
     return res['count']
示例#13
0
 def get_event_tweets(self, index="test3", main_term="", related_terms=""):
     my_connector = Es_connector(index=index)
     terms = []
     words = main_term + ' '
     for t in related_terms:
         terms.append(
             {"match": {
                 "text": {
                     "query": t['word'],
                     "boost": t['value']
                 }
             }})
         words += t['word'] + " "
     terms.append({"match": {"text": {"query": main_term, "boost": 2}}})
     # res = my_connector.search({"query": {"term" : { "text" : word }}})
     # query = {
     #     "bool": {
     #         "must": {
     #             "match": {
     #                 "text": {
     #                     "query": main_term,
     #                     "operator": "or"
     #                 }
     #             }
     #         },
     #         "should": terms
     #     }
     # }
     query = {"sort": ["_score"], "query": {"bool": {"should": terms}}}
     # print(query)
     # res = my_connector.search(query)
     res = my_connector.init_paginatedSearch(query)
     return res
示例#14
0
 def get_tweets_query_state(self,
                            index="test3",
                            word="",
                            state="proposed",
                            session=""):
     my_connector = Es_connector(index=index)
     query = {
         "query": {
             "bool": {
                 "must": {
                     "simple_query_string": {
                         "fields": ["text"],
                         "query": word
                     }
                 },
                 "filter": {
                     "bool": {
                         "should": [{
                             "match": {
                                 session: state
                             }
                         }]
                     }
                 }
             }
         }
     }
     res = my_connector.init_paginatedSearch(query)
     return res
示例#15
0
def get_tweets(session, index, state='confirmed'):
    # Get all confirmed tweets
    connector = Es_connector(index=index, doc_type='tweet')

    query = {"query": {"term": {"session_" + session: state}}}

    res = connector.bigSearch(query)
    return res
示例#16
0
 def update_session_results(self, id, events, impact_data):
     my_connector = Es_connector(index=self.sessions_index,
                                 doc_type=self.sessions_doc_type)
     res = my_connector.update(
         id, {"doc": {
             "events": events,
             "impact_data": impact_data
         }})
     return res
示例#17
0
 def get_tweets_state(self, index="test3", session="", state="proposed"):
     my_connector = Es_connector(index=index)
     res = my_connector.init_paginatedSearch(
         {"query": {
             "term": {
                 "session_" + session: state
             }
         }})
     return res
示例#18
0
    def get_similar_docs(self, **kwargs):

        if len(kwargs["questions"]) == 0:
            return []

        my_connector = Es_connector(index=kwargs["index"])  # , config_relative_path='../')
        duplicated_docs = []

        docs_ids_matches = [{"match": {"id_str": {"query": question["str_id"] }}} for question in kwargs["questions"]]

        docs_original_textual_content = my_connector.search({
            "query": {
                "bool": {
                    "should": docs_ids_matches,
                    "minimum_should_match": 1,
                    "must": [
                        {
                            "match": {
                                kwargs["session"]: "proposed"
                            }
                        }
                    ]
                }
            }
        })

        for doc in docs_original_textual_content["hits"]["hits"]:
            query = {
                "query": {
                    "bool": {
                        "must": [
                            {
                                "term": {
                                    "text.keyword": {
                                        "value": doc["_source"][kwargs["text_field"]]
                                    }
                                }
                            }
                        ]
                    }
                }
            }

            matching_docs = my_connector.search(query)
            if matching_docs["hits"]["total"]>1:

                label = [question for question in kwargs["questions"] if question["str_id"] == doc["_source"]["id_str"]][0]["label"]

                for dup_doc in matching_docs["hits"]["hits"]:
                    duplicated_docs.append({
                        "filename": dup_doc["_source"]["id_str"],
                        "label": label,
                        kwargs["text_field"]: dup_doc["_source"][kwargs["text_field"]]
                    })

        return duplicated_docs
示例#19
0
    def get_event_tweets2(self,
                          index="test3",
                          main_term="",
                          related_terms="",
                          cid=0):
        my_connector = Es_connector(index=index)
        terms = []
        words = main_term + ' '
        for t in related_terms:
            terms.append(
                {"match": {
                    "text": {
                        "query": t['word'],
                        "boost": t['value']
                    }
                }})
            words += t['word'] + " "
        terms.append({"match": {"text": {"query": main_term, "boost": 2}}})
        # terms.append({"match": {
        #     "imagesCluster": {
        #         "query": cid
        #     }
        # }})
        # query = {
        #         "query": {
        #                 "bool": {
        #                     "must": {
        #                         "exists": {
        #                             "field": "imagesCluster"
        #                         }
        #                     },
        #                     # "must": { "match": { "imagesCluster" : cid }},
        #                     "should": terms
        #                 }
        #             }
        #         }

        query = {
            "sort": ["_score"],
            "query": {
                "bool": {
                    "should": terms,
                    "minimum_should_match": 1,
                    "must": [{
                        "match": {
                            "imagesCluster": cid
                        }
                    }]
                }
            }
        }

        # res = my_connector.bigSearch(query)
        res = my_connector.init_paginatedSearch(query)
        return res
示例#20
0
    def get_event_filter_tweets(self,
                                index="test3",
                                main_term="",
                                related_terms="",
                                state="proposed",
                                session=""):
        my_connector = Es_connector(index=index)
        terms = []
        words = main_term + ' '
        for t in related_terms:
            terms.append(
                {"match": {
                    "text": {
                        "query": t['word'],
                        "boost": t['value']
                    }
                }})
            words += t['word'] + " "
        terms.append({"match": {"text": {"query": main_term, "boost": 2}}})
        # query = {
        #     "sort": [
        #         "_score"
        #     ],
        #         "query": {
        #                 "bool": {
        #                     "should": terms
        #                 }
        #             }
        #         }

        query = {
            "sort": ["_score"],
            "query": {
                "bool": {
                    "must": [{
                        "bool": {
                            "should": terms
                        }
                    }],
                    "filter": {
                        "bool": {
                            "should": [{
                                "match": {
                                    session: state
                                }
                            }]
                        }
                    }
                }
            }
        }
        res = my_connector.init_paginatedSearch(query)
        return res
示例#21
0
 def get_words_count(self, index="test3", words=""):
     my_connector = Es_connector(index=index)
     query = {
         "query": {
             "simple_query_string": {
                 "fields": ["text"],
                 "query": words
             }
         }
     }
     res = my_connector.count(query)
     return res['count']
示例#22
0
 def export_event(self, index, session):
     my_connector = Es_connector(index=index)
     res = my_connector.bigSearch({
         "_source": {
             "excludes": ["session_*"]
         },
         "query": {
             "term": {
                 "session_" + session: "confirmed"
             }
         }
     })
     return res
示例#23
0
    def remove_tmp_predictions_field(self, **kwargs):

        my_connector = Es_connector(
            index=kwargs["index"],
            doc_type="tweet")  # config_relative_path='../')

        for answer in kwargs["answers"]:
            res = my_connector.update_by_query(
                {"query": {
                    "match": {
                        "_id": answer["id"]
                    }
                }}, "ctx._source.remove('" + kwargs["session"] + "_tmp')")
示例#24
0
 def get_event_image(self, index="test3", main_term="", related_terms=""):
     my_connector = Es_connector(index=index)
     terms = []
     words = main_term + ' '
     for t in related_terms:
         terms.append(
             {"match": {
                 "text": {
                     "query": t['word'],
                     "boost": t['value']
                 }
             }})
         words += t['word'] + " "
     terms.append({"match": {"text": {"query": main_term, "boost": 2}}})
     # res = my_connector.search({"query": {"term" : { "text" : word }}})
     # query = {
     #     "bool": {
     #         "must": {
     #             "match": {
     #                 "text": {
     #                     "query": main_term,
     #                     "operator": "or"
     #                 }
     #             }
     #         },
     #         "should": terms
     #     }
     # }
     query = {
         "size":
         1,
         "_source": [
             "id_str", "imagesCluster", "session_Twitter2015",
             "extended_entities"
         ],
         "query": {
             "bool": {
                 "must": {
                     "exists": {
                         "field": "extended_entities"
                     }
                 },
                 "should": terms
             }
         }
     }
     # print(query)
     res = my_connector.search(query)
     return res
示例#25
0
    def generate_ngrams_for_index(self, **kwargs):

        try:
            # Get the data for performinga paginated search
            self.current_thread_percentage = 0
            print("Starting")
            my_connector = Es_connector(index=kwargs["index"])

            query = kwargs.get('query', {"query": {"match_all": {}}})

            res = my_connector.init_paginatedSearch(query)
            sid = res["sid"]
            scroll_size = res["scroll_size"]
            total = int(res["total"])

            # Analyse and process page by page
            i = 0
            total_scrolls = int(total / scroll_size)
            processed_scrolls = 0

            print("from_property:", kwargs['from_property'])

            while scroll_size > 0:
                tweets = res["results"]
                self.gerenate_ngrams_for_tweets(
                    tweets,
                    from_property=kwargs['from_property'],
                    prop=kwargs["prop"],
                    index=kwargs["index"],
                    length=kwargs["length"])

                i += 1
                res = my_connector.loop_paginatedSearch(sid, scroll_size)
                scroll_size = res["scroll_size"]
                processed_scrolls += 1

                self.current_thread_percentage = round(
                    processed_scrolls * 100 / total_scrolls, 0)

                print("Completed: ", self.current_thread_percentage, "%")

            # Clean it at the end so the clien knows when to end asking for more logs
            self.current_thread_percentage = 100

            return True

        except Exception as e:
            print('Error: ' + str(e))
            return False
示例#26
0
 def get_range_count(self, index, start, end):
     my_connector = Es_connector(index=index)
     query = {
         "query": {
             "range": {
                 "timestamp_ms": {
                     "gt": str(start),
                     "lt": str(end)
                 }
             }
         }
     }
     print(query)
     res = my_connector.count(query)
     return res['count']
示例#27
0
 def get_big_tweets_scroll(self, index="test3", word=""):
     my_connector = Es_connector(index=index)
     res = my_connector.init_paginatedSearch({
         "_source": [
             "text", "id_str", "extended_entities", "user", "created_at",
             "link"
         ],
         "query": {
             "simple_query_string": {
                 "fields": ["text"],
                 "query": word
             }
         }
     })
     return res
示例#28
0
 def get_end_date(self, index):
     my_connector = Es_connector(index=index)
     res = my_connector.search_size(
         {
             "_source": ["@timestamp", "timestamp_ms"],
             "query": {
                 "match_all": {}
             },
             "sort": [{
                 "@timestamp": {
                     "order": "desc"
                 }
             }]
         }, 1)
     return res['hits']['hits'][0]['_source']
示例#29
0
    def clear_tmp_predictions(self, **kwargs):

        my_connector = Es_connector(
            index=kwargs["index"],
            doc_type="tweet")  # config_relative_path='../')
        res = my_connector.update_by_query(
            {
                "query": {
                    "exists": {
                        "field": kwargs["session"] + "_tmp"
                    }  # e.g. session_lyon2015_test_01_tmp
                }
            },
            "ctx._source." + kwargs["session"] + "_tmp = 'proposed'"
        )  #"ctx._source.remove('" + kwargs["session"] + "_tmp')")
示例#30
0
    def updatePropertyValue(self, **kwargs):

        tweet = kwargs["tweet"]
        # cnn = Es_connector(index=kwargs["index"]);
        #
        # q = {
        #     "script": {
        #         "inline": "ctx._source." + kwargs["property_name"] + " = params.value",
        #         "lang": "painless",
        #         "params": {
        #             "value": str(kwargs["property_value"])
        #         }
        #     },
        #     "query": {
        #         "match": {
        #             "_id": tweet["_id"]
        #         }
        #     }
        # }
        #
        # cnn.es.update_by_query(body=q, doc_type='tweet', index=kwargs["index"])

        Es_connector(index=kwargs["index"]).es.update(
            index=kwargs["index"],
            doc_type="tweet",
            id=tweet["_id"],
            body={"doc": {
                kwargs["property_name"]: kwargs["property_value"]
            }},
            retry_on_conflict=5)