def get_tweets(self, index, doc_field): my_connector = Es_connector(index=index) all_tweets = [] query = { "_source": [doc_field, "timestamp_ms"], "query": { "exists": { "field": doc_field } } } res = my_connector.init_paginatedSearch(query) sid = res["sid"] scroll_size = res["scroll_size"] # Analyse and process page by page processed_tweets = 0 while scroll_size > 0: tweets = res["results"] all_tweets.extend([{ '_source': { doc_field: self.tknzr.tokenize(tweet["_source"][doc_field]), "timestamp_ms": tweet["_source"]["timestamp_ms"] } } for tweet in tweets]) processed_tweets += scroll_size res = my_connector.loop_paginatedSearch(sid, scroll_size) scroll_size = res["scroll_size"] return all_tweets
def get_tweets_query_state(self, index="test3", word="", state="proposed", session=""): my_connector = Es_connector(index=index) query = { "query": { "bool": { "must": { "simple_query_string": { "fields": ["text"], "query": word } }, "filter": { "bool": { "should": [{ "match": { session: state } }] } } } } } res = my_connector.init_paginatedSearch(query) return res
def get_event_tweets(self, index="test3", main_term="", related_terms=""): my_connector = Es_connector(index=index) terms = [] words = main_term + ' ' for t in related_terms: terms.append( {"match": { "text": { "query": t['word'], "boost": t['value'] } }}) words += t['word'] + " " terms.append({"match": {"text": {"query": main_term, "boost": 2}}}) # res = my_connector.search({"query": {"term" : { "text" : word }}}) # query = { # "bool": { # "must": { # "match": { # "text": { # "query": main_term, # "operator": "or" # } # } # }, # "should": terms # } # } query = {"sort": ["_score"], "query": {"bool": {"should": terms}}} # print(query) # res = my_connector.search(query) res = my_connector.init_paginatedSearch(query) return res
def get_tweets(self, index="test3", word=""): my_connector = Es_connector(index=index) # res = my_connector.search({ # "query": { # "simple_query_string": { # "fields": [ # "text" # ], # "query": word # } # } # }) # res = my_connector.bigSearch( # { # "_source": ["text", "id_str", "extended_entities", "user", "created_at", "link"], # "query": { # "simple_query_string": { # "fields": [ # "text" # ], # "query": word # } # } # }) res = my_connector.init_paginatedSearch({ "query": { "simple_query_string": { "fields": ["text"], "query": word } } }) return res
def get_tweets_state(self, index="test3", session="", state="proposed"): my_connector = Es_connector(index=index) res = my_connector.init_paginatedSearch( {"query": { "term": { "session_" + session: state } }}) return res
def get_event_tweets2(self, index="test3", main_term="", related_terms="", cid=0): my_connector = Es_connector(index=index) terms = [] words = main_term + ' ' for t in related_terms: terms.append( {"match": { "text": { "query": t['word'], "boost": t['value'] } }}) words += t['word'] + " " terms.append({"match": {"text": {"query": main_term, "boost": 2}}}) # terms.append({"match": { # "imagesCluster": { # "query": cid # } # }}) # query = { # "query": { # "bool": { # "must": { # "exists": { # "field": "imagesCluster" # } # }, # # "must": { "match": { "imagesCluster" : cid }}, # "should": terms # } # } # } query = { "sort": ["_score"], "query": { "bool": { "should": terms, "minimum_should_match": 1, "must": [{ "match": { "imagesCluster": cid } }] } } } # res = my_connector.bigSearch(query) res = my_connector.init_paginatedSearch(query) return res
def get_event_filter_tweets(self, index="test3", main_term="", related_terms="", state="proposed", session=""): my_connector = Es_connector(index=index) terms = [] words = main_term + ' ' for t in related_terms: terms.append( {"match": { "text": { "query": t['word'], "boost": t['value'] } }}) words += t['word'] + " " terms.append({"match": {"text": {"query": main_term, "boost": 2}}}) # query = { # "sort": [ # "_score" # ], # "query": { # "bool": { # "should": terms # } # } # } query = { "sort": ["_score"], "query": { "bool": { "must": [{ "bool": { "should": terms } }], "filter": { "bool": { "should": [{ "match": { session: state } }] } } } } } res = my_connector.init_paginatedSearch(query) return res
def generate_ngrams_for_index(self, **kwargs): try: # Get the data for performinga paginated search self.current_thread_percentage = 0 print("Starting") my_connector = Es_connector(index=kwargs["index"]) query = kwargs.get('query', {"query": {"match_all": {}}}) res = my_connector.init_paginatedSearch(query) sid = res["sid"] scroll_size = res["scroll_size"] total = int(res["total"]) # Analyse and process page by page i = 0 total_scrolls = int(total / scroll_size) processed_scrolls = 0 print("from_property:", kwargs['from_property']) while scroll_size > 0: tweets = res["results"] self.gerenate_ngrams_for_tweets( tweets, from_property=kwargs['from_property'], prop=kwargs["prop"], index=kwargs["index"], length=kwargs["length"]) i += 1 res = my_connector.loop_paginatedSearch(sid, scroll_size) scroll_size = res["scroll_size"] processed_scrolls += 1 self.current_thread_percentage = round( processed_scrolls * 100 / total_scrolls, 0) print("Completed: ", self.current_thread_percentage, "%") # Clean it at the end so the clien knows when to end asking for more logs self.current_thread_percentage = 100 return True except Exception as e: print('Error: ' + str(e)) return False
def get_big_tweets_scroll(self, index="test3", word=""): my_connector = Es_connector(index=index) res = my_connector.init_paginatedSearch({ "_source": [ "text", "id_str", "extended_entities", "user", "created_at", "link" ], "query": { "simple_query_string": { "fields": ["text"], "query": word } } }) return res
def search_bigrams_related_tweets(self, **kwargs): my_connector = Es_connector(index=kwargs["index"]) if kwargs.get('full_search', False): # All tweets query = { "query": { "bool": { "must": [{ "match_phrase": { kwargs["ngramsPropName"]: kwargs["ngram"] } }, { "match": { kwargs["session"]: kwargs["label"] } }] } } } else: # matching keywords query = { "query": { "bool": { "must": [{ "match": { "text": kwargs["word"] } }, { "match_phrase": { kwargs["ngramsPropName"]: kwargs["ngram"] } }, { "match": { kwargs["session"]: kwargs["label"] } }] } } } print(query) return my_connector.init_paginatedSearch(query)
def download_tweets_from_elastic(self, **kwargs): debug_limit = kwargs.get("debug_limit", False) log_enabled = kwargs.get("log_enabled", True) if "config_relative_path" in kwargs: my_connector = Es_connector( index=kwargs["index"], doc_type="tweet", config_relative_path=kwargs["config_relative_path"]) else: my_connector = Es_connector( index=kwargs["index"], doc_type="tweet") # config_relative_path='../') res = my_connector.init_paginatedSearch(kwargs["query"]) sid = res["sid"] scroll_size = res["scroll_size"] total = int(res["total"]) processed = len(res["results"]) self.write_data_in_folders(kwargs["field"], kwargs["folder"], res["results"]) while scroll_size > 0: res = my_connector.loop_paginatedSearch(sid, scroll_size) scroll_size = res["scroll_size"] processed += len(res["results"]) # Writing the retrieved files into the folders self.write_data_in_folders(kwargs["field"], kwargs["folder"], res["results"]) if log_enabled: print("Downloading: ", round(processed * 100 / total, 2), "%") if debug_limit: print("\nDEBUG LIMIT\n") res = my_connector.loop_paginatedSearch(sid, scroll_size) self.write_data_in_folders(kwargs["field"], kwargs["folder"], res["results"]) scroll_size = 0 return total
def search_event_bigrams_related_tweets(self, **kwargs): my_connector = Es_connector(index=kwargs["index"]) query = { "query": { "bool": { "should": kwargs["target_terms"], "minimum_should_match": 1, "must": [{ "match_phrase": { kwargs["ngramsPropName"]: kwargs["ngram"] } }, { "match": { kwargs["session"]: kwargs["label"] } }] } } } return my_connector.init_paginatedSearch(query)
index=index, doc_type="tweet", id=tweet["_id"], body={"doc": { output_field: full_text }}) print("Languages for stopwords: ", ngramsAnalizer.retrievedLangs) try: my_connector = Es_connector(index=index) #query = #"query": { #"match_all": {} #} query = {"query": {"match": {"lang": "en or fr or es"}}} res = my_connector.init_paginatedSearch(query=query) sid = res["sid"] scroll_size = res["scroll_size"] init_total = int(res["total"]) accum_total = 0 print("\nTotal = ", init_total) print("\nScroll = ", scroll_size) print("\nLangs = ", langs) while scroll_size > 0: generate_text_images_prop(res["results"], langs) res = my_connector.loop_paginatedSearch(sid, scroll_size) scroll_size = res["scroll_size"]