def get_tweets(self, index, doc_field): my_connector = Es_connector(index=index) all_tweets = [] query = { "_source": [doc_field, "timestamp_ms"], "query": { "exists": { "field": doc_field } } } res = my_connector.init_paginatedSearch(query) sid = res["sid"] scroll_size = res["scroll_size"] # Analyse and process page by page processed_tweets = 0 while scroll_size > 0: tweets = res["results"] all_tweets.extend([{ '_source': { doc_field: self.tknzr.tokenize(tweet["_source"][doc_field]), "timestamp_ms": tweet["_source"]["timestamp_ms"] } } for tweet in tweets]) processed_tweets += scroll_size res = my_connector.loop_paginatedSearch(sid, scroll_size) scroll_size = res["scroll_size"] return all_tweets
def download_tweets_from_elastic(self, **kwargs): debug_limit = kwargs.get("debug_limit", False) log_enabled = kwargs.get("log_enabled", True) if "config_relative_path" in kwargs: my_connector = Es_connector( index=kwargs["index"], doc_type="tweet", config_relative_path=kwargs["config_relative_path"]) else: my_connector = Es_connector( index=kwargs["index"], doc_type="tweet") # config_relative_path='../') res = my_connector.init_paginatedSearch(kwargs["query"]) sid = res["sid"] scroll_size = res["scroll_size"] total = int(res["total"]) processed = len(res["results"]) self.write_data_in_folders(kwargs["field"], kwargs["folder"], res["results"]) while scroll_size > 0: res = my_connector.loop_paginatedSearch(sid, scroll_size) scroll_size = res["scroll_size"] processed += len(res["results"]) # Writing the retrieved files into the folders self.write_data_in_folders(kwargs["field"], kwargs["folder"], res["results"]) if log_enabled: print("Downloading: ", round(processed * 100 / total, 2), "%") if debug_limit: print("\nDEBUG LIMIT\n") res = my_connector.loop_paginatedSearch(sid, scroll_size) self.write_data_in_folders(kwargs["field"], kwargs["folder"], res["results"]) scroll_size = 0 return total
def generate_ngrams_for_index(self, **kwargs): try: # Get the data for performinga paginated search self.current_thread_percentage = 0 print("Starting") my_connector = Es_connector(index=kwargs["index"]) query = kwargs.get('query', {"query": {"match_all": {}}}) res = my_connector.init_paginatedSearch(query) sid = res["sid"] scroll_size = res["scroll_size"] total = int(res["total"]) # Analyse and process page by page i = 0 total_scrolls = int(total / scroll_size) processed_scrolls = 0 print("from_property:", kwargs['from_property']) while scroll_size > 0: tweets = res["results"] self.gerenate_ngrams_for_tweets( tweets, from_property=kwargs['from_property'], prop=kwargs["prop"], index=kwargs["index"], length=kwargs["length"]) i += 1 res = my_connector.loop_paginatedSearch(sid, scroll_size) scroll_size = res["scroll_size"] processed_scrolls += 1 self.current_thread_percentage = round( processed_scrolls * 100 / total_scrolls, 0) print("Completed: ", self.current_thread_percentage, "%") # Clean it at the end so the clien knows when to end asking for more logs self.current_thread_percentage = 100 return True except Exception as e: print('Error: ' + str(e)) return False
def get_tweets_scroll(self, index, sid, scroll_size): my_connector = Es_connector(index=index) res = my_connector.loop_paginatedSearch(sid, scroll_size) return res
print("Languages for stopwords: ", ngramsAnalizer.retrievedLangs) try: my_connector = Es_connector(index=index) #query = #"query": { #"match_all": {} #} query = {"query": {"match": {"lang": "en or fr or es"}}} res = my_connector.init_paginatedSearch(query=query) sid = res["sid"] scroll_size = res["scroll_size"] init_total = int(res["total"]) accum_total = 0 print("\nTotal = ", init_total) print("\nScroll = ", scroll_size) print("\nLangs = ", langs) while scroll_size > 0: generate_text_images_prop(res["results"], langs) res = my_connector.loop_paginatedSearch(sid, scroll_size) scroll_size = res["scroll_size"] accum_total += scroll_size print(accum_total * 100 / init_total, "%") except Exception as e: print('Error: ' + str(e))