예제 #1
0
    def get_tweets(self, index, doc_field):

        my_connector = Es_connector(index=index)
        all_tweets = []
        query = {
            "_source": [doc_field, "timestamp_ms"],
            "query": {
                "exists": {
                    "field": doc_field
                }
            }
        }
        res = my_connector.init_paginatedSearch(query)
        sid = res["sid"]
        scroll_size = res["scroll_size"]

        # Analyse and process page by page
        processed_tweets = 0
        while scroll_size > 0:

            tweets = res["results"]
            all_tweets.extend([{
                '_source': {
                    doc_field:
                    self.tknzr.tokenize(tweet["_source"][doc_field]),
                    "timestamp_ms": tweet["_source"]["timestamp_ms"]
                }
            } for tweet in tweets])
            processed_tweets += scroll_size

            res = my_connector.loop_paginatedSearch(sid, scroll_size)
            scroll_size = res["scroll_size"]

        return all_tweets
예제 #2
0
    def download_tweets_from_elastic(self, **kwargs):

        debug_limit = kwargs.get("debug_limit", False)
        log_enabled = kwargs.get("log_enabled", True)

        if "config_relative_path" in kwargs:
            my_connector = Es_connector(
                index=kwargs["index"],
                doc_type="tweet",
                config_relative_path=kwargs["config_relative_path"])
        else:
            my_connector = Es_connector(
                index=kwargs["index"],
                doc_type="tweet")  #  config_relative_path='../')

        res = my_connector.init_paginatedSearch(kwargs["query"])
        sid = res["sid"]
        scroll_size = res["scroll_size"]
        total = int(res["total"])
        processed = len(res["results"])

        self.write_data_in_folders(kwargs["field"], kwargs["folder"],
                                   res["results"])

        while scroll_size > 0:
            res = my_connector.loop_paginatedSearch(sid, scroll_size)
            scroll_size = res["scroll_size"]
            processed += len(res["results"])

            # Writing the retrieved files into the folders
            self.write_data_in_folders(kwargs["field"], kwargs["folder"],
                                       res["results"])
            if log_enabled:
                print("Downloading: ", round(processed * 100 / total, 2), "%")

            if debug_limit:
                print("\nDEBUG LIMIT\n")
                res = my_connector.loop_paginatedSearch(sid, scroll_size)
                self.write_data_in_folders(kwargs["field"], kwargs["folder"],
                                           res["results"])
                scroll_size = 0

        return total
예제 #3
0
    def generate_ngrams_for_index(self, **kwargs):

        try:
            # Get the data for performinga paginated search
            self.current_thread_percentage = 0
            print("Starting")
            my_connector = Es_connector(index=kwargs["index"])

            query = kwargs.get('query', {"query": {"match_all": {}}})

            res = my_connector.init_paginatedSearch(query)
            sid = res["sid"]
            scroll_size = res["scroll_size"]
            total = int(res["total"])

            # Analyse and process page by page
            i = 0
            total_scrolls = int(total / scroll_size)
            processed_scrolls = 0

            print("from_property:", kwargs['from_property'])

            while scroll_size > 0:
                tweets = res["results"]
                self.gerenate_ngrams_for_tweets(
                    tweets,
                    from_property=kwargs['from_property'],
                    prop=kwargs["prop"],
                    index=kwargs["index"],
                    length=kwargs["length"])

                i += 1
                res = my_connector.loop_paginatedSearch(sid, scroll_size)
                scroll_size = res["scroll_size"]
                processed_scrolls += 1

                self.current_thread_percentage = round(
                    processed_scrolls * 100 / total_scrolls, 0)

                print("Completed: ", self.current_thread_percentage, "%")

            # Clean it at the end so the clien knows when to end asking for more logs
            self.current_thread_percentage = 100

            return True

        except Exception as e:
            print('Error: ' + str(e))
            return False
예제 #4
0
 def get_tweets_scroll(self, index, sid, scroll_size):
     my_connector = Es_connector(index=index)
     res = my_connector.loop_paginatedSearch(sid, scroll_size)
     return res
예제 #5
0
    print("Languages for stopwords: ", ngramsAnalizer.retrievedLangs)


try:
    my_connector = Es_connector(index=index)
    #query = #"query": {
    #"match_all": {}
    #}
    query = {"query": {"match": {"lang": "en or fr or es"}}}
    res = my_connector.init_paginatedSearch(query=query)

    sid = res["sid"]
    scroll_size = res["scroll_size"]
    init_total = int(res["total"])
    accum_total = 0

    print("\nTotal = ", init_total)
    print("\nScroll = ", scroll_size)
    print("\nLangs = ", langs)

    while scroll_size > 0:

        generate_text_images_prop(res["results"], langs)
        res = my_connector.loop_paginatedSearch(sid, scroll_size)
        scroll_size = res["scroll_size"]
        accum_total += scroll_size
        print(accum_total * 100 / init_total, "%")

except Exception as e:
    print('Error: ' + str(e))