示例#1
0
    def __init__(self, source_file_path, stopwords_file_path, min_absolute_freq=10, max_relative_freq=0.4, separator='\t', save_voc=False):
        self.source_file_path = source_file_path
        self.size = 0
        self.start_date = '3000-01-01 00:00:00'
        self.end_date = '1000-01-01 00:00:00'
        self.separator = separator

        # load stop-words
        self.stopwords = utils.load_stopwords(stopwords_file_path)

        # identify features
        with open(source_file_path, 'r') as input_file:
            print("Reading from ", input_file)
            csv_reader = csv.reader(input_file, delimiter=self.separator)
            header = next(csv_reader)
            text_column_index = header.index('text')
            date_column_index = header.index('date')
            word_frequency = {}
            for line in csv_reader:
                self.size += 1
                words = self.tokenize(line[text_column_index])
                date = line[date_column_index]
                if date > self.end_date:
                    self.end_date = date
                elif date < self.start_date:
                    self.start_date = date
                # update word frequency
                for word in words:
                    if len(word) > 1:
                        frequency = word_frequency.get(word)
                        if frequency is None:
                            frequency = 0
                        word_frequency[word] = frequency + 1

            print("word_frequency", word_frequency)
            # sort words w.r.t frequency
            vocabulary = list(word_frequency.items())
            vocabulary.sort(key=lambda x: x[1], reverse=True)
            if save_voc:
                with open('vocabulary.pickle', 'wb') as output_file:
                    pickle.dump(vocabulary, output_file)
            self.vocabulary = {}
            vocabulary_size = 0
            # construct the vocabulary map
            for word, frequency in vocabulary:
                if frequency > min_absolute_freq and float(frequency / self.size) < max_relative_freq and word not in self.stopwords:
                    self.vocabulary[word] = vocabulary_size
                    vocabulary_size += 1
            self.start_date = datetime.strptime(self.start_date, "%Y-%m-%d %H:%M:%S")
            self.end_date = datetime.strptime(self.end_date, "%Y-%m-%d %H:%M:%S")
            print('   Corpus: %i tweets, spanning from %s to %s' % (self.size,
                                                                    self.start_date,
                                                                    self.end_date))
            print('   Vocabulary: %d distinct words' % vocabulary_size)
            self.time_slice_count = None
            self.tweet_count = None
            self.global_freq = None
            self.mention_freq = None
            self.time_slice_length = None
示例#2
0
    def __init__(self, source_file_path, stopwords_file_path, min_absolute_freq=10, max_relative_freq=0.4, separator='\t', save_voc=False):
        self.source_file_path = source_file_path
        self.size = 0
        self.start_date = '3000-01-01 00:00:00'
        self.end_date = '1000-01-01 00:00:00'
        self.separator = separator

        # load stop-words
        self.stopwords = utils.load_stopwords(stopwords_file_path)

        # identify features
        with open(source_file_path, 'r') as input_file:
            csv_reader = csv.reader(input_file, delimiter=self.separator)
            header = next(csv_reader)
            text_column_index = header.index('text')
            date_column_index = header.index('date')
            word_frequency = {}
            for line in csv_reader:
                self.size += 1
                words = self.tokenize(line[text_column_index])
                date = line[date_column_index]
                if date > self.end_date:
                    self.end_date = date
                elif date < self.start_date:
                    self.start_date = date
                # update word frequency
                for word in words:
                    if len(word) > 1:
                        frequency = word_frequency.get(word)
                        if frequency is None:
                            frequency = 0
                        word_frequency[word] = frequency + 1
            # sort words w.r.t frequency
            vocabulary = list(word_frequency.items())
            vocabulary.sort(key=lambda x: x[1], reverse=True)
            if save_voc:
                with open('vocabulary.pickle', 'wb') as output_file:
                    pickle.dump(vocabulary, output_file)
            self.vocabulary = {}
            vocabulary_size = 0
            # construct the vocabulary map
            for word, frequency in vocabulary:
                if frequency > min_absolute_freq and float(frequency / self.size) < max_relative_freq and word not in self.stopwords:
                    self.vocabulary[word] = vocabulary_size
                    vocabulary_size += 1
            self.start_date = datetime.strptime(self.start_date, "%Y-%m-%d %H:%M:%S")
            self.end_date = datetime.strptime(self.end_date, "%Y-%m-%d %H:%M:%S")
            print('   Corpus: %i tweets, spanning from %s to %s' % (self.size,
                                                                    self.start_date,
                                                                    self.end_date))
            print('   Vocabulary: %d distinct words' % vocabulary_size)
            self.time_slice_count = None
            self.tweet_count = None
            self.global_freq = None
            self.mention_freq = None
            self.time_slice_length = None
示例#3
0
    def w2v_tweets(self, stopwords_file_path, words, count):
        # Process hits here
        tweets = []
        # load stop-words
        stopwords = utils.load_stopwords(stopwords_file_path)

        # print(stopwords)

        def process_hits(hits, stopwords):
            t = []
            for item in hits:
                # tweet = item['_source']['text'].encode('utf-8', 'ignore').decode('utf-8')
                tweet = item['_source']['text']
                tokenized_tweet = self.tokenize(tweet, stopwords)
                # print(tokenized_tweet)
                t.append(tokenized_tweet)
            return t

        # Check index exists
        if not self.es.indices.exists(index=self.index):
            print("Index " + self.index + " not exists")
            exit()

        body = {
            "query": {
                "bool": {
                    "should": {
                        "match": {
                            "text": {
                                "query": words
                            }
                        }
                    }
                }
            }
        }
        print(body)

        # Init scroll by search

        # filepath = "models/" + str(hash(words)).replace("-", "") + ".model"
        filepath = "models/" + words.replace(" ", "").replace(",",
                                                              "") + ".model"
        modelfile = Path(filepath)
        if modelfile.is_file():
            model = gensim.models.Word2Vec.load(filepath)
        else:
            data = self.es.search(index=self.index,
                                  doc_type=self.doc_type,
                                  scroll='2m',
                                  size=self.size,
                                  body=body)

            # Get the scroll ID
            sid = data['_scroll_id']
            scroll_size = len(data['hits']['hits'])

            # Before scroll, process current batch of hits
            tweets = process_hits(data['hits']['hits'], stopwords)

            while scroll_size > 0:
                "Scrolling..."
                data = self.es.scroll(scroll_id=sid, scroll='2m')

                # Process current batch of hits
                tweets = tweets + process_hits(data['hits']['hits'], stopwords)

                # Update the scroll ID
                sid = data['_scroll_id']

                # Get the number of results that returned in the last scroll
                scroll_size = len(data['hits']['hits'])

            # print(texts[0])
            # tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in tweets]
            # tweets = tweets + ['lyon']

            model = gensim.models.Word2Vec(tweets,
                                           min_count=1,
                                           workers=10,
                                           negative=20)
            model.save(filepath)

        words = self.tokenize(words, stopwords)
        pwords = words
        print("pwords")
        print(pwords)
        # context = model.most_similar(positive=['fête','lumières'], topn=10)
        context = model.most_similar(positive=pwords, topn=count)
        # context = model.most_similar(positive=['fête','lumières'], topn=count)
        # context = model.most_similar_cosmul(positive=pwords, topn=5)
        # context = model.similar_by_word(word='lyon', topn=5)

        # context = model.similar_by_vector(vector=['lyon','fdl','fdl2017'], topn=5)

        return context
示例#4
0
    def __init__(self, stopwords_file_path, min_absolute_freq=10, max_relative_freq=0.4, separator='\t', save_voc=False, index='', session= False, filter= False, cluster=2):

        with open('config.json', 'r') as f:
            config = json.load(f)
        #index = config['default']

        self.size = 0
        self.start_date = '3000-01-01 00:00:00'
        self.end_date = '1000-01-01 00:00:00'
        self.separator = separator

        # load stop-words
        self.stopwords = utils.load_stopwords(stopwords_file_path)

        my_connector = Es_connector(index=index)
        if session and filter:
            self.tweets = my_connector.getFilteredTweets(session, filter)
        else:
            self.tweets = my_connector.getTweets()

        if self.tweets:
            # identify features
            word_frequency = {}
            for line in self.tweets:
                self.size += 1
                tweet_text = line['_source']['text']
                if 'imagesCluster' in line['_source']:
                    arr = line['_source']['imagesCluster']
                    if arr:
                        cluster_str = ""
                        cluster_temp = ""
                        if isinstance(arr, list):
                            for cl in arr:
                                cluster_temp = cluster_temp + ' Cluster' + str(cl)+ ' '
                        else:
                            cluster_temp = ' Cluster'+ str(arr)
                        for x in range(cluster):
                            cluster_str = cluster_str + cluster_temp
                        tweet_text = tweet_text + cluster_str

                #   tokenization
                words = self.tokenize(tweet_text)

                # Updating start and ending dates
                date = line['_source']['timestamp_ms']
                if date > self.end_date:
                    self.end_date = date
                elif date < self.start_date:
                    self.start_date = date

                # update word frequency for the vocabulary
                for word in words:
                    if len(word) > 1:
                        frequency = word_frequency.get(word)
                        if frequency is None:
                            frequency = 0
                        word_frequency[word] = frequency + 1

            # sort words w.r.t frequency
            vocabulary = list(word_frequency.items())
            vocabulary.sort(key=lambda x: x[1], reverse=True)
            if save_voc:
                with open('vocabulary.pickle', 'wb', encoding="utf-8") as output_file:
                    pickle.dump(vocabulary, output_file)
            self.vocabulary = {}
            vocabulary_size = 0
            # construct the vocabulary map
            for word, frequency in vocabulary:
                if frequency > min_absolute_freq and float(frequency / self.size) < max_relative_freq and word not in self.stopwords:
                    self.vocabulary[word] = vocabulary_size
                    vocabulary_size += 1
            # self.start_date = datetime.strptime(self.start_date, "%Y-%m-%d %H:%M:%S")
            self.start_date = time.ctime(int(self.start_date)/1000)
            self.start_date = datetime.strptime(self.start_date, "%a %b %d %H:%M:%S %Y")
            print(self.start_date)
            # self.end_date = datetime.strptime(self.end_date, "%Y-%m-%d %H:%M:%S")
            self.end_date = time.ctime(int(self.end_date)/1000)
            self.end_date = datetime.strptime(self.end_date, "%a %b %d %H:%M:%S %Y")
            print('   Corpus: %i tweets, spanning from %s to %s' % (self.size,
                                                                    self.start_date,
                                                                    self.end_date))
            print('   Vocabulary: %d distinct words' % vocabulary_size)
            self.time_slice_count = None
            self.tweet_count = None
            self.global_freq = None
            self.mention_freq = None
            self.time_slice_length = None