def __init__(self, source_file_path, stopwords_file_path, min_absolute_freq=10, max_relative_freq=0.4, separator='\t', save_voc=False): self.source_file_path = source_file_path self.size = 0 self.start_date = '3000-01-01 00:00:00' self.end_date = '1000-01-01 00:00:00' self.separator = separator # load stop-words self.stopwords = utils.load_stopwords(stopwords_file_path) # identify features with open(source_file_path, 'r') as input_file: print("Reading from ", input_file) csv_reader = csv.reader(input_file, delimiter=self.separator) header = next(csv_reader) text_column_index = header.index('text') date_column_index = header.index('date') word_frequency = {} for line in csv_reader: self.size += 1 words = self.tokenize(line[text_column_index]) date = line[date_column_index] if date > self.end_date: self.end_date = date elif date < self.start_date: self.start_date = date # update word frequency for word in words: if len(word) > 1: frequency = word_frequency.get(word) if frequency is None: frequency = 0 word_frequency[word] = frequency + 1 print("word_frequency", word_frequency) # sort words w.r.t frequency vocabulary = list(word_frequency.items()) vocabulary.sort(key=lambda x: x[1], reverse=True) if save_voc: with open('vocabulary.pickle', 'wb') as output_file: pickle.dump(vocabulary, output_file) self.vocabulary = {} vocabulary_size = 0 # construct the vocabulary map for word, frequency in vocabulary: if frequency > min_absolute_freq and float(frequency / self.size) < max_relative_freq and word not in self.stopwords: self.vocabulary[word] = vocabulary_size vocabulary_size += 1 self.start_date = datetime.strptime(self.start_date, "%Y-%m-%d %H:%M:%S") self.end_date = datetime.strptime(self.end_date, "%Y-%m-%d %H:%M:%S") print(' Corpus: %i tweets, spanning from %s to %s' % (self.size, self.start_date, self.end_date)) print(' Vocabulary: %d distinct words' % vocabulary_size) self.time_slice_count = None self.tweet_count = None self.global_freq = None self.mention_freq = None self.time_slice_length = None
def __init__(self, source_file_path, stopwords_file_path, min_absolute_freq=10, max_relative_freq=0.4, separator='\t', save_voc=False): self.source_file_path = source_file_path self.size = 0 self.start_date = '3000-01-01 00:00:00' self.end_date = '1000-01-01 00:00:00' self.separator = separator # load stop-words self.stopwords = utils.load_stopwords(stopwords_file_path) # identify features with open(source_file_path, 'r') as input_file: csv_reader = csv.reader(input_file, delimiter=self.separator) header = next(csv_reader) text_column_index = header.index('text') date_column_index = header.index('date') word_frequency = {} for line in csv_reader: self.size += 1 words = self.tokenize(line[text_column_index]) date = line[date_column_index] if date > self.end_date: self.end_date = date elif date < self.start_date: self.start_date = date # update word frequency for word in words: if len(word) > 1: frequency = word_frequency.get(word) if frequency is None: frequency = 0 word_frequency[word] = frequency + 1 # sort words w.r.t frequency vocabulary = list(word_frequency.items()) vocabulary.sort(key=lambda x: x[1], reverse=True) if save_voc: with open('vocabulary.pickle', 'wb') as output_file: pickle.dump(vocabulary, output_file) self.vocabulary = {} vocabulary_size = 0 # construct the vocabulary map for word, frequency in vocabulary: if frequency > min_absolute_freq and float(frequency / self.size) < max_relative_freq and word not in self.stopwords: self.vocabulary[word] = vocabulary_size vocabulary_size += 1 self.start_date = datetime.strptime(self.start_date, "%Y-%m-%d %H:%M:%S") self.end_date = datetime.strptime(self.end_date, "%Y-%m-%d %H:%M:%S") print(' Corpus: %i tweets, spanning from %s to %s' % (self.size, self.start_date, self.end_date)) print(' Vocabulary: %d distinct words' % vocabulary_size) self.time_slice_count = None self.tweet_count = None self.global_freq = None self.mention_freq = None self.time_slice_length = None
def w2v_tweets(self, stopwords_file_path, words, count): # Process hits here tweets = [] # load stop-words stopwords = utils.load_stopwords(stopwords_file_path) # print(stopwords) def process_hits(hits, stopwords): t = [] for item in hits: # tweet = item['_source']['text'].encode('utf-8', 'ignore').decode('utf-8') tweet = item['_source']['text'] tokenized_tweet = self.tokenize(tweet, stopwords) # print(tokenized_tweet) t.append(tokenized_tweet) return t # Check index exists if not self.es.indices.exists(index=self.index): print("Index " + self.index + " not exists") exit() body = { "query": { "bool": { "should": { "match": { "text": { "query": words } } } } } } print(body) # Init scroll by search # filepath = "models/" + str(hash(words)).replace("-", "") + ".model" filepath = "models/" + words.replace(" ", "").replace(",", "") + ".model" modelfile = Path(filepath) if modelfile.is_file(): model = gensim.models.Word2Vec.load(filepath) else: data = self.es.search(index=self.index, doc_type=self.doc_type, scroll='2m', size=self.size, body=body) # Get the scroll ID sid = data['_scroll_id'] scroll_size = len(data['hits']['hits']) # Before scroll, process current batch of hits tweets = process_hits(data['hits']['hits'], stopwords) while scroll_size > 0: "Scrolling..." data = self.es.scroll(scroll_id=sid, scroll='2m') # Process current batch of hits tweets = tweets + process_hits(data['hits']['hits'], stopwords) # Update the scroll ID sid = data['_scroll_id'] # Get the number of results that returned in the last scroll scroll_size = len(data['hits']['hits']) # print(texts[0]) # tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in tweets] # tweets = tweets + ['lyon'] model = gensim.models.Word2Vec(tweets, min_count=1, workers=10, negative=20) model.save(filepath) words = self.tokenize(words, stopwords) pwords = words print("pwords") print(pwords) # context = model.most_similar(positive=['fête','lumières'], topn=10) context = model.most_similar(positive=pwords, topn=count) # context = model.most_similar(positive=['fête','lumières'], topn=count) # context = model.most_similar_cosmul(positive=pwords, topn=5) # context = model.similar_by_word(word='lyon', topn=5) # context = model.similar_by_vector(vector=['lyon','fdl','fdl2017'], topn=5) return context
def __init__(self, stopwords_file_path, min_absolute_freq=10, max_relative_freq=0.4, separator='\t', save_voc=False, index='', session= False, filter= False, cluster=2): with open('config.json', 'r') as f: config = json.load(f) #index = config['default'] self.size = 0 self.start_date = '3000-01-01 00:00:00' self.end_date = '1000-01-01 00:00:00' self.separator = separator # load stop-words self.stopwords = utils.load_stopwords(stopwords_file_path) my_connector = Es_connector(index=index) if session and filter: self.tweets = my_connector.getFilteredTweets(session, filter) else: self.tweets = my_connector.getTweets() if self.tweets: # identify features word_frequency = {} for line in self.tweets: self.size += 1 tweet_text = line['_source']['text'] if 'imagesCluster' in line['_source']: arr = line['_source']['imagesCluster'] if arr: cluster_str = "" cluster_temp = "" if isinstance(arr, list): for cl in arr: cluster_temp = cluster_temp + ' Cluster' + str(cl)+ ' ' else: cluster_temp = ' Cluster'+ str(arr) for x in range(cluster): cluster_str = cluster_str + cluster_temp tweet_text = tweet_text + cluster_str # tokenization words = self.tokenize(tweet_text) # Updating start and ending dates date = line['_source']['timestamp_ms'] if date > self.end_date: self.end_date = date elif date < self.start_date: self.start_date = date # update word frequency for the vocabulary for word in words: if len(word) > 1: frequency = word_frequency.get(word) if frequency is None: frequency = 0 word_frequency[word] = frequency + 1 # sort words w.r.t frequency vocabulary = list(word_frequency.items()) vocabulary.sort(key=lambda x: x[1], reverse=True) if save_voc: with open('vocabulary.pickle', 'wb', encoding="utf-8") as output_file: pickle.dump(vocabulary, output_file) self.vocabulary = {} vocabulary_size = 0 # construct the vocabulary map for word, frequency in vocabulary: if frequency > min_absolute_freq and float(frequency / self.size) < max_relative_freq and word not in self.stopwords: self.vocabulary[word] = vocabulary_size vocabulary_size += 1 # self.start_date = datetime.strptime(self.start_date, "%Y-%m-%d %H:%M:%S") self.start_date = time.ctime(int(self.start_date)/1000) self.start_date = datetime.strptime(self.start_date, "%a %b %d %H:%M:%S %Y") print(self.start_date) # self.end_date = datetime.strptime(self.end_date, "%Y-%m-%d %H:%M:%S") self.end_date = time.ctime(int(self.end_date)/1000) self.end_date = datetime.strptime(self.end_date, "%a %b %d %H:%M:%S %Y") print(' Corpus: %i tweets, spanning from %s to %s' % (self.size, self.start_date, self.end_date)) print(' Vocabulary: %d distinct words' % vocabulary_size) self.time_slice_count = None self.tweet_count = None self.global_freq = None self.mention_freq = None self.time_slice_length = None