def transform_20ng_dataset(ouput_file): """ Preprocess the 20NG dataset and stores the preprocessed data :param ouput_file: :return: """ train_data, train_labels = get_20ng(split='train') test_data, test_labels = get_20ng(split='test') wv = WordVectorizer() wv.fit(train_data) glove_embedding = wv.get_glove_embedding() train_data = wv.transform(train_data) test_data = wv.transform(test_data) data = {} data['train_data'] = train_data data['test_data'] = test_data data['train_labels'] = train_labels data['test_labels'] = test_labels data['word2id'] = wv.word2id data['id2word'] = wv.id2word data['embedding'] = glove_embedding with open(ouput_file, "wb") as f: pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
def __init__(self, word2vec_model_path, max_tokens_length, position_vector=True, word_position_size=10, ner=False, pos=False, dependency=False): self.position_vector = position_vector # word_position vectors will be filled when calling init_size function self.word_position = None self.word_position_size = word_position_size self.sentence_vectorizer = WordVectorizer(word2vec_model_path, ner=ner, pos=pos, dependency=dependency) # sizes of the output sequence matrix m is number of words in the sequence # n is the size of the vector representation of each word in the sequence self.m = max_tokens_length self.n = self.sentence_vectorizer.model.vector_size + 2 * self.word_position_size # original index = -l+1,....,0,...l-1 # array index = 0,.......,(l-1),...(2xl)-1 self.word_position = np.random.rand((2 * max_tokens_length) - 1, self.word_position_size)
import os import gensim from tweet import Tweet from word_vectorizer import WordVectorizer import csv #load the word2vec module word2vecModule = gensim.models.KeyedVectors.load_word2vec_format( './embed_tweets_de_200M_200D/embedding_file', binary=False) #initialize thw WordVectorizer model word_vectorizer = WordVectorizer(word2vecModule) # load train data file to get the tweets with open('../tweetsCrawler/train.csv', 'r') as train_data: #write tweets and its matrix to vectors.csv with open('./vectors.csv', 'w') as vectors_data: writer = csv.writer(vectors_data) #write headr row to vectors.csv writer.writerow([ "politician_name", "party", "tweet", "matrix", "percentageOfMissingWords" ]) #rows to be written to vectors.csv out_rows = [] reader = csv.DictReader(train_data) for row in reader: current_tweet = Tweet(row['tweet']) #preprocess the tweet, and get list of tokens
class RelationVectorizer(): def __init__(self, word2vec_model_path, max_tokens_length, position_vector=True, word_position_size=10, ner=False, pos=False, dependency=False): self.position_vector = position_vector # word_position vectors will be filled when calling init_size function self.word_position = None self.word_position_size = word_position_size self.sentence_vectorizer = WordVectorizer(word2vec_model_path, ner=ner, pos=pos, dependency=dependency) # sizes of the output sequence matrix m is number of words in the sequence # n is the size of the vector representation of each word in the sequence self.m = max_tokens_length self.n = self.sentence_vectorizer.model.vector_size + 2 * self.word_position_size # original index = -l+1,....,0,...l-1 # array index = 0,.......,(l-1),...(2xl)-1 self.word_position = np.random.rand((2 * max_tokens_length) - 1, self.word_position_size) def tokens_to_vec(self, tokens): sentence_vec = [] for token in tokens: vec = self.sentence_vectorizer.word2vec(token) if len(vec.shape) == 0: print 'length vec == 0' return [] sentence_vec.append(vec) sentence_vec = np.array(sentence_vec, dtype=np.float32) return sentence_vec def transform(self, sentence_matrix, labels): sentence_matrix_out = np.zeros([0, self.m, self.n], np.float32) valid_label = [] count = 0 for sentence_elements, label in tqdm(zip(sentence_matrix, labels)): count += 1 sentence_vec = self.tokens_to_vec(sentence_elements["tokens"]) if sentence_vec == []: print 'line %d sentence vector is null' % count continue entity1_vec = self.lookup_word_pos(sentence_elements["ent1_pos"], self.m) # dimension m x _ entity2_vec = self.lookup_word_pos(sentence_elements["ent2_pos"], self.m) # dimension m x _ pad_size = self.m - sentence_vec.shape[0] if pad_size > 0: temp = np.zeros((pad_size, self.sentence_vectorizer.model.vector_size)) sentence_vec = np.vstack([sentence_vec, temp]) # merging different parts of vector representation of words sentence_matrix_vec = np.hstack([sentence_vec, entity1_vec, entity2_vec]) sentence_matrix_out = np.append(sentence_matrix_out, [sentence_matrix_vec], axis=0) valid_label.append(label) return sentence_matrix_out, valid_label def lookup_word_pos(self, p, sentence_length): """ :param p: position of entity :return: array of dimension self.m x self.word_position_size example : if ent1 = 2 self.m = 10 i.e. : (w0, w1, w2(e1), w3, w4, w5, w6, w7, w8, w9) return: word_position[-2:8] === add (l-1) to get indices between (0,2l-1) ===> word_position[7:17] """ start = -p + sentence_length - 1 end = start + sentence_length return self.word_position[start:end]