예제 #1
0
파일: datasets.py 프로젝트: zbxzc35/boew
def transform_20ng_dataset(ouput_file):
    """
    Preprocess the 20NG dataset and stores the preprocessed data
    :param ouput_file:
    :return:
    """

    train_data, train_labels = get_20ng(split='train')
    test_data, test_labels = get_20ng(split='test')

    wv = WordVectorizer()
    wv.fit(train_data)
    glove_embedding = wv.get_glove_embedding()

    train_data = wv.transform(train_data)
    test_data = wv.transform(test_data)
    data = {}
    data['train_data'] = train_data
    data['test_data'] = test_data
    data['train_labels'] = train_labels
    data['test_labels'] = test_labels
    data['word2id'] = wv.word2id
    data['id2word'] = wv.id2word
    data['embedding'] = glove_embedding

    with open(ouput_file, "wb") as f:
        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
    def __init__(self, word2vec_model_path, max_tokens_length, position_vector=True, word_position_size=10, ner=False, pos=False, dependency=False):

        self.position_vector = position_vector

        # word_position vectors will be filled when calling init_size function
        self.word_position = None
        self.word_position_size = word_position_size
        self.sentence_vectorizer = WordVectorizer(word2vec_model_path, ner=ner, pos=pos, dependency=dependency)

        # sizes of the output sequence matrix m is number of words in the sequence
        # n is the size of the vector representation of each word in the sequence
        self.m = max_tokens_length
        self.n = self.sentence_vectorizer.model.vector_size + 2 * self.word_position_size
        # original index = -l+1,....,0,...l-1
        # array index    = 0,.......,(l-1),...(2xl)-1
        self.word_position = np.random.rand((2 * max_tokens_length) - 1, self.word_position_size)
import os
import gensim
from tweet import Tweet
from word_vectorizer import WordVectorizer
import csv

#load the word2vec module
word2vecModule = gensim.models.KeyedVectors.load_word2vec_format(
    './embed_tweets_de_200M_200D/embedding_file', binary=False)

#initialize thw WordVectorizer model
word_vectorizer = WordVectorizer(word2vecModule)

# load train data file to get the tweets
with open('../tweetsCrawler/train.csv', 'r') as train_data:
    #write tweets and its matrix to vectors.csv
    with open('./vectors.csv', 'w') as vectors_data:

        writer = csv.writer(vectors_data)
        #write headr row to vectors.csv
        writer.writerow([
            "politician_name", "party", "tweet", "matrix",
            "percentageOfMissingWords"
        ])
        #rows to be written to vectors.csv
        out_rows = []

        reader = csv.DictReader(train_data)
        for row in reader:
            current_tweet = Tweet(row['tweet'])
            #preprocess the tweet, and get list of tokens
class RelationVectorizer():

    def __init__(self, word2vec_model_path, max_tokens_length, position_vector=True, word_position_size=10, ner=False, pos=False, dependency=False):

        self.position_vector = position_vector

        # word_position vectors will be filled when calling init_size function
        self.word_position = None
        self.word_position_size = word_position_size
        self.sentence_vectorizer = WordVectorizer(word2vec_model_path, ner=ner, pos=pos, dependency=dependency)

        # sizes of the output sequence matrix m is number of words in the sequence
        # n is the size of the vector representation of each word in the sequence
        self.m = max_tokens_length
        self.n = self.sentence_vectorizer.model.vector_size + 2 * self.word_position_size
        # original index = -l+1,....,0,...l-1
        # array index    = 0,.......,(l-1),...(2xl)-1
        self.word_position = np.random.rand((2 * max_tokens_length) - 1, self.word_position_size)

    def tokens_to_vec(self, tokens):
        sentence_vec = []
        for token in tokens:
            vec = self.sentence_vectorizer.word2vec(token)
            if len(vec.shape) == 0:
                print 'length vec == 0'
                return []
            sentence_vec.append(vec)

        sentence_vec = np.array(sentence_vec, dtype=np.float32)
        return sentence_vec

    def transform(self, sentence_matrix, labels):
        sentence_matrix_out = np.zeros([0, self.m, self.n], np.float32)
        valid_label = []

        count = 0
        for sentence_elements, label in tqdm(zip(sentence_matrix, labels)):
            count += 1
            sentence_vec = self.tokens_to_vec(sentence_elements["tokens"])
            if sentence_vec == []:
                 print 'line %d sentence vector is null' % count
                 continue
            entity1_vec = self.lookup_word_pos(sentence_elements["ent1_pos"], self.m)  # dimension m x _
            entity2_vec = self.lookup_word_pos(sentence_elements["ent2_pos"], self.m)  # dimension m x _

            pad_size = self.m - sentence_vec.shape[0]
            if pad_size > 0:
                temp = np.zeros((pad_size, self.sentence_vectorizer.model.vector_size))
                sentence_vec = np.vstack([sentence_vec, temp])

            #  merging different parts of vector representation of words
            sentence_matrix_vec = np.hstack([sentence_vec, entity1_vec, entity2_vec])
            sentence_matrix_out = np.append(sentence_matrix_out, [sentence_matrix_vec], axis=0)
            valid_label.append(label)
        return sentence_matrix_out, valid_label

    def lookup_word_pos(self, p, sentence_length):
        """
        :param p: position of entity
        :return: array of dimension self.m x self.word_position_size

        example : if ent1 = 2 self.m = 10   i.e. : (w0, w1, w2(e1), w3, w4, w5, w6, w7, w8, w9)
                  return: word_position[-2:8]   ===
                  add (l-1) to get indices between (0,2l-1) ===>  word_position[7:17]
        """
        start = -p + sentence_length - 1
        end = start + sentence_length
        return self.word_position[start:end]