Python MyVocabularyProcessor.transform示例，preprocess.MyVocabularyProcessor.transform Python示例

示例#1

0

显示文件

文件： input_helpers.py 项目： jsupeng/kablstm

    def getDataSets(self, training_paths, training_labeled_paths, dev_paths, dev_labeled_paths, max_document_length, percent_dev, batch_size):
        x1_text, x2_text, y=self.getTsvData(training_paths)
        ent_x1, ent_x2=self.getEntData(x1_text, x2_text, training_labeled_paths, max_document_length)
        add_fea = self.getAdditionalFeature(x1_text,x2_text)
        #print add_fea
        # Build vocabulary
        print("Building vocabulary")
        vocab_processor = MyVocabularyProcessor(max_document_length,min_frequency=0)
        vocab_processor.fit_transform(np.concatenate((x2_text,x1_text),axis=0))
        vocab =  vocab_processor.vocabulary_.__dict__['_reverse_mapping']
        self.saveMap(vocab)
        print("Length of loaded vocabulary ={}".format( len(vocab_processor.vocabulary_)))
        i1=0
        train_set=[]
        dev_set=[]
        sum_no_of_batches = 0
        x1 = np.asarray(list(vocab_processor.transform(x1_text)))
        x2 = np.asarray(list(vocab_processor.transform(x2_text)))
        print x1[0]
        print ent_x1[0]
        # Randomly shuffle data
        np.random.seed(131)
        shuffle_indices = np.random.permutation(np.arange(len(y)))
        x1_shuffled = x1[shuffle_indices]
        x2_shuffled = x2[shuffle_indices]
        ent_x1_shuffled = ent_x1[shuffle_indices]
        ent_x2_shuffled = ent_x2[shuffle_indices]
        y_shuffled = y[shuffle_indices]
        add_fea_shuffled = add_fea[shuffle_indices]
        dev_idx = -1*len(y_shuffled)*percent_dev//100
        del x1
        del x2
        del ent_x1
        del ent_x2

        if dev_paths == None:
            # Split train/dev set
            self.dumpValidation(x1_text,x2_text,y,shuffle_indices,dev_idx,0)
            # TODO: This is very crude, should use cross-validation
            x1_train, x1_dev = x1_shuffled[:dev_idx], x1_shuffled[dev_idx:]
            x2_train, x2_dev = x2_shuffled[:dev_idx], x2_shuffled[dev_idx:]
            ent_x1_train, ent_x1_dev = ent_x1_shuffled[:dev_idx], ent_x1_shuffled[dev_idx:]
            ent_x2_train, ent_x2_dev = ent_x2_shuffled[:dev_idx], ent_x2_shuffled[dev_idx:]
            y_train, y_dev = y_shuffled[:dev_idx], y_shuffled[dev_idx:]
            print("Train/Dev split for {}: {:d}/{:d}".format(training_paths, len(y_train), len(y_dev)))       
        else:
            x1_train, x2_train, ent_x1_train, ent_x2_train, y_train, add_fea_train = x1_shuffled,x2_shuffled,ent_x1_shuffled,ent_x2_shuffled,y_shuffled,add_fea_shuffled
            x1_dev_text, x2_dev_text, y_dev = self.getTsvData(dev_paths)
            ent_x1_dev, ent_x2_dev=self.getEntData(x1_dev_text, x2_dev_text, dev_labeled_paths, max_document_length)
            add_fea_dev = self.getAdditionalFeature(x1_dev_text,x2_dev_text)
            x1_dev = np.asarray(list(vocab_processor.transform(x1_dev_text)))
            x2_dev = np.asarray(list(vocab_processor.transform(x2_dev_text)))

        sum_no_of_batches = sum_no_of_batches+(len(y_train)//batch_size)
        train_set=(x1_train, x2_train, ent_x1_train, ent_x2_train, y_train,add_fea_train)
        dev_set=(x1_dev,x2_dev,ent_x1_dev,ent_x2_dev,y_dev,add_fea_dev,x1_dev_text,x2_dev_text)    
        gc.collect()
        return train_set,dev_set,vocab_processor,sum_no_of_batches

示例#2

0

显示文件

    def myGetDataSets(self, cursor, max_document_length, percent_dev,
                      batch_size, is_char_based, number_of_samples):
        # edited
        start_time = time.time()
        cursor.execute('select * from dataset_sentence')
        end_time = time.time()
        print('Time elapsed on running select all: {} seconds.'.format(
            round(end_time - start_time, 2)))

        start_time = time.time()
        tuples = cursor.fetchmany(number_of_samples)
        end_time = time.time()
        print('Time elapsed on fetching {} lines: {} seconds.'.format(
            number_of_samples, round(end_time - start_time, 2)))

        x1_text = np.asarray([i[0] for i in tuples])
        x2_text = np.asarray([i[1] for i in tuples])
        y = np.asarray([i[2] for i in tuples])

        # Build vocabulary
        print("Building vocabulary")
        vocab_processor = MyVocabularyProcessor(max_document_length,
                                                min_frequency=0,
                                                is_char_based=is_char_based)
        vocab_processor.fit_transform(
            np.concatenate((x2_text, x1_text), axis=0))
        print("Length of loaded vocabulary ={}".format(
            len(vocab_processor.vocabulary_)))
        i1 = 0
        train_set = []
        dev_set = []
        sum_no_of_batches = 0
        x1 = np.asarray(list(vocab_processor.transform(x1_text)))
        x2 = np.asarray(list(vocab_processor.transform(x2_text)))
        # Randomly shuffle data
        np.random.seed(131)
        shuffle_indices = np.random.permutation(np.arange(len(y)))
        x1_shuffled = x1[shuffle_indices]
        x2_shuffled = x2[shuffle_indices]
        y_shuffled = y[shuffle_indices]
        dev_idx = -1 * len(y_shuffled) * percent_dev // 100
        del x1
        del x2
        # Split train/test set
        self.dumpValidation(x1_text, x2_text, y, shuffle_indices, dev_idx, 0)
        # TODO: This is very crude, should use cross-validation
        x1_train, x1_dev = x1_shuffled[:dev_idx], x1_shuffled[dev_idx:]
        x2_train, x2_dev = x2_shuffled[:dev_idx], x2_shuffled[dev_idx:]
        y_train, y_dev = y_shuffled[:dev_idx], y_shuffled[dev_idx:]
        print("Train/Dev split for {}: {:d}/{:d}".format(
            'dataset_id', len(y_train), len(y_dev)))
        sum_no_of_batches = sum_no_of_batches + (len(y_train) // batch_size)
        train_set = (x1_train, x2_train, y_train)
        dev_set = (x1_dev, x2_dev, y_dev)
        gc.collect()
        return train_set, dev_set, vocab_processor, sum_no_of_batches

示例#3

0

显示文件

 def getAquaintTestDataSet(self, data_path, vocab_path, max_document_length):
     x1_temp,x2_temp,y = self.getAquaintData(data_path)
     # Build vocabulary
     vocab_processor = MyVocabularyProcessor(max_document_length,min_frequency=0)
     vocab_processor = vocab_processor.restore(vocab_path)
     print (len(vocab_processor.vocabulary_))
     x1 = np.asarray(list(vocab_processor.transform(x1_temp)))
     x2 = np.asarray(list(vocab_processor.transform(x2_temp)))
     # Randomly shuffle data
     del vocab_processor
     gc.collect()
     return x1,x2, y

示例#4

0

显示文件

文件： input_helpers.py 项目： rohit-nair/neuralist

    def getPCADataSet(self, data_path, vocab_path, max_document_length):
        x1_temp = self.getJsonPCAData(data_path)

        # Build vocabulary
        vocab_processor = MyVocabularyProcessor(max_document_length,min_frequency=0)
        vocab_processor = vocab_processor.restore(vocab_path)
        print len(vocab_processor.vocabulary_)

        x1 = np.asarray(list(vocab_processor.transform(x1_temp)))
        # Randomly shuffle data
        del vocab_processor
        gc.collect()
        return x1,x1, np.ones(len(x1))

示例#5

0

显示文件

    def getTestDataSet_infer(self, x1_infer, x2_infer, vocab_path, max_document_length):
        x1_temp,x2_temp = np.asarray(x1_infer), np.asarray(x2_infer)  #, = self.getTsvTestData_infer(x1_infer, x2_infer)
        #print('DAS ist x1_temp: ', type(x1_temp), x1_temp)
        # Build vocabulary
        vocab_processor = MyVocabularyProcessor(max_document_length,min_frequency=0)
        vocab_processor = vocab_processor.restore(vocab_path)
        print ('len vocab: ', len(vocab_processor.vocabulary_))

        x1 = np.asarray(list(vocab_processor.transform(x1_temp)))
        x2 = np.asarray(list(vocab_processor.transform(x2_temp)))
        # Randomly shuffle data
        del vocab_processor
        gc.collect()
        return x1,x2

示例#6

0

显示文件

文件： input_helpers.py 项目： kumamandala/deep-siamese-text-similarity-1

    def getTestDataSet(self, data_path, vocab_path, max_document_length):
        x1_temp,x2_temp,y = self.getTsvTestData(data_path)

        # Build vocabulary
        vocab_processor = MyVocabularyProcessor(max_document_length,min_frequency=0)
        vocab_processor = vocab_processor.restore(vocab_path)
        print len(vocab_processor.vocabulary_)

        x1 = np.asarray(list(vocab_processor.transform(x1_temp)))
        x2 = np.asarray(list(vocab_processor.transform(x2_temp)))
        # Randomly shuffle data
        del vocab_processor
        gc.collect()
        return x1,x2, y

示例#7

0

显示文件

    def getDataSets(self, training_paths, max_document_length, percent_dev,
                    batch_size):
        x1_text, x2_text, y = self.getTsvData(training_paths)
        # print('x1_text= {}'.format(x1_text))
        # print('x2_text= {}'.format(x2_text))
        # print ('y= {}'.format(y))

        # Build vocabulary
        print("Building vocabulary")
        vocab_processor = MyVocabularyProcessor(max_document_length,
                                                min_frequency=0)
        vocab_processor.fit_transform(
            np.concatenate((x2_text, x1_text), axis=0))
        print("Length of loaded vocabulary ={}".format(
            len(vocab_processor.vocabulary_)))

        sum_no_of_batches = 0
        x1 = np.asarray(list(vocab_processor.transform(x1_text)))
        x2 = np.asarray(list(vocab_processor.transform(x2_text)))
        # Randomly shuffle data
        np.random.seed(131)
        shuffle_indices = np.random.permutation(np.arange(len(y)))
        x1_shuffled = x1[shuffle_indices]
        x2_shuffled = x2[shuffle_indices]
        y_shuffled = y[shuffle_indices]
        dev_idx = -1 * len(y_shuffled) * percent_dev // 100
        print('dev_idx= {}'.format(dev_idx))

        # for idx, item in enumerate(y_shuffled):
        #     print('idx={}\n x1_shuffled={}\n x2_shuffled={}\n y_shuffled={}'.format(idx, x1_shuffled[idx], x2_shuffled[idx], y_shuffled[idx]))
        #
        # exit(0)

        del x1
        del x2
        # Split train/test set
        self.dumpValidation(x1_text, x2_text, y, shuffle_indices, dev_idx, 0)
        # TODO: This is very crude, should use cross-validation
        x1_train, x1_dev = x1_shuffled[:dev_idx], x1_shuffled[dev_idx:]
        x2_train, x2_dev = x2_shuffled[:dev_idx], x2_shuffled[dev_idx:]
        y_train, y_dev = y_shuffled[:dev_idx], y_shuffled[dev_idx:]
        print("Train/Dev split for {}: {:d}/{:d}".format(
            training_paths, len(y_train), len(y_dev)))
        sum_no_of_batches = sum_no_of_batches + (len(y_train) // batch_size)
        train_set = (x1_train, x2_train, y_train)
        dev_set = (x1_dev, x2_dev, y_dev)
        gc.collect()
        return train_set, dev_set, vocab_processor, sum_no_of_batches

示例#8

0

显示文件

文件： input_helpers.py 项目： jsupeng/kablstm

    def getTestDataSet(self, data_path, ent_path, vocab_path, max_document_length):
        x1_temp,x2_temp,y = self.getTsvTestData(data_path)
        ent_x1,ent_x2=self.getEntData(x1_temp,x2_temp, ent_path, max_document_length)
        add_fea_test = self.getAdditionalFeature(x1_temp,x2_temp)

        # Build vocabulary
        vocab_processor = MyVocabularyProcessor(max_document_length,min_frequency=0)
        vocab_processor = vocab_processor.restore(vocab_path)
        print len(vocab_processor.vocabulary_)

        x1 = np.asarray(list(vocab_processor.transform(x1_temp)))
        x2 = np.asarray(list(vocab_processor.transform(x2_temp)))
        # Randomly shuffle data
        del vocab_processor
        gc.collect()
        return x1,x2,ent_x1,ent_x2, y, x1_temp, x2_temp,add_fea_test

示例#9

0

显示文件

    def getDataSets(self, training_paths, max_document_length, percent_dev,
                    batch_size):
        #Tao: x1_text, x2_text, y all are 1-D np arrays
        x1_text, x2_text, y = self.getTsvData(training_paths)

        # Build vocabulary
        print("Building vocabulary")
        vocab_processor = MyVocabularyProcessor(max_document_length,
                                                min_frequency=0)
        #Tao:
        vocab_processor.fit_transform(
            np.concatenate((x2_text, x1_text), axis=0))
        print("Length of loaded vocabulary ={}".format(
            len(vocab_processor.vocabulary_)))
        i1 = 0
        train_set = []
        dev_set = []
        sum_no_of_batches = 0
        #Tao: x1 and x2 are both 2-D arrays
        x1 = np.asarray(list(vocab_processor.transform(x1_text)))
        x2 = np.asarray(list(vocab_processor.transform(x2_text)))
        # Randomly shuffle data
        np.random.seed(131)
        shuffle_indices = np.random.permutation(np.arange(len(y)))
        x1_shuffled = x1[shuffle_indices]
        x2_shuffled = x2[shuffle_indices]
        y_shuffled = y[shuffle_indices]
        dev_idx = -1 * len(y_shuffled) * percent_dev // 100
        del x1
        del x2
        # Split train/test set
        self.dumpValidation(x1_text, x2_text, y, shuffle_indices, dev_idx, 0)
        # TODO: This is very crude, should use cross-validation
        x1_train, x1_dev = x1_shuffled[:dev_idx], x1_shuffled[dev_idx:]
        x2_train, x2_dev = x2_shuffled[:dev_idx], x2_shuffled[dev_idx:]
        y_train, y_dev = y_shuffled[:dev_idx], y_shuffled[dev_idx:]
        print("Train/Dev split for {}: {:d}/{:d}".format(
            training_paths, len(y_train), len(y_dev)))
        #Tao:
        #root@docker:/opt/siamese_nn/deep-siamese-text-similarity-master# wc -l person_match.train2
        #13198 person_match.train2
        #Train/Dev split for person_match.train2: 35634/3960   -->13198 * 3=39594  35634+3960=39594
        sum_no_of_batches = sum_no_of_batches + (len(y_train) // batch_size)
        train_set = (x1_train, x2_train, y_train)
        dev_set = (x1_dev, x2_dev, y_dev)
        gc.collect()
        return train_set, dev_set, vocab_processor, sum_no_of_batches

示例#10

0

显示文件

    def getDataSets(self, training_paths, max_document_length, percent_dev,
                    batch_size, is_char_based):
        if is_char_based:
            x1_text, x2_text, y = self.getTsvDataCharBased(training_paths)
        else:
            x1_text, x2_text, y = self.getTsvData(training_paths)
        # Build vocabulary
        print("Building vocabulary")
        vocab_processor = MyVocabularyProcessor(max_document_length,
                                                min_frequency=0,
                                                is_char_based=is_char_based)
        # 数组的合并后做,fit_transform()的作用就是先拟合数据，然后转化它将其转化为标准形式
        vocab_processor.fit_transform(
            np.concatenate((x2_text, x1_text), axis=0))
        print("Length of loaded vocabulary ={}".format(
            len(vocab_processor.vocabulary_)))
        i1 = 0
        train_set = []
        dev_set = []
        sum_no_of_batches = 0
        x1 = np.asarray(list(vocab_processor.transform(x1_text)))
        x2 = np.asarray(list(vocab_processor.transform(x2_text)))
        # Randomly shuffle data
        np.random.seed(131)
        shuffle_indices = np.random.permutation(np.arange(len(y)))
        #它会返回一个洗牌后的矩阵副本

        x1_shuffled = x1[shuffle_indices]
        x2_shuffled = x2[shuffle_indices]
        y_shuffled = y[shuffle_indices]
        dev_idx = -1 * len(y_shuffled) * percent_dev // 100
        del x1
        del x2
        # Split train/test set
        self.dumpValidation(x1_text, x2_text, y, shuffle_indices, dev_idx, 0)
        # TODO: This is very crude, should use cross-validation
        x1_train, x1_dev = x1_shuffled[:dev_idx], x1_shuffled[dev_idx:]
        x2_train, x2_dev = x2_shuffled[:dev_idx], x2_shuffled[dev_idx:]
        y_train, y_dev = y_shuffled[:dev_idx], y_shuffled[dev_idx:]
        print("Train/Dev split for {}: {:d}/{:d}".format(
            training_paths, len(y_train), len(y_dev)))
        sum_no_of_batches = sum_no_of_batches + (len(y_train) // batch_size)
        train_set = (x1_train, x2_train, y_train)
        dev_set = (x1_dev, x2_dev, y_dev)
        gc.collect()
        return train_set, dev_set, vocab_processor, sum_no_of_batches

示例#11

0

显示文件

    def getDataSets(self, data, max_document_length, percent_dev, batch_size):

        x1_text, x2_text, y = self.getTsvData(data)

        # Build vocabulary
        logger.info("Building vocabulary")
        vocab_processor = MyVocabularyProcessor(max_document_length,
                                                min_frequency=0)
        vocab_processor.fit_transform(
            np.concatenate((x2_text, x1_text), axis=0))
        logger.info("Length of loaded vocabulary ={}".format(
            len(vocab_processor.vocabulary_)))

        train_set = []
        dev_set = []
        sum_no_of_batches = 0
        x1 = np.asarray(list(vocab_processor.transform(x1_text)))
        x2 = np.asarray(list(vocab_processor.transform(x2_text)))

        # Randomly shuffle data

        pos_rate = 0.165
        pos_num = y.sum()
        neg_num = y.shape[0] - y.sum()
        logger.info('pos_rate: %s, target pos_rate: %s, pos_num: %s' %
                    (pos_num / y.shape[0], pos_rate, pos_num))

        w = (neg_num * pos_rate) / (pos_num * (1 - pos_rate))
        sample_weight = np.where(y == 1, w, 1)
        calc_pos_rate = (w * pos_num) / (w * pos_num + neg_num)
        logger.info('calc pos_rate: %s' % calc_pos_rate)

        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=871)
        for train_idx, test_idx in cv.split(x1, y):
            break
        x1_train, x1_dev = x1[train_idx], x1[test_idx]
        x2_train, x2_dev = x2[train_idx], x2[test_idx]
        y_train, y_dev = y[train_idx], y[test_idx]
        sample_weight = (sample_weight[train_idx], sample_weight[test_idx])

        sum_no_of_batches = sum_no_of_batches + (len(y_train) // batch_size)
        train_set = (x1_train, x2_train, y_train)
        dev_set = (x1_dev, x2_dev, y_dev)
        gc.collect()
        return train_set, dev_set, vocab_processor, sum_no_of_batches, sample_weight

示例#12

0

显示文件

    def getWords(self, word1, word2, vocab_path, max_document_length):
        temp1 = []
        temp2 = []
        temp1.append(word1.lower())
        temp2.append(word2.lower())

        x1_temp = np.asarray(temp1)
        x2_temp = np.asarray(temp2)
        # Build vocabulary
        vocab_processor = MyVocabularyProcessor(max_document_length,
                                                min_frequency=0)
        vocab_processor = vocab_processor.restore(vocab_path)

        x1 = np.asarray(list(vocab_processor.transform(x1_temp)))
        x2 = np.asarray(list(vocab_processor.transform(x2_temp)))
        # Randomly shuffle data
        del vocab_processor
        gc.collect()
        return x1, x2, np.asarray(-1)

示例#13

0

显示文件

    def getEmbeddingsMap(self, cursor, max_document_length, num_docs):
        print('Loading sentences')
        # print('Memory (before): {}Mb'.format(mem_profile.memory_usage()))
        ids, sentences = map(
            list, zip(*datagen.get_sentences_list(cursor, num_docs)))
        # print('Memory (after): {}Mb\n'.format(mem_profile.memory_usage()))

        # Build vocabulary
        print("Building vocabulary")
        vocab_processor = MyVocabularyProcessor(max_document_length,
                                                min_frequency=0,
                                                is_char_based=False)
        #sentences_array = np.asarray(sentences) # line in which memory error occurs with full list of datasets (size = 6620242)

        # print('Memory (before): {}Mb'.format(mem_profile.memory_usage()))
        start_time = time.time()
        vocab_processor.fit_transform(sentences)
        end_time = time.time()
        print(
            'Time elapsed on vocabulary fitting (fit_transform): {} seconds.'.
            format(round(end_time - start_time, 2)))
        # print('Memory (after): {}Mb'.format(mem_profile.memory_usage()))

        print("Length of loaded vocabulary ={}".format(
            len(vocab_processor.vocabulary_)))
        print('Vocabulary created!\n')

        # print('Memory (before): {}Mb'.format(mem_profile.memory_usage()))
        start_time = time.time()
        embeddings = np.asarray(list(vocab_processor.transform(sentences)))
        end_time = time.time()
        print('Time elapsed on sentences to word ids (transform): {} seconds.'.
              format(round(end_time - start_time, 2)))
        # print('Memory (after): {}Mb\n'.format(mem_profile.memory_usage()))

        print('Embeddings generated in memory!')

        gc.collect()
        return dict(zip(ids, embeddings)), vocab_processor

示例#14

0

显示文件

文件： input_helpers.py 项目： kumamandala/deep-siamese-text-similarity

    def getDataSets(self, training_paths, max_document_length, percent_dev, batch_size):
        x1_text, x2_text, y = self.getTsvData(training_paths)
        # print('x1_text= {}'.format(x1_text))
        # print('x2_text= {}'.format(x2_text))
        # print ('y= {}'.format(y))

        # Build vocabulary
        print("Building vocabulary")
        vocab_processor = MyVocabularyProcessor(max_document_length, min_frequency=0)
        vocab_processor.fit_transform(np.concatenate((x2_text, x1_text), axis=0))
        print("Length of loaded vocabulary ={}".format(len(vocab_processor.vocabulary_)))

        sum_no_of_batches = 0
        x1 = np.asarray(list(vocab_processor.transform(x1_text)))
        x2 = np.asarray(list(vocab_processor.transform(x2_text)))
        # Randomly shuffle data
        np.random.seed(131)
        shuffle_indices = np.random.permutation(np.arange(len(y)))
        x1_shuffled = x1[shuffle_indices]
        x2_shuffled = x2[shuffle_indices]
        y_shuffled = y[shuffle_indices]
        dev_idx = -1 * len(y_shuffled) * percent_dev // 100
        print('dev_idx= {}'.format(dev_idx))

        del x1
        del x2
        # Split train/test set
        self.dumpValidation(x1_text, x2_text, y, shuffle_indices, dev_idx, 0)
        # TODO: This is very crude, should use cross-validation
        x1_train, x1_dev = x1_shuffled[:dev_idx], x1_shuffled[dev_idx:]
        x2_train, x2_dev = x2_shuffled[:dev_idx], x2_shuffled[dev_idx:]
        y_train, y_dev = y_shuffled[:dev_idx], y_shuffled[dev_idx:]
        print("Train/Dev split for {}: {:d}/{:d}".format(training_paths, len(y_train), len(y_dev)))
        sum_no_of_batches = sum_no_of_batches + (len(y_train) // batch_size)
        train_set = (x1_train, x2_train, y_train)
        dev_set = (x1_dev, x2_dev, y_dev)
        gc.collect()
        return train_set, dev_set, vocab_processor, sum_no_of_batches

示例#15

0

显示文件

文件： input_helpers.py 项目： Labyrinth108/DLDisambiguation

    def toVocabularyIndexVector(self, datax1, datax2, vocab_path,
                                max_document_length):
        """
        Transform the word list to vocabulary_index vectors
        :param datax1:
        :param datax2:
        :param vocab_path:
        :param max_document_length:
        :return:
        """
        # Build vocabulary
        vocab_processor = MyVocabularyProcessor(max_document_length,
                                                min_frequency=0)
        vocab_processor = vocab_processor.restore(vocab_path)
        print(len(vocab_processor.vocabulary_))

        datax1 = preprocess_arr(datax1)
        datax2 = preprocess_arr(datax2)
        x1 = np.asarray(list(vocab_processor.transform(datax1)))
        x2 = np.asarray(list(vocab_processor.transform(datax2)))
        # Randomly shuffle data
        del vocab_processor
        gc.collect()
        return x1, x2

示例#16

0

显示文件

class InputHelper():
    def __init__(self,
                 data_dir,
                 input_file,
                 batch_size,
                 sequence_length,
                 is_train=True):
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.sequence_length = sequence_length

        vocab_file = os.path.join(data_dir, 'useWords.model.vec')
        # vocab_file = os.path.join(data_dir, 'vocab.pkl')
        input_file = os.path.join(data_dir, input_file)

        if not (os.path.exists(vocab_file)):
            print 'readling train file'
            self.preprocess(input_file, vocab_file)
        else:
            print 'loading vocab file'
            self.load_vocab(vocab_file)

        if is_train:
            self.create_batches(input_file)
            self.reset_batch()

    def preprocess(self, input_file, vocab_file, min_freq=2):

        token_freq = defaultdict(int)

        for line in open(input_file):
            seq1, seq2, label = line.rstrip().split('\t')
            seq = seq1 + ' ' + seq2
            for token in seq.split(' '):
                token_freq[token] += 1

        token_list = [
            w for w in token_freq.keys() if token_freq[w] >= min_freq
        ]
        token_list.append('<pad>')
        token_dict = {token: index for index, token in enumerate(token_list)}

        with open(vocab_file, 'w') as f:
            cPickle.dump(token_dict, f)

        self.token_dictionary = token_dict
        self.vocab_size = len(self.token_dictionary)

    def load_vocab(self, vocab_file):
        self.token_dictionary = dict()
        for line in open(vocab_file):
            l = line.strip().split()
            st = l[0].decode('utf-8')
            self.token_dictionary[st] = np.asarray(l[1:])
            # self.vocab_size = len(self.token_dictionary)

            # with open(vocab_file, 'rb') as f:
            #    self.token_dictionary = cPickle.load(f)
            #    self.vocab_size = len(self.token_dictionary)

    def text_to_array(self, text, is_clip=True):

        words = [w for w in jieba.cut(text[0]) if w.strip()]
        words1 = [
            TOKENIZER_RE.findall(w)[0] for w in words
            if TOKENIZER_RE.findall(w)
        ]
        if is_clip:
            words1 = words1[:self.sequence_length]
        return words1

        # seq_ids = [int(self.token_dictionary.get(token)) for token in text if
        #            self.token_dictionary.get(token) is not None]
        # if is_clip:
        #    seq_ids = seq_ids[:self.sequence_length]
        # return seq_ids

    def getTsvData(self, filepath):
        print("Loading training data from " + filepath)
        x1 = []
        x2 = []
        y = []
        # positive samples from file
        for line in open(filepath):
            l = line.strip().split("\t")
            if len(l) < 3:
                continue
            if random() > 0.5:
                x1.append(l[1])
                x2.append(l[2])
            else:
                x1.append(l[2])
                x2.append(l[1])
            y.append(int(l[3]))
        return np.asarray(x1), np.asarray(x2), np.asarray(y)

    def getTsvTestData(self, filepath):
        print("Loading testing/labelled data from " + filepath)
        x1 = []
        x2 = []
        # positive samples from file
        for line in open(filepath):
            l = line.strip().split("\t")
            if len(l) < 2:
                continue
            x1.append(l[1])
            x2.append(l[2])
        return np.asarray(x1), np.asarray(x2)

    def getTestDataSet(self, data_path, vocab_path, max_document_length):
        x1_temp, x2_temp = self.getTsvTestData(data_path)

        # Build vocabulary
        self.vocab_processor = MyVocabularyProcessor(max_document_length,
                                                     min_frequency=0)
        self.vocab_processor = self.vocab_processor.restore(vocab_path)
        print len(self.vocab_processor.vocabulary_)

        x1 = np.asarray(list(self.vocab_processor.transform(x1_temp)))
        x2 = np.asarray(list(self.vocab_processor.transform(x2_temp)))
        # Randomly shuffle data
        return x1, x2

    def batch_iter(self, data, batch_size, num_epochs, shuffle=True):
        """
        Generates a batch iterator for a dataset.
        """
        data = np.asarray(data)
        print(data)
        print(data.shape)
        data_size = len(data)
        num_batches_per_epoch = int(len(data) / batch_size) + 1
        for epoch in range(num_epochs):
            # Shuffle the data at each epoch
            if shuffle:
                shuffle_indices = np.random.permutation(np.arange(data_size))
                shuffled_data = data[shuffle_indices]
            else:
                shuffled_data = data
            for batch_num in range(num_batches_per_epoch):
                start_index = batch_num * batch_size
                end_index = min((batch_num + 1) * batch_size, data_size)
                yield shuffled_data[start_index:end_index]

    def padding_seq(self, seq_array, padding_index):

        for i in xrange(len(seq_array), self.sequence_length):
            seq_array.append(padding_index)

    def create_batches(self, text_file):

        x1 = []
        x2 = []
        y = []
        seq1_array = []
        seq2_array = []

        # padding_index = self.vocab_size - 1
        for line in open(text_file):
            _, seq1, seq2, label = line.rstrip().split('\t')

            # seq1_array = self.text_to_array(seq1.decode('utf-8').split(' '))
            # seq2_array = self.text_to_array(seq2.decode('utf-8').split(' '))

            # self.padding_seq(seq1_array, padding_index)
            # self.padding_seq(seq2_array, padding_index)

            label = int(label)
            x1.append(seq1)
            x2.append(seq2)
            y.append(label)

        self.vocab_processor = MyVocabularyProcessor(self.sequence_length,
                                                     min_frequency=0)
        self.vocab_processor.fit_transform(np.concatenate((x2, x1), axis=0))

        x1_1 = np.asarray(list(self.vocab_processor.transform(x1)))
        x2_1 = np.asarray(list(self.vocab_processor.transform(x2)))

        # x1 = np.array(x1)
        # x2 = np.array(x2)
        y = np.array(y)

        self.num_samples = len(y)
        self.num_batches = self.num_samples / self.batch_size
        indices = np.random.permutation(self.num_samples)
        self.x1 = x1_1[indices]
        self.x2 = x2_1[indices]
        self.y = y[indices]

    def next_batch(self):

        begin = self.pointer
        end = self.pointer + self.batch_size
        x1_batch = self.x1[begin:end]
        x2_batch = self.x2[begin:end]
        y_batch = self.y[begin:end]

        new_pointer = self.pointer + self.batch_size

        if new_pointer >= self.num_samples:
            self.eos = True
        else:
            self.pointer = new_pointer

        return x1_batch, x2_batch, y_batch

    def reset_batch(self):
        self.pointer = 0
        self.eos = False