예제 #1
0
    def load_data(self):
        print(config.get_current_time("load row data"))
        train = pd.read_csv(config.TRAIN_DIR)
        test = pd.read_csv(config.TEST_DIR)

        ## split to train and val
        train_data, val_data = train_test_split(train,
                                                test_size=0.08,
                                                random_state=2018)
        print("Train data: {}, Valid data: {}, Test data: {}.".format(
            train.shape, val_data.shape, test.shape))

        ## fill up the missing values
        train_X = train_data["question_text"].fillna("_##_").values
        val_X = val_data["question_text"].fillna("_##_").values
        test_X = test["question_text"].fillna("_##_").values

        ## Tokenize the sentences
        tokenizer = Tokenizer(num_words=self.MAX_NB_WORDS)
        tokenizer.fit_on_texts(list(train_X))
        self.word_index = tokenizer.word_index
        train_X = tokenizer.texts_to_sequences(train_X)
        val_X = tokenizer.texts_to_sequences(val_X)
        test_X = tokenizer.texts_to_sequences(test_X)

        ## Pad the sentences
        train_X = pad_sequences(train_X, maxlen=self.data_len)
        val_X = pad_sequences(val_X, maxlen=self.data_len)
        test_X = pad_sequences(test_X, maxlen=self.data_len)

        ## Get the target values
        train_y = train_data['target'].values
        val_y = val_data['target'].values

        # shuffling the data
        np.random.seed(2018)
        trn_idx = np.random.permutation(len(train_X))
        val_idx = np.random.permutation(len(val_X))

        train_X = train_X[trn_idx]
        val_X = val_X[val_idx]
        train_y = train_y[trn_idx]
        val_y = val_y[val_idx]

        train_y = to_categorical(train_y, num_classes=2)
        val_y = to_categorical(val_y, num_classes=2)

        print(config.get_current_time("return data"))
        return train_X, train_y, val_X, val_y, test_X
def log_time_helper(mode, is_starting=True):
    if is_starting:
        w = "starting"
    else:
        w = "ending"
    current_time = get_current_time()
    my_logger.info("[+] Mode = {}; {} at {}".format(mode, w, current_time[0]))
    return current_time
예제 #3
0
 def load_topic_info(self):
     '''
     just get the topic ids 
     :return: 
     '''
     print(config.get_current_time(), "loading topic info")
     with open(config.TOPIC_INFO_DIR, 'r') as f:
         for index, line in enumerate(f.readlines()):
             self.topic_dict[line.strip('\n').split('\t')[0]] = index
             self.topic_dict_inv[index] = line.strip('\n').split('\t')[0]
예제 #4
0
    def load_wiki_news_em_matrix(self):
        '''

        :return:
        '''
        print(config.get_current_time("load_wiki_news_em_matrix"))
        embeddings_index = dict()

        embedding_max_value = 0
        embedding_min_value = 1

        with open(config.WIKI_NEWS_DIR, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip().split(' ')
                if len(line) != 301:
                    continue

                coefs = np.asarray(line[1:], dtype='float32')

                if np.max(coefs) > embedding_max_value:
                    embedding_max_value = np.max(coefs)
                if np.min(coefs) < embedding_min_value:
                    embedding_min_value = np.min(coefs)

                embeddings_index[line[0]] = coefs

        print(
            config.get_current_time(
                ('Found %s word vectors.' % len(embeddings_index))))

        self.WIKI_NEWS_EM = np.zeros(
            (len(self.word_index) + 1, self.EMBEDDING_DIM))
        for word, i in self.word_index.items():
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                self.WIKI_NEWS_EM[i] = embedding_vector
            else:
                self.WIKI_NEWS_EM[i] = np.random.uniform(
                    low=embedding_min_value,
                    high=embedding_max_value,
                    size=self.EMBEDDING_DIM)
예제 #5
0
    def get_quesids(self):
        '''

        :return: 
        '''
        question_ids = []

        print(config.get_current_time(), 'loading question eval ids')
        with open(config.QUESTION_EVAL_SET_DIR, 'r') as f:
            for index, line in enumerate(f.readlines()):
                splitted = line.strip('\n').split('\t')
                question_ids.append(splitted[0])

        self.load_topic_info()
        return question_ids
예제 #6
0
    def load_google_news_em_matrix(self):

        print(config.get_current_time("load_google_news_em_matrix"))
        self.GOOGLE_NEWS_EM = np.zeros(
            (len(self.word_index) + 1, self.EMBEDDING_DIM))
        model = gensim.models.KeyedVectors.load_word2vec_format(
            config.GOOGLE_NEWS_DIR, binary=True)

        for word, i in self.word_index.items():
            try:
                embedding_vector = model[word]
            except:
                embedding_vector = None

            if embedding_vector is not None:
                self.GOOGLE_NEWS_EM[i] = embedding_vector
            else:
                self.GOOGLE_NEWS_EM[i] = np.random.uniform(
                    low=-0.0018054, high=0.047287, size=self.EMBEDDING_DIM)
예제 #7
0
    def load_charembedding_matrix(self):

        embeddings_index = dict()

        embedding_max_value = 0
        embedding_min_value = 1

        with open(config.CHAR_EMBEDDING_DIR, 'r') as f:
            for line in f:
                line = line.strip().split(' ')
                if len(line) != 257:
                    continue

                coefs = np.asarray(line[1:], dtype='float32')

                if np.max(coefs) > embedding_max_value:
                    embedding_max_value = np.max(coefs)
                if np.min(coefs) < embedding_min_value:
                    embedding_min_value = np.min(coefs)

                embeddings_index[line[0]] = coefs

        print(config.get_current_time(),
              ('Found %s char vectors.' % len(embeddings_index)))

        self.embedchar_matrix = np.zeros((len(self.char_index) + 1, 256))
        for word, i in self.char_index.items():
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                self.embedchar_matrix[i] = embedding_vector
            else:
                self.embedchar_matrix[i] = np.random.uniform(
                    low=embedding_min_value,
                    high=embedding_max_value,
                    size=256)
예제 #8
0
    def load_train_data(self):
        '''
        title_char+title_word+dsp_char+dsp_word
        :param istitle: bool 
        :param iscontent: bool
        :param type_kind: char or word
        :return: 
        '''

        title_char_list = []
        title_word_list = []
        dsp_char_list = []
        dsp_word_list = []
        question_ids = []

        print(config.get_current_time(), 'loading question train set file')
        with open(config.QUESTION_TRAIN_SET_DIR, 'r') as f:
            for index, line in enumerate(f.readlines()):
                if index > 500:
                    break
                splitted = line.strip('\n').split('\t')

                if len(splitted) == 1:
                    continue
                elif len(splitted) == 2:
                    continue
                elif len(splitted) == 5:
                    title_char_list.append(splitted[1].replace(',', ' '))
                    title_word_list.append(splitted[2].replace(',', ' '))
                    dsp_char_list.append(splitted[3].replace(',', ' '))
                    dsp_word_list.append(splitted[4].replace(',', ' '))
                    self.max_titlechar_len = max(len(splitted[1].split(',')),
                                                 self.max_titlechar_len)
                    self.max_titleword_len = max(len(splitted[2].split(',')),
                                                 self.max_titleword_len)
                    self.max_dspchar_len = max(len(splitted[3].split(',')),
                                               self.max_dspchar_len)
                    self.max_dspword_len = max(len(splitted[4].split(',')),
                                               self.max_dspword_len)
                    question_ids.append(splitted[0])
                else:
                    continue

        # print('max titlecharlength', self.max_titlechar_len)
        # print('max titleword length', self.max_titleword_len)
        # print('max dspchar length', self.max_dspchar_len)
        # print('max dspword length', self.max_dspword_len)

        pickle.dump(self.tw_len, open(self.savedir + '/tw_len.pkl', 'wb'))
        pickle.dump(self.tc_len, open(self.savedir + '/tc_len.pkl', 'wb'))
        pickle.dump(self.dsppad_length,
                    open(self.savedir + '/dsp_pad_length.pkl', 'wb'))

        # ------titleword--------
        print(config.get_current_time(), 'tokenizer title word working')
        tokenizer_word = Tokenizer(num_words=self.MAX_NB_WORDS)
        tokenizer_word.fit_on_texts(title_word_list + dsp_word_list)
        sequences_titleword = tokenizer_word.texts_to_sequences(
            title_word_list)
        self.word_index = tokenizer_word.word_index
        print(config.get_current_time(),
              'Found %s unique word tokens.' % len(self.word_index))
        titleword_array = pad_sequences(sequences_titleword,
                                        maxlen=self.tw_len)  # return arrays
        pickle.dump(tokenizer_word,
                    open(self.savedir + '/tokenizer_word.pkl', 'wb'))
        print('tokenzier is saved as %s/tokenizer_word.pkl' % (self.savedir))
        # -----titlechar---------
        print(config.get_current_time(), 'tokenizer title char working')
        tokenizer_char = Tokenizer(num_words=self.MAX_NB_WORDS)
        tokenizer_char.fit_on_texts(title_char_list + dsp_char_list)
        sequences_titlechar = tokenizer_char.texts_to_sequences(
            title_char_list)
        self.char_index = tokenizer_char.word_index
        print(config.get_current_time(),
              'Found %s unique char tokens.' % len(self.char_index))
        titlechar_array = pad_sequences(sequences_titlechar,
                                        maxlen=self.tc_len)  # return arrays
        pickle.dump(tokenizer_char,
                    open(self.savedir + '/tokenizer_char.pkl', 'wb'))
        print('tokenzier is saved as %s/tokenizer_char.pkl' % (self.savedir))
        # -----dspword--------
        print(config.get_current_time(), 'tokenizer dsp char working')
        sequences_dspchar = tokenizer_char.texts_to_sequences(dsp_char_list)
        dspchar_array = pad_sequences(
            sequences_dspchar, maxlen=self.dsppad_length)  # return arrays
        # ---dspchar---------
        print(config.get_current_time(), 'tokenizer dsp word working')
        sequences_dspword = tokenizer_word.texts_to_sequences(dsp_word_list)
        dspword_array = pad_sequences(
            sequences_dspword, maxlen=self.dsppad_length)  # return arrays

        self.load_topic_info()

        question_to_label = {}
        print(config.get_current_time(), 'loading train labels')
        with open(config.QUESTION_TOPIC_TRAIN_DIR, 'r') as f:
            for index, line in enumerate(f.readlines()):
                # if index>100000:
                #     break
                splitted = line.strip('\n').split('\t')
                if len(splitted) != 2:
                    print('error!')
                question_to_label[splitted[0]] = [
                    self.topic_dict[i] for i in splitted[1].split(',')
                ]

        print(config.get_current_time(), 'duiqi traindata and labels')

        row_ = []
        col_ = []
        count_1 = 0
        # label_dense = np.zeros((train_titleword_array.shape[0], 1999))
        for row, quesid in enumerate(question_ids):
            cols = question_to_label.get(quesid)
            if cols is None:
                print('error!')
            count_1 += len(cols)
            for k in cols:
                row_.append(row)
            col_.extend(cols)

        data_ = [1 for i in row_]
        label_sparse = csr_matrix((data_, (row_, col_)),
                                  shape=(len(question_ids), 1999))
        # # Shuffle data
        # shuffle_indices = np.random.permutation(np.arange(train_titleword_array.shape[0]))
        # x_word = train_titleword_array[shuffle_indices]
        # x_char = train_titlechar_array[shuffle_indices]
        # row_ = [row_[i] for i in shuffle_indices]
        # col_ = [col_[i] for i in shuffle_indices]
        #
        # # label_dense = label_dense[shuffle_indices]
        # # label_sparse = csr_matrix(([1 for i in range(count_1))],(row_,col_)),shape = ())
        #
        # train_len = int(x_word.shape[0] * 0.9)
        # x_word_train = x_word[:train_len]
        # x_char_train = x_char[:train_len]
        # y_train = label_sparse[:train_len]
        # x_word_test = x_word[train_len:]
        # x_char_test = x_char[train_len:]
        # y_test = label_sparse[train_len:]

        # return (x_word_train, x_char_train, y_train, x_word_test, x_char_test, y_test)
        return titlechar_array, titleword_array, dspchar_array, dspword_array, label_sparse
예제 #9
0
    def load_pred_data_4part(self):
        '''
        
        :return: 
        '''
        title_char_list = []
        title_word_list = []
        dsp_char_list = []
        dsp_word_list = []
        question_ids = []

        self.tw_len = pickle.load(open(self.savedir + '/tw_len.pkl', 'rb'))
        self.tc_len = pickle.load(open(self.savedir + '/tc_len.pkl', 'rb'))
        self.dsppad_length = pickle.load(
            open(self.savedir + '/dsp_pad_length.pkl', 'rb'))
        print('length is loaded!')

        print(config.get_current_time(), 'loading question eval set file')
        with open(config.QUESTION_EVAL_SET_DIR, 'r') as f:
            for index, line in enumerate(f.readlines()):
                # if index>50000:
                #     break
                splitted = line.strip('\n').split('\t')

                if len(splitted) == 1:
                    print('error!')
                    exit()
                elif len(splitted) == 2:
                    title_char_list.append(splitted[1].replace(',', ' '))
                    title_word_list.append(" ")
                    dsp_char_list.append(" ")
                    dsp_word_list.append(" ")
                elif len(splitted) == 3:
                    title_char_list.append(splitted[1].replace(',', ' '))
                    title_word_list.append(splitted[2].replace(',', ' '))
                    dsp_char_list.append(" ")
                    dsp_word_list.append(" ")
                elif len(splitted) == 4:
                    title_char_list.append(splitted[1].replace(',', ' '))
                    title_word_list.append(splitted[2].replace(',', ' '))
                    dsp_char_list.append(splitted[3].replace(',', ' '))
                    dsp_word_list.append(" ")
                elif len(splitted) == 5:
                    title_char_list.append(splitted[1].replace(',', ' '))
                    title_word_list.append(splitted[2].replace(',', ' '))
                    dsp_char_list.append(splitted[3].replace(',', ' '))
                    dsp_word_list.append(splitted[4].replace(',', ' '))

                question_ids.append(splitted[0])

        tokenizer_word = pickle.load(
            open(self.savedir + '/tokenizer_word.pkl', 'rb'))
        tokenizer_char = pickle.load(
            open(self.savedir + '/tokenizer_char.pkl', 'rb'))
        print('tokenizer word loaded!')
        print("")

        print(config.get_current_time(), 'tokenizer working title char')
        titlechar_sequences_char = tokenizer_char.texts_to_sequences(
            title_char_list)
        self.char_index = tokenizer_char.word_index
        titlechar_array = pad_sequences(titlechar_sequences_char,
                                        maxlen=self.tc_len)  # return arrays

        print(config.get_current_time(), 'tokenizer working title word')
        titleword_sequences_word = tokenizer_word.texts_to_sequences(
            title_word_list)
        self.word_index = tokenizer_word.word_index
        titleword_array = pad_sequences(titleword_sequences_word,
                                        maxlen=self.tw_len)  # return arrays

        print(config.get_current_time(), 'tokenizer working dsp char')
        dspchar_sequences_char = tokenizer_char.texts_to_sequences(
            dsp_char_list)
        dspchar_array = pad_sequences(
            dspchar_sequences_char, maxlen=self.dsppad_length)  # return arrays

        print(config.get_current_time(), 'tokenizer working dsp word')
        dspword_sequences_word = tokenizer_word.texts_to_sequences(
            dsp_word_list)
        dspword_array = pad_sequences(
            dspword_sequences_word, maxlen=self.dsppad_length)  # return arrays

        self.load_topic_info()

        return titlechar_array, titleword_array, dspchar_array, dspword_array, question_ids
예제 #10
0
    with open("final_423.csv", 'w') as f:
        for i in range(predlabels.shape[0]):
            # f.write(ques_ids[i] + "," + ','.join([topic_dict_inv[k] for k in predlabels[i]]) + '\n')
            f.write(ques_ids[i] + "," + ','.join(tmpfunc(predlabels[i])) +
                    '\n')


if __name__ == '__main__':

    if len(sys.argv) < 2:
        print('error, give me mode ')
        exit()

    mode = sys.argv[1]

    print(config.get_current_time(), 'current mode:', mode)

    if mode == "train":

        save_root_dir = './model_exp'  #your own path, to save models,tokenizers...

        dl = data_loader(save_root_dir)
        datatuple = dl.load_train_data()
        dl.load_charembedding_matrix()
        dl.load_wordembedding_matrix()

        mymodel = MultiModel(w_embed_matrix=dl.embedword_matrix,
                             c_embed_matrix=dl.embedchar_matrix,
                             word_index=dl.word_index,
                             char_index=dl.char_index,
                             titlechar_length=dl.tc_len,
예제 #11
0
    def bulid_model(self):
        '''

        :return:
        '''

        print(config.get_current_time("building model ------"))

        # ----------- title local w2v ----------
        with tf.device('/gpu:%d' % (0)):
            tl_embedding_layer = Embedding(len(self.word_index) + 1,
                                           self.EMBEDDING_DIM,
                                           weights=[self.GLOVE_EM],
                                           input_length=self.data_len, trainable=True,
                                           embeddings_initializer=initializers.RandomUniform(minval=-0.2, maxval=0.2,
                                                                                         seed=None))
        tl_sequence_input = Input(shape=(self.data_len,), name="title_local_w2v_input")
        tl_embedded_sequences = tl_embedding_layer(tl_sequence_input)
        with tf.device('/gpu:%d' % (0)):
            tl_z_pos = LSTM(512, implementation=2, return_sequences=True, go_backwards=False)(tl_embedded_sequences)
            tl_z_neg = LSTM(512, implementation=2, return_sequences=True, go_backwards=True)(tl_embedded_sequences)
            tl_z_concat = concatenate([tl_z_pos, tl_embedded_sequences, tl_z_neg], axis=-1)

            tl_z = Dense(512, activation='tanh')(tl_z_concat)
            tl_pool_rnn = Lambda(lambda x: K.max(x, axis=1), output_shape=(512,))(tl_z)

        # ---------- title ai w2v ----------
        with tf.device('/gpu:%d' % (0)):
            ta_embedding_layer = Embedding(len(self.word_index) + 1,
                                           self.EMBEDDING_DIM,
                                           weights=[self.GOOGLE_NEWS_EM],
                                           input_length=self.data_len, trainable=True,
                                           embeddings_initializer=initializers.RandomUniform(minval=-0.2, maxval=0.2,
                                                                                         seed=None))
        ta_sequence_input = Input(shape=(self.data_len,), name="title_ai_w2v_input")
        ta_embedded_sequences = ta_embedding_layer(ta_sequence_input)
        with tf.device('/gpu:%d' % (0)):
            ta_z_pos = LSTM(512, implementation=2, return_sequences=True, go_backwards=False)(ta_embedded_sequences)
            ta_z_neg = LSTM(512, implementation=2, return_sequences=True, go_backwards=True)(ta_embedded_sequences)
            ta_z_concat = concatenate([ta_z_pos, ta_embedded_sequences, ta_z_neg], axis=-1)

            ta_z = Dense(512, activation='tanh')(ta_z_concat)
            ta_pool_rnn = Lambda(lambda x: K.max(x, axis=1), output_shape=(512,))(ta_z)

        # ----------- des local w2v ----------
        with tf.device('/gpu:%d' % (0)):
            dl_embedding_layer = Embedding(len(self.word_index) + 1,
                                           self.EMBEDDING_DIM,
                                           weights=[self.PARAGRAM_EM],
                                           input_length=self.data_len, trainable=True,
                                           embeddings_initializer=initializers.RandomUniform(minval=-0.2, maxval=0.2,
                                                                                         seed=None))
        dl_sequence_input = Input(shape=(self.data_len,), name="des_local_w2v_input")
        dl_embedded_sequences = dl_embedding_layer(dl_sequence_input)
        with tf.device('/gpu:%d' % (0)):
            dl_z_pos = LSTM(512, implementation=2, return_sequences=True, go_backwards=False)(dl_embedded_sequences)
            dl_z_neg = LSTM(512, implementation=2, return_sequences=True, go_backwards=True)(dl_embedded_sequences)
            dl_z_concat = concatenate([dl_z_pos, dl_embedded_sequences, dl_z_neg], axis=-1)

            dl_z = Dense(512, activation='tanh')(dl_z_concat)
            dl_pool_rnn = Lambda(lambda x: K.max(x, axis=1), output_shape=(512,))(dl_z)

        # ---------- des ai w2v ----------
        with tf.device('/gpu:%d' % (0)):
            da_embedding_layer = Embedding(len(self.word_index) + 1,
                                           self.EMBEDDING_DIM,
                                           weights=[self.WIKI_NEWS_EM],
                                           input_length=self.data_len, trainable=True,
                                           embeddings_initializer=initializers.RandomUniform(minval=-0.2, maxval=0.2,
                                                                                         seed=None))
        da_sequence_input = Input(shape=(self.data_len,), name="des_ai_w2v_input")
        da_embedded_sequences = da_embedding_layer(da_sequence_input)
        with tf.device('/gpu:%d' % (0)):
            da_z_pos = LSTM(512, implementation=2, return_sequences=True, go_backwards=False)(da_embedded_sequences)
            da_z_neg = LSTM(512, implementation=2, return_sequences=True, go_backwards=True)(da_embedded_sequences)
            da_z_concat = concatenate([da_z_pos, da_embedded_sequences, da_z_neg], axis=-1)

            da_z = Dense(512, activation='tanh')(da_z_concat)
            da_pool_rnn = Lambda(lambda x: K.max(x, axis=1), output_shape=(512,))(da_z)

        # ---------- att ----------
        concat_t_d = concatenate([tl_pool_rnn, ta_pool_rnn, dl_pool_rnn, da_pool_rnn], axis=-1)
        concat_t_d = Reshape((2, 512 * 2))(concat_t_d)

        attention = Dense(1, activation='tanh')(concat_t_d)
        attention = Flatten()(attention)
        attention = Activation('softmax')(attention)
        attention = RepeatVector(512 * 2)(attention)
        attention = Permute([2, 1])(attention)

        sent_representation = multiply([concat_t_d, attention])
        sent_representation = Lambda(lambda xin: K.sum(xin, axis=-2), output_shape=(512 * 2,))(sent_representation)

        # ---------- merge_4models ----------
        model_final_ = Dense(2, activation='relu')(sent_representation)
        model_final_ = Dropout(0.5)(model_final_)
        model_final = Dense(2, activation='softmax')(model_final_)

        self.model = Model(inputs=[tl_sequence_input, ta_sequence_input, dl_sequence_input, da_sequence_input],
                           outputs=model_final)
        adam = optimizers.adam(lr=0.00001)
        self.model.compile(loss='categorical_crossentropy',
                           optimizer=adam,
                           metrics=[f1])

        print(self.model.summary())