예제 #1
0
    def predict_on_file(self,
                        test_path=None,
                        test_result_path=None,
                        chosen_word_dict=None):
        # test_data: label_name \t title \t text
        fw = open(test_result_path, 'w', encoding='utf-8')
        line_cnt = 0

        with open(test_path, 'r', encoding='utf-8') as fr:
            label_dict = get_label_dict()  # {label_name: label_index}
            for line in fr:
                buf = line[:-1].split('\t')
                if len(buf) != 3:
                    continue

                y_true_label_name = buf[0].strip()
                y_true_label_index = str(label_dict[y_true_label_name])
                title = buf[1]
                text = buf[2]

                word_dict = {}  # {word: True}
                for word in (title + ' ' + text).split(' '):
                    word = word.strip()
                    if word not in chosen_word_dict:  # feature selection
                        continue
                    if word in word_dict:
                        continue
                    word_dict[word] = True

                probs = {}  # {label_name: prob}
                V = len(chosen_word_dict)
                for label_name, label_prob in self.label_prob.items():
                    prob = np.log(label_prob)
                    N_k = self.label_count[label_name]
                    for word in word_dict.keys():
                        if word not in self.label_word_prob[label_name]:
                            #continue
                            word_pos_prob = 1. / (N_k + V)
                            #print(label_name, word, word_pos_prob)
                        else:
                            word_pos_prob = self.label_word_prob[label_name][
                                word]
                        prob += np.log(word_pos_prob)

                    probs[label_name] = prob

                # === Sort by prob, DESC
                # sorted[top_0][first_item=label_name]
                y_pred_label_name = sorted(probs.items(),
                                           key=lambda x: -x[1])[0][0]
                y_pred_label_index = str(label_dict[y_pred_label_name])

                fw.write(
                    str(y_true_label_index) + '\t' + str(y_pred_label_index) +
                    '\n')
                line_cnt += 1
                if line_cnt % 1000 == 0:
                    print(line_cnt)
            print(line_cnt)
            print("Predict done! %s" % test_result_path)
예제 #2
0
def dataset_generator(data_path=None,
                      epochs=10,
                      shuffle_buffer_size=1024,
                      batch_size=16,
                      max_seq_len = 100,
                      word2id_dict=None):
    # input_data: label \t word_1, word_2, ..., split by ' '
    # Output: label [word_index_1, word_index_2, ..., word_index_max_seq_len], 0 for pad, 1 for unk
    #   `pad` for less sequence, while `truncate` for longer one.

    # {label_name: label_index}
    label_dict = get_label_dict()

    def generator():
        with open(data_path, 'r', encoding='utf-8') as f:
            for line in f:
                buf = line[:-1].split('\t')
                label_name = buf[0]
                label = int(label_dict[label_name])

                inputs = np.array(list(map(lambda x: float(x), buf[1].split(','))),
                                  dtype=np.float32)

                yield inputs, [label]

    dataset = tf.data.Dataset.from_generator(generator,
                                             output_shapes=((300, ), (1, )),
                                             output_types=(tf.float32, tf.int32))

    return dataset.repeat(epochs)\
        .shuffle(buffer_size=shuffle_buffer_size)\
        .batch(batch_size=batch_size)
예제 #3
0
def pretrained_dataset_generator(data_path=None,
                      epochs=10,
                      shuffle_buffer_size=1024,
                      batch_size=16):
    # input_data: label \t vec_1, vec_2, ..., vec_300, split by ','
    # Output: inputs, label; [None, 300], [None, 1]

    # {label_name: label_index}
    label_dict = get_label_dict()

    def generator():
        with open(data_path, 'r', encoding='utf-8') as f:
            for line in f:
                buf = line[:-1].split('\t')
                label_name = buf[0]
                label = int(label_dict[label_name])

                inputs = np.array(list(map(lambda x: float(x), buf[1].split(','))),
                                  dtype=np.float32)

                yield inputs, [label]

    dataset = tf.data.Dataset.from_generator(generator,
                                             output_shapes=((300, ), (1, )),
                                             output_types=(tf.float32, tf.int32))

    return dataset.repeat(epochs)\
        .shuffle(buffer_size=shuffle_buffer_size)\
        .batch(batch_size=batch_size)
예제 #4
0
def test_model(model=None, vec_path=None, result_path=None):
    # data: label_name \t vec_1, vec_2, ..., vec_300; split by ','
    # result: true_label_index \t pred_label_index

    with open(vec_path, 'r', encoding='utf-8') as fr:
        with open(result_path, 'w', encoding='utf-8') as fw:

            # {label_name: label_index}
            label_dict = get_label_dict()

            line_cnt = 0
            for line in fr:
                buf = line[:-1].split('\t')
                label_name = buf[0]
                true_label = label_dict[label_name]

                # [1, 300]
                inputs = np.array(list(
                    map(lambda x: float(x), buf[1].split(','))),
                                  dtype=np.float32).reshape((-1, 300))
                # [1, num_classes=14]
                softmax = model(inputs)

                pred_label = np.argmax(softmax, axis=1)
                fw.write(
                    str(true_label) + '\t' + str(pred_label[0]) + '\t' + '\n')
                line_cnt += 1
                if line_cnt % 1000 == 0:
                    print(line_cnt)
            print("Total line %d" % line_cnt)
예제 #5
0
def compute_tfidf(data_path=None,
                  idf_dict=None,
                  word2id_dict=None,
                  tfidf_path=None):
    # data: label \t title_words \t text_words, split by '\s'
    # idf_dict: {word: idf}
    # word2id_dict: {word: index}
    # tfidf: label_index \s word_index:tfidf \s word_index:tfidf, word_index sorted ASC.

    fw = open(tfidf_path, 'w', encoding='utf-8')
    label_dict = get_label_dict()

    with open(data_path, 'r', encoding='utf-8') as fr:
        for line in fr:
            buf = line[:-1].split('\t')
            if len(buf) != 3:
                continue

            label_name = buf[0]
            label_index = label_dict[label_name]

            title = buf[1]
            text = buf[2]

            # === Count tf
            tf_dict = {}
            for word in (title + ' ' + text).split(' '):
                if word not in word2id_dict:
                    continue

                if word not in tf_dict:
                    tf_dict[word] = 1
                else:
                    tf_dict[word] += 1

            # === Compute tfidf
            tfidf_dict = {}
            for word, tf in tf_dict.items():
                if word not in idf_dict:
                    continue

                idf = idf_dict[word]
                tfidf = tf * idf

                word_index = word2id_dict[word]
                tfidf_dict[word_index] = tfidf

            # === Store in the format of libsvm
            # LIBSVM: sort key ASC {key: value}.
            tfidf_list = []
            for word_index, tfidf in sorted(tfidf_dict.items(),
                                            key=lambda x: x[0]):
                tfidf_list.append(
                    str(word_index) + ':' + "{:.4f}".format(tfidf))

            if len(tfidf_list) > 0:
                fw.write(str(label_index) + ' ' + ' '.join(tfidf_list) + '\n')
        fw.close()
        print("Write done! %s " % tfidf_path)
def test_model(model=None,
               test_path=None,
               result_path=None,
               word_vec_dict=None,
               max_seq_len=100):
    # test_data: \label \t title_words \t text_words
    # result: true_label_index \t pred_label_index

    with open(test_path, 'r', encoding='utf-8') as fr:
        with open(result_path, 'w', encoding='utf-8') as fw:

            # {label_name: label_index}
            label_dict = get_label_dict()

            line_cnt = 0
            for line in fr:
                buf = line[:-1].split('\t')

                if len(buf) != 3:  # label \t title \t text
                    continue

                label_name = buf[0]
                true_label = label_dict[label_name]

                title = buf[1]
                text = buf[2]
                words = (title + ' ' + text).split(' ')

                inputs = []
                i = 0
                for word in words:
                    if word not in word_vec_dict:
                        continue
                    word_vec = word_vec_dict[word]
                    inputs.append(word_vec)
                    i += 1
                    if i >= max_seq_len:
                        break

                for _ in range(len(inputs), max_seq_len):
                    inputs.append([0] * 300)  # 300: embedding_dim

                inputs = np.expand_dims(np.array(inputs), axis=0)
                #print(inputs.shape)

                # [1, num_classes=14]
                softmax = model(inputs)

                pred_label = np.argmax(softmax, axis=1)
                fw.write(
                    str(true_label) + '\t' + str(pred_label[0]) + '\t' + '\n')
                line_cnt += 1
                if line_cnt % 1000 == 0:
                    print(line_cnt)
            print("Total line %d" % line_cnt)
def dataset_generator(data_path=None,
                      epochs=10,
                      shuffle_buffer_size=1024,
                      batch_size=16,
                      max_seq_len=100,
                      word_vec_dict=None):
    # input_data: label \t title_words \t text_words, words split by ','
    # Output: label [word_index_1, word_index_2, ..., word_index_max_seq_len], 0 for pad, 1 for unk
    #   `pad` for less sequence, while `truncate` for longer one.

    # {label_name: label_index}
    label_dict = get_label_dict()

    def generator():
        with open(data_path, 'r', encoding='utf-8') as f:
            for line in f:
                buf = line[:-1].split('\t')
                if len(buf) != 3:
                    continue
                label_name = buf[0]
                label = int(label_dict[label_name])

                title = buf[1]
                text = buf[2]

                words = (title + ' ' + text).split(' ')

                inputs = []
                for i, word in zip(range(max_seq_len), words):
                    if word not in word_vec_dict:
                        continue
                    word_vec = word_vec_dict[word]
                    inputs.append(word_vec)

                for i in range(len(inputs), max_seq_len):
                    # embedding_dim: 300, hard code here
                    inputs.append([0] * 300)

                inputs = np.array(inputs)
                yield inputs, [label]

    dataset = tf.data.Dataset.from_generator(generator,
                                             output_shapes=((
                                                 max_seq_len,
                                                 300,
                                             ), (1, )),
                                             output_types=(tf.float32,
                                                           tf.int32))

    return dataset.repeat(epochs)\
        .shuffle(buffer_size=shuffle_buffer_size)\
        .batch(batch_size=batch_size)
예제 #8
0
def tfidf_to_libsvm_format(tfidf_path=None,
                           libsvm_path=None,
                           word2id_dict_path=None):
    # tfidf: label_name \t word:tfidf \s word:tfidf
    # libsvm: label_index \s word_index:tfidf, in sorted word_index
    # word2id_dict: {word: index}, start from 0
    with open(word2id_dict_path, 'rb') as fr:
        word2id_dict = pickle.load(fr)
        print("#word2id_dict=%d" % len(word2id_dict))

    with open(libsvm_path, 'w', encoding='utf-8') as fw:
        with open(tfidf_path, 'r', encoding='utf-8') as fr:

            label_dict = get_label_dict()
            line_cnt = 0
            for line in fr:
                buf = line[:-1].split('\t')
                if len(buf) != 2:
                    continue
                label_name = buf[0]
                label_index = label_dict[label_name]

                libsvm_dict = {}

                for pair in buf[1].split(' '):
                    if len(pair.split(':')) != 2:
                        continue
                    word = pair.split(':')[0]
                    if word not in word2id_dict:
                        continue
                    word_index = word2id_dict[word]
                    tfidf = pair.split(':')[1]

                    libsvm_dict[word_index] = tfidf

                libsvm_list = []
                # === sort in word_index ASC
                for word_index, tfidf in sorted(libsvm_dict.items(),
                                                key=lambda x: x[0]):
                    libsvm_list.append(str(word_index) + ':' + tfidf)

                fw.write(str(label_index) + ' ' + ' '.join(libsvm_list) + '\n')
                line_cnt += 1
                if line_cnt % 1000 == 0:
                    print(line_cnt)
            print("Total line %d" % line_cnt)
예제 #9
0
def word_to_libsvm_format(text_path=None,
                          libsvm_path=None,
                          word2id_dict_path=None,
                          df_info_dict_path=None):
    # text: label_name \t title_words \t text_words; words split by '\s'
    # libsvm: label_index \s word_index:tfidf, in sorted word_index
    # word2id_dict: {word: index}, start from 0
    # df_info: {'df_dict': df_dict, 'total_num_doc': total_num_doc}
    # df_dict: {df: count}

    with open(word2id_dict_path, 'rb') as fr:
        word2id_dict = pickle.load(fr)
        print("#word2id_dict=%d" % len(word2id_dict))

    with open(df_info_dict_path, 'rb') as fr:
        df_info_dict = pickle.load(fr)
        total_num_doc = df_info_dict['total_num_doc']
        df_dict = df_info_dict['df_dict']
        print('total_num_doc %d' % total_num_doc)
        print("#df_dict=%d" % len(df_dict))

    with open(libsvm_path, 'w', encoding='utf-8') as fw:
        with open(text_path, 'r', encoding='utf-8') as fr:

            label_dict = get_label_dict()
            line_cnt = 0
            for line in fr:
                buf = line[:-1].split('\t')
                if len(buf) != 3:
                    continue
                label_name = buf[0]
                label_index = label_dict[label_name]

                title = buf[1]
                text = buf[2]

                # === count tf in current doc
                tf_dict = {}  # {word: tf_count}
                for word in (title + ' ' + text).split(' '):
                    if len(
                            word
                    ) == 0 or word not in word2id_dict or word not in df_dict:
                        continue

                    if word not in tf_dict:
                        tf_dict[word] = 1
                    else:
                        tf_dict[word] += 1

                # === compute tf-idf
                tfidf_dict = {}  #{word_index: tfidf}
                for word, tf in tf_dict.items():
                    word_index = word2id_dict[word]
                    df = int(df_dict[word])
                    tfidf = tf * np.log(total_num_doc / (df + 1))
                    tfidf_dict[word_index] = tfidf

                # === sort in word_index ASC
                libsvm_list = []
                for word_index, tfidf in sorted(tfidf_dict.items(),
                                                key=lambda x: x[0]):
                    libsvm_list.append(
                        str(word_index) + ':' + "{:.4f}".format(tfidf))

                fw.write(str(label_index) + ' ' + ' '.join(libsvm_list) + '\n')
                line_cnt += 1
                if line_cnt % 1000 == 0:
                    print(line_cnt)
            print("Total line %d" % line_cnt)