def add_sent_fred_feature(self, data_type):
        """
        Idea from https://www.kaggle.com/jturkewitz/magic-features-0-03-gain (Quora Question Pairs Competition)
        Magic features based on question frequency. The idea.md behind is a question that is asked often has more chances
        to be duplicated.
        """

        feat_file = self.format_feature_file(data_type, 'sent_fred')
        if os.path.exists(feat_file):
            features = pickle_load(feat_file)
        else:
            sents_dict, p_vc, h_vc = self.get_sent_freq()
            data = pd.DataFrame(self.get_data(data_type))
            data['p_hash'] = data['premise'].map(sents_dict)
            data['h_hash'] = data['hypotheis'].map(sents_dict)
            data['p_freq'] = data['p_hash'].map(
                lambda x: p_vc.get(x, 0) + h_vc.get(x, 0))
            data['h_freq'] = data['h_hash'].map(
                lambda x: p_vc.get(x, 0) + h_vc.get(x, 0))
            data['freq_mean'] = (data['p_freq'] + data['h_freq']) / 2
            data['freq_cross'] = data['p_freq'] * data['h_freq']
            data['p_freq_sq'] = data['p_freq'] * data['p_freq']
            data['h_freq_sq'] = data['h_freq'] * data['h_freq']

            features = data[[
                'p_freq', 'h_freq', 'freq_mean', 'freq_cross', 'p_freq_sq',
                'h_freq_sq'
            ]].values
            pickle_dump(feat_file, features)
        return features
    def tfidf_model(self):
        print('Logging Info - Get Tf-idf model...')
        tfidf_model_path = os.path.join(FEATURE_DIR,
                                        '{}_tfidf.model').format(self.genre)
        dict_path = os.path.join(FEATURE_DIR,
                                 '{}_tfidf.dict').format(self.genre)
        if os.path.exists(tfidf_model_path):
            dictionary = pickle_load(dict_path)
            tfidf_model = TfidfModel.load(tfidf_model_path)
        else:
            corpus = [
                text.split() for text in self.train_data['premise'] +
                self.train_data['hypothesis'] + self.dev_data['premise'] +
                self.dev_data['hypothesis'] + self.test_data['premise'] +
                self.test_data['hypothesis']
            ]
            dictionary = corpora.Dictionary(corpus)
            corpus = [dictionary.doc2bow(text) for text in corpus]
            tfidf_model = TfidfModel(corpus)

            del corpus
            tfidf_model.save(tfidf_model_path)
            pickle_dump(dict_path, dictionary)

        return dictionary, tfidf_model
示例#3
0
def main(args):
    set_seed(args.seed)

    elapsed, context_predictions = context_selection(args,
                                                     args.context_config_json)
    logger.info(
        f"Finished context selection, get {len(context_predictions)} paragraphs"
        f"({elapsed:.2f}s elapsed)")

    elapsed, qa_predictions = question_answering(args, args.qa_config_json,
                                                 context_predictions)
    logger.info(
        f"Finished question_answering, get {len(qa_predictions)} answer spans"
        f"({elapsed:.2f}s elapsed)")

    predictions = {
        d["id"]: d["answer"]
        for d in map(postprocess, qa_predictions)
    }
    json_dump(predictions, args.predict_json, ensure_ascii=False)

    if args.tmp_dir:
        args.tmp_dir.mkdir(parents=True, exist_ok=True)
        pickle_dump(context_predictions,
                    args.tmp_dir / "context_predictions.pkl")
        pickle_dump(qa_predictions, args.tmp_dir / "qa_predictions.pkl")
 def add_tfidf_feature(self, data_type):
     feat_file = self.format_feature_file(data_type, 'tfidf')
     if os.path.exists(feat_file):
         features = pickle_load(feat_file)
     else:
         dictionary, tfidf_model = self.tfidf_model()
         features = list()
         for premise, hypothesis in zip(
                 self.get_data(data_type)['premise'],
                 self.get_data(data_type)['hypothesis']):
             premise = premise.split()
             hypothesis = hypothesis.split()
             p_tfidf = dict(tfidf_model[dictionary.doc2bow(premise)])
             h_tfidf = dict(tfidf_model[dictionary.doc2bow(hypothesis)])
             features.append([
                 np.sum(list(p_tfidf.values())),
                 np.sum(list(h_tfidf.values())),
                 np.mean(list(p_tfidf.values())),
                 np.mean(list(h_tfidf.values()))
             ])
         features = np.array(features)
         pickle_dump(feat_file, features)
     print('Logging Info - {} : w_ngram_ol_tfidf feature shape : {}'.format(
         data_type, features.shape))
     return features
 def add_similarity_feature(self, data_type, feat_type, sim_func):
     feat_file = self.format_feature_file(data_type, feat_type)
     if os.path.exists(feat_file):
         features = pickle_load(feat_file)
     else:
         len_dist_feat = np.array([
             sim_func(p, h) for p, h in zip(
                 self.get_data(data_type)['premise'],
                 self.get_data(data_type)['hypothesis'])
         ])
         features = self.check_and_expand_shape(len_dist_feat)
         pickle_dump(feat_file, features)
     print('Logging Info - {} : {} feature shape : {}'.format(
         data_type, feat_type, features.shape))
     return features
    def gen_all_features(self, data_type, scaled=False):
        if scaled:
            feat_file = self.format_feature_file(data_type, 'all_scaled')
        else:
            feat_file = self.format_feature_file(data_type, 'all')
        if os.path.exists(feat_file):
            features = pickle_load(feat_file)
        else:
            features = list()
            feat_types = [('len_dis', length_distance),
                          ('lcs_seq', lcs_seq_norm),
                          ('lcs_str', lcs_str_1_norm),
                          ('edit_dist', edit_distance),
                          ('jaro', jaro_distance),
                          ('jaro_winkler', jaro_winkler_dist), ('fuzz', fuzzy),
                          ('simhash', simhash), ('w_share', word_share),
                          ('w_ngram_dist', word_ngram_distance),
                          ('c_ngram_ol', char_ngram_overlap),
                          ('w_ngram_ol', word_ngram_overlap)]
            for feat_type, sim_func in feat_types:
                features.append(
                    self.add_similarity_feature(data_type, feat_type,
                                                sim_func))

            features.append(
                self.add_weighted_word_ngram_overlap_feature(data_type))
            features.append(self.add_tfidf_feature(data_type))
            features.append(self.add_word_power_feature(data_type))
            features.append(self.add_graph_feature(data_type))
            features = np.concatenate(features, axis=-1)

            if scaled:
                scaler = StandardScaler()
                features = scaler.fit_transform(features)
                joblib.dump(
                    scaler,
                    os.path.join(FEATURE_DIR,
                                 '{}_scaler.model'.format(self.genre)))

            pickle_dump(feat_file, features)

        print('Logging Info - {} : all feature shape : {}'.format(
            data_type, features.shape))
    def add_word_power_feature(self, data_type):
        feat_file = self.format_feature_file(data_type, 'word_power')
        if os.path.exists(feat_file):
            features = pickle_load(feat_file)
        else:
            power_word = self.get_power_word()
            num_least = 100
            features = list()
            for premise, hypothesis in zip(
                    self.get_data(data_type)['premise'],
                    self.get_data(data_type)['hypothesis']):
                premise = premise.split()
                hypothesis = hypothesis.split()

                rate = [1.0, 1.0]
                share_words = list(set(premise).intersection(set(hypothesis)))
                for word in share_words:
                    if word not in power_word:
                        continue
                    if power_word[word][0] * power_word[word][
                            5] < num_least:  # 共享词出现在双侧语句对数量要大于num_least
                        continue
                    rate[0] *= (1.0 - power_word[word][6]
                                )  # 共享词但是语句对不是正确的(label!=2)
                p_diff = list(set(premise).difference(set(hypothesis)))
                h_diff = list(set(premise).difference(set(hypothesis)))
                all_diff = set(p_diff + h_diff)
                for word in all_diff:
                    if word not in power_word:
                        continue
                    if power_word[word][0] * power_word[word][
                            3] < num_least:  # 共享词只出现在单侧语句数量要大于num_least
                        continue
                    rate[1] *= (1.0 - power_word[word][4]
                                )  # 非共享词但是语句对是正确的(label=2)
                rate = [1 - num for num in rate]
                features.append(rate)
            features = np.array(features)
            pickle_dump(feat_file, features)
        print('Logging Info - {} : word_power feature shape : {}'.format(
            data_type, features.shape))
        return features
    def generate_graph(self):
        print('Logging Info - Get graph...')
        sent2id_path = os.path.join(FEATURE_DIR,
                                    '{}_graph_sent2id.pkl'.format(self.genre))
        graph_path = os.path.join(FEATURE_DIR,
                                  '{}_graph.pkl'.format(self.genre))
        if os.path.exists(graph_path):
            sent2id = pickle_load(sent2id_path)
            graph = pickle_load(graph_path)
        else:
            sent2id = {}  # sentence to id
            graph = nx.Graph()
            for data_type in ['train', 'dev', 'test']:
                for premise, hypothesis in zip(
                        self.get_data(data_type)['premise'],
                        self.get_data(data_type)['hypothesis']):
                    if premise not in sent2id:
                        sent2id[premise] = len(sent2id)
                    if hypothesis not in sent2id:
                        sent2id[hypothesis] = len(sent2id)
                    p_id = sent2id[premise]
                    h_id = sent2id[hypothesis]

                    match = 0.0
                    premise = premise.split()
                    hypothesis = hypothesis.split()
                    for w1 in premise:
                        if w1 in hypothesis:
                            match += 1

                    if len(premise) + len(hypothesis) == 0:
                        weight = 0.0
                    else:
                        weight = 2.0 * (match /
                                        (len(premise) + len(hypothesis)))
                    graph.add_edge(p_id, h_id, weight=weight)
            pickle_dump(sent2id_path, sent2id)
            pickle_dump(graph_path, graph)
        return sent2id, graph
    def add_weighted_word_ngram_overlap_feature(self, data_type):
        feat_file = self.format_feature_file(data_type, 'w_ngram_ol_tfidf')
        if os.path.exists(feat_file):
            features = pickle_load(feat_file)
        else:
            dictionary, tfidf_model = self.tfidf_model()
            idf_model = tfidf_model.idfs
            features = list()
            for premise, hypothesis in zip(
                    self.get_data(data_type)['premise'],
                    self.get_data(data_type)['hypothesis']):
                premise = premise.split()
                p_tfidf = dict(tfidf_model[dictionary.doc2bow(premise)])
                input_premise = [
                    (word, idf_model.get(dictionary.token2id.get(word, 0),
                                         0.0),
                     p_tfidf.get(dictionary.token2id.get(word, 0), 0.0))
                    for word in premise
                ]

                hypothesis = hypothesis.split()
                h_tfidf = dict(tfidf_model[dictionary.doc2bow(hypothesis)])
                input_hypothesis = [
                    (word, idf_model.get(dictionary.token2id.get(word, 0),
                                         0.0),
                     h_tfidf.get(dictionary.token2id.get(word, 0), 0.0))
                    for word in hypothesis
                ]
                features.append(
                    weighted_word_ngram_overlap(input_premise,
                                                input_hypothesis))
            features = np.array(features)
            pickle_dump(feat_file, features)
        print('Logging Info - {} : w_ngram_ol_tfidf feature shape : {}'.format(
            data_type, features.shape))
        return features
    def get_sent_freq(self):
        print('Logging Info - Get sentence frequency...')
        sents_dict_path = os.path.join(FEATURE_DIR,
                                       '{}_sent_dict.pkl'.format(self.genre))
        p_vc_path = os.path.join(FEATURE_DIR,
                                 '{}_premise_vc.pkl'.format(self.genre))
        h_vc_path = os.path.join(FEATURE_DIR,
                                 '{}_hypothesis_vc.pkl'.format(self.genre))
        if os.path.exists(p_vc_path):
            sents_dict = pickle_load(sents_dict_path)
            p_vc = pickle_load(p_vc_path)
            h_vc = pickle_load(h_vc_path)
        else:
            train_data = pd.DataFrame(self.train_data)
            dev_data = pd.DataFrame(self.dev_data)
            test_data = pd.DataFrame(self.test_data)
            all_data = pd.concat([train_data, dev_data, test_data])

            df1 = all_data[['premise']]
            df2 = all_data[['hypothesis']]
            df2.rename(columns={'hypothesis': 'premise'}, inplace=True)

            train_sents = pd.concat([df1, df2])
            train_sents.drop_duplicates(subset=['premise'], inplace=True)
            train_sents.reset_index(inplace=True, drop=True)

            sents_dict = pd.Series(train_sents.index.values,
                                   index=train_sents.premise.values).to_dict()
            all_data['p_hash'] = all_data['premise'].map(sents_dict)
            all_data['h_hash'] = all_data['hypothesis'].map(sents_dict)

            p_vc = all_data.p_hash.value_counts().to_dict()
            h_vc = all_data.h_hash.value_counts().to_dict()

            pickle_dump(sents_dict_path, sents_dict)
            pickle_dump(p_vc_path, p_vc)
            pickle_dump(h_vc_path, h_vc)
            del train_data, dev_data, test_data, all_data
        return sents_dict, p_vc, h_vc
示例#11
0
        word_ids_test = create_token_ids_matrix(word_tokenizer,
                                                raw_data[variation],
                                                config.word_max_len)

        # prepare n-gram input
        vectorizer = pickle_load(
            format_filename(PROCESSED_DATA_DIR,
                            VECTORIZER_TEMPLATE,
                            variation=variation,
                            type='binary',
                            level='char',
                            ngram_range=(2, 3)))
        n_gram_test = vectorizer.transform(raw_data[variation])

        pickle_dump(
            format_filename(PROCESSED_DATA_DIR,
                            TEST_DATA_TEMPLATE,
                            variation=variation), {'sentence': test_data})
        pickle_dump(
            format_filename(PROCESSED_DATA_DIR,
                            TEST_IDS_MATRIX_TEMPLATE,
                            variation=variation,
                            level='word'), {'sentence': word_ids_test})
        pickle_dump(
            format_filename(PROCESSED_DATA_DIR,
                            TEST_NGRAM_DATA_TEMPLATE,
                            variation=variation,
                            type='binary',
                            level='char',
                            ngram_range=(2, 3)), {'sentence': n_gram_test})
    def get_power_word(self):
        """
        计算数据中词语的影响力,格式如下:
        词语 --> [0. 出现语句对数量,1. 出现语句对比例,2. 正确语句对比例,3. 单侧语句对比例,4. 单侧语句对正确比例,
                 5. 双侧语句对比例,6. 双侧语句对正确比例]
        """
        print('Logging Info - Get power word...')
        words_power_path = os.path.join(FEATURE_DIR,
                                        '{}_power_word.pkl'.format(self.genre))
        if os.path.exists(words_power_path):
            words_power = pickle_load(words_power_path)
        else:
            words_power = {}
            x_a = [
                text.split() for text in self.train_data['premise'] +
                self.dev_data['premise'] + self.test_data['premise']
            ]
            x_b = [
                text.split() for text in self.train_data['hypothesis'] +
                self.dev_data['hypothesis'] + self.test_data['hypothesis']
            ]
            y = self.train_data['label'] + self.dev_data[
                'label'] + self.test_data['label']
            for i in range(len(x_a)):
                label = y[i]
                q1_words = x_a[i]
                q2_words = x_b[i]
                all_words = set(q1_words + q2_words)
                q1_words = set(q1_words)
                q2_words = set(q2_words)
                for word in all_words:
                    if word not in words_power:
                        words_power[word] = [0. for _ in range(7)]
                    words_power[word][0] += 1.  # 计算出现语句对的数量
                    words_power[word][1] += 1.  # 计算出现语句对比例

                    if ((word in q1_words) and
                        (word not in q2_words)) or ((word not in q1_words) and
                                                    (word in q2_words)):
                        words_power[word][3] += 1.  # 计算单侧语句对比例
                        if 0 == label:
                            words_power[word][2] += 1.  # 计算正确语句对比例
                            words_power[word][4] += 1.  # 计算单侧语句正确比例
                    if (word in q1_words) and (word in q2_words):
                        words_power[word][5] += 1.  # 计算双侧语句数量
                        if 2 == label:
                            words_power[word][2] += 1.  # 计算正确语句对比例
                            words_power[word][6] += 1.  # 计算双侧语句正确比例

            for word in words_power:
                words_power[word][1] /= len(x_a)  # 计算出现语句对比例=出现语句对数量/总的语句对数量
                words_power[word][2] /= words_power[word][
                    0]  # 计算正确语句对比例=正确语句对数量/出现语句对数量
                if words_power[word][3] > 1e-6:
                    words_power[word][4] /= words_power[word][
                        3]  # 计算单侧语句正确比例=单侧语句正确数量/出现单侧语句数量
                words_power[word][3] /= words_power[word][
                    0]  # 计算出现单侧语句对比例=出现单侧语句数量/出现语句对数量
                if words_power[word][5] > 1e-6:
                    words_power[word][6] /= words_power[word][
                        5]  # 计算双侧语句正确比例=双侧语句正确数量/出现双侧语句数量
                words_power[word][5] /= words_power[word][
                    0]  # 计算出现双侧语句对比例=出现双侧语句数量/出现语句数量
            del x_a, x_b, y
            pickle_dump(words_power_path, words_power)

        return words_power
示例#13
0
    if not os.path.exists(PROCESSED_DATA_DIR):
        os.makedirs(PROCESSED_DATA_DIR)
    if not os.path.exists(LOG_DIR):
        os.makedirs(LOG_DIR)
    if not os.path.exists(MODEL_SAVED_DIR):
        os.makedirs(MODEL_SAVED_DIR)
    if not os.path.exists(SUBMIT_DIR):
        os.makedirs(SUBMIT_DIR)
    if not os.path.exists(IMG_DIR):
        os.makedirs(IMG_DIR)

    # load knowledge base data
    mention_to_entity, entity_to_mention, entity_desc, entity_type = load_kb_data(
        KB_FILENAME)
    pickle_dump(
        format_filename(PROCESSED_DATA_DIR, MENTION_TO_ENTITY_FILENAME),
        mention_to_entity)
    pickle_dump(format_filename(PROCESSED_DATA_DIR, ENTITY_DESC_FILENAME),
                entity_desc)
    pickle_dump(format_filename(PROCESSED_DATA_DIR, ENTITY_TYPE_FILENAME),
                entity_type)
    pickle_dump(
        format_filename(PROCESSED_DATA_DIR, ENTITY_TO_MENTION_FILENAME),
        entity_to_mention)

    # load training data
    train_data = load_train_data(CCKS_TRAIN_FILENAME)

    # prepare character embedding
    char_vocab, idx2char, char_corpus = load_char_vocab_and_corpus(
        entity_desc, train_data)
示例#14
0
文件: preprocess.py 项目: schan27/DMT
def prepare_skip_ngram_feature(vectorizer_type, level, ngram, skip_k,
                               train_data, dev_data, variation):
    if level not in ['word', 'char']:
        raise ValueError('Vectorizer Level Not Understood: {}'.format(level))

    if vectorizer_type == 'binary':
        vectorizer = CountVectorizer(binary=True,
                                     tokenizer=make_skip_tokenize(
                                         ngram, skip_k, level))
    elif vectorizer_type == 'tf':
        vectorizer = CountVectorizer(binary=False,
                                     tokenizer=make_skip_tokenize(
                                         ngram, skip_k, level))
    elif vectorizer_type == 'tfidf':
        vectorizer = TfidfVectorizer(make_skip_tokenize(ngram, skip_k, level))
    else:
        raise ValueError(
            'Vectorizer Type Not Understood: {}'.format(vectorizer_type))

    train_ngram_feature = vectorizer.fit_transform(train_data['sentence'])
    train_ngram_data = {
        'sentence': train_ngram_feature,
        'label': train_data['label']
    }

    dev_ngram_feature = vectorizer.transform(dev_data['sentence'])
    dev_ngram_data = {
        'sentence': dev_ngram_feature,
        'label': dev_data['label']
    }

    print(
        'Logging info - {}_{}vectorizer_{}_{}_{} : train_skip_ngram_feature shape: {}, '
        'dev_skip_ngram_feature shape: {}'.format(variation, vectorizer_type,
                                                  level, ngram, skip_k,
                                                  train_ngram_feature.shape,
                                                  dev_ngram_feature.shape))

    # pickle can't pickle lambda function, here i use drill: https://github.com/uqfoundation/dill
    with open(
            format_filename(PROCESSED_DATA_DIR,
                            VECTORIZER_TEMPLATE,
                            variation=variation,
                            type=vectorizer_type,
                            level=level,
                            ngram_range='%d_%d' % (ngram, skip_k)),
            'wb') as writer:

        dill.dump(vectorizer, writer)

    pickle_dump(
        format_filename(PROCESSED_DATA_DIR,
                        TRAIN_NGRAM_DATA_TEMPLATE,
                        variation=variation,
                        type=vectorizer_type,
                        level=level,
                        ngram_range='%d_%d' % (ngram, skip_k)),
        train_ngram_data)
    pickle_dump(
        format_filename(PROCESSED_DATA_DIR,
                        DEV_NGRAM_DATA_TEMPLATE,
                        variation=variation,
                        type=vectorizer_type,
                        level=level,
                        ngram_range='%d_%d' % (ngram, skip_k)), dev_ngram_data)
    return vectorizer, train_ngram_data, dev_ngram_data
示例#15
0
文件: preprocess.py 项目: schan27/DMT
def prepare_ngram_feature(vectorizer_type, level, ngram_range, train_data,
                          dev_data, variation):
    if level not in ['word', 'char', 'char_wb']:
        raise ValueError('Vectorizer Level Not Understood: {}'.format(level))
    if not isinstance(ngram_range, tuple):
        raise ValueError('ngram_range should be a tuple, got {}'.format(
            type(ngram_range)))
    if vectorizer_type == 'binary':
        vectorizer = CountVectorizer(binary=True,
                                     analyzer=level,
                                     ngram_range=ngram_range)
    elif vectorizer_type == 'tf':
        vectorizer = CountVectorizer(binary=False,
                                     analyzer=level,
                                     ngram_range=ngram_range)
    elif vectorizer_type == 'tfidf':
        vectorizer = TfidfVectorizer(analyzer=level, ngram_range=ngram_range)
    else:
        raise ValueError(
            'Vectorizer Type Not Understood: {}'.format(vectorizer_type))

    train_ngram_feature = vectorizer.fit_transform(train_data['sentence'])
    train_ngram_data = {
        'sentence': train_ngram_feature,
        'label': train_data['label']
    }

    dev_ngram_feature = vectorizer.transform(dev_data['sentence'])
    dev_ngram_data = {
        'sentence': dev_ngram_feature,
        'label': dev_data['label']
    }

    print(
        'Logging info - {}_{}vectorizer_{}_{} : train_ngram_feature shape: {}, '
        'dev_ngram_feature shape: {}'.format(variation, vectorizer_type, level,
                                             ngram_range,
                                             train_ngram_feature.shape,
                                             dev_ngram_feature.shape))

    pickle_dump(
        format_filename(PROCESSED_DATA_DIR,
                        VECTORIZER_TEMPLATE,
                        variation=variation,
                        type=vectorizer_type,
                        level=level,
                        ngram_range=ngram_range), vectorizer)
    pickle_dump(
        format_filename(PROCESSED_DATA_DIR,
                        TRAIN_NGRAM_DATA_TEMPLATE,
                        variation=variation,
                        type=vectorizer_type,
                        level=level,
                        ngram_range=ngram_range), train_ngram_data)
    pickle_dump(
        format_filename(PROCESSED_DATA_DIR,
                        DEV_NGRAM_DATA_TEMPLATE,
                        variation=variation,
                        type=vectorizer_type,
                        level=level,
                        ngram_range=ngram_range), dev_ngram_data)
    return vectorizer, train_ngram_data, dev_ngram_data
    def add_graph_feature(self, data_type):
        feat_file = self.format_feature_file(data_type, 'word_power')
        if os.path.exists(feat_file):
            graph_features = pickle_load(feat_file)
        else:
            sent2id, graph = self.generate_graph()

            n2clique = {}
            cliques = []
            for clique in nx.find_cliques(graph):
                for n in clique:
                    if n not in n2clique:
                        n2clique[n] = []
                    n2clique[n].append(len(cliques))
                cliques.append(clique)

            n2cc = {}
            ccs = []
            for cc in nx.connected_components(graph):
                for n in cc:
                    n2cc[n] = len(ccs)
                ccs.append(cc)

            pagerank = nx.pagerank(graph, alpha=0.9, max_iter=100)

            hits_h, hits_a = nx.hits(graph, max_iter=100)

            indegree_features = list()
            clique_features = list()
            cc_features = list()
            pagerank_features = list()
            hits_features = list()
            shortestpath_features = list()
            # neighbor_features = list()
            for premise, hypothesis in zip(
                    self.get_data(data_type)['premise'],
                    self.get_data(data_type)['hypothesis']):
                p_id = sent2id[premise]
                h_id = sent2id[hypothesis]

                # graph in-degree fetures
                indegree_features.append(
                    [graph.degree[p_id], graph.degree[h_id]])

                # clique features
                edge_max_clique_size = 0
                num_clique = 0
                for clique_id in n2clique[p_id]:
                    if h_id in cliques[clique_id]:
                        edge_max_clique_size = max(edge_max_clique_size,
                                                   len(cliques[clique_id]))
                        num_clique += 1
                clique_features.append([edge_max_clique_size, num_clique])

                lnode_max_clique_size = 0
                rnode_max_clique_size = 0
                for clique_id in n2clique[p_id]:
                    lnode_max_clique_size = max(lnode_max_clique_size,
                                                len(cliques[clique_id]))

                for clique_id in n2clique[h_id]:
                    rnode_max_clique_size = max(rnode_max_clique_size,
                                                len(cliques[clique_id]))

                clique_features[-1] += [
                    lnode_max_clique_size, rnode_max_clique_size,
                    max(lnode_max_clique_size, rnode_max_clique_size),
                    min(lnode_max_clique_size, rnode_max_clique_size)
                ]

                # connected components features
                cc_features.append([len(ccs[n2cc[p_id]])])

                # page rank features
                pr1 = pagerank[p_id] * 1e6
                pr2 = pagerank[h_id] * 1e6
                pagerank_features.append(
                    [pr1, pr2,
                     max(pr1, pr2),
                     min(pr1, pr2), (pr1 + pr2) / 2.])

                # graph hits features
                h1 = hits_h[p_id] * 1e6
                h2 = hits_h[h_id] * 1e6
                a1 = hits_a[p_id] * 1e6
                a2 = hits_a[h_id] * 1e6
                hits_features.append([
                    h1, h2, a1, a2,
                    max(h1, h2),
                    max(a1, a2),
                    min(h1, h2),
                    min(a1, a2), (h1 + h2) / 2., (a1 + a2) / 2.
                ])

                # graph shortest path features
                shortest_path = -1
                weight = graph[p_id][h_id]['weight']
                graph.remove_edge(p_id, h_id)
                if nx.has_path(graph, p_id, h_id):
                    shortest_path = nx.dijkstra_path_length(graph, p_id, h_id)
                graph.add_edge(p_id, h_id, weight=weight)
                shortestpath_features.append([shortest_path])

                # graph neighbour features
                # l = []
                # r = []
                # l_nb = graph.neighbors(p_id)
                # r_nb = graph.neighbors(h_id)
                # for n in l_nb:
                #     if (n != h_id) and (n != p_id):
                #         l.append(graph[p_id][n]['weight'])
                # for n in r_nb:
                #     if (n != h_id) and (n != p_id):
                #         r.append(graph[h_id][n]['weight'])
                # if len(l) == 0 or len(r) == 0:
                #     neighbor_features.append([0.0] * 11)
                # else:
                #     neighbor_features.append(l + r +
                #                              [len(list((set(l_nb).union(set(r_nb))) ^ (set(l_nb) ^ set(r_nb))))])

            graph_features = np.concatenate(
                (np.array(indegree_features), np.array(clique_features),
                 np.array(cc_features), np.array(pagerank_features),
                 np.array(hits_features), np.array(shortestpath_features)),
                axis=-1)
            pickle_dump(feat_file, graph_features)
        print('Logging Info - {} : graph feature shape : {}'.format(
            data_type, graph_features.shape))
        return graph_features
示例#17
0
def main():
    process_conf = ProcessConfig()
    # create directory
    if not os.path.exists(PROCESSED_DATA_DIR):
        os.makedirs(PROCESSED_DATA_DIR)
    if not os.path.exists(LOG_DIR):
        os.makedirs(LOG_DIR)
    if not os.path.exists(MODEL_SAVED_DIR):
        os.makedirs(MODEL_SAVED_DIR)
    if not os.path.exists(IMG_DIR):
        os.makedirs(IMG_DIR)

    # load SNLI, MultiNLI and MLI datasets
    data_train, data_dev, data_test = load_data()
    print('Logging Info - Data: train - {}, dev - {}, test - {}'.format(data_train.shape, data_dev.shape,
                                                                        data_test.shape))

    for genre in GENRES:
        if genre not in data_train.index:
            continue

        analyze_result = {}

        genre_train = data_train.loc[genre]
        genre_dev = data_dev.loc[genre]
        genre_test = data_test.loc[genre]   # might be None
        print('Logging Info - Genre: {}, train - {}, dev - {}, test - {}'.format(genre, genre_train.shape,
                                                                                 genre_dev.shape, genre_test.shape))
        analyze_result.update({'train_set': len(genre_train), 'dev_set': len(genre_dev),
                               'test_set': 0 if genre_test is None else len(genre_test)})

        genre_train_data = process_data(genre_train, process_conf.clean, process_conf.stem)
        genre_dev_data = process_data(genre_dev, process_conf.clean, process_conf.stem)

        # class distribution analysis
        train_label_distribution = analyze_class_distribution(genre_train_data['label'])
        analyze_result.update(dict(('train_cls_{}'.format(cls), percent) for cls, percent in train_label_distribution.items()))
        dev_label_distribution = analyze_class_distribution(genre_dev_data['label'])
        analyze_result.update(dict(('dev_cls_{}'.format(cls), percent) for cls, percent in dev_label_distribution.items()))

        # create tokenizer and vocabulary
        sentences_train = genre_train_data['premise'] + genre_train_data['hypothesis']
        sentences_dev = genre_dev_data['premise'] + genre_dev_data['hypothesis']

        word_tokenizer = Tokenizer(lower=process_conf.lowercase, filters='', char_level=False)
        char_tokenizer = Tokenizer(lower=process_conf.lowercase, filters='', char_level=True)
        word_tokenizer.fit_on_texts(sentences_train)    # just fit on train data
        char_tokenizer.fit_on_texts(sentences_train)
        print('Logging Info - Genre: {}, word_vocab: {}, char_vocab: {}'.format(genre, len(word_tokenizer.word_index),
                                                                                len(char_tokenizer.word_index)))
        analyze_result.update({'word_vocab': len(word_tokenizer.word_index),
                               'char_vocab': len(char_tokenizer.word_index)})

        # length analysis
        word_len_distribution, word_max_len = analyze_len_distribution(sentences_train, level='word')
        analyze_result.update(dict(('word_{}'.format(k), v) for k, v in word_len_distribution.items()))
        char_len_distribution, char_max_len = analyze_len_distribution(sentences_train, level='char')
        analyze_result.update(dict(('char_{}'.format(k), v) for k, v in char_len_distribution.items()))

        train_word_ids = create_data_matrices(word_tokenizer, genre_train_data, process_conf.padding,
                                              process_conf.truncating, process_conf.n_class, word_max_len)
        train_char_ids = create_data_matrices(char_tokenizer, genre_train_data, process_conf.padding,
                                              process_conf.truncating, process_conf.n_class, char_max_len)
        dev_word_ids = create_data_matrices(word_tokenizer, genre_dev_data, process_conf.padding,
                                            process_conf.truncating, process_conf.n_class, word_max_len)
        dev_char_ids = create_data_matrices(char_tokenizer, genre_dev_data, process_conf.padding,
                                            process_conf.truncating, process_conf.n_class, char_max_len)

        # create embedding matrix from pretrained word vectors
        glove_cc = load_trained(EXTERNAL_WORD_VECTORS_FILENAME['glove_cc'], word_tokenizer.word_index)
        fasttext_cc = load_trained(EXTERNAL_WORD_VECTORS_FILENAME['fasttext_cc'], word_tokenizer.word_index)
        fasttext_wiki = load_trained(EXTERNAL_WORD_VECTORS_FILENAME['fasttext_wiki'], word_tokenizer.word_index)
        # create embedding matrix by training on nil dataset
        w2v_nil = train_w2v(sentences_train+sentences_dev, lambda x: x.split(), word_tokenizer.word_index)
        c2v_nil = train_w2v(sentences_train+sentences_dev, lambda x: list(x), char_tokenizer.word_index)
        w_fasttext_nil = train_fasttext(sentences_train + sentences_dev, lambda x: x.split(), word_tokenizer.word_index)
        c_fasttext_nil = train_fasttext(sentences_train + sentences_dev, lambda x: list(x), char_tokenizer.word_index)
        w_glove_nil = train_glove(sentences_train + sentences_dev, lambda x: x.split(), word_tokenizer.word_index)
        c_glove_nil = train_glove(sentences_train + sentences_dev, lambda x: list(x), char_tokenizer.word_index)

        # save pre-process data
        pickle_dump(format_filename(PROCESSED_DATA_DIR, TRAIN_DATA_TEMPLATE, genre), genre_train_data)
        pickle_dump(format_filename(PROCESSED_DATA_DIR, DEV_DATA_TEMPLATE, genre), genre_dev_data)
        pickle_dump(format_filename(PROCESSED_DATA_DIR, TRAIN_IDS_MATRIX_TEMPLATE, genre, 'word'), train_word_ids)
        pickle_dump(format_filename(PROCESSED_DATA_DIR, TRAIN_IDS_MATRIX_TEMPLATE, genre, 'char'), train_char_ids)
        pickle_dump(format_filename(PROCESSED_DATA_DIR, DEV_IDS_MATRIX_TEMPLATE, genre, 'word'), dev_word_ids)
        pickle_dump(format_filename(PROCESSED_DATA_DIR, DEV_IDS_MATRIX_TEMPLATE, genre, 'char'), dev_char_ids)

        np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'glove_cc'), glove_cc)
        np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'fasttext_cc'), fasttext_cc)
        np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'fasttext_wiki'), fasttext_wiki)
        np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'w2v_nil'), w2v_nil)
        np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'c2v_nil'), c2v_nil)
        np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'w_fasttext_nil'), w_fasttext_nil)
        np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'c_fasttext_nil'), c_fasttext_nil)
        np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'w_glove_nil'), w_glove_nil)
        np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'c_glove_nil'), c_glove_nil)

        pickle_dump(format_filename(PROCESSED_DATA_DIR, TOKENIZER_TEMPLATE, genre, 'word'), word_tokenizer)
        pickle_dump(format_filename(PROCESSED_DATA_DIR, TOKENIZER_TEMPLATE, genre, 'char'), char_tokenizer)
        pickle_dump(format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, genre, 'word'), word_tokenizer.word_index)
        pickle_dump(format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, genre, 'char'), char_tokenizer.word_index)

        if genre_test is not None:
            genre_test_data = process_data(genre_test, process_conf.clean, process_conf.stem)
            test_label_distribution = analyze_class_distribution(genre_test_data['label'])
            analyze_result.update(
                dict(('test_cls_%d' % cls, percent) for cls, percent in test_label_distribution.items()))

            test_word_ids = create_data_matrices(word_tokenizer, genre_test_data, process_conf.padding,
                                                 process_conf.truncating, process_conf.n_class,
                                                 word_max_len)
            test_char_ids = create_data_matrices(char_tokenizer, genre_test_data, process_conf.padding,
                                                 process_conf.truncating, process_conf.n_class,
                                                 char_max_len)
            pickle_dump(format_filename(PROCESSED_DATA_DIR, TEST_DATA_TEMPLATE, genre), genre_test_data)
            pickle_dump(format_filename(PROCESSED_DATA_DIR, TEST_IDS_MATRIX_TEMPLATE, genre, 'word'), test_word_ids)
            pickle_dump(format_filename(PROCESSED_DATA_DIR, TEST_IDS_MATRIX_TEMPLATE, genre, 'char'), test_char_ids)

        # save analyze result
        analyze_result['timestamp'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
        write_log(format_filename(LOG_DIR, ANALYSIS_LOG_TEMPLATE, genre), analyze_result)
示例#18
0
文件: preprocess.py 项目: schan27/DMT
def process_data():
    config = ModelConfig()

    # create dir
    if not path.exists(PROCESSED_DATA_DIR):
        os.makedirs(PROCESSED_DATA_DIR)
    if not path.exists(LOG_DIR):
        os.makedirs(LOG_DIR)
    if not path.exists(MODEL_SAVED_DIR):
        os.makedirs(MODEL_SAVED_DIR)
    if not path.exists(IMG_DIR):
        os.makedirs(IMG_DIR)

    # load datasets
    data_train, data_dev = load_data()
    print('Logging Info - Data: train - {}, dev - {}'.format(
        data_train.shape, data_dev.shape))

    for variation in VARIATIONS:
        if variation not in data_train.index:
            continue

        analyze_result = {}
        variation_train = data_train.loc[variation]
        variation_dev = data_dev.loc[variation]

        print('Logging Info - Variation: {}, train - {}, dev - {}'.format(
            variation, variation_train.shape, variation_dev.shape))
        analyze_result.update({
            'train_set': len(variation_train),
            'dev_set': len(variation_train)
        })

        variation_train_data = get_sentence_label(variation_train)
        variation_dev_data = get_sentence_label(variation_dev)

        if config.data_augment:
            variation_train_data = augment_data(variation_train_data)
            variation += '_aug'

        # class distribution analysis
        train_label_distribution = analyze_class_distribution(
            variation_train_data['label'])
        analyze_result.update(
            dict(('train_cls_{}'.format(cls), percent)
                 for cls, percent in train_label_distribution.items()))
        dev_label_distribution = analyze_class_distribution(
            variation_dev_data['label'])
        analyze_result.update(
            dict(('dev_cls_{}'.format(cls), percent)
                 for cls, percent in dev_label_distribution.items()))

        # create tokenizer and vocabulary
        sentences_train = variation_train_data['sentence']
        sentences_dev = variation_dev_data['sentence']

        word_tokenizer = Tokenizer(char_level=False)
        char_tokenizer = Tokenizer(char_level=True)
        word_tokenizer.fit_on_texts(sentences_train)
        char_tokenizer.fit_on_texts(sentences_train)
        print('Logging Info - Variation: {}, word_vocab: {}, char_vocab: {}'.
              format(variation, len(word_tokenizer.word_index),
                     len(char_tokenizer.word_index)))
        analyze_result.update({
            'word_vocab': len(word_tokenizer.word_index),
            'char_vocab': len(char_tokenizer.word_index)
        })

        # length analysis
        word_len_distribution, word_max_len = analyze_len_distribution(
            sentences_train, level='word')
        analyze_result.update(
            dict(('word_{}'.format(k), v)
                 for k, v in word_len_distribution.items()))
        char_len_distribution, char_max_len = analyze_len_distribution(
            sentences_train, level='char')
        analyze_result.update(
            dict(('char_{}'.format(k), v)
                 for k, v in char_len_distribution.items()))

        one_hot = False if config.loss_function == 'binary_crossentropy' else True
        train_word_ids = create_data_matrices(word_tokenizer,
                                              variation_train_data,
                                              config.n_class, one_hot,
                                              word_max_len)
        train_char_ids = create_data_matrices(char_tokenizer,
                                              variation_train_data,
                                              config.n_class, one_hot,
                                              char_max_len)
        dev_word_ids = create_data_matrices(word_tokenizer, variation_dev_data,
                                            config.n_class, one_hot,
                                            word_max_len)
        dev_char_ids = create_data_matrices(char_tokenizer, variation_dev_data,
                                            config.n_class, one_hot,
                                            char_max_len)

        # create embedding matrix by training on dataset
        w2v_data = train_w2v(sentences_train + sentences_dev,
                             lambda x: x.split(), word_tokenizer.word_index)
        c2v_data = train_w2v(sentences_train + sentences_dev,
                             lambda x: list(x), char_tokenizer.word_index)
        w_fasttext_data = train_fasttext(sentences_train + sentences_dev,
                                         lambda x: x.split(),
                                         word_tokenizer.word_index)
        c_fasttext_data = train_fasttext(sentences_train + sentences_dev,
                                         lambda x: list(x),
                                         char_tokenizer.word_index)
        # w_glove_data = train_glove(sentences_train+sentences_dev, lambda x: x.split(), word_tokenizer.word_index)
        # c_glove_data = train_glove(sentences_train+sentences_dev, lambda x: list(x), char_tokenizer.word_index)

        # save pre-process data
        pickle_dump(
            format_filename(PROCESSED_DATA_DIR,
                            TRAIN_DATA_TEMPLATE,
                            variation=variation), variation_train_data)
        pickle_dump(
            format_filename(PROCESSED_DATA_DIR,
                            DEV_DATA_TEMPLATE,
                            variation=variation), variation_dev_data)
        pickle_dump(
            format_filename(PROCESSED_DATA_DIR,
                            TRAIN_IDS_MATRIX_TEMPLATE,
                            variation=variation,
                            level='word'), train_word_ids)
        pickle_dump(
            format_filename(PROCESSED_DATA_DIR,
                            TRAIN_IDS_MATRIX_TEMPLATE,
                            variation=variation,
                            level='char'), train_char_ids)
        pickle_dump(
            format_filename(PROCESSED_DATA_DIR,
                            DEV_IDS_MATRIX_TEMPLATE,
                            variation=variation,
                            level='word'), dev_word_ids)
        pickle_dump(
            format_filename(PROCESSED_DATA_DIR,
                            DEV_IDS_MATRIX_TEMPLATE,
                            variation=variation,
                            level='char'), dev_char_ids)

        np.save(
            format_filename(PROCESSED_DATA_DIR,
                            EMBEDDING_MATRIX_TEMPLATE,
                            variation=variation,
                            type='w2v_data'), w2v_data)
        np.save(
            format_filename(PROCESSED_DATA_DIR,
                            EMBEDDING_MATRIX_TEMPLATE,
                            variation=variation,
                            type='c2v_data'), c2v_data)
        np.save(
            format_filename(PROCESSED_DATA_DIR,
                            EMBEDDING_MATRIX_TEMPLATE,
                            variation=variation,
                            type='w_fasttext_data'), w_fasttext_data)
        np.save(
            format_filename(PROCESSED_DATA_DIR,
                            EMBEDDING_MATRIX_TEMPLATE,
                            variation=variation,
                            type='c_fasttext_data'), c_fasttext_data)
        # np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, variation=variation,
        # type='w_glove_data'), w_glove_data)
        # np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, variation=variation,
        # type='c_glove_data'), c_glove_data)

        pickle_dump(
            format_filename(PROCESSED_DATA_DIR,
                            TOKENIZER_TEMPLATE,
                            variation=variation,
                            level='word'), word_tokenizer)
        pickle_dump(
            format_filename(PROCESSED_DATA_DIR,
                            TOKENIZER_TEMPLATE,
                            variation=variation,
                            level='char'), char_tokenizer)
        pickle_dump(
            format_filename(PROCESSED_DATA_DIR,
                            VOCABULARY_TEMPLATE,
                            variation=variation,
                            level='word'), word_tokenizer.word_index)
        pickle_dump(
            format_filename(PROCESSED_DATA_DIR,
                            VOCABULARY_TEMPLATE,
                            variation=variation,
                            level='char'), char_tokenizer.word_index)

        # prepare ngram feature
        for vectorizer_type in ['binary', 'tf', 'tfidf']:
            for level in ['char', 'word']:
                for ngram_range in [(1, 1), (2, 2), (3, 3), (2, 3), (1, 3),
                                    (2, 4), (1, 4), (4, 4), (5, 5), (6, 6),
                                    (7, 7), (8, 8)]:
                    prepare_ngram_feature(vectorizer_type, level, ngram_range,
                                          variation_train_data,
                                          variation_dev_data, variation)

        # prepare skip ngram features
        for vectorizer_type in ['binary', 'tf', 'tfidf']:
            for level in ['word', 'char']:
                for ngram in [2, 3]:
                    for skip_k in [1, 2, 3]:
                        prepare_skip_ngram_feature(vectorizer_type, level,
                                                   ngram, skip_k,
                                                   variation_train_data,
                                                   variation_dev_data,
                                                   variation)

        # prepare pos ngram
        variation_train_pos_data = {
            'sentence': [
                get_pos(sentence)
                for sentence in variation_train_data['sentence']
            ],
            'label':
            variation_train_data['label']
        }
        variation_dev_pos_data = {
            'sentence':
            [get_pos(sentence) for sentence in variation_dev_data['sentence']],
            'label':
            variation_dev_data['label']
        }
        for vectorizer_type in ['binary', 'tf', 'tfidf']:
            for level in ['word']:
                for ngram_range in [(1, 1), (2, 2), (3, 3)]:
                    prepare_ngram_feature(vectorizer_type, level, ngram_range,
                                          variation_train_pos_data,
                                          variation_dev_pos_data,
                                          variation + '_pos')

        # save analyze result
        write_log(
            format_filename(LOG_DIR,
                            ANALYSIS_LOG_TEMPLATE,
                            variation=variation), analyze_result)