'relation_' + data_part +
                                                    '_intents.txt')

        rels_web = None
        if add_web:
            rels_web = all_rels_web[index]
            print '[Web] total relations in ', data_part, len(rels_web)
            prepare.save_relation(
                cur_data_dir + 'relation_' + data_part + '_web.txt', rels_web)
            print '[Web] filter queries with duplicated doc ids...'
            prepare.check_filter_query_with_dup_doc(cur_data_dir +
                                                    'relation_' + data_part +
                                                    '_web.txt')

    print 'total corpus ', len(corpus)
    prepare.save_corpus_dmn(cur_data_dir + 'corpus.txt', corpus, '\t')
    print 'preparation finished ...'

    print 'begin preprocess...'
    # Prerpocess corpus file
    preprocessor = Preprocess(word_filter_config={'min_freq': 5})
    dids, docs = preprocessor.run_2d(
        cur_data_dir +
        'corpus.txt')  # docs is [corpus_size, utterance_num, max_text1_len]
    preprocessor.save_word_dict(cur_data_dir + 'word_dict.txt')
    # preprocessor.save_words_df(basedir + 'word_df.txt')

    fout = open(cur_data_dir + 'corpus_preprocessed.txt', 'w')

    for inum, did in enumerate(dids):
        doc_txt = docs[inum]  # 2d list
Пример #2
0
    corpus, rels_train, rels_valid, rels_test = prepare.run_with_train_valid_test_corpus_dmn(
        basedir + train_file, basedir + valid_file, basedir + test_file)
    for data_part in list(['train', 'valid', 'test']):
        if data_part == 'train':
            rels = rels_train
        elif data_part == 'valid':
            rels = rels_valid
        else:
            rels = rels_test
        print 'total relations in ', data_part, len(rels)
        prepare.save_relation(basedir + 'relation_' + data_part + '.txt', rels)
        print 'filter queries with duplicated doc ids...'
        prepare.check_filter_query_with_dup_doc(basedir + 'relation_' +
                                                data_part + '.txt')
    print 'total corpus ', len(corpus)
    prepare.save_corpus_dmn(basedir + 'corpus.txt', corpus, '\t')
    print 'preparation finished ...'

    print 'begin preprocess...'
    # Prerpocess corpus file
    # Trying not filtering terms by frequency
    preprocessor = Preprocess()
    dids, docs = preprocessor.run_2d_smn(
        basedir +
        'corpus.txt')  # docs is [corpus_size, utterance_num, max_text1_len]
    preprocessor.save_word_dict(basedir + 'word_dict.txt')
    # preprocessor.save_words_df(basedir + 'word_df.txt')

    fout = open(basedir + 'corpus_preprocessed.txt', 'w')
    for inum, did in enumerate(dids):
        doc_txt = docs[inum]  # 2d list