'relation_' + data_part + '_intents.txt') rels_web = None if add_web: rels_web = all_rels_web[index] print '[Web] total relations in ', data_part, len(rels_web) prepare.save_relation( cur_data_dir + 'relation_' + data_part + '_web.txt', rels_web) print '[Web] filter queries with duplicated doc ids...' prepare.check_filter_query_with_dup_doc(cur_data_dir + 'relation_' + data_part + '_web.txt') print 'total corpus ', len(corpus) prepare.save_corpus_dmn(cur_data_dir + 'corpus.txt', corpus, '\t') print 'preparation finished ...' print 'begin preprocess...' # Prerpocess corpus file preprocessor = Preprocess(word_filter_config={'min_freq': 5}) dids, docs = preprocessor.run_2d( cur_data_dir + 'corpus.txt') # docs is [corpus_size, utterance_num, max_text1_len] preprocessor.save_word_dict(cur_data_dir + 'word_dict.txt') # preprocessor.save_words_df(basedir + 'word_df.txt') fout = open(cur_data_dir + 'corpus_preprocessed.txt', 'w') for inum, did in enumerate(dids): doc_txt = docs[inum] # 2d list
corpus, rels_train, rels_valid, rels_test = prepare.run_with_train_valid_test_corpus_dmn( basedir + train_file, basedir + valid_file, basedir + test_file) for data_part in list(['train', 'valid', 'test']): if data_part == 'train': rels = rels_train elif data_part == 'valid': rels = rels_valid else: rels = rels_test print 'total relations in ', data_part, len(rels) prepare.save_relation(basedir + 'relation_' + data_part + '.txt', rels) print 'filter queries with duplicated doc ids...' prepare.check_filter_query_with_dup_doc(basedir + 'relation_' + data_part + '.txt') print 'total corpus ', len(corpus) prepare.save_corpus_dmn(basedir + 'corpus.txt', corpus, '\t') print 'preparation finished ...' print 'begin preprocess...' # Prerpocess corpus file # Trying not filtering terms by frequency preprocessor = Preprocess() dids, docs = preprocessor.run_2d_smn( basedir + 'corpus.txt') # docs is [corpus_size, utterance_num, max_text1_len] preprocessor.save_word_dict(basedir + 'word_dict.txt') # preprocessor.save_words_df(basedir + 'word_df.txt') fout = open(basedir + 'corpus_preprocessed.txt', 'w') for inum, did in enumerate(dids): doc_txt = docs[inum] # 2d list