if __name__ == '__main__': prepare = Preparation() srcdir = './' dstdir = './' infiles = [ srcdir + 'MSMARCO-small-mz-train.txt', srcdir + 'MSMARCO-mz-dev.txt', srcdir + 'MSMARCO-mz-test.txt' ] corpus, rel_train, rel_valid, rel_test = prepare.run_with_train_valid_test_corpus( infiles[0], infiles[1], infiles[2]) print('total corpus : %d ...' % (len(corpus))) print('total relation-train : %d ...' % (len(rel_train))) print('total relation-valid : %d ...' % (len(rel_valid))) print('total relation-test: %d ...' % (len(rel_test))) prepare.save_corpus(dstdir + 'corpus.txt', corpus) prepare.save_relation(dstdir + 'relation_train.txt', rel_train) prepare.save_relation(dstdir + 'relation_valid.txt', rel_valid) prepare.save_relation(dstdir + 'relation_test.txt', rel_test) print('Preparation finished ...') preprocessor = Preprocess(word_stem_config={'enable': False}, word_filter_config={'min_freq': 2}) dids, docs = preprocessor.run(dstdir + 'corpus.txt') preprocessor.save_word_dict(dstdir + 'word_dict.txt', True) preprocessor.save_words_stats(dstdir + 'word_stats.txt', True) fout = open(dstdir + 'corpus_preprocessed.txt', 'w') for inum, did in enumerate(dids): fout.write('%s %s %s\n' %
for data_part in list(['train', 'valid', 'test']): if data_part == 'train': rels = rels_train elif data_part == 'valid': rels = rels_valid else: rels = rels_test print 'total relations in ', data_part, len(rels) prepare.save_relation(basedir + 'relation_' + data_part + '.txt', rels) if save_space == '0': print 'filter queries with duplicated doc ids...' prepare.check_filter_query_with_dup_doc(basedir + 'relation_' + data_part + '.txt') print 'total corpus ', len(corpus) if save_space == '0': prepare.save_corpus(basedir + 'corpus.txt', corpus) print 'preparation finished ...' if need_preprocess == '1': print 'begin preprocess...' # Prerpocess corpus file preprocessor = Preprocess(word_filter_config={'min_freq': 2}) dids, docs = preprocessor.run(basedir + 'corpus.txt') preprocessor.save_word_dict(basedir + 'word_dict.txt') # preprocessor.save_words_df(basedir + 'word_df.txt') fout = open(basedir + 'corpus_preprocessed.txt', 'w') for inum, did in enumerate(dids): fout.write('%s\t%s\t%s\n' % (did, len(docs[inum]), ' '.join( map(str, docs[inum])))) # id text_len text_ids fout.close()
tri_dict[triinfo[0]] = len(tri_dict) return tri_dict if __name__ == '__main__': prepare = Preparation() srcdir = './' dstdir = './' infiles = [ srcdir + 'WikiQA-mz-train.txt', srcdir + 'WikiQA-mz-dev.txt', srcdir + 'WikiQA-mz-test.txt'] corpus, rel_train, rel_valid, rel_test = prepare.run_with_train_valid_test_corpus(infiles[0], infiles[1], infiles[2]) print('total corpus : %d ...' % (len(corpus))) print('total relation-train : %d ...' % (len(rel_train))) print('total relation-valid : %d ...' % (len(rel_valid))) print('total relation-test: %d ...' % (len(rel_test))) prepare.save_corpus(dstdir + 'corpus.txt', corpus) prepare.save_relation(dstdir + 'relation_train.txt', rel_train) prepare.save_relation(dstdir + 'relation_valid.txt', rel_valid) prepare.save_relation(dstdir + 'relation_test.txt', rel_test) print('Preparation finished ...') preprocessor = Preprocess(word_stem_config={'enable': False}, word_filter_config={'min_freq': 2}) dids, docs = preprocessor.run(dstdir + 'corpus.txt') preprocessor.save_word_dict(dstdir + 'word_dict.txt', True) preprocessor.save_words_stats(dstdir + 'word_stats.txt', True) fout = open(dstdir + 'corpus_preprocessed.txt', 'w') for inum, did in enumerate(dids): fout.write('%s %s %s\n' % (did, len(docs[inum]), ' '.join(map(str, docs[inum])))) fout.close()