def get_feat(infersent, data_path, verbose=True, layer_norm=False, split_sents=True): if verbose: print('Loading Text Data from {}'.format(data_path)) train_data, train_labels, ids = load_csv_corpus(data_path) if verbose: print('Building Vocabulary Table for Infersent by {}'.format(data_path)) infersent.build_vocab(train_data, tokenize=False) if verbose: print('Extracting Feat using Infersent') train_feat = infersent_encode_sents(infersent, train_data, split_sents=split_sents, layer_norm=layer_norm, verbose=False) return train_feat, np.array(train_labels), ids
print( 'Building Vocabulary Table for Infersent by {}'.format(data_path)) infersent.build_vocab(train_data, tokenize=False) if verbose: print('Extracting Feat using Infersent') train_feat = infersent_encode_sents(infersent, train_data, split_sents=split_sents, layer_norm=layer_norm, verbose=False) return train_feat, np.array(train_labels), ids if __name__ == '__main__': args = parse_args() data_dir = args.db_dir model_id = args.model_id assert 0 <= model_id <= 2 split_sents = True layer_norm = False train_data_path = os.path.join(data_dir, cfg.TRAIN_DATA_NAME + '.csv') # test_data_path = os.path.join(data_dir, cfg.TEST_DATA_NAME+'.csv') train_feat_path = os.path.join(data_dir, 'tfidf.h5') train_sents, train_labels, _ = load_csv_corpus(train_data_path) train_feat = get_tf_idf_feat(train_sents) print('Dumping Train Text Feat and Labels into {}'.format(train_feat_path)) dump_feat(train_feat_path, train_feat, labels=train_labels)
help='the number of seed for each class') parser.add_argument('--verbose', help='whether to print log', action='store_true') args = parser.parse_args() return args args = get_args() data_dir = args.data_dir random_seed = args.seed seed_num = args.seed_num verbose = args.verbose initialize_environment(random_seed=random_seed) _, labels, ids = load_csv_corpus( os.path.join(data_dir, cfg.TRAIN_DATA_NAME + '.csv')) dic = defaultdict(list) for tmp_id, tmp_label in zip(ids, labels): dic[tmp_label].append(tmp_id) results = [] for l, tmp_ids in dic.items(): random.shuffle(tmp_ids) tmp_ids = tmp_ids[:seed_num] results.extend([(tmp_id, l) for tmp_id in tmp_ids]) results.sort() seed_path = os.path.join(os.path.join(data_dir, cfg.SEED_FILE_NAME)) with open(seed_path, 'w') as f: