def get_feat(infersent, data_path, verbose=True, layer_norm=False, split_sents=True):
    if verbose:
        print('Loading Text Data from {}'.format(data_path))
    train_data, train_labels, ids = load_csv_corpus(data_path)
    if verbose:
        print('Building Vocabulary Table for Infersent by {}'.format(data_path))
    infersent.build_vocab(train_data, tokenize=False)
    if verbose:
        print('Extracting Feat using Infersent')
    train_feat = infersent_encode_sents(infersent, train_data, split_sents=split_sents, layer_norm=layer_norm, verbose=False)
    return train_feat, np.array(train_labels), ids
예제 #2
0
        print(
            'Building Vocabulary Table for Infersent by {}'.format(data_path))
    infersent.build_vocab(train_data, tokenize=False)
    if verbose:
        print('Extracting Feat using Infersent')
    train_feat = infersent_encode_sents(infersent,
                                        train_data,
                                        split_sents=split_sents,
                                        layer_norm=layer_norm,
                                        verbose=False)
    return train_feat, np.array(train_labels), ids


if __name__ == '__main__':
    args = parse_args()
    data_dir = args.db_dir
    model_id = args.model_id
    assert 0 <= model_id <= 2
    split_sents = True
    layer_norm = False

    train_data_path = os.path.join(data_dir, cfg.TRAIN_DATA_NAME + '.csv')
    # test_data_path = os.path.join(data_dir, cfg.TEST_DATA_NAME+'.csv')
    train_feat_path = os.path.join(data_dir, 'tfidf.h5')

    train_sents, train_labels, _ = load_csv_corpus(train_data_path)
    train_feat = get_tf_idf_feat(train_sents)

    print('Dumping Train Text Feat and Labels into {}'.format(train_feat_path))
    dump_feat(train_feat_path, train_feat, labels=train_labels)
예제 #3
0
                        help='the number of seed for each class')
    parser.add_argument('--verbose',
                        help='whether to print log',
                        action='store_true')
    args = parser.parse_args()
    return args


args = get_args()
data_dir = args.data_dir
random_seed = args.seed
seed_num = args.seed_num
verbose = args.verbose

initialize_environment(random_seed=random_seed)
_, labels, ids = load_csv_corpus(
    os.path.join(data_dir, cfg.TRAIN_DATA_NAME + '.csv'))

dic = defaultdict(list)

for tmp_id, tmp_label in zip(ids, labels):
    dic[tmp_label].append(tmp_id)

results = []
for l, tmp_ids in dic.items():
    random.shuffle(tmp_ids)
    tmp_ids = tmp_ids[:seed_num]
    results.extend([(tmp_id, l) for tmp_id in tmp_ids])
results.sort()

seed_path = os.path.join(os.path.join(data_dir, cfg.SEED_FILE_NAME))
with open(seed_path, 'w') as f: