示例#1
0
def setup_rep(technique, dataset, fold, rep,
              noise, shuffled, folddir, repdir):
    if shuffled == 0:
        return []
    repfile = ('%s_%s_%d_%d_%.5f_%d.rep' %
               (technique, dataset, fold, rep, noise, shuffled))
    reppath = os.path.join(repdir, repfile)
    if os.path.exists(reppath):
        with open(reppath, 'r') as f:
            shuffled_bags = map(lambda s: tuple(s.strip().split(',')), f)
            shuffled_bags = [(int(l[0]), l[1], l[2], int(l[3]))
                       for l in shuffled_bags]
    else:
        ids, _, y = get_dataset(dataset)
        test_ids = set(bid for bid, iid in get_fold(folddir, dataset, fold))
        classes = defaultdict(list)
        for (bid, iid), yi in zip(ids, y.flat):
            if bid not in test_ids:
                classes[bid].append(yi)
        bag_ids = list(classes.keys())
        Y = [any(classes[bid]) for bid in bag_ids]

        pos_bags = [bid for bid, Yi in zip(bag_ids, Y) if Yi]
        neg_bags = [bid for bid, Yi in zip(bag_ids, Y) if not Yi]

        n_pos = len(pos_bags)
        n_neg = len(neg_bags)
        initial_bag_ids = pos_bags + neg_bags
        initial_labels = ([True]*n_pos) + ([False]*n_neg)
        initial_bags = [[(bid, iid) for bid, iid in ids if bid == ibid]
                        for ibid in initial_bag_ids]

        if technique == 'shuffle_both':
            shuffled_bags, shuffled_labels = shuffling.shuffle_both(
                shuffled, initial_bags, initial_labels, noise)
        elif technique == 'shuffle_pos':
            shuffled_bags, shuffled_labels = shuffling.shuffle_pos(
                shuffled, initial_bags, initial_labels, noise)
        else:
            raise Exception('Unsupported shuffling technique: %s' % technique)

        all_bags = shuffled_bags
        all_labels = shuffled_labels
        shuffled_bags = []
        with open(reppath, 'w+') as f:
            for i, (bag, label) in enumerate(zip(all_bags, all_labels)):
                for bid, iid in bag:
                    linst = (i, bid, iid, int(label))
                    shuffled_bags.append(linst)
                    f.write('%d,%s,%s,%d\n' % linst)

    return shuffled_bags
示例#2
0
def client_target(task, callback):
    key = task['key']
    params = task['params']
    (classifier, dataset, kernel, fold, rep) = key

    print 'Starting task %s...' % str(key)
    print 'Parameters: %s' % str(params)

    data_dict = data.get_dataset(dataset)

    fold_ids = set(data.get_fold(FOLDIR, dataset, fold))
    test_ids = fold_ids
    if rep == 0:
        train_ids = set(data_dict.keys()) - test_ids
    else:
        train_ids = set(data.get_rep(FOLDIR, dataset, fold, rep))

    X_train = [data_dict[bid][0] for bid in train_ids]
    y_train = [data_dict[bid][1] for bid in train_ids]

    X_test = [data_dict[bid][0] for bid in test_ids]
    y_test = [data_dict[bid][1] for bid in test_ids]

    results = {}
    results['stats'] = {}
    results['preds'] = {}
    start = time.time()

    if classifier == 'nsk':
        nsk = SetSVM(SVC, kernel, **params)
        nsk.fit(X_train, y_train)
        predictions = nsk.decision_function(X_test)

    else:
        print 'Technique "%s" not supported' % technique
        callback.quit = True
        return

    results['stats']['time'] = time.time() - start
    for i, y in zip(test_ids, predictions):
        results['preds'][i] = float(y)

    if len(y_test) > 1:
        print 'Test AUC Score: %f' % auc_score(y_test, predictions)

    print 'Finished task %s.' % str(key)
    return results
示例#3
0
def main(dataset, folddir, outputdir, reps=0):
    data_dict = data.get_dataset(dataset)
    folds = data.get_folds(folddir, dataset)

    all_bag_ids = set(data_dict.keys())

    progress = ProgressMonitor(total=reps * len(folds),
                               msg='Generating Replicates')

    for f in range(len(folds)):
        test = data.get_fold(folddir, dataset, f)
        bag_ids = np.array(list(all_bag_ids - set(test)))
        n = len(bag_ids)

        for r in range(1, reps + 1):
            rep_path = os.path.join(outputdir,
                                    '%s_%04d_%06d.rep' % (dataset, f, r))
            if not os.path.exists(rep_path):
                sample = np.random.randint(n, size=n)
                sampled_bags = bag_ids[sample]
                with open(rep_path, 'w+') as ofile:
                    ofile.write('\n'.join([bid for bid in sampled_bags.flat]))
            progress.increment()
示例#4
0
def main(args):
    out_dir = args.model_dir

    paramargs = param_t([ln.strip().split("=", 1) for ln in open(os.path.join(out_dir, "param.txt"), "r")])
    
    print("")
    print("Regression evaluator")
    print("  # Score type: {}".format(paramargs.mp_score_type))
    print("  # Output dir: {}".format(out_dir))
    print("  # Param string:\n{}".format(model.param_str(paramargs)))
    print("")

    # To reduce memory consumption    
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    keras.backend.set_session(sess)

    # Load Pretrained Embedding
    pre_embed = data.load_pretrained_embeddings() if args.mp_pretrained else None 

    # Load data and Normalize score
    essayids, essays, org_scores, scores, prompts, scaler = data.load_annotated_essay_with_normalized_score('Path_to_essay_file(.xlsx)', score_source="./essayScore_folds/{}Scores.txt".format(paramargs.mp_score_type))  # ./data/ICLE_essays.xlsx
        
    # Get Persing Sequence (Paragraph Sequence of Persing et. al. 2010)
    pseqs = np.array([data.get_persing_sequence(e, p) for e, p in zip(essays, prompts)])
    
    # Preprocess data (assert sentence and paragraph boundary)
    if paramargs.mp_di_aware:
        di_list = data.load_discourse_indicators()
        essays = data.preprocess_essay_withParaBoundary(essays, paramargs, boseos=True, bopeop=True)
    elif paramargs.mp_model_type == "nea_aft_pretrain" and not paramargs.mp_para:
        essays = data.preprocess_essay_noParaBoundary(essays, paramargs, boseos=True)
    elif paramargs.mp_no_para:
        essays = data.preprocess_essay_noParaBoundary(essays, paramargs, boseos=True)
    else:
        essays = data.preprocess_essay_withParaBoundary(essays, paramargs, boseos=True, bopeop=True)
        
    
    if paramargs.mp_prompt:
        prompts = data.preprocess_essay_noParaBoundary(essays, paramargs, boseos=True)
    
    # Get training and validation set!
    id2idx = dict([(v, k) for k, v in enumerate(essayids)])
    folds = data.load_folds("./essayScore_folds/{}Folds.txt".format(paramargs.mp_score_type), id2idx=id2idx)
    
    assert(0 <= args.fold and args.fold <= 4)
    
    _, _, ts = data.get_fold(folds, args.fold)

    indices = np.arange(len(essays))
    main_essay_t, main_essay_v, score_t, score_v, indices_t, indices_v, prompt_t, prompt_v = [], essays[ts], [], org_scores[ts], [], indices[ts], [], prompts[ts]
    pseq_t, pseq_v = [], pseqs[indices_v]
    
    print(main_essay_v[:1])
    print(prompt_v[:1])
    
    # Preparing inputs
    model_inputs_v = []
    
    # Text to sequence
    if paramargs.mp_model_type != "only_pseq":
        
        tokenizer_m = pickle.load(open(os.path.join(args.model_dir, "tokenizer_f{}.pickle".format(args.fold)), "rb"))
        sequences_valid_main = tokenizer_m.texts_to_sequences(main_essay_v)
        lens = [len(e) for e in sequences_valid_main]

        model_inputs_v += [pad_sequences(sequences_valid_main, maxlen=min(max(lens), data.MAX_WORDS))]
        sequence_length_main = model_inputs_v[-1].shape[1]
    
    # Persing sequence to sequence
    sequence_length_pseq = None
    
    if paramargs.mp_pseq or paramargs.mp_model_type == "only_pseq":
        
        tokenizer_pseq = pickle.load(open(os.path.join(args.model_dir, "tokenizer_pseq_f{}.pickle".format(args.fold)), "rb"))
        sequences_valid_pseq = tokenizer_pseq.texts_to_sequences(pseq_v)
        lens = [len(e) for e in sequences_valid_pseq]

        model_inputs_v += [pad_sequences(sequences_valid_pseq, maxlen=min(max(lens), data.MAX_PARAGRAPHS))]
        sequence_length_pseq = model_inputs_v[-1].shape[1]
        
    #prompt
    if paramargs.mp_prompt:
        
        tokenizer_p = pickle.load(open(os.path.join(args.model_dir, "tokenizer_p_f{}.pickle".format(args.fold)), "rb"))
        sequences_valid_prompt = tokenizer_p.texts_to_sequences(prompt_v)
        lens = [len(e) for e in sequences_valid_prompt]

        model_inputs_v += [pad_sequences(sequences_valid_prompt, maxlen=min(max(lens), data.MAX_WORDS))]
        sequence_length_prompt = model_inputs_v[-1].shape[1]

    
    if paramargs.mp_model_type == "only_pseq":
        
        mainModel = model.pseq_regression(sequence_length_pseq,
                                        paramargs,)
    else:
        if paramargs.mp_prompt:
            
            mainModel = model.create_regression_wprompt(None,
                                        tokenizer_m.word_index,
                                        tokenizer_p.word_index,
                                        sequence_length_main,
                                        sequence_length_pseq,
                                        sequence_length_prompt,
                                        paramargs,
                                        )
        else:
            mainModel = model.create_regression(None,
                                        tokenizer_m.word_index,
                                        sequence_length_main,
                                        sequence_length_pseq,
                                        paramargs,
                                        )
    
    mainModel.summary()    
    mainModel.load_weights(os.path.join(args.model_dir, "regression_f{}.hdf5".format(args.fold)), by_name=True)

    print("Starting evaluation.")
    
    # Perform prediction!
    score_model = mainModel.predict(
        model_inputs_v,
        verbose=1,
        batch_size=32)
    
    
    score_model_n = scaler.inverse_transform(score_model)
    score_model_n = score_model_n.tolist()
    
    score_model_nf = []
    for i in score_model_n:
        for j in i:
            score_model_nf.append(j)
    
    # Save to the file.
    with open(os.path.join(out_dir, "prediction_f{}.json".format(args.fold)), "w") as f:
        mse, mae = mean_squared_error(score_v, score_model_nf), mean_absolute_error(score_v, score_model_nf)
        
        pr = {
            "system": score_model_nf,
            "gold": score_v.tolist(),
            "MSE": mse,
            "MAE": mae,
        }
        
        json.dump(pr, f)
    
        print("MSE:", mse)
        print("MAE:", mae)
示例#5
0
def client_target(task, callback):
    key = task['key']
    params = task['params']
    shuffled_bags = task['shuffled_bags']
    (technique, classifier, dataset, kernel, fold, rep, noise, shuffled) = key

    print 'Starting task %s...' % str(key)
    print 'Parameters: %s' % str(params)

    ids, X, y = data.get_dataset(dataset)
    id_index = {}
    for j, i in enumerate(ids):
        id_index[i] = j

    fold = set(data.get_fold(FOLDIR, dataset, fold))
    test_ids = fold
    train_ids = set(ids) - test_ids

    X_train = defaultdict(list)
    y_train = defaultdict(bool)
    for bid, iid in train_ids:
        X_train[bid].append(X[id_index[bid, iid]])
        y_train[bid] |= bool(y[id_index[bid, iid]])
    for bag, bid, iid, yi in shuffled_bags:
        X_train[bag].append(X[id_index[bid, iid]])
        y_train[bag] |= bool(yi)
    bags_train = sorted(X_train.keys())
    X_train = map(np.vstack, [X_train[b] for b in bags_train])
    y_train = [y_train[b] for b in bags_train]

    X_test = defaultdict(list)
    y_test = defaultdict(bool)
    for bid, iid in test_ids:
        X_test[bid].append(X[id_index[bid, iid]])
        y_test[bid] |= bool(y[id_index[bid, iid]])
    bags_test = sorted(X_test.keys())
    X_test = map(np.vstack, [X_test[b] for b in bags_test])
    y_test = [y_test[b] for b in bags_test]

    results = {}
    results['stats'] = {}
    results['preds'] = {}
    start = time.time()

    if classifier == 'nsk':
        nsk = SetSVM(SVC, kernel, **params)
        nsk.fit(X_train, y_train)
        predictions = nsk.decision_function(X_test)

    else:
        print 'Classifier "%s" not supported' % classifier
        callback.quit = True
        return

    results['stats']['time'] = time.time() - start
    for i, y in zip(bags_test, predictions):
        results['preds'][i] = float(y)

    if len(y_test) > 1:
        print 'Test AUC Score: %f' % auc_score(y_test, predictions)

    print 'Finished task %s.' % str(key)
    return results
示例#6
0
def main(args):
    pstr = model.param_str(args)
    out_dir = hashlib.md5(pstr.encode("utf-8")).hexdigest()
    out_dir = os.path.join("output", out_dir)
    os.system("mkdir -p {}".format(out_dir))

    with open(os.path.join(out_dir, "param.txt"), "w") as f:
        print(pstr, file=f)

    with open(os.path.join(out_dir, "args.json"), "w") as f:
        json.dump(args.__dict__, f)

    print("")
    print("Regression trainer")
    print("  # Score type: {}".format(args.mp_score_type))
    print("  # Output dir: {}".format(out_dir))
    print("  # Param string:\n{}".format(pstr))
    print("")

    # To reduce memory consumption
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    keras.backend.set_session(sess)

    # Load Pretrained Embedding
    pre_embed = data.load_pretrained_embeddings(
    ) if args.mp_pretrained else None

    # Load data and Normalize Score
    essayids, essays, _, scores, prompts, _ = data.load_annotated_essay_with_normalized_score(
        'Path_to_essay_file(.xlsx)',
        score_source="data/{}Scores.txt".format(
            args.mp_score_type))  # ./data/ICLE_essays.xlsx

    # Get Persing Sequence (Paragraph Sequence of Persing et. al. 2010)
    pseqs = np.array(
        [data.get_persing_sequence(e, p) for e, p in zip(essays, prompts)])

    # Preprocess data (assert sentence and paragraph boundary)
    if args.mp_di_aware:
        di_list = data.load_discourse_indicators()
        essays = data.preprocess_essay_withParaBoundary(essays,
                                                        args,
                                                        boseos=True,
                                                        bopeop=True)
    elif args.mp_model_type == "nea_aft_pretrain" and not args.mp_para:
        essays = data.preprocess_essay_noParaBoundary(essays,
                                                      args,
                                                      boseos=True)
    elif args.mp_no_para:
        essays = data.preprocess_essay_noParaBoundary(essays,
                                                      args,
                                                      boseos=True)
    else:
        essays = data.preprocess_essay_withParaBoundary(essays,
                                                        args,
                                                        boseos=True,
                                                        bopeop=True)

    if args.mp_prompt:
        prompts = data.preprocess_essay_noParaBoundary(essays,
                                                       args,
                                                       boseos=True)

    # Get training and validation set!
    id2idx = dict([(v, k) for k, v in enumerate(essayids)])
    folds = data.load_folds("./essayScore_folds/{}Folds.txt".format(
        args.mp_score_type),
                            id2idx=id2idx)

    assert (0 <= args.fold and args.fold <= 4)

    tr, v, ts = data.get_fold(folds, args.fold)

    if args.mp_divide_data == "half":
        _, tr = train_test_split(tr, test_size=0.50, random_state=args.mp_seed)
    elif args.mp_divide_data == "one_forth":
        _, tr = train_test_split(tr, test_size=0.25, random_state=args.mp_seed)
    elif args.mp_divide_data == "one_eighth":
        _, tr = train_test_split(tr,
                                 test_size=0.125,
                                 random_state=args.mp_seed)
    else:
        tr = tr
        v = v

    indices = np.arange(len(essays))
    main_essay_t, main_essay_v, score_t, score_v, indices_t, indices_v, prompt_t, prompt_v = essays[
        tr], essays[v], scores[tr], scores[v], indices[tr], indices[
            v], prompts[tr], prompts[v]
    pseq_t, pseq_v = pseqs[indices_t], pseqs[indices_v]

    print(main_essay_v[:2])
    print(prompt_v[:10])

    # Preparing inputs
    model_inputs_t, model_inputs_v = [], []

    # Text to sequence
    if args.mp_model_type != "only_pseq":
        if args.mp_preenc != None:
            tokenizer_m = pickle.load(
                open(os.path.join(args.mp_preenc, "tokenizer.pickle"), "rb"))
        else:
            tokenizer_m = model.create_vocab(main_essay_t, args)
        with open(
                os.path.join(out_dir,
                             "tokenizer_f{}.pickle".format(args.fold)),
                "wb") as f:
            pickle.dump(tokenizer_m, f)

        sequences_train_main = tokenizer_m.texts_to_sequences(main_essay_t)
        sequences_valid_main = tokenizer_m.texts_to_sequences(main_essay_v)
        lens = [len(e) for e in sequences_train_main]

        model_inputs_t += [
            pad_sequences(sequences_train_main,
                          maxlen=min(max(lens), data.MAX_WORDS))
        ]
        model_inputs_v += [
            pad_sequences(sequences_valid_main,
                          maxlen=model_inputs_t[-1].shape[1])
        ]

        sequence_length_main = model_inputs_t[-1].shape[1]

    # Persing sequence to sequence
    sequence_length_pseq = None

    if args.mp_pseq or args.mp_model_type == "only_pseq":
        tokenizer_pseq = model.create_vocab_seq(pseq_t, char_level=True)

        with open(
                os.path.join(out_dir,
                             "tokenizer_pseq_f{}.pickle".format(args.fold)),
                "wb") as f:
            pickle.dump(tokenizer_pseq, f)

        sequences_train_pseq = tokenizer_pseq.texts_to_sequences(pseq_t)
        sequences_valid_pseq = tokenizer_pseq.texts_to_sequences(pseq_v)
        lens = [len(e) for e in sequences_train_pseq]

        model_inputs_t += [
            pad_sequences(sequences_train_pseq,
                          maxlen=min(max(lens), data.MAX_PARAGRAPHS))
        ]
        model_inputs_v += [
            pad_sequences(sequences_valid_pseq,
                          maxlen=model_inputs_t[-1].shape[1])
        ]

        sequence_length_pseq = model_inputs_t[-1].shape[1]

    # Prompt
    if args.mp_prompt:
        tokenizer_p = model.create_vocab_prompt(prompt_t, args)

        with open(
                os.path.join(out_dir,
                             "tokenizer_p_f{}.pickle".format(args.fold)),
                "wb") as f:
            pickle.dump(tokenizer_p, f)

        sequences_train_prompt = tokenizer_p.texts_to_sequences(prompt_t)
        sequences_valid_prompt = tokenizer_p.texts_to_sequences(prompt_v)
        lens = [len(e) for e in sequences_train_prompt]

        model_inputs_t += [
            pad_sequences(sequences_train_prompt,
                          maxlen=min(max(lens), data.MAX_WORDS))
        ]
        model_inputs_v += [
            pad_sequences(sequences_valid_prompt,
                          maxlen=model_inputs_t[-1].shape[1])
        ]

        sequence_length_prompt = model_inputs_t[-1].shape[1]

    # Create neural regression model.
    if args.mp_model_type == "only_pseq":
        mainModel = model.pseq_regression(
            sequence_length_pseq,
            args,
        )
        mainModel.summary()

    else:
        if args.mp_prompt:
            mainModel = model.create_regression_wprompt(
                pre_embed,
                tokenizer_m.word_index,
                tokenizer_p.word_index,
                sequence_length_main,
                sequence_length_pseq,
                sequence_length_prompt,
                args,
            )
            mainModel.summary()
        else:
            mainModel = model.create_regression(
                pre_embed,
                tokenizer_m.word_index,
                sequence_length_main,
                sequence_length_pseq,
                args,
            )
            mainModel.summary()

    # Load Pretrained Model
    if args.mp_preenc != None:
        mainModel.load_weights(os.path.join(args.mp_preenc, "encoder.hdf5"),
                               by_name=True)

    print("Starting training.")

    optimizer_main = keras.optimizers.Adam(clipnorm=args.mp_clipnorm)
    mainModel.compile(optimizer=optimizer_main, loss='mse', metrics=['mse'])
    es = keras.callbacks.EarlyStopping(monitor='val_loss',
                                       min_delta=0,
                                       patience=15,
                                       verbose=0,
                                       mode='auto',
                                       baseline=None,
                                       restore_best_weights=True)
    nbl = kr_util.NBatchLogger(
        os.path.join(out_dir, "logs_f{}.pickle".format(args.fold)))

    mainModel.fit(model_inputs_t,
                  score_t,
                  validation_data=(model_inputs_v, score_v),
                  epochs=100,
                  verbose=1,
                  callbacks=[es, nbl],
                  batch_size=32)

    mainModel.save_weights(
        os.path.join(out_dir, "regression_f{}.hdf5".format(args.fold)))

    print()
    print("# Output dir: {}".format(out_dir))
示例#7
0
def client_target(task, callback):
    key = task['key']
    params = task['params']
    labeled = task['labeled']
    (technique, classifier, dataset, kernel, fold, rep, initial, shuffled,
     queries) = key

    print 'Starting task %s...' % str(key)
    print 'Parameters: %s' % str(params)

    ids, X, y = data.get_dataset(dataset)
    id_index = {}
    for j, i in enumerate(ids):
        id_index[i] = j

    fold = set(data.get_fold(FOLDIR, dataset, fold))
    test_ids = fold
    train_ids = set(ids) - test_ids
    labeled_ids = set((l[1], l[2]) for l in labeled)
    pool_ids = train_ids - labeled_ids

    X_labeled = defaultdict(list)
    y_labeled = defaultdict(bool)
    for bag, bid, iid, yi in labeled:
        X_labeled[bag].append(X[id_index[bid, iid]])
        y_labeled[bag] |= bool(yi)
    bags_labeled = sorted(X_labeled.keys())
    X_labeled = map(np.vstack, [X_labeled[b] for b in bags_labeled])
    y_labeled = [y_labeled[b] for b in bags_labeled]

    X_pool = defaultdict(list)
    y_pool = defaultdict(bool)
    for bid, iid in pool_ids:
        X_pool[bid].append(X[id_index[bid, iid]])
        y_pool[bid] |= bool(y[id_index[bid, iid]])
    bags_pool = sorted(X_pool.keys())
    X_pool = map(np.vstack, [X_pool[b] for b in bags_pool])
    y_pool = [y_pool[b] for b in bags_pool]

    X_test = defaultdict(list)
    y_test = defaultdict(bool)
    for bid, iid in test_ids:
        X_test[bid].append(X[id_index[bid, iid]])
        y_test[bid] |= bool(y[id_index[bid, iid]])
    bags_test = sorted(X_test.keys())
    X_test = map(np.vstack, [X_test[b] for b in bags_test])
    y_test = [y_test[b] for b in bags_test]

    results = {}
    results['stats'] = {}
    results['preds'] = {}
    start = time.time()

    if classifier == 'nsk':
        cls = SetSVM(SVC, kernel, **params)
        active = SVMActiveLearner(cls, queries)
        predictions = active.learn(X_labeled, y_labeled, X_pool, y_pool,
                                   X_test)
    else:
        print 'Technique "%s" not supported' % technique
        callback.quit = True
        return

    results['stats']['time'] = time.time() - start
    for q, preds in enumerate(predictions):
        results['preds'][q] = {}
        for bid, y in zip(bags_test, preds):
            results['preds'][q][bid] = float(y)

    predictions = np.column_stack(predictions).T
    print predictions.shape
    if len(bags_test) > 1:
        print 'Test AUC Scores:'
        for row in predictions:
            print '\t%f' % auc_score(np.array(y_test), row.reshape((-1, )))

    print 'Finished task %s.' % str(key)
    return results