Exemplo n.º 1
0
def main(train, dev, test, emb_path, hyperparams, run_id=None, res_path=None):
    with open(train, 'rb') as fid:
        X_train, Y_train, vocabulary, _ = cPickle.load(fid)
    with open(dev, 'rb') as fid:
        X_dev, Y_dev, _, _ = cPickle.load(fid)
    with open(test, 'rb') as fid:
        test_x, test_y, _, _ = cPickle.load(fid)
    E, _ = embeddings.read_embeddings(emb_path, wrd2idx=vocabulary)
    nn = nlse.NLSE(E, **hyperparams)
    nn.fit(X_train, Y_train, X_dev, Y_dev)
    y_hat = nn.predict(test_x)
    avgF1 = f1_score(test_y, y_hat, average="macro")
    acc = accuracy_score(test_y, y_hat)
    run_id = run_id
    dataset = os.path.basename(test)
    hp = {p: hyperparams[p] for p in ["sub_size", "lrate"]}
    if run_id is None: run_id = "NLSE"
    results = {"acc":round(acc,3), \
                "avgF1":round(avgF1,3), \
                "model":"NLSE", \
                "dataset":dataset, \
                "run_id":run_id,
                "sub_size":hyperparams["sub_size"],
                "lrate":hyperparams["lrate"]}
    cols = ["dataset", "model", "run_id", "acc", "avgF1", "sub_size"]
    helpers.print_results(
        results,
        columns=["dataset", "run_id", "lrate", "subsize", "acc", "avgF1"])
    if res_path is not None:
        helpers.save_results(results, res_path, sep="\t", columns=cols)
    return results
Exemplo n.º 2
0
def hypertune(train, dev, emb_path, obj, hyperparams, res_path=None):

    with open(train, 'rb') as fid:
        X_train, Y_train, vocabulary, _ = cPickle.load(fid)
    with open(dev, 'rb') as fid:
        X_dev, Y_dev, _, _ = cPickle.load(fid)
    E, _ = embeddings.read_embeddings(emb_path, wrd2idx=vocabulary)

    best_hp = None
    best_score = 0
    for hp in hyperparams:
        #initialize model with the hyperparameters
        nn = nlse.NLSE(E, **hp)
        nn.fit(X_train, Y_train, X_dev, Y_dev, silent=False)
        Y_hat = nn.predict(X_dev)
        score = obj(Y_dev, Y_hat)
        print "[score: {} | hyperparameters: {}]".format(score, repr(hp))
        if score > best_score:
            best_score = score
            best_hp = hp
        results = {"score": round(score, 3), "hyper": repr(hp)}
        if res_path is not None:
            helpers.save_results(results, res_path, sep="\t")
        helpers.print_results(results)
    print ""
    print "[best conf: {} | score: {}]".format(repr(best_hp), best_score)
    return best_hp, best_score
Exemplo n.º 3
0
def main(lex_path, test, label_map, run_id, conf={}, dev=None, res_path=None):
	#read test data
	dt = read_dataset(test, labels=label_map.keys())
	X_test = [x[1] for x in dt]
	Y_test = [label_map[x[0]] for x in dt]
	#model
	model = LexiconSentiment(path=lex_path,**conf)
	#if dev data is passed, use this data to fit the threshold
	if dev is not None:
		dt_dev = read_dataset(dev, labels=label_map.keys())
		X_dev = [x[1] for x in dt_dev]
		Y_dev = [label_map[x[0]] for x in dt_dev]		
		print "[fitting]"
		model.fit(X_dev,Y_dev,samples=SAMPLEZ,silent=True)
		conf = model.get_params()
	#test model
	Y_hat = model.predict(X_test)		
	avgF1 = f1_score(Y_test, Y_hat,average="macro") 		
	acc = accuracy_score(Y_test, Y_hat)				
	
	results = {"acc":round(acc,3), 
			   "avgF1":round(avgF1,3),	
				"model":run_id, 
				"dataset":os.path.basename(test), 
				"run_id":run_id
				}
	results.update(conf)
	cols = ["dataset", "model", "run_id", "acc", "avgF1"]	
	helpers.print_results(results, columns=cols)
	if res_path is not None:
		#cols+=["positive_threshold","keep_scores_below","keep_scores_above"]
		helpers.save_results(results, res_path, sep="\t", columns=cols)	
	return results
Exemplo n.º 4
0
def hypertuner(train,
               dev,
               test,
               emb_path,
               obj,
               hyperparams,
               run_id,
               res_path=None,
               model_path=None):
    with open(train, 'rb') as fid:
        X_train, Y_train, vocabulary = cPickle.load(fid)
    with open(dev, 'rb') as fid:
        X_dev, Y_dev, _ = cPickle.load(fid)
    with open(test, 'rb') as fid:
        X_test, Y_test, _ = cPickle.load(fid)
    E, _ = embeddings.read_embeddings(emb_path, wrd2idx=vocabulary)
    Ys = Y_train + Y_dev + Y_test
    label_map = vectorizer.build_vocabulary(Ys)
    Y_train = [label_map[y] for y in Y_train]
    Y_test = [label_map[y] for y in Y_test]
    Y_dev = [label_map[y] for y in Y_dev]

    dataset = os.path.basename(test)
    best_hp = None
    best_score = 0
    best_results = None

    for hp in hyperparams:
        #initialize model with the hyperparameters
        nn = nlse.NLSE(E, label_map=label_map, vocab=vocabulary, **hp)
        # nn = nlse.NLSE(E, **hp)
        nn.fit(X_train, Y_train, X_dev, Y_dev, silent=False)
        Y_hat = nn.predict(X_test)
        score = obj(Y_test, Y_hat)
        print "[score: {} | hyperparameters: {}]".format(score, repr(hp))
        if score > best_score:
            if model_path is not None:
                nn.save(model_path)
            best_score = score
            best_hp = hp
            acc = accuracy_score(Y_test, Y_hat)
            avgF1 = f1_score(Y_test, Y_hat, average="macro")
            rep_hp = {p: hp[p] for p in ["sub_size", "lrate"]}
            best_results = {"acc":round(acc,3), \
                    "avgF1":round(avgF1,3), \
                    "model":"NLSE", \
                    "dataset":dataset, \
                    "run_id":run_id,
                    "hyper":repr(rep_hp)}
        res = {"score": round(score, 3), "hyper": repr(hp)}
        helpers.print_results(res)

    if res_path is not None:
        cols = ["dataset", "model", "run_id", "acc", "avgF1"]
        helpers.save_results(best_results, res_path, cols, sep="\t")
    print ""
    print "[best conf: {} | score: {}]".format(repr(best_hp), best_score)
    return best_hp, best_score
Exemplo n.º 5
0
def main(train,
         dev,
         test,
         emb_path,
         hyperparams,
         run_id=None,
         res_path=None,
         no_hidden=False):
    with open(train, 'rb') as fid:
        X_train, Y_train, vocabulary = cPickle.load(fid)
    with open(dev, 'rb') as fid:
        X_dev, Y_dev, _ = cPickle.load(fid)
    with open(test, 'rb') as fid:
        X_test, Y_test, _ = cPickle.load(fid)
    E, _ = embeddings.read_embeddings(emb_path, wrd2idx=vocabulary)
    Ys = Y_train + Y_dev + Y_test
    label_map = vectorizer.build_vocabulary(Ys)
    Y_train = [label_map[y] for y in Y_train]
    Y_test = [label_map[y] for y in Y_test]
    Y_dev = [label_map[y] for y in Y_dev]
    print "[no hidden: {}]".format(no_hidden)
    # set_trace()
    if no_hidden:
        del hyperparams["sub_size"]
        nn = nlse.BOE_plus(E,
                           label_map=label_map,
                           vocab=vocabulary,
                           **hyperparams)
    else:
        nn = nlse.NLSE(E, label_map=label_map, vocab=vocabulary, **hyperparams)
    nn.fit(X_train, Y_train, X_dev, Y_dev)
    y_hat = nn.predict(X_test)
    avgF1 = f1_score(Y_test, y_hat, average="macro")
    acc = accuracy_score(Y_test, y_hat)
    run_id = run_id
    dataset = os.path.basename(test)
    #    hp = {p:hyperparams[p] for p in ["sub_size","lrate"]}
    if run_id is None: run_id = "NLSE"
    results = {"acc":round(acc,3), \
                "avgF1":round(avgF1,3), \
                "model":"NLSE", \
                "dataset":dataset, \
                "run_id":run_id}

    helpers.print_results(results,
                          columns=["dataset", "run_id", "acc", "avgF1"])
    if res_path is not None:
        cols = ["dataset", "model", "run_id", "acc", "avgF1"]
        helpers.save_results(results, res_path, sep="\t", columns=cols)
    return results, nn
Exemplo n.º 6
0
def main(train, test, run_id, features, hyperparameters={}, res_path=None):
    #train and evalute model
    if features[0].lower() == "naive_bayes":
        X_train, Y_train = get_features(train, ["bow-bin"])
        X_test, Y_test = get_features(test, ["bow-bin"])
        model = BernoulliNB()
        model_name = "NaiveBayes"
    elif features[0].lower() == "mlp":
        X_train, Y_train = get_features(train, ["bow-bin"])
        X_test, Y_test = get_features(test, ["bow-bin"])
        model = MLPClassifier(solver='lbfgs',
                              activation="logistic",
                              hidden_layer_sizes=[400])
        model_name = "MLP"
    elif features[0].lower() == "mlp-2":
        X_train, Y_train = get_features(train, ["bow-bin"])
        X_test, Y_test = get_features(test, ["bow-bin"])
        model = MLPClassifier(solver='lbfgs',
                              activation="logistic",
                              hidden_layer_sizes=[400, 100])
        model_name = "MLP-2"
    else:
        X_train, Y_train = get_features(train, features)
        X_test, Y_test = get_features(test, features)
        #initialize model with the hyperparameters
        model = SGDClassifier(random_state=1234, **hyperparameters)
        model_name = "+".join(features)
    model.fit(X_train, Y_train)
    Y_hat = model.predict(X_test)
    avgF1 = f1_score(Y_test, Y_hat, average="macro")
    acc = accuracy_score(Y_test, Y_hat)
    results = {"acc":round(acc,3), \
      "avgF1":round(avgF1,3),	\
      "model":model_name, \
      "dataset":os.path.basename(test), \
      "run_id":run_id, \
      "train_size":len(X_train), \
      "test_size":len(X_test), \
      "hyper":repr(hyperparameters)}
    cols = ["dataset", "run_id", "acc", "avgF1", "hyper"]
    helpers.print_results(results, columns=cols)
    if res_path is not None:
        cols = ["dataset", "model", "run_id", "acc", "avgF1"]
        helpers.save_results(results, res_path, sep="\t", columns=cols)
    return results
Exemplo n.º 7
0
def hypertune(train, dev, features, obj, hyperparams, res_path=None):
    X_train, Y_train = get_features(train, features)
    X_dev, Y_dev = get_features(dev, features)
    best_hp = None
    best_score = 0
    for hp in hyperparams:
        #initialize model with the hyperparameters
        model = SGDClassifier(random_state=1234, **hp)
        model.fit(X_train, Y_train)
        Y_hat = model.predict(X_dev)
        score = obj(Y_dev, Y_hat)
        # print "[score: {} | hyperparameters: {}]".format(score, repr(hp))
        if score > best_score:
            best_score = score
            best_hp = hp
        results = {"score": round(score, 3), "hyper": repr(hp)}
        if res_path is not None:
            helpers.save_results(results, res_path)
        helpers.print_results(results)
    print ""
    print "[best conf: {} | score: {}]".format(repr(best_hp),
                                               round(best_score, 3))
    return best_hp, best_score
Exemplo n.º 8
0
def main(train,
         test,
         dev,
         embs_path,
         total_epochs=10,
         weights_file=None,
         results_path=None):
    print "[reading data]"
    train_data = data.read_dataset(train)
    train_docs = [x[1] for x in train_data]
    train_Y = [x[0] for x in train_data]

    test_data = data.read_dataset(test)
    test_docs = [x[1] for x in test_data]
    test_Y = [x[0] for x in test_data]

    dev_data = data.read_dataset(dev)
    dev_docs = [x[1] for x in dev_data]
    dev_Y = [x[0] for x in dev_data]

    #convert labels to one-hot
    label_map = vectorizer.build_vocabulary(test_Y + train_Y + dev_Y)
    train_Y = vectorizer.one_hot(label_map, train_Y)
    dev_Y = vectorizer.one_hot(label_map, dev_Y)
    test_Y = vectorizer.one_hot(label_map, test_Y)
    #convert to argmax
    test_Y = np.argmax(test_Y, axis=1)
    n_labels = len(train_Y[0])
    print "[loading embeddings]"
    wvs = embeddings.embeddings_to_dict(embs_path)
    # preprocessor for texts
    print "[preprocessing...]"
    all_docs = train_docs + test_docs + dev_docs
    max_len = max([len(x.split()) for x in all_docs])
    print "[max len: {}]".format(max_len)
    p = CNN_text.Preprocessor(max_features=len(wvs), maxlen=max_len, wvs=wvs)
    p.preprocess(all_docs)
    train_X = p.build_sequences(train_docs)
    test_X = p.build_sequences(test_docs)
    dev_X = p.build_sequences(dev_docs)
    # then the CNN
    cnn = CNN_text.TextCNN(p,
                           n_labels=n_labels,
                           filters=[2, 3],
                           n_filters=50,
                           dropout=0.0)

    if weights_file:
        cnn.model.load_weights('weights.hdf5')

    epochs_per_iter = 1
    epochs_so_far = 0
    print "training"
    while epochs_so_far < total_epochs:
        cnn.train(train_X,
                  train_Y,
                  nb_epoch=epochs_per_iter,
                  X_val=dev_X,
                  y_val=dev_Y)
        epochs_so_far += epochs_per_iter
        Y_hat = cnn.predict(dev_X)
        acc = accuracy_score(np.argmax(dev_Y, axis=1), Y_hat)
        avgF1 = f1_score(np.argmax(dev_Y, axis=1), Y_hat, average="macro")
        res={"acc":round(acc,3), \
            "avgF1":round(avgF1,3)}
        helpers.print_results(res)
        #print("acc @ epoch %s: %s" % (epochs_so_far, acc))

    Y_hat = cnn.predict(test_X)
    acc = accuracy_score(test_Y, Y_hat)
    avgF1 = f1_score(test_Y, Y_hat, average="macro")

    results = {"acc":round(acc,3), \
            "avgF1":round(avgF1,3), \
            "model":"CNN", \
            "dataset":os.path.basename(test), \
            "run_id":"NEURAL"}
    helpers.print_results(results)
    if results_path is not None:
        cols = ["dataset", "model", "run_id", "acc", "avgF1"]
        helpers.save_results(results, results_path, cols, sep="\t")
Exemplo n.º 9
0
                conf, _ = hypertune(tr_fname, ts_fname, args.emb_path,\
                                       scorer, hyperparams_grid, res_path=hyper_results_path)
            #run model with the best hyperparams
            res = main(tr_fname,
                       dev_fname,
                       ts_fname,
                       args.emb_path,
                       conf,
                       run_id=args.run_id,
                       res_path=cv_results_path)
            results.append(res)

        accs = [res["acc"] for res in results]
        f1s = [res["avgF1"] for res in results]

        cv_res = {"acc_mean":round(np.mean(accs),3), \
                "acc_std":round(np.std(accs),3), \
                "avgF1_mean":round(np.mean(f1s),3), \
                "avgF1_std":round(np.std(f1s),3), \
                "model":"NLSE", \
                "dataset":os.path.basename(args.test), \
                "run_id":args.run_id}
        helpers.print_results(cv_res)
        #save the results of each run
        if args.res_path is not None:
            cols = [
                "dataset", "run_id", "model", "acc_mean", "acc_std",
                "avgF1_mean", "avgF1_std"
            ]
            helpers.save_results(cv_res, args.res_path, sep="\t", columns=cols)
Exemplo n.º 10
0
		predictor.fit(X_train, Y_train)
		Y_hat = predictor.predict(X_test)
		#evaluate
		avgF1 = f1_score(Y_test, Y_hat, average="macro")		
		acc = accuracy_score(Y_test, Y_hat)
		fname = os.path.basename(args.test)
		run_id = args.run_id
		if run_id is None: run_id = "+".join(args.features)				
		results = {"dataset": fname,
					"acc": round(acc, 3), 
				   "avgF1": round(avgF1, 3),
                   "features": "+".join(args.features)+"@"+args.model,				   
				   "run_id": run_id}
		cols = ["dataset", "run_id", "features", "acc", "avgF1"]
		#report
		helpers.print_results(results, columns=cols)
		if args.res_path is not None:
			helpers.save_results(results, args.res_path, columns=cols)
	elif args.type == "continuous":
		#choose model
		if args.model == "linear":
			predictor = SVR(kernel='linear')
		elif args.model == "l1":
			predictor = LinearSVR(loss='epsilon_insensitive')
		elif args.model == "rbf":
			predictor = SVR(kernel='rbf')
		#train and predict
		predictor.fit(X_train, Y_train)
		Y_hat = predictor.predict(X_test)
		#evaluate
		pred_rank = sp.stats.stats.rankdata(Y_hat)
Exemplo n.º 11
0
    # logging.info('Initialized model from embeddings %s' % args.emb)
    # logging.info("Training data: %s" % args.tr)
    # logging.info("Dev data: %s" % args.dev)
    with open(args.tr, 'rb') as fid:
        train_x, train_y, vocabulary, label_map, E, st, ed = cPickle.load(fid)
        E = E.astype(theano.config.floatX)
    with open(args.dev, 'rb') as fid:
        dev_x, dev_y, _, _ = cPickle.load(fid)
    with open(args.ts, 'rb') as fid:
        ts_x, ts_y, _, _ = cPickle.load(fid)

    nn = train_nlse_reg(train_x, train_y, vocabulary, E, st, ed, args)
    tau = evaluate(nn, ts_x, ts_y)
    fname = os.path.basename(args.ts)
    run_id = args.run_id
    if run_id is None: run_id = "NLSE"
    results = {
        "tau": round(tau, 3),
        "features": "NLSE",
        "subsize": args.sub_size,
        "lrate": args.lrate,
        "dataset": fname,
        "run_id": run_id
    }
    cols = ["dataset", "run_id", "features", "tau"]
    helpers.print_results(
        results, columns=["dataset", "run_id", "lrate", "subsize", "tau"])
    if args.res_path is not None:
        helpers.save_results(results, args.res_path, columns=cols)