Exemplo n.º 1
0
def getAccSentiment(model,words,f, params=[]):
    f = open(f,'r')
    lines = f.readlines()
    preds = []
    golds = []
    seq1 = []
    ct = 0
    for i in lines:
        i = i.split("\t")
        p1 = i[0]; score = i[1]
        X1 = data_io.getSeq(p1,words)
        seq1.append(X1)
        ct += 1
        if ct % 100 == 0:
            x1,m1 = data_io.prepare_data(seq1)
            if params and params.weightfile:
                m1 = data_io.seq2weight(x1, m1, params.weight4ind)
            scores = model.scoring_function(x1,m1)
            scores = np.squeeze(scores)
            preds.extend(scores.tolist())
            seq1 = []
        golds.append(score)
    if len(seq1) > 0:
        x1,m1 = data_io.prepare_data(seq1)
        if params and params.weightfile:
            m1 = data_io.seq2weight(x1, m1, params.weight4ind)
        scores = model.scoring_function(x1,m1)
        scores = np.squeeze(scores)
        preds.extend(scores.tolist())
    return accSentiment(preds,golds)
Exemplo n.º 2
0
def get_pc(data, We, weight4ind, params):
    "Comput the principal component"

    def get_weighted_average(We, x, w):
        "Compute the weighted average vectors"
        n_samples = x.shape[0]
        emb = np.zeros((n_samples, We.shape[1]))
        for i in range(n_samples):
            emb[i, :] = w[i, :].dot(We[x[i, :], :]) / np.count_nonzero(w[i, :])
        return emb

    for i in data:
        i[0].populate_embeddings(words)
        if not params.task == "sentiment":
            i[1].populate_embeddings(words)
    if params.task == "ent":
        (scores, g1x, g1mask, g2x, g2mask) = data_io.getDataEntailment(data)
        if params.weightfile:
            g1mask = data_io.seq2weight(g1x, g1mask, weight4ind)
    elif params.task == "sim":
        (scores, g1x, g1mask, g2x, g2mask) = data_io.getDataSim(data, -1)
        if params.weightfile:
            g1mask = data_io.seq2weight(g1x, g1mask, weight4ind)
    elif params.task == "sentiment":
        (scores, g1x, g1mask) = data_io.getDataSentiment(data)
        if params.weightfile:
            g1mask = data_io.seq2weight(g1x, g1mask, weight4ind)
    emb = get_weighted_average(We, g1x, g1mask)
    svd = TruncatedSVD(n_components=params.npc, n_iter=7, random_state=0)
    svd.fit(emb)
    return svd.components_
Exemplo n.º 3
0
def simSentenceEmb(termList, mat):
    """compute similarity between a term and a pharse. this function uses SIFmaster code"""
    # input
    wordfile = 'helpers/glove.6B.300d.txt'  # word vector file, can be downloaded from GloVe website
    weightfile = 'SIFmaster/auxiliary_data/enwiki_vocab_min200.txt'  # each line is a word and its frequency
    weightpara = 1e-3  # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
    rmpc = 1  # number :of principal components to remove in SIF weighting scheme
    #sentences = [pharse1, pharse2]

    # load word vectors
    (words, We) = data_io.getWordmap(wordfile)
    # load word weights
    word2weight = data_io.getWordWeight(
        weightfile,
        weightpara)  # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(
        words, word2weight)  # weight4ind[i] is the weight for the i-th word

    x, m = data_io.sentences2idx(
        termList, words
    )  # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    w = data_io.seq2weight(x, m, weight4ind)  # get word weights

    # set parameters
    param = params.params()
    param.rmpc = rmpc
    # get SIF embedding
    embedding = SIF_embedding.SIF_embedding(
        We, x, w, param)  # embedding[i,:] is the embedding for sentence i

    for i in range(0, len(embedding)):
        for j in range(0, len(embedding)):
            mat[i, j] = sim2sentence(embedding[i], embedding[j])

    return mat
Exemplo n.º 4
0
def sim_getCorrelation(We,words,f, weight4ind, scoring_function, params):
    f = open(f,'r')
    lines = f.readlines()
    golds = []
    seq1 = []
    seq2 = []
    for i in lines:
        i = i.split("\t")
        p1 = i[0]; p2 = i[1]; score = float(i[2])
        X1, X2 = data_io.getSeqs(p1,p2,words)
        seq1.append(X1)
        seq2.append(X2)
        golds.append(score)
    x1,m1 = data_io.prepare_data(seq1)
    x2,m2 = data_io.prepare_data(seq2)
    m1 = data_io.seq2weight(x1, m1, weight4ind)
    m2 = data_io.seq2weight(x2, m2, weight4ind)
    scores = scoring_function(We,x1,x2,m1,m2, params)
    preds = np.squeeze(scores)
    return pearsonr(preds,golds)[0], spearmanr(preds,golds)[0]
Exemplo n.º 5
0
weightfile = '../data/auxiliary_data/enwiki_vocab_min200.txt'  # each line is a word and its frequency
weightpara = 1e-3  # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
rmpc = 1  # number of principal components to remove in SIF weighting scheme
sentences = [
    'this is an example sentence',
    'this is another sentence that is slightly longer'
]

# load word vectors
(words, We) = data_io.getWordmap(wordfile)
# load word weights
word2weight = data_io.getWordWeight(
    weightfile,
    weightpara)  # word2weight['str'] is the weight for the word 'str'
weight4ind = data_io.getWeight(
    words, word2weight)  # weight4ind[i] is the weight for the i-th word
# load sentences
x, m = data_io.sentences2idx(
    sentences, words
)  # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
w = data_io.seq2weight(x, m, weight4ind)  # get word weights

# set parameters
params = params.params()
params.rmpc = rmpc
# get SIF embedding
embedding = SIF_embedding.SIF_embedding(
    We, x, w, params)  # embedding[i,:] is the embedding for sentence i

print(embedding[0, :])
Exemplo n.º 6
0
def train_util(model, train_data, dev, test, train, words, params):
    "utility function for training the model"
    start_time = time()
    try:
        for eidx in range(params.epochs):
            kf = data_io.get_minibatches_idx(len(train_data),
                                             params.batchsize,
                                             shuffle=True)
            uidx = 0
            for _, train_index in kf:
                uidx += 1
                batch = [train_data[t] for t in train_index]
                # load the word ids
                for i in batch:
                    i[0].populate_embeddings(words)
                    if not params.task == "sentiment":
                        i[1].populate_embeddings(words)
                # load the data
                if params.task == "ent":
                    (scores, g1x, g1mask, g2x,
                     g2mask) = data_io.getDataEntailment(batch)
                elif params.task == "sim":
                    (scores, g1x, g1mask, g2x,
                     g2mask) = data_io.getDataSim(batch, model.nout)
                elif params.task == "sentiment":
                    (scores, g1x, g1mask) = data_io.getDataSentiment(batch)
                else:
                    raise ValueError('Task should be ent or sim.')
                # train
                if not params.task == "sentiment":
                    if params.weightfile:
                        g1mask = data_io.seq2weight(g1x, g1mask,
                                                    params.weight4ind)
                        g2mask = data_io.seq2weight(g2x, g2mask,
                                                    params.weight4ind)
                    cost = model.train_function(scores, g1x, g2x, g1mask,
                                                g2mask)
                else:
                    if params.weightfile:
                        g1mask = data_io.seq2weight(g1x, g1mask,
                                                    params.weight4ind)
                    cost = model.train_function(scores, g1x, g1mask)
                if np.isnan(cost) or np.isinf(cost):
                    print('NaN detected')
                # undo batch to save RAM
                for i in batch:
                    i[0].representation = None
                    i[0].unpopulate_embeddings()
                    if not params.task == "sentiment":
                        i[1].representation = None
                        i[1].unpopulate_embeddings()
            # evaluate
            if params.task == "sim":
                dp, ds = eval.supervised_evaluate(model, words, dev, params)
                tp, ts = eval.supervised_evaluate(model, words, test, params)
                rp, rs = eval.supervised_evaluate(model, words, train, params)
                print("evaluation: ", dp, ds, tp, ts, rp, rs)
            elif params.task == "ent" or params.task == "sentiment":
                ds = eval.supervised_evaluate(model, words, dev, params)
                ts = eval.supervised_evaluate(model, words, test, params)
                rs = eval.supervised_evaluate(model, words, train, params)
                print("evaluation: ", ds, ts, rs)
            else:
                raise ValueError('Task should be ent or sim.')
            print('Epoch ', (eidx + 1), 'Cost ', cost)
            sys.stdout.flush()
    except KeyboardInterrupt:
        print("Training interupted")
    end_time = time()
    print("total time:", (end_time - start_time))