Пример #1
0
def get_pc(data, We, weight4ind, params):
    "Comput the principal component"

    def get_weighted_average(We, x, w):
        "Compute the weighted average vectors"
        n_samples = x.shape[0]
        emb = np.zeros((n_samples, We.shape[1]))
        for i in range(n_samples):
            emb[i, :] = w[i, :].dot(We[x[i, :], :]) / np.count_nonzero(w[i, :])
        return emb

    for i in data:
        i[0].populate_embeddings(words)
        if not params.task == "sentiment":
            i[1].populate_embeddings(words)
    if params.task == "ent":
        (scores, g1x, g1mask, g2x, g2mask) = data_io.getDataEntailment(data)
        if params.weightfile:
            g1mask = data_io.seq2weight(g1x, g1mask, weight4ind)
    elif params.task == "sim":
        (scores, g1x, g1mask, g2x, g2mask) = data_io.getDataSim(data, -1)
        if params.weightfile:
            g1mask = data_io.seq2weight(g1x, g1mask, weight4ind)
    elif params.task == "sentiment":
        (scores, g1x, g1mask) = data_io.getDataSentiment(data)
        if params.weightfile:
            g1mask = data_io.seq2weight(g1x, g1mask, weight4ind)
    emb = get_weighted_average(We, g1x, g1mask)
    svd = TruncatedSVD(n_components=params.npc, n_iter=7, random_state=0)
    svd.fit(emb)
    return svd.components_
Пример #2
0
def getAccSentiment(model,words,f, params=[]):
    f = open(f,'r')
    lines = f.readlines()
    preds = []
    golds = []
    seq1 = []
    ct = 0
    for i in lines:
        i = i.split("\t")
        p1 = i[0]; score = i[1]
        X1 = data_io.getSeq(p1,words)
        seq1.append(X1)
        ct += 1
        if ct % 100 == 0:
            x1,m1 = data_io.prepare_data(seq1)
            if params and params.weightfile:
                m1 = data_io.seq2weight(x1, m1, params.weight4ind)
            scores = model.scoring_function(x1,m1)
            scores = np.squeeze(scores)
            preds.extend(scores.tolist())
            seq1 = []
        golds.append(score)
    if len(seq1) > 0:
        x1,m1 = data_io.prepare_data(seq1)
        if params and params.weightfile:
            m1 = data_io.seq2weight(x1, m1, params.weight4ind)
        scores = model.scoring_function(x1,m1)
        scores = np.squeeze(scores)
        preds.extend(scores.tolist())
    return accSentiment(preds,golds)
Пример #3
0
def sim_getCorrelation(We, words, f, weight4ind, scoring_function, params):
    f = open(f, 'r')
    lines = f.readlines()
    golds = []
    seq1 = []
    seq2 = []
    for i in lines:
        i = i.split("\t")
        p1 = i[0]
        p2 = i[1]
        score = float(i[2])
        X1, X2 = data_io.getSeqs(p1, p2, words)
        seq1.append(X1)
        seq2.append(X2)
        golds.append(score)
    x1, m1 = data_io.prepare_data(seq1)
    x2, m2 = data_io.prepare_data(seq2)
    m1 = data_io.seq2weight(x1, m1, weight4ind)
    m2 = data_io.seq2weight(x2, m2, weight4ind)
    scores = scoring_function(We, x1, x2, m1, m2, params)
    print seq1[0]
    print seq2[0]
    print scores[0]
    preds = np.squeeze(scores)
    return pearsonr(preds, golds)[0], spearmanr(preds, golds)[0]
def return_sif(sentences, words, weight4ind, param, Weights):
    # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    x, m = data_io.sentences2idx(sentences, words)
    w = data_io.seq2weight(x, m, weight4ind) # get word weights
    # get SIF embedding
    embeddings = SIF_embedding.SIF_embedding(Weights, x, w, param) # embedding[i,:] is the embedding for sentence i
    return embeddings
Пример #5
0
 def fit(self, sentences, We, lowercase_tokens, embeddings_format, embeddings_filepath, params, word_map, weight4ind):
     
     # store these off for pickling or extra transforms
     self.word_map = word_map
     self.weight4ind = weight4ind
     self.params = params
     self.lowercase_tokens = lowercase_tokens
     self.embeddings_format = embeddings_format
     self.embeddings_filepath = embeddings_filepath
     
     self.sentence_count = len(sentences)
     
     x, m = data_io.sentences2idx(sentences, self.word_map) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
     w = data_io.seq2weight(x, m, self.weight4ind) # get word weights
     
     # now let's do some of what happens in src/SIF_embedding.py
     # but also keep some pieces along the way
     #weighted_emb = get_weighted_average(We, x, w)
     weighted_emb = get_weighted_average_alternate(We, x, w)
     
     self.compute_pc(weighted_emb)
     
     self.trained = True
     
     return self.remove_pc(weighted_emb)
Пример #6
0
def main(sentences,
         wordfile: str,
         weightfile: str,
         weightpara: float = 1e-3,
         rmpc: int = 1):
    # load word vectors
    (words, We) = data_io.getWordmap(wordfile)
    # load word weights
    word2weight = data_io.getWordWeight(
        weightfile,
        weightpara)  # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(
        words, word2weight)  # weight4ind[i] is the weight for the i-th word
    # load sentences
    x, m, _ = data_io.sentences2idx(
        sentences, words
    )  # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    w = data_io.seq2weight(x, m, weight4ind)  # get word weights

    # set parameters
    params = params.params()
    params.rmpc = rmpc
    # get SIF embedding
    embedding = SIF_embedding.SIF_embedding(
        We, x, w, params)  # embedding[i,:] is the embedding for sentence i
Пример #7
0
def getSIFscore(sentences: list, words, weight4ind, rmpc, We, params, sx: int,
                sy: int):
    # load sentences
    x, m = data_io.sentences2idx(
        sentences, words
    )  # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    w = data_io.seq2weight(x, m, weight4ind)  # get word weights
    # print('load sentences finished')

    # set parameters
    params = params.params()
    params.rmpc = rmpc
    # get SIF embedding
    embedding = SIF_embedding.SIF_embedding(
        We, x, w, params)  # embedding[i,:] is the embedding for sentence i

    embeddingSize = len(embedding)
    # print('embeddingSize= ',embeddingSize)

    emb = list()
    for x in range(embeddingSize):
        emb.append(embedding[x, :])

    emb1 = emb[sx]
    emb2 = emb[sy]
    inn = (emb1 * emb2).sum()
    emb1norm = numpy.sqrt((emb1 * emb1).sum())
    emb2norm = numpy.sqrt((emb2 * emb2).sum())
    score = inn / emb1norm / emb2norm

    # print(sentences[sx],'--------',sentences[sy],' = ',score,'\n')
    return score
    def compute_sif_emb(self, sentences):
        # load sentences
        x1, m = data_io.sentences2idx(sentences, self.words)
        w1 = data_io.seq2weight(x1, m, self.weight4ind)

        result = get_emb(self.We, x1, w1)
        return result
Пример #9
0
def get_embedding(sentence, words, weight4ind, params, We):
    # load sentences
    xx, mm = data_io.sentences2idx(sentence, words)
    ww = data_io.seq2weight(xx, mm, weight4ind)  # get word weights
    # get SIF embedding
    em = SIF_embedding.SIF_embedding(
        We, xx, ww, params)  # embedding[i,:] is the embedding for sentence i
    return em
Пример #10
0
def sim_badSents(We, words, weight4ind, scoring_function, params, fpc, sent1,
                 sent2):
    seq1 = []
    seq2 = []

    X1, X2 = data_io.getSeqs(sent1, sent2, words)
    seq1.append(X1)
    seq2.append(X2)

    x1, m1 = data_io.prepare_data(seq1)
    x2, m2 = data_io.prepare_data(seq2)
    m1 = data_io.seq2weight(x1, m1, weight4ind)
    m2 = data_io.seq2weight(x2, m2, weight4ind)
    scores = scoring_function(We, x1, x2, m1, m2, params, fpc)
    preds = np.squeeze(scores)
    preds = preds * 2 + 3
    return preds
Пример #11
0
def get_embeddings(words, We, word2weight, weight4ind, filename, params):

  # load sentences
  x, m, _ = data_io.sentiment2idx(filename, words) # x is the array of word indices, m is a mask
  w = data_io.seq2weight(x, m, weight4ind) # get word weights

  # get SIF embedding
  embedding = SIF_embedding.SIF_embedding(We, x, w, params) # embedding[i,:] is the embedding for sentence i
  return embedding
Пример #12
0
def sim_getCorrelation(We, words, f, weight4ind, scoring_function, params, fpc,
                       test_name):
    f = open(f, 'r')
    lines = f.readlines()
    golds = []
    seq1 = []
    seq2 = []
    index = []
    idx = 0
    for i in lines:
        i = i.split("\t")
        p1 = i[0]
        p2 = i[1]
        score = float(i[2])
        X1, X2 = data_io.getSeqs(p1, p2, words)
        seq1.append(X1)
        seq2.append(X2)
        golds.append(score)
        index.append(idx)
        idx += 1
    x1, m1 = data_io.prepare_data(seq1)
    x2, m2 = data_io.prepare_data(seq2)
    m1 = data_io.seq2weight(x1, m1, weight4ind)
    m2 = data_io.seq2weight(x2, m2, weight4ind)
    golds = np.asarray(golds)
    scores = scoring_function(We, x1, x2, m1, m2, params, fpc)
    # scores = scoring_function(We, x1, x2, m1, m2, golds, params, fpc)
    # preds = np.squeeze(scores).reshape(-1, 1)
    preds = np.squeeze(scores)
    # print('the prediction list is {}'.format(preds))

    # add SVM predictor
    # clf = pickle.load(open('../score_predictor/model_svm', 'rb'))
    # clf.fit(preds, golds)
    # preds = clf.predict(preds)

    print(preds)
    # np.save(open("../pred_list", 'wb'), preds)
    # np.save(open("../gold_list", 'wb'), golds)
    # show_result_image(preds, golds, index, fpc, test_name)
    # find_bad_scores(preds.tolist(), lower_threshold=2.5, higher_threshold=3.8)
    MSE = sqrt(mean_squared_error(golds, preds))
    return pearsonr(preds, golds)[0], MSE
Пример #13
0
def get_embs(sentences, params):
    # load word vectors
    (words, We) = data_io.getWordmap(wordfile)
    # load word weights
    word2weight = data_io.getWordWeight(weightfile, weightpara) # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(words, word2weight) # weight4ind[i] is the weight for the i-th word
    # load sentences
    x, m = data_io.sentences2idx(sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    w = data_io.seq2weight(x, m, weight4ind) # get word weights
    
    # get SIF embedding
    embedding = SIF_embedding.SIF_embedding(We, x, w, params) # embedding[i,:] is the embedding for sentence i
    return embedding
Пример #14
0
def prepare_first_pc(We, words, weight4ind, generation_function, params, fpc):
    print("reading file: {}.".format(fpc))
    # pre_calculate_first_pc(We, words, fpc, weight4ind, generation_function, params)
    file_name = fpc
    f = os.path.join("../data/", fpc)
    f = open(f, 'r')
    seq = []
    for i in f.readlines():
        X = data_io.getSeq(i, words)
        seq.append(X)
    x, m = data_io.prepare_data(seq)
    m = data_io.seq2weight(x, m, weight4ind)
    generation_function(We, x, m, params, file_name)
Пример #15
0
def getCorrelation(model,words,f, params=[]):
    f = open(f,'r')
    lines = f.readlines()
    preds = []
    golds = []
    seq1 = []
    seq2 = []
    for i in lines:
        i = i.split("\t")
        p1 = i[0]; p2 = i[1]; score = float(i[2])
        X1, X2 = data_io.getSeqs(p1,p2,words)
        seq1.append(X1)
        seq2.append(X2)
        golds.append(score)
    x1,m1 = data_io.prepare_data(seq1)
    x2,m2 = data_io.prepare_data(seq2)
    if params and params.weightfile:
        m1 = data_io.seq2weight(x1, m1, params.weight4ind)
        m2 = data_io.seq2weight(x2, m2, params.weight4ind)
    scores = model.scoring_function(x1,x2,m1,m2)
    preds = np.squeeze(scores)
    return pearsonr(preds,golds)[0], spearmanr(preds,golds)[0]
Пример #16
0
def sim_getCorrelation1(We, words, file_index, weight4ind, scoring_function,
                        params):
    f = open(file_index[0], 'r')
    #print(f)
    line = f.readlines()
    lines = [lin for lin in line]
    f = open(file_index[1], 'r')
    #print(f)
    score_line = f.readlines()
    score_lines = [score for score in score_line]
    golds = []
    seq1 = []
    seq2 = []
    for index in range(len(lines)):
        i = lines[index]
        j = score_lines[index]
        i = i.split("\t")
        #print(i)
        #print(i)
        p1 = i[0].lower()
        p2 = i[1].lower()
        try:
            score = float(j)
            X1, X2 = data_io.getSeqs(p1, p2, words)
            seq1.append(X1)
            seq2.append(X2)
            golds.append(score)
        except:
            pass
    x1, m1 = data_io.prepare_data(seq1)
    x2, m2 = data_io.prepare_data(seq2)
    m1 = data_io.seq2weight(x1, m1, weight4ind)
    m2 = data_io.seq2weight(x2, m2, weight4ind)
    # print(x1,x2,m1,m2)
    # print(x1.shape,x2.shape,m1.shape,m2.shape)
    scores = scoring_function(We, x1, x2, m1, m2, params)
    preds = np.squeeze(scores)
    return pearsonr(preds, golds)[0]
Пример #17
0
def generate_vecs(models, document):
    words, weight4ind, rmpc, We = models

    x, m = data_io.sentences2idx(document, words)
    # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    w = data_io.seq2weight(x, m, weight4ind)  # get word weights

    # set parameters
    param = params.params()
    param.rmpc = rmpc
    # get SIF embedding
    embedding = SIF_embedding.SIF_embedding(
        We, x, w, param)  # embedding[i,:] is the embedding for sentence i
    return embedding
Пример #18
0
def SIFSentEmbedding(weighttxt,
                     docfile,
                     words,
                     We,
                     weight4ind,
                     weightpara=1e-3,
                     paramm=1):
    # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
    # number of principal components to remove in SIF weighting scheme
    sentences = sent_tokenize(docfile)
    x, m = data_io.sentences2idx(sentences, words)
    w = data_io.seq2weight(x, m, weight4ind)  # get word weights
    paramm = params.params()
    paramm = paramm.LC
    embedding = SIF_embedding.SIF_embedding(
        We, x, w, paramm)  # embedding[i,:] is the embedding for sentence i
    return embedding
Пример #19
0
def sif_embedding(sen):
    import sys
    #sys.path.append("../src")
    #sys.path.append("../data")
    import data_io, params, SIF_embedding
    import params
    import SIF_embedding
    # input
    wordfile = 'data/dic_files.txt'  # word vector file, can be downloaded from GloVe website
    weightfile = 'data/dic_freq.txt'  # each line is a word and its frequency
    weightpara = 1e-3  # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
    rmpc = 1  # number of principal components to remove in SIF weighting scheme
    # sentences = ['这是一个例句', '这是一个更长一些的例句']
    # sentences = ['昨天天气不错', '这是一个更长一些的例句']
    sentences = sen
    # sentences = ['this is an example sentence', 'this is another sentence that is slightly longer']

    # load word vectors
    (words, We) = data_io.getWordmap(wordfile)
    # print(words,We) #单词,和词向量
    # load word weights
    word2weight = data_io.getWordWeight(
        weightfile,
        weightpara)  # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(
        words, word2weight)  # weight4ind[i] is the weight for the i-th word
    # load sentences
    # x, m, _ = data_io.sentences2idx(sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    x, m = data_io.sentences2idx(
        sentences, words
    )  # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    # print(x,m)
    w = data_io.seq2weight(x, m, weight4ind)  # get word weights
    # print('word weight:',w)
    # set parameters
    # params = params.params()
    params = params.params_all()  # name 'params' is not defined
    params.rmpc = rmpc
    # get SIF embedding
    embedding = SIF_embedding.SIF_embedding(
        We, x, w, params)  # embedding[i,:] is the embedding for sentence i
    return embedding
Пример #20
0
def vectorize_sif(filename):
    class params(object):
        def __init__(self):
            self.LW = 1e-5
            self.LC = 1e-5
            self.eta = 0.05

        def __str__(self):
            t = "LW", self.LW, ", LC", self.LC, ", eta", self.eta
            t = map(str, t)
            return ' '.join(t)

    # input
    wordfile = 'glove.6B.100d.txt'  # word vector file, can be downloaded from GloVe website
    weightfile = 'enwiki_vocab_min200.txt'  # each line is a word and its frequency
    weightpara = 1e-3  # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
    rmpc = 1  # number of principal components to remove in SIF weighting scheme
    #sentiment_file = '../data/sentiment-test' # sentiment data file
    #cleanfile = "2/D1026-A.M.100.E.10.segs.cl"
    #sentiment_file = '../data/clean-5.txt'
    # load word vectors
    (words, We) = data_io.getWordmap(wordfile)
    # load word weights
    word2weight = data_io.getWordWeight(
        weightfile,
        weightpara)  # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(
        words, word2weight)  # weight4ind[i] is the weight for the i-th word
    # load sentences (here use sentiment data as an example)
    #x, m, _ = data_io.sentiment2idx(sentiment_file, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    x, m = data_io.sentiment2idx(filename, words)
    w = data_io.seq2weight(x, m, weight4ind)  # get word weights
    # parameters
    params = params()
    #params = params.params()
    params.rmpc = rmpc

    # get SIF embedding
    embedding = SIF_embedding_lib.SIF_embedding(
        We, x, w, params)  # embedding[i,:] is the embedding for sentence i

    return embedding
Пример #21
0
def sentences2embeddings(sentences):
	"""
	Input: sentences - a list of sentences
	Output: sentence_embeddings - a list of sentence embeddings (numpy vectors of shape (1,300))

	"""
	# load sentences
	x, m = data_io.sentences2idx(sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
	w = data_io.seq2weight(x, m, weight4ind) # get word weights

	# set parameters
	parameters = params.params()
	parameters.rmpc = rmpc
	# get SIF embedding
	embedding = SIF_embedding.SIF_embedding(We, x, w, parameters) # embedding[i,:] is the embedding for sentence i

	sentence_embeddings = []
	for i in range(len(sentences)):
		e = np.array(embedding[i,:]).reshape((1,300)) # reshape to fit into the function semantic_similarity
		sentence_embeddings.append(e)

	return sentence_embeddings
def get_sent_vec(sentences):
    import params
    # 详见data_io.py
    (words, We) = data_io.getWordmap(wordfile)
    # 详见data_io.py
    word2weight = data_io.getWordWeight(weightfile, weightpara)
    weight4ind = data_io.getWeight(words, word2weight)
    # 详见data_io.py
    x, m = data_io.sentences2idx(sentences, words)
    w = data_io.seq2weight(x, m, weight4ind)

    # 参数设置
    params = params.params()
    params.rmpc = rmpc
    # 调用SIF核心算法计算句向量,详见SIF_core
    embedding = SIF_core.SIF_embedding(We, x, w, params)

    get_sent_vec = {}
    for i in range(len(embedding)):
        get_sent_vec[sentences[i]] = embedding[i]

    return get_sent_vec
Пример #23
0
def get_sent_vec(sentences):
    '''
    通过SIF算法计算句向量
    :param sentences: 类型为列表,列表内的元素为字句子(句子类型为字符串,不用通过jieba.cut分词处理)
    :return: 输出类型为字典,key为句子字符串,value为句向量列表
    '''
    import params
    # 详见data_io.py
    x, m = data_io.sentences2idx(sentences, words)
    w = data_io.seq2weight(x, m, weight4ind)

    # 参数设置
    rmpc = 1  # number of principal components to remove in SIF weighting scheme
    params = params.params()
    params.rmpc = rmpc

    # 调用SIF核心算法计算句向量,详见SIF_core
    embedding = SIF_core.SIF_embedding(We, x, w, params)

    get_sent_vec = {}
    for i in range(len(embedding)):
        get_sent_vec[sentences[i]] = embedding[i]

    return get_sent_vec
def sentences2vecs(sentences, We, words, weight4ind):
    x, m = sentences2idx(sentences, words)
    w = data_io.seq2weight(x, m, weight4ind)
    return SIF_embedding.get_weighted_average(We, x, w)
def top_filtering(logits,
                  words,
                  weight4ind,
                  We,
                  tokenizer,
                  history,
                  args,
                  params,
                  embedding1,
                  top_k=0,
                  top_p=0.0,
                  threshold=-float('Inf'),
                  filter_value=-float('Inf'),
                  current_output=None):
    """ Filter a distribution of logits using top-k, top-p (nucleus) and/or threshold filtering
        Args:
            logits: logits distribution shape (vocabulary size)
            top_k: <=0: no filtering, >0: keep only top k tokens with highest probability.
            top_p: <=0.0: no filtering, >0.0: keep only a subset S of candidates, where S is the smallest subset
                whose total probability mass is greater than or equal to the threshold top_p.
                In practice, we select the highest probability tokens whose cumulative probability mass exceeds
                the threshold top_p.
            threshold: a minimal threshold to keep logits
    """
    if current_output is None:
        current_output = []
    assert logits.dim(
    ) == 1  # Only work for batch size 1 for now - could update but it would obfuscate a bit the code
    top_k = min(top_k, logits.size(-1))

    if top_k > 0:
        # Remove all tokens with a probability less than the last token in the top-k tokens
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1,
                                                                  None]
        logits[indices_to_remove] = filter_value

    if top_p > 0.0:
        # Compute cumulative probabilities of sorted tokens
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probabilities = torch.cumsum(F.softmax(sorted_logits,
                                                          dim=-1),
                                                dim=-1)

        # Remove tokens with cumulative probability above the threshold
        sorted_indices_to_remove = cumulative_probabilities > top_p
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
            ..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        # Back to unsorted indices and set them to -infinity
        indices_to_remove = sorted_indices[sorted_indices_to_remove]
        indices_to_use = sorted_indices[~sorted_indices_to_remove]
        logits[indices_to_remove] = filter_value
        if len(indices_to_use) > 1:
            cands = [current_output + [idx] for idx in indices_to_use.tolist()]
            raw_cands = [
                tokenizer.decode(cand, skip_special_tokens=True)
                for cand in cands
            ]
            scores = []
            for i in raw_cands:
                sentences = [i]
                x, m = data_io.sentences2idx(sentences, words)
                w = data_io.seq2weight(x, m, weight4ind)
                embedding2 = SIF_embedding.SIF_embedding(We, x, w, params)
                inn = (embedding1 * embedding2).sum(axis=1)
                emb1norm = np.sqrt((embedding1 * embedding1).sum(axis=1))
                emb2norm = np.sqrt((embedding2 * embedding2).sum(axis=1))
                scores.append(inn / emb1norm / emb2norm)
                #print(sentences)
            for idx, sim in zip(indices_to_use, scores):
                logits[idx] += sim.item()
            """
            probs = F.softmax(logits, dim=-1)
            index = []
            for i in probs:
                if i > 0:
                    index.append(i)
            prev = torch.topk(probs, 1)[1] if args.no_sample else torch.multinomial(probs, len(index))
            text = []
            last_utt = history[-1]
            last = tokenizer.decode(last_utt, skip_special_tokens=True)
            sentences = [last]
            # load sentences
            x, m = data_io.sentences2idx(sentences,
                                            words)  # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
            w = data_io.seq2weight(x, m, weight4ind)  # get word weights

            # set parameters
            global params
            params.rmpc = rmpc
            # get SIF embedding
            embedding1 = SIF_embedding.SIF_embedding(We, x, w, params)  # embedding[i,:] is the embedding for sentence i
            for i in prev:
                text.append(i.item())
            for i in text:
                cand = current_output.copy()
                cand.append(i)
                indice = i
                raw_text=tokenizer.decode(cand, skip_special_tokens=True)
                sentences = [raw_text]
                x, m= data_io.sentences2idx(sentences,
                                                words)
                w = data_io.seq2weight(x, m, weight4ind)
                embedding2 = SIF_embedding.SIF_embedding(We, x, w, params)
                inn = (embedding1 * embedding2 ).sum(axis=1)
                emb1norm = np.sqrt((embedding1 * embedding1).sum(axis=1))
                emb2norm = np.sqrt((embedding2  * embedding2 ).sum(axis=1))
                scores = inn / emb1norm / emb2norm
                #print(scores)
                logits[indice] += scores.item()
                cand.clear()
                """

    indices_to_remove = logits < threshold
    logits[indices_to_remove] = filter_value

    return logits
def sample_sequence(personality,
                    history,
                    tokenizer,
                    model,
                    args,
                    words,
                    weight4ind,
                    We,
                    current_output=None):
    special_tokens_ids = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS)
    if current_output is None:
        current_output = []
    last_utt = history[-1]
    last = tokenizer.decode(last_utt, skip_special_tokens=True)
    sentences = [last]
    # load sentences
    x, m = data_io.sentences2idx(
        sentences, words
    )  # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    w = data_io.seq2weight(x, m, weight4ind)  # get word weights

    rmpc = 1  # number of principal components to remove in SIF weighting scheme

    # set parameters
    global params
    params.rmpc = rmpc
    # get SIF embedding
    embedding1 = SIF_embedding.SIF_embedding(
        We, x, w, params)  # embedding[i,:] is the embedding for sentence i

    for i in range(args.max_length):
        instance, _ = build_input_from_segments(personality,
                                                history,
                                                current_output,
                                                tokenizer,
                                                with_eos=False)

        input_ids = torch.tensor(instance["input_ids"],
                                 device=args.device).unsqueeze(0)
        token_type_ids = torch.tensor(instance["token_type_ids"],
                                      device=args.device).unsqueeze(0)
        temperature = 1.0
        top_k = 0
        top_p = 0.9

        logits = model(input_ids, token_type_ids=token_type_ids)
        if isinstance(logits, tuple):  # for gpt2 and maybe others
            logits = logits[0]
        logits = logits[0, -1, :] / args.temperature
        logits = top_filtering(logits,
                               words,
                               weight4ind,
                               We,
                               tokenizer,
                               history,
                               args,
                               params,
                               embedding1,
                               top_k=top_k,
                               top_p=top_p,
                               current_output=current_output)
        probs = F.softmax(logits, dim=-1)

        prev = torch.topk(
            probs, 1)[1] if args.no_sample else torch.multinomial(probs, 1)
        if i < args.min_length and prev.item() in special_tokens_ids:
            while prev.item() in special_tokens_ids:
                if probs.max().item() == 1:
                    warnings.warn(
                        "Warning: model generating special token with probability 1."
                    )
                    break  # avoid infinitely looping over special token
                prev = torch.multinomial(probs, num_samples=1)

        if prev.item() in special_tokens_ids:
            break
        current_output.append(prev.item())

    return current_output
Пример #27
0
def train_util(model, train_data, dev, test, train, words, params):
    "utility function for training the model"
    start_time = time()
    try:
        for eidx in range(params.epochs):
            kf = data_io.get_minibatches_idx(len(train_data),
                                             params.batchsize,
                                             shuffle=True)
            uidx = 0
            for _, train_index in kf:
                uidx += 1
                batch = [train_data[t] for t in train_index]
                # load the word ids
                for i in batch:
                    i[0].populate_embeddings(words)
                    if not params.task == "sentiment":
                        i[1].populate_embeddings(words)
                # load the data
                if params.task == "ent":
                    (scores, g1x, g1mask, g2x,
                     g2mask) = data_io.getDataEntailment(batch)
                elif params.task == "sim":
                    (scores, g1x, g1mask, g2x,
                     g2mask) = data_io.getDataSim(batch, model.nout)
                elif params.task == "sentiment":
                    (scores, g1x, g1mask) = data_io.getDataSentiment(batch)
                else:
                    raise ValueError('Task should be ent or sim.')
                # train
                if not params.task == "sentiment":
                    if params.weightfile:
                        g1mask = data_io.seq2weight(g1x, g1mask,
                                                    params.weight4ind)
                        g2mask = data_io.seq2weight(g2x, g2mask,
                                                    params.weight4ind)
                    cost = model.train_function(scores, g1x, g2x, g1mask,
                                                g2mask)
                else:
                    if params.weightfile:
                        g1mask = data_io.seq2weight(g1x, g1mask,
                                                    params.weight4ind)
                    cost = model.train_function(scores, g1x, g1mask)
                if np.isnan(cost) or np.isinf(cost):
                    print('NaN detected')
                # undo batch to save RAM
                for i in batch:
                    i[0].representation = None
                    i[0].unpopulate_embeddings()
                    if not params.task == "sentiment":
                        i[1].representation = None
                        i[1].unpopulate_embeddings()
            # evaluate
            if params.task == "sim":
                dp, ds = eval.supervised_evaluate(model, words, dev, params)
                tp, ts = eval.supervised_evaluate(model, words, test, params)
                rp, rs = eval.supervised_evaluate(model, words, train, params)
                print(("evaluation: ", dp, ds, tp, ts, rp, rs))
            elif params.task == "ent" or params.task == "sentiment":
                ds = eval.supervised_evaluate(model, words, dev, params)
                ts = eval.supervised_evaluate(model, words, test, params)
                rs = eval.supervised_evaluate(model, words, train, params)
                print(("evaluation: ", ds, ts, rs))
            else:
                raise ValueError('Task should be ent or sim.')
            print(('Epoch ', (eidx + 1), 'Cost ', cost))
            sys.stdout.flush()
    except KeyboardInterrupt:
        print("Training interupted")
    end_time = time()
    print(("total time:", (end_time - start_time)))
Пример #28
0
	#in sentence_file or '5-15' in sentence_file:
	batch_num = 0
	with open(sentence_file, 'r', encoding='utf-8') as fr:
		print('Processing file', sentence_file, '...')
		p = hnswlib.Index(space='cosine', dim=dimension)
		p.init_index(max_elements = num_elements, ef_construction = 2000, M = 80)
		p.set_ef(1000)
		# Set number of threads used during batch search/construction
		# By default using all available cores
		p.set_num_threads(30)
		for n_lines in iter(lambda: tuple(islice(fr, batch_size)), ()):
			sents = list(map(str.strip, n_lines))
			sent_id = list(map(lambda x:int(x.split('\t')[0]), sents))
			sentences = list(map(lambda x:x.split('\t')[-1], sents))
			x, m = data_io.sentences2idx(sentences, words)
			w = data_io.seq2weight(x, m, weight4ind)

			# get SIF embedding
			embedding = SIF_embedding.SIF_embedding(We, x, w, params) # embedding[i,:] is the embedding for sentence i
			embeddings.normalize(embedding, ["unit", "center"])

			p.add_items(embedding,sent_id)
			print('Finished batch', batch_num, '.', end = '\r')
			batch_num += 1
	print('\nFinished loading', sentence_file, '.')
	out_file = sentence_file+'.ann'
	p.save_index(out_file)
	print('Finished saving', out_file, '.')
	del p

Пример #29
0
(glove_words, We) = data_io.getWordmap(wordfile)
print("shape of Word embedding is: " + str(We.shape))

# load word weights
word2weight = data_io.getWordWeight(
    weightfile,
    weightpara)  # word2weight['str'] is the weight for the word 'str'
weight4ind = data_io.getWeight(
    glove_words, word2weight)  # weight4ind[i] is the weight for the i-th word

# set parameters
params = params.params()
params.rmpc = rmpc

# load sentences
print("reading the input sentences now & converting to indices .. \n")
sample_sents = read_NMT_data.read_data(sample_ara)

# AraSIF embedding for sample sentences
print("computing AraSIF embedding now ...\n")

# x is the array of word indices, m is the binary mask indicating whether there is a word in that location
x, m = data_io.sentences2idx(sample_sents, glove_words)
w = data_io.seq2weight(x, m, weight4ind)  # get word weights
sample_embedding = SIF_embedding.SIF_embedding(
    We, x, w, params)  # embedding[i,:] is the embedding for sentence i
print("shape of sample sentence embedding is: " + str(sample_embedding.shape))

# serialize for future use
numpy.save('sample_sentence_embedding.npy', sample_embedding)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-d',
                        '--dataname',
                        default='t6',
                        help='dataset name',
                        choices=['t6', 't26', '2C'])
    parser.add_argument('-c',
                        '--classifiername',
                        default='RF',
                        help='which classifier to use',
                        choices=['GaussianNB', 'RF', 'SVM', 'KNN'])
    args = parser.parse_args()
    data_name = args.dataname  # t6 or t26, 2C, 4C
    clf_name = args.classifiername  # classfier

    # Original SIF paper used glove.840B.300d, we use the ones that were trained on twitter.
    embed_dims = [100]  # can add 25, 50, 200 dimension if needed
    wordfile_list = [
        '../data/glove.twitter.27B.{}d.txt'.format(dim) for dim in embed_dims
    ]
    # each line is a word and its frequency
    weightfile = 'SIF-master/auxiliary_data/enwiki_vocab_min200.txt'
    # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
    weightpara = 1e-3
    # number of principal components to remove in SIF weighting scheme
    rmpc = 1

    for wordfile, dim in zip(wordfile_list, embed_dims):
        # load word vectors
        (words, We) = data_io.getWordmap(wordfile)
        # load word weights
        # word2weight['str'] is the weight for the word 'str'
        word2weight = data_io.getWordWeight(weightfile, weightpara)
        # weight4ind[i] is the weight for the i-th word
        weight4ind = data_io.getWeight(words, word2weight)

        data_path = "../data/"
        if data_name == "t6":
            file_path = data_path + "CrisisLexT6_cleaned/"
            disasters = [
                "sandy", "queensland", "boston", "west_texas", "oklahoma",
                "alberta"
            ]
            test_list = [
                "{}_glove_token.csv.unique.csv".format(disaster)
                for disaster in disasters
            ]
            train_list = [
                "{}_training.csv".format(disaster) for disaster in disasters
            ]
        if data_name == "t26":
            file_path = data_path + "CrisisLexT26_cleaned/"
            disasters = [
                "2012_Colorado_wildfires", "2013_Queensland_floods",
                "2013_Boston_bombings", "2013_West_Texas_explosion",
                "2013_Alberta_floods", "2013_Colorado_floods",
                "2013_NY_train_crash"
            ]
            test_list = [
                "{}-tweets_labeled.csv.unique.csv".format(disaster)
                for disaster in disasters
            ]
            train_list = [
                "{}_training.csv".format(disaster) for disaster in disasters
            ]
        if data_name == "2C":
            file_path = data_path + "2CTweets_cleaned/"
            disasters = [
                "Memphis", "Seattle", "NYC", "Chicago", "SanFrancisco",
                "Boston", "Brisbane", "Dublin", "London", "Sydney"
            ]
            test_list = [
                "{}2C.csv.token.csv.unique.csv".format(disaster)
                for disaster in disasters
            ]
            train_list = [
                "{}2C_training.csv".format(disaster) for disaster in disasters
            ]

        accu_list = []
        roc_list = []
        precision_list = []
        recall_list = []
        f1_list = []
        for train, test in zip(train_list, test_list):
            train_file = os.path.join(file_path, train)
            test_file = os.path.join(file_path, test)
            xtrain, ytrain = load_data(data_name, train_file)
            xtest, ytest = load_data(data_name, test_file)

            # load train
            # xtrain_windx is the array of word indices, m_train is the binary mask indicating whether there is a word in that location
            xtrain_windx, m_train = data_io.sentences2idx(xtrain, words)
            w_train = data_io.seq2weight(xtrain_windx, m_train,
                                         weight4ind)  # get word weights

            # set parameters
            paramss = params.params()
            paramss.rmpc = rmpc
            # get SIF embedding
            train_embed = SIF_embedding.SIF_embedding(
                We, xtrain_windx, w_train,
                paramss)  # embedding[i,:] is the embedding for sentence i

            # load target
            # xtest_windx is the array of word indices, m_test is the binary mask indicating whether there is a word in that location
            xtest_windx, m_test = data_io.sentences2idx(xtest, words)
            # get word weights
            w_test = data_io.seq2weight(xtest_windx, m_test, weight4ind)

            # set parameters
            paramsss = params.params()
            paramsss.rmpc = rmpc
            # get SIF embedding
            test_embed = SIF_embedding.SIF_embedding(
                We, xtest_windx, w_test,
                paramsss)  # embedding[i,:] is the embedding for sentence i

            print(test)
            accu, roc, precision, recall, f1 = run_classifier(
                train_embed, ytrain, test_embed, ytest, clf_name, 100)
            accu_list.append(accu)
            roc_list.append(roc)
            precision_list.append(precision)
            recall_list.append(recall)
            f1_list.append(f1)

        print("{}_SIF_{}_LOO_accuracy {}".format(data_name,
                                                 clf_name + str(dim),
                                                 accu_list))
        print("{}_SIF_{}_LOO_roc {}".format(data_name, clf_name + str(dim),
                                            roc_list))
        print("{}_SIF_{}_LOO_precision {}".format(data_name,
                                                  clf_name + str(dim),
                                                  precision_list))
        print("{}_SIF_{}_LOO_recall {}".format(data_name, clf_name + str(dim),
                                               recall_list))
        print("{}_SIF_{}_LOO_f1 {}".format(data_name, clf_name + str(dim),
                                           f1_list))
        print(
            "{0}_SIF_LOO_{1} {2:.4f} + {3:.4f} {4:.4f} + {5:.4f} {6:.4f} + {7:.4f} {8:.4f} + {9:.4f} {10:.4f} + {11:.4f}"
            .format(data_name, clf_name + str(dim), np.mean(accu_list),
                    np.std(accu_list), np.mean(roc_list), np.std(roc_list),
                    np.mean(f1_list), np.std(f1_list), np.mean(precision_list),
                    np.std(precision_list), np.mean(recall_list),
                    np.std(recall_list)))