Python WordTokenizer示例，allennlp.data.tokenizers.word_tokenizer.WordTokenizer Python示例

示例#1

0

显示文件

def doc_word_embed0(path, no_add_set):
    with open(path, 'r') as file:
        lines = file.readlines()

    lines1 = []
    #for line in lines:
    #    lines1.extend(line.lower().split('.') )
    #lines = lines1

    words = []
    vocab, embeds = data.process_glove_data(dim=100)
    embed_map = dict(zip(vocab, embeds))

    tk = tokenizer.WordTokenizer()
    #list of list of tokens
    tokens_l = tk.batch_tokenize(lines)
    stop_word_filter = StopwordFilter()

    word_embeds = []
    words_ar = []
    added_set = set(no_add_set)
    for sentence in tokens_l:
        sentence = stop_word_filter.filter_words(sentence)
        for w in sentence:
            w = w.text.lower()
            if w in embed_map and w not in added_set:
                added_set.add(w)
                words_ar.append(w)
                word_embeds.append(embed_map[w])

    word_embeds = torch.stack(word_embeds, dim=0).to(utils.device)
    if False:  #sanity check
        word_embeds[:] = word_embeds[0]
    #word_embeds = word_embeds / (word_embeds**2).sum(dim=1, keepdim=True).sqrt()
    return words_ar, word_embeds

示例#2

0

显示文件

文件： elmo_mlp_model.py 项目： wangbq18/gender-unbiased_BERT-based_pronoun_resolution

def get_elmo_fea(data, op, wg):
    '''
    Took this method from public kernel:
    https://www.kaggle.com/wochidadonggua/elmo-baseline

    modified it to concatenate all 3 layers
    '''

    def get_nearest(slot, target):
        for i in range(target, -1, -1):
            if i in slot:
                return i

    # add parameter cuda_device=0 to use GPU
    elmo = ElmoEmbedder(options_file=op, weight_file=wg)

    tk = word_tokenizer.WordTokenizer()
    tokens = tk.batch_tokenize(data.Text)
    idx = []

    for i in range(len(tokens)):
        idx.append([x.idx for x in tokens[i]])
        tokens[i] = [x.text for x in tokens[i]]

    vectors = elmo.embed_sentences(tokens)

    ans = []
    for i, vector in enumerate([v for v in vectors]):
        P_l = data.iloc[i].Pronoun
        A_l = data.iloc[i].A.split()
        B_l = data.iloc[i].B.split()

        P_offset = data.iloc[i]['Pronoun-offset']
        A_offset = data.iloc[i]['A-offset']
        B_offset = data.iloc[i]['B-offset']

        if P_offset not in idx[i]:
            P_offset = get_nearest(idx[i], P_offset)
        if A_offset not in idx[i]:
            A_offset = get_nearest(idx[i], A_offset)
        if B_offset not in idx[i]:
            B_offset = get_nearest(idx[i], B_offset)

        # P is a single token. For A and B, average over tokens in the span.
        emb_P = vector[:, idx[i].index(P_offset), :]
        emb_A = np.mean(vector[:, idx[i].index(A_offset):idx[i].index(A_offset) + len(A_l), :], axis=1)
        emb_B = np.mean(vector[:, idx[i].index(B_offset):idx[i].index(B_offset) + len(B_l), :], axis=1)

        ans.append(np.concatenate([emb_A[0], emb_A[1], emb_A[2], emb_B[0], emb_B[1], emb_B[2],
                                   emb_P[0], emb_P[1], emb_P[2]], axis=0).reshape(1, -1))

    emb = np.concatenate(ans, axis=0)
    return emb

示例#3

0

显示文件

def doc_word_embed(path, no_add_set, content_lines=None):
    if content_lines is not None:
        lines = content_lines
    else:
        with open(path, 'r') as file:
            lines = file.readlines()

    lines1 = []
    words = []
    vocab, embeds = data.process_glove_data(dim=100)
    embed_map = dict(zip(vocab, embeds))

    tk = tokenizer.WordTokenizer()
    #list of list of tokens
    tokens_l = tk.batch_tokenize(lines)
    stop_word_filter = StopwordFilter()

    tokens_l1 = []
    for sentence_l in tokens_l:
        tokens_l1.extend(sentence_l)
    tokens_l = [tokens_l1]

    n_avg = 5  #5
    word_embeds = []
    words_ar = []
    added_set = set(no_add_set)
    for sentence in tokens_l:

        sentence = stop_word_filter.filter_words(sentence)
        cur_embed = torch.zeros_like(embed_map['a'])
        cur_counter = 0
        for j, w in enumerate(sentence):
            w = w.text.lower()
            if w in embed_map:  # and w not in added_set:
                if cur_counter == n_avg:  # or j==len(sentence)-1:
                    added_set.add(w)
                    words_ar.append(w)
                    #word_embeds.append(embed_map[w])
                    #word_embeds.append(cur_embed/(cur_counter if cur_counter > 0 else 1))
                    word_embeds.append(cur_embed / n_avg)

                    cur_embed = torch.zeros_like(embed_map['a'])
                    cur_counter = 0
                else:
                    cur_counter += 1
                    cur_embed += embed_map[w]

    word_embeds = torch.stack(word_embeds, dim=0).to(utils.device)
    if False:  #is_noise :#False: #sanity check
        word_embeds[:] = word_embeds.mean(0)  #word_embeds[0]
    return words_ar, word_embeds

示例#4

0

显示文件

def doc_sentence_embed(path):
    with open(path, 'r') as file:
        lines = file.readlines()

    lines1 = []
    for line in lines:
        lines1.extend(line.lower().split('.'))

    lines = lines1
    words = []
    vocab, embeds = data.process_glove_data(dim=100)
    embed_map = dict(zip(vocab, embeds))

    tk = tokenizer.WordTokenizer()
    tokens_l = tk.batch_tokenize(lines)
    word_embeds = []
    words_ar = []
    added_set = set()
    for sentence in tokens_l:
        if len(sentence) < 3:
            continue
        sentence_embed = 0
        aa = True
        for w in sentence:
            w = w.text.lower()
            if w in embed_map:  # and w not in added_set:
                ##added_set.add(w)
                ##words_ar.append(w)
                ##word_embeds.append(embed_map[w])
                sentence_embed += embed_map[w]
                aa = False
        if aa:
            continue
        words_ar.append(sentence)
        word_embeds.append(sentence_embed / len(sentence))

    word_embeds = torch.stack(word_embeds, dim=0).to(utils.device)
    #word_embeds = word_embeds / (word_embeds**2).sum(dim=1, keepdim=True).sqrt()
    return words_ar, word_embeds

示例#5

0

显示文件

import torch
import sklearn.decomposition as decom
import data
import utils
import numpy as np
import numpy.linalg as linalg
import re

import pdb

USE_ALLENNLP = False
#use flag, as some users reported issues with installation.
if USE_ALLENNLP:
    import allennlp.data.tokenizers.word_tokenizer as tokenizer
    from allennlp.data.tokenizers.word_filter import StopwordFilter
    tk = tokenizer.WordTokenizer()
    stop_word_filter = StopwordFilter()
else:
    print('Note: using rudimentary tokenizer, for better results enable allennlp.')
    stop_word_filter = utils.stop_word_filter()
    tk = utils.tokenizer()
    
'''
Combines content and noise words embeddings
'''
def doc_word_embed_content_noise(content_path, noise_path, whiten_path=None, content_lines=None, noise_lines=None, opt=None):
    no_add_set = set()
    doc_word_embed_f = doc_word_embed_sen
    content_words_ar, content_word_embeds = doc_word_embed_f(content_path, no_add_set, content_lines=content_lines)
    words_set = set(content_words_ar)
    noise_words_ar, noise_word_embeds = doc_word_embed_f(noise_path, set(content_words_ar), content_lines=noise_lines)

示例#6

0

显示文件

def get_elmo_emb(data_name, op, wg):
    elmo = ElmoEmbedder(options_file=op, weight_file=wg, cuda_device=0)

    # data = pd.read_csv("input/gap-validation.tsv", sep = '\t')

    data = pd.read_csv(f'input/{data_name}.tsv', sep='\t')

    index = data.index
    columns = ['emb_A', 'emb_B', 'emb_P', 'label']
    emb = pd.DataFrame(index=index, columns=columns)
    emb.index.name = 'ID'

    tk = word_tokenizer.WordTokenizer()
    tokens = tk.batch_tokenize(data.Text)
    idx = []

    for i in range(len(tokens)):
        idx.append([x.idx for x in tokens[i]])
        tokens[i] = [x.text for x in tokens[i]]

    vectors = elmo.embed_sentences(tokens)

    ans = []
    for i, vector in enumerate([v for v in vectors]):
        P_l = data.iloc[i].Pronoun
        A_l = data.iloc[i].A.split()
        B_l = data.iloc[i].B.split()

        P_offset = data.iloc[i]['Pronoun-offset']
        A_offset = data.iloc[i]['A-offset']
        B_offset = data.iloc[i]['B-offset']

        if P_offset not in idx[i]:
            P_offset = get_nearest(idx[i], P_offset)
        if A_offset not in idx[i]:
            A_offset = get_nearest(idx[i], A_offset)
        if B_offset not in idx[i]:
            B_offset = get_nearest(idx[i], B_offset)

        emb_P = np.mean(vector[1:3, idx[i].index(P_offset), :],
                        axis=0,
                        keepdims=True)

        emb_A = np.mean(vector[1:3,
                               idx[i].index(A_offset):idx[i].index(A_offset) +
                               len(A_l), :],
                        axis=(1, 0),
                        keepdims=True)
        emb_A = np.squeeze(emb_A, axis=0)

        emb_B = np.mean(vector[1:3,
                               idx[i].index(B_offset):idx[i].index(B_offset) +
                               len(B_l), :],
                        axis=(1, 0),
                        keepdims=True)
        emb_B = np.squeeze(emb_B, axis=0)

        emb_A = emb_A.reshape((1024, ))
        emb_B = emb_B.reshape((1024, ))
        emb_P = emb_P.reshape((1024, ))

        label = 'Neither'
        if data.loc[i, 'A-coref']:
            label = 'A'
        if data.loc[i, 'B-coref']:
            label = 'B'

        emb.iloc[i] = [emb_A, emb_B, emb_P, label]
    return emb