#     tokenizer.save('./', 'token_test')
# else:

#     tokenizer = ByteLevelBPETokenizer( "./{}-vocab.json".format('token_test'), "./{}-merges.txt".format('token_test'),
#         add_prefix_space=True,
#     )

# # Now we can encode
# encoded = tokenizer.encode("will be back later.  http://plurk.com/p/rp3k7,will be back later, loooove u @mahboi #blessed")
# print(encoded.tokens)
# print(encoded.offsets)

from tokenizers import BertWordPieceTokenizer
# My arbitrary sentence
sentence = "[CLS] will be back later.  www.facebook.com ,will be back later, loooove u @mahboi #blessed"
# Bert vocabularies
# Instantiate a Bert tokenizers
tokenizer = BertWordPieceTokenizer("bert-large-uncased-vocab.txt",
                                   lowercase=True,
                                   clean_text=True)
tokenizer.add_tokens(['[LINK]'])

tokenizer.enable_padding(max_length=100)
WordPieceEncoder = tokenizer.encode(sentence)
# Print the ids, tokens and offsets
print(WordPieceEncoder.ids)
print(WordPieceEncoder.tokens)
print(WordPieceEncoder.offsets)
print(tokenizer.get_vocab()['[PAD]'])
print(tokenizer.decode(WordPieceEncoder.ids))
示例#2
0
import csv
from tokenizers import BertWordPieceTokenizer

# Files with commands.
data_path = "/home/tkornuta/data/local-leonardo-sierra5k"
processed_path = os.path.join(data_path, "processed")
command_templates = os.path.join(processed_path, "command_templates.csv")
command = os.path.join(processed_path, "command.csv")

# Initialize a new tokenizer
tokenizer = BertWordPieceTokenizer()

# Then train it!
tokenizer.train([command_templates, command], vocab_size=100)
print("Vocabulary size: ", tokenizer.get_vocab_size())
for k, v in tokenizer.get_vocab().items():
    print(k, ": ", v)

# Samples from 5k - human labels.
# data_00050000_00052798.gif,"Disjoint the given stacks to form a new stack with blue, red blocks.","Make a new stack with blue, red blocks."
# data_00150000_00150539.gif,Place all the blocks individually on the surface.,Disjoint the given stack of blocks.
# data_00110000_00110725.gif,"Separate the given stack to form yellow, red blocks stack.",Remove 2nd and 4th blocks from the given stack.
# data_00120000_00120478.gif,Remove 1st and 2nd block from the given stack and form stack with blue on top of yellow block.,Do not touch green and red block and form another stack with blue and yellow block

# Now, let's use it:
#input = "I can feel the magic, can you?"
#input = "Disjoint the given stacks to form a new stack with blue, red blocks."
#input = "Make a new stack with blue, red blocks."
input = "Remove 1st and 2nd block from the given stack and form stack with blue on top of yellow block.,Do not touch green and red block and form another stack with blue and yellow block"
print(input)
encoded = tokenizer.encode(input)  #, return_tensors="pt")
class Tweets(Dataset):
    def __init__(self, device='cpu', pad=150, test=False, N=4):
        self.samples = []
        self.pad = pad

        self.tokenizer = BertWordPieceTokenizer(
            "./data/bert-base-uncased-vocab.txt",
            lowercase=True,
            clean_text=True)

        self.tokenizer.enable_padding(max_length=pad -
                                      1)  # -1 for sentiment token

        self.tokenizer.add_special_tokens(['[POS]'])
        self.tokenizer.add_special_tokens(['[NEG]'])
        self.tokenizer.add_special_tokens(['[NEU]'])
        self.vocab = self.tokenizer.get_vocab()

        self.sent_t = {
            'positive': self.tokenizer.token_to_id('[POS]'),
            'negative': self.tokenizer.token_to_id('[NEG]'),
            'neutral': self.tokenizer.token_to_id('[NEU]')
        }

        self.pos_set = {'UNK': 0}
        all_pos = load('help/tagsets/upenn_tagset.pickle').keys()

        for i, p in enumerate(all_pos):
            self.pos_set[p] = i + 1

        self.tweet_tokenizer = TweetTokenizer()

        data = None
        if test is True:
            data = pd.read_csv(TEST_PATH).values
            for row in data:
                tid, tweet, sentiment = tuple(row)

                pos_membership = [0] * len(tweet)

                pos_tokens = self.tweet_tokenizer.tokenize(tweet)
                pos = nltk.pos_tag(pos_tokens)
                offset = 0

                for i, token in enumerate(pos_tokens):
                    start = tweet.find(token, offset)
                    end = start + len(token)
                    if pos[i][1] in self.pos_set:
                        pos_membership[start:end] = [self.pos_set[pos[i][1]]
                                                     ] * len(token)
                    offset += len(token)

                tokens = self.tokenizer.encode(tweet)
                word_to_index = tokens.ids
                offsets = tokens.offsets

                token_pos = [0] * len(word_to_index)
                # get pos info
                for i, (s, e) in enumerate(offsets):
                    if word_to_index[i] == 0 or word_to_index[
                            i] == 101 or word_to_index[i] == 102:
                        pass
                    elif s != e:
                        sub = pos_membership[s:e]
                        token_pos[i] = max(set(sub), key=sub.count)

                token_pos = [0] + token_pos
                word_to_index = [self.sent_t[sentiment]] + word_to_index
                offsets = [(0, 0)] + offsets
                offsets = np.array([[off[0], off[1]] for off in offsets])
                word_to_index = np.array(word_to_index)
                token_pos = np.array(token_pos)

                self.samples.append({
                    'tid': tid,
                    'sentiment': sentiment,
                    'tweet': word_to_index,
                    'offsets': offsets,
                    'raw_tweet': tweet,
                    'pos': token_pos
                })

        else:

            data = pd.read_csv(TRAIN_PATH).values
            if N > 0:
                data = augment_n(data, N=N)

            for row in data:
                tid, tweet, selection, sentiment = tuple(row)

                char_membership = [0] * len(tweet)
                pos_membership = [0] * len(tweet)
                si = tweet.find(selection)
                if si < 0:
                    char_membership[0:] = [1] * len(char_membership)
                else:
                    char_membership[si:si +
                                    len(selection)] = [1] * len(selection)

                pos_tokens = self.tweet_tokenizer.tokenize(tweet)
                pos = nltk.pos_tag(pos_tokens)
                offset = 0

                for i, token in enumerate(pos_tokens):
                    start = tweet.find(token, offset)
                    end = start + len(token)
                    if pos[i][1] in self.pos_set:
                        pos_membership[start:end] = [self.pos_set[pos[i][1]]
                                                     ] * len(token)
                    offset += len(token)

                tokens = self.tokenizer.encode(tweet)
                word_to_index = tokens.ids
                offsets = tokens.offsets

                token_membership = [0] * len(word_to_index)
                token_pos = [0] * len(word_to_index)

                # Inclusive indices
                start = None
                end = None
                for i, (s, e) in enumerate(offsets):
                    if word_to_index[i] == 0 or word_to_index[
                            i] == 101 or word_to_index[i] == 102:
                        token_membership[i] = -1
                    elif sum(char_membership[s:e]) > 0:
                        token_membership[i] = 1
                        if start is None:
                            start = i + 1
                        end = i + 1

                # get pos info
                for i, (s, e) in enumerate(offsets):
                    if word_to_index[i] == 0 or word_to_index[
                            i] == 101 or word_to_index[i] == 102:
                        pass
                    elif s != e:
                        sub = pos_membership[s:e]
                        token_pos[i] = max(set(sub), key=sub.count)

                if start is None:
                    print("Data Point Error")
                    print(tweet)
                    print(selection)
                    continue
                # token_membership = torch.LongTensor(token_membership).to(device)
                word_to_index = [self.sent_t[sentiment]] + word_to_index
                token_membership = [-1] + token_membership
                offsets = [(0, 0)] + offsets
                token_pos = [0] + token_pos

                offsets = np.array([[off[0], off[1]] for off in offsets])
                word_to_index = np.array(word_to_index)
                token_membership = np.array(token_membership).astype('float')
                token_pos = np.array(token_pos)

                if tid is None:
                    raise Exception('None field detected')
                if sentiment is None:
                    raise Exception('None field detected')
                if word_to_index is None:
                    raise Exception('None field detected')
                if token_membership is None:
                    raise Exception('None field detected')
                if selection is None:
                    raise Exception('None field detected')
                if tweet is None:
                    raise Exception('None field detected')
                if start is None:
                    raise Exception('None field detected')
                if end is None:
                    raise Exception('None field detected')
                if offsets is None:
                    raise Exception('None field detected')

                self.samples.append({
                    'tid': tid,
                    'sentiment': sentiment,
                    'tweet': word_to_index,
                    'selection': token_membership,
                    'raw_selection': selection,
                    'raw_tweet': tweet,
                    'start': start,
                    'end': end,
                    'offsets': offsets,
                    'pos': token_pos
                })

    def get_splits(self, val_size=.3):
        N = len(self.samples)
        indices = np.random.permutation(N)
        split = int(N * (1 - val_size))
        train_indices = indices[0:split]
        valid_indices = indices[split:]
        return train_indices, valid_indices

    def k_folds(self, k=5):
        N = len(self.samples)
        indices = np.random.permutation(N)
        return np.array_split(indices, k)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        try:
            return self.samples[idx]
        except TypeError:
            pass
        return [self.samples[i] for i in idx]