# tokenizer.save('./', 'token_test') # else: # tokenizer = ByteLevelBPETokenizer( "./{}-vocab.json".format('token_test'), "./{}-merges.txt".format('token_test'), # add_prefix_space=True, # ) # # Now we can encode # encoded = tokenizer.encode("will be back later. http://plurk.com/p/rp3k7,will be back later, loooove u @mahboi #blessed") # print(encoded.tokens) # print(encoded.offsets) from tokenizers import BertWordPieceTokenizer # My arbitrary sentence sentence = "[CLS] will be back later. www.facebook.com ,will be back later, loooove u @mahboi #blessed" # Bert vocabularies # Instantiate a Bert tokenizers tokenizer = BertWordPieceTokenizer("bert-large-uncased-vocab.txt", lowercase=True, clean_text=True) tokenizer.add_tokens(['[LINK]']) tokenizer.enable_padding(max_length=100) WordPieceEncoder = tokenizer.encode(sentence) # Print the ids, tokens and offsets print(WordPieceEncoder.ids) print(WordPieceEncoder.tokens) print(WordPieceEncoder.offsets) print(tokenizer.get_vocab()['[PAD]']) print(tokenizer.decode(WordPieceEncoder.ids))
import csv from tokenizers import BertWordPieceTokenizer # Files with commands. data_path = "/home/tkornuta/data/local-leonardo-sierra5k" processed_path = os.path.join(data_path, "processed") command_templates = os.path.join(processed_path, "command_templates.csv") command = os.path.join(processed_path, "command.csv") # Initialize a new tokenizer tokenizer = BertWordPieceTokenizer() # Then train it! tokenizer.train([command_templates, command], vocab_size=100) print("Vocabulary size: ", tokenizer.get_vocab_size()) for k, v in tokenizer.get_vocab().items(): print(k, ": ", v) # Samples from 5k - human labels. # data_00050000_00052798.gif,"Disjoint the given stacks to form a new stack with blue, red blocks.","Make a new stack with blue, red blocks." # data_00150000_00150539.gif,Place all the blocks individually on the surface.,Disjoint the given stack of blocks. # data_00110000_00110725.gif,"Separate the given stack to form yellow, red blocks stack.",Remove 2nd and 4th blocks from the given stack. # data_00120000_00120478.gif,Remove 1st and 2nd block from the given stack and form stack with blue on top of yellow block.,Do not touch green and red block and form another stack with blue and yellow block # Now, let's use it: #input = "I can feel the magic, can you?" #input = "Disjoint the given stacks to form a new stack with blue, red blocks." #input = "Make a new stack with blue, red blocks." input = "Remove 1st and 2nd block from the given stack and form stack with blue on top of yellow block.,Do not touch green and red block and form another stack with blue and yellow block" print(input) encoded = tokenizer.encode(input) #, return_tensors="pt")
class Tweets(Dataset): def __init__(self, device='cpu', pad=150, test=False, N=4): self.samples = [] self.pad = pad self.tokenizer = BertWordPieceTokenizer( "./data/bert-base-uncased-vocab.txt", lowercase=True, clean_text=True) self.tokenizer.enable_padding(max_length=pad - 1) # -1 for sentiment token self.tokenizer.add_special_tokens(['[POS]']) self.tokenizer.add_special_tokens(['[NEG]']) self.tokenizer.add_special_tokens(['[NEU]']) self.vocab = self.tokenizer.get_vocab() self.sent_t = { 'positive': self.tokenizer.token_to_id('[POS]'), 'negative': self.tokenizer.token_to_id('[NEG]'), 'neutral': self.tokenizer.token_to_id('[NEU]') } self.pos_set = {'UNK': 0} all_pos = load('help/tagsets/upenn_tagset.pickle').keys() for i, p in enumerate(all_pos): self.pos_set[p] = i + 1 self.tweet_tokenizer = TweetTokenizer() data = None if test is True: data = pd.read_csv(TEST_PATH).values for row in data: tid, tweet, sentiment = tuple(row) pos_membership = [0] * len(tweet) pos_tokens = self.tweet_tokenizer.tokenize(tweet) pos = nltk.pos_tag(pos_tokens) offset = 0 for i, token in enumerate(pos_tokens): start = tweet.find(token, offset) end = start + len(token) if pos[i][1] in self.pos_set: pos_membership[start:end] = [self.pos_set[pos[i][1]] ] * len(token) offset += len(token) tokens = self.tokenizer.encode(tweet) word_to_index = tokens.ids offsets = tokens.offsets token_pos = [0] * len(word_to_index) # get pos info for i, (s, e) in enumerate(offsets): if word_to_index[i] == 0 or word_to_index[ i] == 101 or word_to_index[i] == 102: pass elif s != e: sub = pos_membership[s:e] token_pos[i] = max(set(sub), key=sub.count) token_pos = [0] + token_pos word_to_index = [self.sent_t[sentiment]] + word_to_index offsets = [(0, 0)] + offsets offsets = np.array([[off[0], off[1]] for off in offsets]) word_to_index = np.array(word_to_index) token_pos = np.array(token_pos) self.samples.append({ 'tid': tid, 'sentiment': sentiment, 'tweet': word_to_index, 'offsets': offsets, 'raw_tweet': tweet, 'pos': token_pos }) else: data = pd.read_csv(TRAIN_PATH).values if N > 0: data = augment_n(data, N=N) for row in data: tid, tweet, selection, sentiment = tuple(row) char_membership = [0] * len(tweet) pos_membership = [0] * len(tweet) si = tweet.find(selection) if si < 0: char_membership[0:] = [1] * len(char_membership) else: char_membership[si:si + len(selection)] = [1] * len(selection) pos_tokens = self.tweet_tokenizer.tokenize(tweet) pos = nltk.pos_tag(pos_tokens) offset = 0 for i, token in enumerate(pos_tokens): start = tweet.find(token, offset) end = start + len(token) if pos[i][1] in self.pos_set: pos_membership[start:end] = [self.pos_set[pos[i][1]] ] * len(token) offset += len(token) tokens = self.tokenizer.encode(tweet) word_to_index = tokens.ids offsets = tokens.offsets token_membership = [0] * len(word_to_index) token_pos = [0] * len(word_to_index) # Inclusive indices start = None end = None for i, (s, e) in enumerate(offsets): if word_to_index[i] == 0 or word_to_index[ i] == 101 or word_to_index[i] == 102: token_membership[i] = -1 elif sum(char_membership[s:e]) > 0: token_membership[i] = 1 if start is None: start = i + 1 end = i + 1 # get pos info for i, (s, e) in enumerate(offsets): if word_to_index[i] == 0 or word_to_index[ i] == 101 or word_to_index[i] == 102: pass elif s != e: sub = pos_membership[s:e] token_pos[i] = max(set(sub), key=sub.count) if start is None: print("Data Point Error") print(tweet) print(selection) continue # token_membership = torch.LongTensor(token_membership).to(device) word_to_index = [self.sent_t[sentiment]] + word_to_index token_membership = [-1] + token_membership offsets = [(0, 0)] + offsets token_pos = [0] + token_pos offsets = np.array([[off[0], off[1]] for off in offsets]) word_to_index = np.array(word_to_index) token_membership = np.array(token_membership).astype('float') token_pos = np.array(token_pos) if tid is None: raise Exception('None field detected') if sentiment is None: raise Exception('None field detected') if word_to_index is None: raise Exception('None field detected') if token_membership is None: raise Exception('None field detected') if selection is None: raise Exception('None field detected') if tweet is None: raise Exception('None field detected') if start is None: raise Exception('None field detected') if end is None: raise Exception('None field detected') if offsets is None: raise Exception('None field detected') self.samples.append({ 'tid': tid, 'sentiment': sentiment, 'tweet': word_to_index, 'selection': token_membership, 'raw_selection': selection, 'raw_tweet': tweet, 'start': start, 'end': end, 'offsets': offsets, 'pos': token_pos }) def get_splits(self, val_size=.3): N = len(self.samples) indices = np.random.permutation(N) split = int(N * (1 - val_size)) train_indices = indices[0:split] valid_indices = indices[split:] return train_indices, valid_indices def k_folds(self, k=5): N = len(self.samples) indices = np.random.permutation(N) return np.array_split(indices, k) def __len__(self): return len(self.samples) def __getitem__(self, idx): try: return self.samples[idx] except TypeError: pass return [self.samples[i] for i in idx]