def generate_iterators(BATCH_SIZE=32, MAX_LEN=20, load_data=False, embedding=None): if not load_data: spacy_de = spacy.load('de') spacy_en = spacy.load('en') def tokenize_de(text): return [tok.text for tok in spacy_de.tokenizer(text)] def tokenize_en(text): return [tok.text for tok in spacy_en.tokenizer(text)] BOS_WORD = '<s>' EOS_WORD = '</s>' DE = data.Field(tokenize=tokenize_de) EN = data.Field(tokenize=tokenize_en, init_token=BOS_WORD, eos_token=EOS_WORD) # only target needs BOS/EOS train, val, test = datasets.IWSLT.splits( exts=('.de', '.en'), fields=(DE, EN), filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and len( vars(x)['trg']) <= MAX_LEN) MIN_FREQ = 5 DE.build_vocab(train.src, min_freq=MIN_FREQ) EN.build_vocab(train.trg, min_freq=MIN_FREQ) if embedding is not None: if embedding in ['FastText', 'fasttext']: EN.vocab.load_vectors(vectors=FastText(language='en')) DE.vocab.load_vectors(vectors=FastText(language='de')) else: raise ValueError("Only fasttext is supported at the moment") train_iter, val_iter = data.BucketIterator.splits( (train, val), batch_size=BATCH_SIZE, device=-1, repeat=False, sort_key=lambda x: len(x.src)) return train_iter, val_iter, EN, DE else: # does not work... with open('train.pkl', 'rb') as f: train = pickle.load(f) with open('val.pkl', 'rb') as f: val = pickle.load(f) with open('DE.torchtext.Field.pkl', 'rb') as f: DE = pickle.load(f) with open('EN.torchtext.Field.pkl', 'rb') as f: EN = pickle.load(f) BATCH_SIZE = 32 train_iter, val_iter = data.BucketIterator.splits( (train, val), batch_size=BATCH_SIZE, device=-1, repeat=False, sort_key=lambda x: len(x.src)) return train_iter, val_iter, EN, DE
def prepare_data(self): e, m, x = read_conll(self.param.training_file_path) max_token_length = 0 for i in range(len(e)): current_sentence_len = len(e[i].sentence.tokens) if current_sentence_len > max_token_length: max_token_length = current_sentence_len # reverse int to tokens sentences = list() sentences_postags = list() sentences_lemmas = list() labels = list() for i in range(int(x)): sentences.append( [VOCDICT.getstr(token) for token in e[i].sentence.tokens]) sentences_postags.append( [POSDICT.getstr(postag) for postag in e[i].sentence.postags]) sentences_lemmas.append( [LEMDICT.getstr(lemma) for lemma in e[i].sentence.lemmas]) labels.append(list(e[i].targetframedict.keys())) tokens_field = Field(sequential=True, fix_length=max_token_length) postags_field = Field(sequential=True, fix_length=max_token_length) lemmas_field = Field(sequential=True, fix_length=max_token_length) tokens_field.build_vocab(sentences, vectors=FastText('simple')) postags_field.build_vocab(sentences_postags) lemmas_field.build_vocab(sentences_lemmas, vectors=FastText('simple')) self.pretrained_embedding = tokens_field.vocab.vectors def _preprocess_field(l: list) -> list: return [1 if j in l else 0 for j in range(max_token_length)] labels_field = Field(sequential=False, use_vocab=False, preprocessing=_preprocess_field, is_target=True) train, val = FrameTargetDataset(sentences, sentences_postags, sentences_lemmas, labels, fields=[ ('tokens', tokens_field), ('postags', postags_field), ('lemmas', lemmas_field), ('labels', labels_field), ]).split() self.train_iter, self.val_iter = BucketIterator.splits( datasets=(train, val), batch_sizes=(self.batch_size, self.batch_size), device=self._d, sort=False)
def __init__(self, max_len, batch_size, max_epochs, device, pretrained): text_field = data.Field(lower=True, batch_first=True, fix_length=max_len, init_token='<go>', eos_token='<eos>', unk_token='<unk>', pad_token='<pad>') label_field = data.Field(fix_length=max_len - 1, batch_first=True) # make splits for data unsup_train, unsup_val, unsup_test = NLIGen.splits(text_field) train, val, test = datasets.UDPOS.splits( (('text', text_field), ('label', label_field))) # build the vocabulary text_field.build_vocab( unsup_train) # , vectors="fasttext.simple.300d") label_field.build_vocab(train) # make iterator for splits self.train_iter, _, _ = data.BucketIterator.splits( (unsup_train, unsup_val, unsup_test), batch_size=batch_size, device=device, shuffle=True, sort=False) _, self.unsup_val_iter, _ = data.BucketIterator.splits( (unsup_train, unsup_val, unsup_test), batch_size=int(batch_size / 10), device=device, shuffle=True, sort=False) self.sup_iter, _, _ = data.BucketIterator.splits((train, val, test), batch_size=batch_size, device=device, shuffle=False, sort=False) _, self.val_iter, self.test_iter = data.BucketIterator.splits( (train, unsup_val, unsup_test), batch_size=int(batch_size), device=device, shuffle=False, sort=False) self.vocab = text_field.vocab self.tags = label_field.vocab self.text_field = text_field self.label_field = label_field self.device = device self.batch_size = batch_size self.n_epochs = 0 self.max_epochs = max_epochs if pretrained: ftxt = FastText() self.wvs = ftxt.get_vecs_by_tokens(self.vocab.itos) else: self.wvs = None
def __init__(self, emb_dim=50, mbsize=32, main=True, dataset2=None, **kwargs): self.TEXT = data.Field(init_token='<start>', eos_token='<eos>', lower=True, tokenize='spacy', fix_length=16) self.LABEL = data.Field(sequential=False, unk_token=None) train, val, test = datasets.SST.splits( self.TEXT, self.LABEL, fine_grained=False, train_subtrees=False, filter_pred=utils.filter(6) ) self.train = train if main: train_datasets = [train.text, dataset2.get_train().text] \ if dataset2 else [train] self.TEXT.build_vocab(*train_datasets, vectors=FastText('en')) self.LABEL.build_vocab(train) self.n_vocab = len(self.TEXT.vocab.itos) self.emb_dim = emb_dim self.train_iter, self.val_iter, _ = data.BucketIterator.splits( (train, val, test), batch_size=mbsize, device=-1, shuffle=True, repeat=True ) self.train_iter = iter(self.train_iter) self.val_iter = iter(self.val_iter)
def benchmark_experimental_vectors(): def _run_benchmark(tokens, vector): t0 = time.monotonic() for token in tokens: vector[token] print("Time:", time.monotonic() - t0) train, = AG_NEWS(data_select='train') vocab = train.get_vocab() tokens = [] for (label, text) in train: for id in text.tolist(): tokens.append(vocab.itos[id]) # existing FastText fast_text = FastText() print("FastText - Not Jit Mode") _run_benchmark(tokens, fast_text) # experimental FastText fast_text_experimental = FastTextExperimental() jit_fast_text_experimental = torch.jit.script(fast_text_experimental) print("FastText Experimental - Not Jit Mode") _run_benchmark(tokens, fast_text_experimental) print("FastText Experimental - Jit Mode") _run_benchmark(tokens, jit_fast_text_experimental)
def get_text_metadata(): """ Returns word embeddings for glove/fasttext text embeddings, None for use model """ if embed_type == 'use': return None, None, None text_field = data.Field(sequential=True, use_vocab=True, tokenize=tokenize, lower=True) captions = get_caption_list() preprocessed_caption = pd.DataFrame(captions, columns=[ 'caption' ])['caption'].apply(lambda x: text_field.preprocess(x)) if embed_type == 'glove': text_field.build_vocab(preprocessed_caption, vectors=GloVe(name='6B', dim=300)) elif embed_type == 'fasttext': text_field.build_vocab(preprocessed_caption, vectors=FastText(language='en')) word_embeddings = text_field.vocab.vectors vocab_size = len(text_field.vocab) print("Length of Text Vocabulary: " + str(vocab_size)) print("Unique Word Vectors", torch.unique(text_field.vocab.vectors, dim=0).shape) print("Vector size of Text Vocabulary: ", word_embeddings.size()) return text_field, word_embeddings, vocab_size
def load_fasttext_embedding(_log): _log.info('Loading fasttext pretrained embedding') ft = FastText(language='id', cache=os.path.join(os.getenv('HOME'), '.vectors_cache')) _log.info('Read %d pretrained words with embedding size of %d', len(ft.itos), ft.dim) return ft
def test_vocab_extend(self): c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2}) # Build a vocab and get vectors twice to test caching. for i in range(2): f = FastText(language='simple') v = vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'], vectors=f) n_vocab = len(v) v.extend(f) # extend the vocab with the words contained in f.itos self.assertGreater(len(v), n_vocab) self.assertEqual(v.itos[:6], ['<unk>', '<pad>', '<bos>', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world']) vectors = v.vectors.numpy() # The first 5 entries in each vector. expected_fasttext_simple_en = { 'hello': [0.39567, 0.21454, -0.035389, -0.24299, -0.095645], 'world': [0.10444, -0.10858, 0.27212, 0.13299, -0.33165], } for word in expected_fasttext_simple_en: assert_allclose(vectors[v.stoi[word], :5], expected_fasttext_simple_en[word]) assert_allclose(vectors[v.stoi['<unk>']], np.zeros(300)) # Delete the vectors after we're done to save disk space on CI if os.environ.get("TRAVIS") == "true": vec_file = os.path.join(self.project_root, ".vector_cache", "wiki.simple.vec") conditional_remove(vec_file)
def load_word_vectors(word_vectors_name, embedding_size, word_vectors_cache='../data/word_vectors_cache'): implemented_vector_embeddings = ('GloVe_6B', 'GloVe_42B', 'GloVe_840B', 'GloVe_twitter.27B', 'FastText_en') assert word_vectors_name in implemented_vector_embeddings word_vectors = None if word_vectors_name == 'GloVe_6B': assert embedding_size in (50, 100, 200, 300) word_vectors = GloVe(name='6B', dim=embedding_size, cache=word_vectors_cache) if word_vectors_name == 'GloVe_42B': embedding_size = 300 word_vectors = GloVe(name='42B', cache=word_vectors_cache) if word_vectors_name == 'GloVe_840B': embedding_size = 300 word_vectors = GloVe(name='840B', cache=word_vectors_cache) if word_vectors_name == 'GloVe_twitter.27B': assert embedding_size in (25, 50, 100, 200) word_vectors = GloVe(name='twitter.27B', dim=embedding_size, cache=word_vectors_cache) if word_vectors_name == 'FastText_en': embedding_size = 300 word_vectors = FastText(language='en', cache=word_vectors_cache) return word_vectors, embedding_size
def get_all_vectors(pretrained_model): emb_vectors = [] if pretrained_model == "": return emb_vectors emb_vector_names = pretrained_model.split(",") for emb_vector_name in emb_vector_names: emb_info = emb_vector_name.split("_") if len(emb_info) == 3: emb_name, emb_set, emb_size = emb_info[0], emb_info[1], emb_info[2] else: emb_name, emb_set = emb_info[0], emb_info[1] if emb_name == "glove": # glove_640B_300 print("glove") emb_vectors.append(GloVe(name=emb_set, dim=emb_size)) elif emb_name == "fasttext": if emb_set == "subwordcc": # fasttext_subwordcc print("fasttext_subwordcc") emb_vectors.append(FastTextSubwordCC()) elif emb_set == "wiki": # fasttext_wiki_en print("fasttext_wiki") emb_vectors.append(FastText(language=emb_size)) elif emb_set == "cc": # fasttext_cc_en print("fasttext_cc") emb_vectors.append(FastTextCC(language=emb_size)) elif emb_name == "char": # char_ngram if emb_set == "ngram": print("char_ngram") emb_vectors.append(CharNGram()) return emb_vectors
def benchmark_experimental_vectors(): def _run_benchmark_lookup(tokens, vector): t0 = time.monotonic() for token in tokens: vector[token] print("Lookup time:", time.monotonic() - t0) train, = AG_NEWS(data_select='train') vocab = train.get_vocab() tokens = [] for (label, text) in train: for id in text.tolist(): tokens.append(vocab.itos[id]) # existing FastText construction print("Existing FastText - Not Jit Mode") t0 = time.monotonic() fast_text = FastText() print("Construction time:", time.monotonic() - t0) _run_benchmark_lookup(tokens, fast_text) # experimental FastText construction print("FastText Experimental") t0 = time.monotonic() fast_text_experimental = FastTextExperimental(validate_file=False) print("Construction time:", time.monotonic() - t0) # not jit lookup print("FastText Experimental - Not Jit Mode") _run_benchmark_lookup(tokens, fast_text_experimental) # jit lookup print("FastText Experimental - Jit Mode") jit_fast_text_experimental = torch.jit.script(fast_text_experimental) _run_benchmark_lookup(tokens, jit_fast_text_experimental)
def main(params): # build dataset train_data = pd.read_csv('./data/train_final.csv') tokenizer = get_tokenizer('spacy', language='en') if params.emb_type == "GloVe": embedding = GloVe( name=params.emb_data, dim=params.emb_dim ) # use glove embedding with default option(name='840B', dim=300) elif params.emb_type == "CharNGram": embedding = CharNGram() elif params.emb_type == "FastText": embedding = FastText(name=params.emb_data, dim=params.emb_dim) else: print("Wrong embedding type") exit() train_data, val_data = train_data[1000:], train_data[:1000] train_dataset = SentimentDataset(train_data, tokenizer, embedding) val_dataset = SentimentDataset(val_data, tokenizer, embedding) train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) val_dataloader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False) model = SentimentClassificationModel(params.emb_dim, params.hidden_dim, params.dropout).to(device) crit = nn.CrossEntropyLoss().to(device) optim = torch.optim.Adam(params=model.parameters(), lr=1e-3) best_val_acc = 0 early_stop_cnt = 0 epoch = 0 train_loss_list = [] train_acc_list = [] val_acc_list = [] while early_stop_cnt != 5: loss_list, train_acc = train.trainer(epoch, model, train_dataloader, crit, optim, device) val_acc = train.eval(epoch, model, val_dataloader, device, False) if val_acc > best_val_acc and epoch > 0: torch.save(model.state_dict(), './model/lstm_best.pt') best_val_acc = val_acc early_stop_cnt = 0 early_stop_cnt += 1 epoch += 1 train_loss_list.extend(loss_list) train_acc_list.append(train_acc) val_acc_list.append(val_acc) print("Early stopping condition satisfied") plotting("train_loss", "steps", "loss", train_loss_list) plotting("train_accuracy", "epoch", "accuracy", train_acc_list) plotting('validation_accuracy', "epoch", "accuracy", val_acc_list)
def __init__(self, df: pd.DataFrame, preprocess: bool = True, translation_dict: Optional[Dict[str, str]] = None): index: List[PIDTitleRecord] = [] for _, row in df.iterrows(): title = preprocess_title( row['title'], translation_dict) if preprocess else row['title'] index.append(PIDTitleRecord(pid=row['posting_id'], title=title)) self._index = index self._vocab = FastText()
def __init__(self, df: pd.DataFrame, preprocess: bool = True, translation_dict: Optional[Dict[str, str]] = None): index: List[TitleLabelRecord] = [] for _, row in df.iterrows(): title = preprocess_title( row['title'], translation_dict) if preprocess else row['title'] index.append( TitleLabelRecord(title=title, label_group=row['label_group'])) self._index = index self._vocab = FastText()
def __init__(self, batch_size=128): self.batch_size = batch_size self.TEXT = data.Field() self.LABEL = data.Field(sequential=False) self.train, self.val, self.test = datasets.SST.splits( self.TEXT, self.LABEL, fine_grained=True, train_subtrees=True) f = FastText() self.TEXT.build_vocab(self.train, vectors=f) self.TEXT.vocab.extend(f) self.LABEL.build_vocab(self.train)
def __init__(self, args): if args.datastories: tokenizer = SocialTokenizer(lowercase=True) else: tokenizer = TweetTokenizer() self.RAW = data.RawField() self.TEXT = data.Field(batch_first=True, include_lengths=True, lower=True, tokenize=tokenizer.tokenize) self.LABEL = data.Field(sequential=False, unk_token=None) self.train, self.dev, self.test = datasets.EMO.splits( args, self.RAW, self.TEXT, self.LABEL, args.train_data_path, args.valid_data_path, args.test_data_path) self.TEXT.build_vocab(self.train, self.dev, self.test, vectors=GloVe(name='840B', dim=300)) if args.fasttext: self.FASTTEXT = data.Field(batch_first=True, include_lengths=True, lower=True, tokenize=tokenizer.tokenize) self.FASTTEXT.vocab = copy.deepcopy(self.TEXT.vocab) self.FASTTEXT.vocab.set_vectors(self.FASTTEXT.vocab.stoi, vectors=FastText(language='en'), dim=300) self.LABEL.build_vocab(self.train) self.train_iter, self.dev_iter, self.test_iter = \ data.BucketIterator.splits((self.train, self.dev, self.test), batch_size=args.batch_size, device=args.device, repeat=False) self.max_word_len = max([len(w) for w in self.TEXT.vocab.itos]) # for <pad> self.char_vocab = {'': 0} # for <unk> and <pad> self.characterized_words = [[0] * self.max_word_len, [0] * self.max_word_len] if args.char_emb: self.build_char_vocab() filehandler = open('./data/vocab.obj', 'wb') pickle.dump(self.TEXT.vocab, filehandler) filehandler = open('./data/label.obj', 'wb') pickle.dump(self.LABEL.vocab, filehandler)
def test_vocab_download_fasttext_vectors(self): c = Counter({ 'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2 }) # Build a vocab and get vectors twice to test caching, then once more # to test string aliases. for i in range(3): if i == 2: vectors = str("fasttext.simple.300d") # must handle str on Py2 else: vectors = FastText(language='simple') v = vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'], vectors=vectors) expected_itos = [ '<unk>', '<pad>', '<bos>', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world' ] expected_stoi = {x: index for index, x in enumerate(expected_itos)} self.assertEqual(v.itos, expected_itos) self.assertEqual(dict(v.stoi), expected_stoi) vectors = v.vectors.numpy() # The first 5 entries in each vector. expected_fasttext_simple_en = { 'hello': [0.39567, 0.21454, -0.035389, -0.24299, -0.095645], 'world': [0.10444, -0.10858, 0.27212, 0.13299, -0.33165], } for word in expected_fasttext_simple_en: assert_allclose(vectors[v.stoi[word], :5], expected_fasttext_simple_en[word]) assert_allclose(vectors[v.stoi['<unk>']], np.zeros(300)) assert_allclose(vectors[v.stoi['OOV token']], np.zeros(300)) # Delete the vectors after we're done to save disk space on CI if os.environ.get("TRAVIS") == "true": vec_file = os.path.join(self.project_root, ".vector_cache", "wiki.simple.vec") conditional_remove(vec_file)
def process_text(self, text): """Transform each description into vectors """ # filter text text = text.apply(lambda doc: self.filter_text(doc)) tokenizer = get_tokenizer('spacy', 'en_core_web_sm') # get idf (inverse document frequency) print('Calculating tf-idf...') warnings.filterwarnings("ignore") tfidf = TfidfVectorizer(tokenizer=tokenizer) tfidf.fit(text.dropna()) idf = dict(zip(tfidf.get_feature_names(), tfidf.idf_)) print('Converting text to document embedding...') # get document embedding w2v = FastText(language='en') self.text_dim = w2v.dim text = text.apply(lambda doc: self.doc2vec(doc, tokenizer, idf, w2v)) return text
def load_embedding(embed_corpus): corpora = [ 'glove_twitter', 'glove_commoncrawl', 'fasttext_wiki', 'fasttext_commoncrawl', 'word2vec' ] dim = 300 os.makedirs('data/glove', exist_ok=True) os.makedirs('data/fast_text', exist_ok=True) os.makedirs('data/word2vec', exist_ok=True) if embed_corpus == 'glove_twitter': # GloVe trained on Twitter corpus embedding = GloVe(name='twitter.27B', dim=200, cache='data/glove/') dim = 200 elif embed_corpus == 'glove_commoncrawl': # GloVe trained on Common Crawl corpus embedding = GloVe(name='42B', dim=300, cache='data/glove/') elif embed_corpus == 'fasttext_wiki': # FastText trained on Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset embedding = FastText(language='en', cache='data/fast_text/') elif embed_corpus == 'fasttext_commoncrawl': # FastText trained on Common Crawl corpus embedding = Vectors( name='crawl-300d-2M.vec', url= 'https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip', cache='data/fast_text/') elif embed_corpus == 'word2vec': # Word2Vec trained on Google New corpus name = 'GoogleNews-vectors-negative300.txt' if os.path.isfile(f'data/word2vec/{name}.pt'): embedding = Vectors(name=name, cache='data/word2vec/') else: raise FileNotFoundError(( 'No torchtext formatted word2vec vectors file found. ' 'See load_word2vec.py to create the necessary pt file. Requires gensim.' )) else: raise ValueError( f'Invalid pre-trained word embedding vectors. Options are {"/".join(corpora)}.' ) return embedding, dim
def iters(cls, path, vectors_name, vectors_cache, batch_size=64, shuffle=True, device=0, vectors=None, unk_init=torch.Tensor.zero_): """ :param path: directory containing train, test, dev files :param vectors_name: name of word vectors file :param vectors_cache: path to directory containing word vectors file :param batch_size: batch size :param device: GPU device :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes :param unk_init: function used to generate vector for OOV words :return: """ print("loading vectors") if vectors_name == "fasttext": vectors = FastText() elif "B" in vectors_name: vectors = GloVe(vectors_name) elif vectors is None: vectors = Vectors(name=vectors_name, cache=vectors_cache, unk_init=unk_init) print("completed vectors loading") train, val, test = cls.splits(path) cls.TEXT_FIELD.build_vocab(train, val, test, vectors=vectors) return BucketIterator.splits((train, val, test), batch_size=batch_size, repeat=False, shuffle=shuffle, sort_within_batch=True, device=device)
def create_embedding_matrix(self): """ currently only supports fasttext. Returns the weight matrix for the current vocab""" import torch import torch.nn as nn target_vocab = self.label_encoder.classes_ embedding = FastText('en') emb_dim = int(embedding.dim) matrix_len = len(target_vocab) weights_matrix = np.zeros((matrix_len, emb_dim)) words_found = 0 for i, word in enumerate(target_vocab): try: weights_matrix[i] = embedding[word] words_found += 1 except KeyError: weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, )) weights_matrix = torch.from_numpy(weights_matrix) return weights_matrix
def get_vectors(model_name, emb_folder): if model_name not in available_models: raise AttributeError( f'Model name {model_name} is not in model list: {available_models}' ) if not os.path.exists(emb_folder): os.mkdir(emb_folder) model_type = available_models[model_name] if model_type == 'fasttext': lang = model_name.split('_')[0] fasttext_emb_folder = os.path.join(os.getcwd(), emb_folder, 'fasttext') if not os.path.exists(fasttext_emb_folder): os.mkdir(fasttext_emb_folder) vectors = FastText(language=lang, cache=fasttext_emb_folder) fasttext_model_name = os.path.join(fasttext_emb_folder, f'wiki.{lang}.vec') os.remove(fasttext_model_name) return vectors elif model_type == 'gensim': glove_emb_folder = os.path.join(os.getcwd(), emb_folder, 'glove') if not os.path.exists(glove_emb_folder): os.mkdir(glove_emb_folder) api.BASE_DIR = glove_emb_folder raw_model_name = model_name.split('_')[1] w2v_model_name = raw_model_name + '.txt' full_w2v_model_name = os.path.join(glove_emb_folder, w2v_model_name) if not os.path.exists(full_w2v_model_name + '.pt'): model_gensim = api.load(raw_model_name) model_gensim.save_word2vec_format(full_w2v_model_name) shutil.rmtree(os.path.join(glove_emb_folder, raw_model_name)) vectors = Vectors(w2v_model_name, cache=glove_emb_folder) os.remove(full_w2v_model_name) else: vectors = Vectors(w2v_model_name, cache=glove_emb_folder) return vectors
def __init__(self, df: pd.DataFrame, preprocess: bool = True, translation_dict: Optional[Dict[str, str]] = None): index: List[TitleTripletRecord] = [] for _, row in df.iterrows(): title_a = preprocess_title( row['title_a'], translation_dict) if preprocess else row['title_a'] title_p = preprocess_title( row['title_p'], translation_dict) if preprocess else row['title_p'] title_n = preprocess_title( row['title_n'], translation_dict) if preprocess else row['title_n'] index.append( TitleTripletRecord(title_a=title_a, title_p=title_p, title_n=title_n)) self._index = index self._vocab = FastText()
def __init__(self, max_len, batch_size, max_epochs, device, unsup_proportion, sup_proportion, dev_index=1, pretrained=False): text_field = data.Field( lower=True, batch_first=True, fix_length=max_len, pad_token='<pad>', init_token='<go>', is_target=True ) #init_token='<go>', eos_token='<eos>', unk_token='<unk>', pad_token='<unk>') label_field = data.Field(fix_length=max_len - 1, batch_first=True) # make splits for data #unsup_train, unsup_val, unsup_test = MyPennTreebank.splits(text_field) #unsup_train, unsup_val, unsup_test = datasets.PennTreebank.splits(text_field) #unsup_train, unsup_val, unsup_test = datasets.WikiText2.splits(text_field) unsup_train, unsup_val, unsup_test = datasets.UDPOS.splits( (('text', text_field), ('label', label_field))) #unsup_train, unsup_val, unsup_test = YahooLM.splits(text_field) train, val, test = datasets.UDPOS.splits( (('text', text_field), ('label', label_field))) # build the vocabulary text_field.build_vocab( unsup_train, max_size=VOCAB_LIMIT) # , vectors="fasttext.simple.300d") label_field.build_vocab(train) # self.train_iter, _, _ = data.BPTTIterator.splits((unsup_train, unsup_val, unsup_test), # batch_size=batch_size, bptt_len=max_len, # device=device, repeat=False, shuffle=False, # sort=False) # _, self.unsup_val_iter, _ = data.BPTTIterator.splits((unsup_train, unsup_val, unsup_test), # batch_size=int(batch_size/10), bptt_len=max_len, # device=device, repeat=False, shuffle=False, # sort=False) # Remaking splits according to supervision proportions exlist = [ex for ex in train + val] train = Dataset(exlist, {'text': text_field, 'label': label_field}) dev_start, dev_end = int(len(train) / 5 * (dev_index - 1)), \ int(len(train) / 5 * (dev_index)) train_start1, train_start2, train_end1, train_end2 = 0, dev_end, int(dev_start * sup_proportion), \ int(dev_end + (len(train) - dev_end) * sup_proportion) unsup_start, unsup_end = 0, int(len(unsup_train) * unsup_proportion) val = Dataset(train[dev_start:dev_end], { 'text': text_field, 'label': label_field }) train = Dataset( train[train_start1:train_end1] + train[train_start2:train_end2], { 'text': text_field, 'label': label_field }) unsup_train = Dataset(unsup_train[unsup_start:unsup_end], {'text': text_field}) # make iterator for splits self.train_iter, _, _ = data.BucketIterator.splits( (unsup_train, unsup_val, unsup_test), batch_size=batch_size, device=device, shuffle=True, sort=False) _, self.unsup_val_iter, _ = data.BucketIterator.splits( (unsup_train, unsup_val, unsup_test), batch_size=int(batch_size / 10), device=device, shuffle=False, sort=False) self.sup_iter, _, _ = data.BucketIterator.splits((train, val, test), batch_size=batch_size, device=device, shuffle=False, sort=False) _, self.val_iter, self.test_iter = data.BucketIterator.splits( (train, val, test), batch_size=int(batch_size), device=device, shuffle=False, sort=False) self.vocab = text_field.vocab self.tags = label_field.vocab self.text_field = text_field self.label_field = label_field self.device = device self.batch_size = batch_size self.n_epochs = 0 self.max_epochs = max_epochs if pretrained: ftxt = FastText() self.wvs = ftxt.get_vecs_by_tokens(self.vocab.itos) else: self.wvs = None
def load_img_samples(orig_dir, dest_dir): for label in sorted(os.listdir(orig_dir)): class_path = f'{orig_dir}/{label}' with os.scandir(class_path) as it: for _, path in tqdm(enumerate(it)): with open(path, 'rb') as f: try: img = Image.open(f) img = img.convert('RGB') img = img.resize((384, 384)) if not os.path.exists(f'{dest_dir}/{label}'): os.mkdir(f'{dest_dir}/{label}') img.save(f'{dest_dir}/{label}/{"".join(path.name.split(".")[:-1])}.jpg', "JPEG", quality=100) except UnidentifiedImageError: pass if __name__ == '__main__': fasttext_model = FastText() glove_model = GloVe() load_img_samples('../data/original/Tobacco3482-jpg', '../data/Tobacco3482-jpg') load_txt_samples('../data/original/QS-OCR-small', '../data/QS-OCR-small', fasttext_model) for s in ['val', 'test', 'train']: load_img_samples(f'../data/original/RVL-CDIP/{s}', f'../data/RVL-CDIP/{s}') load_txt_samples(f'../data/original/QS-OCR-Large/{s}', f'../data/QS-OCR-Large/{s}', fasttext_model)
def build_legacy_fasttext_vector_pipeline(): tokenizer = get_tokenizer("basic_english") vector = FastText() pipeline = sequential_transforms(tokenizer, vector.get_vecs_by_tokens) return pipeline, None, None
def __init__(self, glove=True, device=device): self.device = device nlp = spacy.load("en_core_web_sm") char_nesting = Field(batch_first=True, tokenize=list, lower=True) char = NestedField(char_nesting, init_token="<sos>", eos_token="<eos>", tokenize="spacy") word = Field(init_token="<sos>", eos_token="<eos>", lower=True, tokenize="spacy") label = Field(sequential=False, is_target=True, use_vocab=False) self.fields = [("question_char", char), ("question_word", word), ("context_char", char), ("context_word", word), ("answer", label)] self.dict_fields = { "question": [("question_char", char), ("question_word", word)], "context": [("context_char", char), ("context_word", word)], "answer": ("answer", label) } self.train_data = self._get_data("../data/train.jsonl") self.dev_data = self._get_data("../data/dev.jsonl") char.build_vocab(self.train_data) if glove: word.build_vocab(self.train_data, vectors=GloVe(name="6B", dim=100)) else: word.build_vocab(self.train_data, vectors=FastText(language='en', max_vectors=30000)) self.char_vocab = char.vocab self.word_vocab = word.vocab pos = [] ner = [] ind2pos = [] ind2ner = [] for data in tqdm(self.train_data): doc = nlp(' '.join(data.question_word + data.context_word)) # t - token pos.extend([t.pos_ for t in doc]) ner.extend([t.label_ for t in doc.ents]) ind2pos.extend([[self.word_vocab.stoi[str(t)], t.pos_] for t in doc]) ind2ner.extend([[self.word_vocab.stoi[str(t)], t.label_] for t in doc.ents]) self.pos_voc = {tag: i for i, tag in enumerate(set(pos))} self.ner_voc = {tag: i + 1 for i, tag in enumerate(set(ner))} self.ner_voc['None'] = 0 # default values, used in DrQA model self.ind2pos = defaultdict(lambda: self.pos_voc['X']) # returns 14 self.ind2ner = defaultdict(lambda: self.ner_voc['None']) # returns 0 self.ind2pos.update({tag[0]: self.pos_voc[tag[1]] for tag in ind2pos}) self.ind2ner.update({tag[0]: self.ner_voc[tag[1]] for tag in ind2ner})
FILE = data.LabelField(sequential=False) #データの読み込み dataset = data.TabularDataset(path='./document.tsv', format='tsv', fields=[('Text', TEXT), ('Label', LABEL), ('File', FILE)], skip_header=True) LABEL.build_vocab(dataset) FILE.build_vocab(dataset) train, val, test = dataset.split(split_ratio=[0.7, 0.1, 0.2], random_state=random.getstate()) TEXT.build_vocab(train, vectors=FastText(language="ja"), min_freq=2) #size print(TEXT.vocab.vectors.size()) # device = torch.device('cpu') device = torch.device('cuda:0') train_iter, val_iter, test_iter = data.Iterator.splits((train, val, test), batch_sizes=(16, 16, 1), device=device, repeat=False, sort=False) batch = next(iter(train_iter)) print(batch.Text) print(batch.Label)
def main(language, hidden_dim, dropout, proc, letter_proc, objective, operator, alpha, lr, momentum, optimizer, batch_size, n_epochs, pretrained_embeddings, letter_hidden_dim, letter_embedding_dim, n_samples, pad_edge, augment, _seed, _run, _log): if objective not in ['erm', 'nll']: raise ValueError("`objective` should be in ['erm', 'nll']," "got %s" % objective) # Technical device = init_system() if pad_edge: init_token = '<init>' eos_token = '<end>' else: init_token = None eos_token = None # Data loading using torchtext abstraction tags = ttdata.Field(sequential=True, include_lengths=True, preprocessing=iob1_iobes, init_token=init_token, eos_token=eos_token, pad_token=None, unk_token=None, batch_first=True) sentences = ttdata.Field(sequential=True, include_lengths=False, batch_first=True, init_token=init_token, eos_token=eos_token, preprocessing=zero_num) letter = ttdata.Field(sequential=True, tokenize=list, include_lengths=True, init_token=None, eos_token=None, preprocessing=zero_num, batch_first=True) letters = NestedField( letter, use_vocab=True, tensor_type=torch.FloatTensor, init_token=init_token, eos_token=eos_token, ) if language == 'en': fields = [[('sentences', sentences), ('letters', letters)], ('', None), ('', None), ('tags', tags)] elif language == 'de': fields = [[('sentences', sentences), ('letters', letters)], ('', None), ('', None), ('', None), ('tags', tags)] elif language in ['es', 'nl']: fields = [[('sentences', sentences), ('letters', letters)], ('', None), ('tags', tags)] else: raise ValueError('Wrong language') tagger_languages = {'en': 'eng', 'nl': 'ned', 'de': 'deu', 'es': 'esp'} train_data, val_data, test_data = SequenceTaggingDataset.splits( path=expanduser('~/data/sdtw_data/conll'), train='%s.train' % tagger_languages[language], validation='%s.testa' % tagger_languages[language], test='%s.testb' % tagger_languages[language], n_samples=n_samples, fields=fields) letters.build_vocab(train_data, val_data, test_data) tags.build_vocab(train_data) tag_itos = tags.vocab.itos if pad_edge: eos_idx = tags.vocab.stoi[tags.eos_token] init_idx = tags.vocab.stoi[tags.init_token] tag_itos[eos_idx] = 'O' tag_itos[init_idx] = 'O' else: eos_idx = None init_idx = None if isinstance(pretrained_embeddings, int): sentences.build_vocab(train_data, val_data, test_data) embedding_dim = pretrained_embeddings else: if pretrained_embeddings == 'ner': vectors = CaseInsensitiveVectors( expanduser('~/data/sdtw_data/ner/%s' % tagger_languages[language]), unk_init=lambda x: x.normal_(0, 1), cache=expanduser('~/cache')) elif 'glove' in pretrained_embeddings: _, name, dim = pretrained_embeddings.split('.') dim = dim[:-1] GloVe.__getitem__ = CaseInsensitiveVectors.__getitem__ vectors = GloVe(name=name, dim=dim, cache=expanduser('~/cache')) elif pretrained_embeddings == 'fasttext': FastText.__getitem__ = CaseInsensitiveVectors.__getitem__ FastText.cache = CaseInsensitiveVectors.cache vectors = FastText(language=language, cache=expanduser('~/cache')) # extend vocab with words of test/val set that has embeddings in # pre-trained embedding # A prod-version would do it dynamically at inference time counter = Counter() sentences.build_vocab(val_data, test_data) for word in sentences.vocab.stoi: if word in vectors.stoi or word.lower() in vectors.stoi or \ re.sub('\d', '0', word.lower()) in vectors.stoi: counter[word] = 1 eval_vocab = Vocab(counter) print("%i/%i eval/test word in pretrained" % (len(counter), len(sentences.vocab.stoi))) sentences.build_vocab(train_data) prev_vocab_size = len(sentences.vocab.stoi) sentences.vocab.extend(eval_vocab) new_vocab_size = len(sentences.vocab.stoi) print('New vocab size: %i (was %i)' % (new_vocab_size, prev_vocab_size)) sentences.vocab.load_vectors(vectors) embedding_dim = sentences.vocab.vectors.shape[1] artifact_dir = _run.info['artifact_dir'] vocab_dict = { 'sentences': sentences.vocab, 'tags': tags.vocab, 'letters': letter.vocab } torch.save(vocab_dict, open(join(artifact_dir, 'vocab.pt'), 'wb+')) unk_idx = sentences.vocab.stoi[sentences.unk_token] padding_idx = sentences.vocab.stoi[sentences.pad_token] singleton_idx = [ tags.vocab.stoi[singleton] for singleton in tags.vocab.stoi if 'S-' in singleton ] tagset_size = len(tags.vocab) vocab_size = len(sentences.vocab) letter_size = len(letters.vocab) device_iter = -1 if device.type == 'cpu' else device.index train_iter, val_iter, test_iter = Iterator.splits( (train_data, val_data, test_data), sort_within_batch=True, batch_sizes=(batch_size, 512, 512), device=device_iter) train_test_iter = Iterator(train_data, sort_within_batch=True, batch_size=512, shuffle=True, device=device_iter) eval_iter = { 'val': val_iter, 'test': test_iter, 'train_test': [next(iter(train_test_iter))] } model = Tagger(embedding_dim, vocab_size, tagset_size, hidden_dim=hidden_dim, proc=proc, padding_idx=padding_idx, letter_proc=letter_proc, letter_embedding_dim=letter_embedding_dim, letter_hidden_dim=letter_hidden_dim, letter_size=letter_size, dropout=dropout, eos_idx=eos_idx, init_idx=init_idx, alpha=alpha, operator=operator) # Load vectors if hasattr(sentences.vocab, 'vectors'): model.embedder.word_embeddings.weight.data = sentences.vocab.vectors model.embedder.word_embeddings.weight.data[padding_idx].fill_(0.) model = model.to(device=device) if operator == 'softmax': loss_function = OurNLLLoss() else: loss_function = BinaryMSELoss() score_function = functools.partial(ner_score, tag_itos=tag_itos, format='iobes') if optimizer == 'sgd': optimizer = torch.optim.SGD(params=model.parameters(), lr=lr * batch_size, momentum=momentum) elif optimizer == 'adam': optimizer = torch.optim.Adam(params=model.parameters(), lr=lr) else: raise ValueError() scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, threshold=1e-3, cooldown=2) for fold in eval_iter: _run.info['%s_loss' % fold] = [] _run.info['%s_prec' % fold] = [] _run.info['%s_recall' % fold] = [] _run.info['%s_f1' % fold] = [] _run.info['epochs'] = [] _run.info['time'] = [] last_epoch = floor(train_iter.epoch) t0 = time.clock() total_time = 0 for batch in train_iter: epoch = floor(train_iter.epoch) if epoch > last_epoch: t1 = time.clock() elapsed = t1 - t0 total_time += elapsed model.eval() _log.info("epoch %i, time/epoch %.3f s" % (epoch, elapsed)) if epoch % 10 == 0: dump_model(model, 'model_%i.pt' % epoch) for fold in eval_iter: this_iter = eval_iter[fold] this_iter = iter(this_iter) loss, prec, recall, f1 = validate(model, this_iter, score_function, objective, loss_function) if fold == 'val': scheduler.step(loss.item(), epoch=epoch) _log.info("%s: loss %.4f, prec %.4f, recall %.4f, f1 %.4f" % (fold, loss, prec, recall, f1)) _run.info['%s_loss' % fold].append(loss.item()) _run.info['%s_prec' % fold].append(prec) _run.info['%s_recall' % fold].append(recall) _run.info['%s_f1' % fold].append(f1) _run.info['time'].append(total_time) _run.info['epochs'].append(epoch) if epoch > n_epochs: break t0 = time.clock() data = make_data(batch, augment=augment, unk_idx=unk_idx, singleton_idx=singleton_idx) model.train() model.zero_grad() loss = compute_loss(model, data, objective, loss_function) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 5, norm_type=2) optimizer.step() last_epoch = epoch dump_model(model, 'model_final.pt') return _run.info['test_f1'][-1]
print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size()) # make iterator for splits train_iter, val_iter, test_iter = data.BucketIterator.splits( (train, val, test), batch_size=3) # print batch information batch = next(iter(train_iter)) print(batch.text) print(batch.label) # Approach 2: TEXT.build_vocab( train, vectors=[GloVe(name='840B', dim='300'), CharNGram(), FastText()]) LABEL.build_vocab(train) # print vocab information print('len(TEXT.vocab)', len(TEXT.vocab)) print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size()) train_iter, val_iter, test_iter = datasets.SST.iters(batch_size=4) # print batch information batch = next(iter(train_iter)) print(batch.text) print(batch.label) # Approach 3: f = FastText()