def __init__(self, graph, path): Embedding.__init__(self, graph) self.graph = graph self.walks = None self.embedding = None self.path = path self.set_paths(path)
class Embeddings(object): EMBEDDING_MODELS: List[Embedding] = [ Embedding(name=u'use_dan', dimensions=512, corpus_size='na', vocabulary_size='230k', download_url='https://storage.googleapis.com/tfhub-modules/' 'google/universal-sentence-encoder/2.tar.gz', format='tar.gz', architecture='DAN', trained_data='wikipedia and other sources', language='en'), Embedding(name=u'use_transformer_large', dimensions=512, corpus_size='na', vocabulary_size='230k', download_url='https://storage.googleapis.com/tfhub-modules/' 'google/universal-sentence-encoder-large/3.tar.gz', format='tar.gz', architecture='Transformer', trained_data='wikipedia and other sources', language='en'), Embedding(name=u'use_transformer_lite', dimensions=512, corpus_size='na', vocabulary_size='na', download_url='https://storage.googleapis.com/tfhub-modules/' 'google/universal-sentence-encoder-lite/2.tar.gz', format='tar.gz', architecture='Transformer', trained_data='wikipedia and other sources', language='en') ] EMBEDDING_MODELS: Dict[str, Embedding] = { embedding.name: embedding for embedding in EMBEDDING_MODELS } def __init__(self): self.sess = tf.Session() self.sess.run( [tf.global_variables_initializer(), tf.tables_initializer()]) self.use_module = None self.model = None def load_model(self, model: str, model_path: str): self.use_module = hub.Module(model_path) self.sess.run(tf.initializers.global_variables()) self.model = model def encode(self, texts: list, pooling: Optional[str] = None) -> Optional[np.array]: return self.sess.run(self.use_module(texts))
def __init__( self, num_of_actions, epsilon=0.0001, num_of_neighbours=10, cluster_distance=0.008, pseudo_counts=0.001, maximum_similarity=8, episodic_memory_capacity=30000): self.epsilon = epsilon self.num_of_neighbours = num_of_neighbours self.cluster_distance = cluster_distance self.pseudo_counts = pseudo_counts self.maximum_similarity = maximum_similarity self.episodic_memory = deque([], maxlen=episodic_memory_capacity) self.moving_average = MovingAverage() self.network = Embedding(num_of_actions) self.optimizer = tf.keras.optimizers.Adam()
def train(params, files): binary = str2bool(params['binary']) data_set, peptide_n_mer = read_data_set(files, test_size=0.05, binary=binary) print('Train data shape is {}'.format(data_set['X_train'].shape)) print('Train data shape is {}'.format(data_set['X_test'].shape)) # variable batch size depending on number of data points batch_size = int(np.ceil(len(data_set['X_train']) / 100.0)) epochs = int(params['epochs']) nb_filter = int(params['filter_size']) filter_length = int(params['filter_length']) dropout = float(params['dropout']) lr = float(params['lr']) # manual drop last for name in data_set.keys(): if data_set[name].shape[0] % batch_size != 0: data_set[name] = data_set[name][:-(data_set[name].shape[0] % batch_size)] # load in learned distributed representation HLA-Vec hla_vec_obj = Word2Vec.load(files['vector_embedding']) hla_vec_embed = hla_vec_obj.wv embed_shape = hla_vec_embed.syn0.shape embedding_weights = np.random.rand(embed_shape[0] + 1, embed_shape[1]) for key in AA_IDX.keys(): embedding_weights[AA_IDX[key], :] = hla_vec_embed[key] embedded_dim = embed_shape[1] embedding = Embedding(embedded_dim, embedding_weights) train_embedding = embedding(torch.from_numpy(data_set['X_train'])).numpy() train_embedding = train_embedding.reshape((train_embedding.shape[0], -1)) test_embedding = embedding(torch.from_numpy(data_set['X_test'])).numpy() test_embedding = test_embedding.reshape((test_embedding.shape[0], -1)) if str2bool(params['binary']): data_set['Y_train'] = np.argmax(data_set['Y_train'], -1)[:, np.newaxis] data_set['Y_test'] = np.argmax(data_set['Y_test'], -1)[:, np.newaxis] else: data_set['Y_train'] = data_set['Y_train'][:, np.newaxis] data_set['Y_test'] = data_set['Y_test'][:, np.newaxis] # weight_space(train_embedding, test_embedding, data_set) infinite_fcn(train_embedding, test_embedding, data_set, binary=binary) #infinite_resnet(train_embedding, test_embedding, data_set) print("The result of gaussian process of regression is:") gaussian_process(train_embedding, test_embedding, data_set, is_classifier=False, binary=binary)
from typing import List, Dict from models import Embedding EMBEDDING_MODELS: List[Embedding] = [ Embedding( name=u'use', dimensions=512, corpus_size='na', vocabulary_size='230k', download_url='https://tfhub.dev/google/universal-sentence-encoder/2', format='.tar.gz', architecture='DAN', trained_data='wikipedia and other sources', language='en'), Embedding(name=u'use_large', dimensions=512, corpus_size='na', vocabulary_size='230k', download_url= 'https://tfhub.dev/google/universal-sentence-encoder-large/3', format='.tar.gz', architecture='Transformer', trained_data='wikipedia and other sources', language='en'), Embedding(name=u'use_lite', dimensions=512, corpus_size='na', vocabulary_size='na', download_url= 'https://tfhub.dev/google/universal-sentence-encoder-lite/2',
def __init__(self, graph, save_path): Embedding.__init__(self, graph) self.set_paths(save_path) self.walker = RandomWalker(graph)
class Embeddings(object): EMBEDDING_MODELS: List[Embedding] = [ Embedding( name=u'google_news_300', dimensions=300, corpus_size='100B', vocabulary_size='3M', download_url= 'https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz', format='gz', architecture='skip-gram', trained_data='Google News', language='en') ] EMBEDDING_MODELS: Dict[str, Embedding] = { embedding.name: embedding for embedding in EMBEDDING_MODELS } def __init__(self): self.word_vectors: Dict[Any, Any] = {} self.model = None @classmethod def _tokens(cls, text: str) -> List[str]: return [x.lower().strip() for x in text.split()] def load_model(self, model: str, model_path: str): try: encoding = 'utf-8' unicode_errors = 'strict' model_file = [ f for f in os.listdir(model_path) if os.path.isfile(os.path.join(model_path, f)) ] f = open(os.path.join(model_path, model_file[0]), 'rb') header = to_unicode(f.readline(), encoding=encoding) vocab_size, vector_size = (int(x) for x in header.split() ) # throws for invalid file format binary_len = dtype(real).itemsize * vector_size for _ in tqdm(range(vocab_size)): word = [] while True: ch = f.read(1) if ch == b' ': break if ch == b'': raise EOFError( "unexpected end of input; is count incorrect or file otherwise damaged?" ) if ch != b'\n': # ignore newlines in front of words (some binary files have) word.append(ch) word = to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors) weights = fromstring(f.read(binary_len), dtype=real).astype(real) self.word_vectors[word] = weights self.model = model print("Model loaded Successfully !") return self except Exception as e: print('Error loading Model, ', str(e)) def encode(self, texts: list, pooling: str = 'mean', **kwargs) -> np.array: text = texts[0] result = np.zeros(Embeddings.EMBEDDING_MODELS[self.model].dimensions, dtype="float32") tokens = Embeddings._tokens(text) vectors = np.array([ self.word_vectors[token] for token in tokens if token in self.word_vectors.keys() ]) if pooling == 'mean': result = np.mean(vectors, axis=0) elif pooling == 'max': result = np.max(vectors, axis=0) elif pooling == 'sum': result = np.sum(vectors, axis=0) elif pooling == 'tf-idf-sum': if not kwargs.get('tfidf_dict'): print('Must provide tfidf dict') return result tfidf_dict = kwargs.get('tfidf_dict') weighted_vectors = np.array([ tfidf_dict.get(token) * self.word_vectors.get(token) for token in tokens if token in self.word_vectors.keys() and token in tfidf_dict ]) result = np.mean(weighted_vectors, axis=0) else: print( f'Given pooling method "{pooling}" not implemented in "{self.model}"' ) return result
class Embeddings(object): EMBEDDING_MODELS: List[Embedding] = [ Embedding(name=u'wiki_news_300', dimensions=300, corpus_size='16B', vocabulary_size='1M', download_url= 'https://dl.fbaipublicfiles.com/fasttext/vectors-english/' 'wiki-news-300d-1M.vec.zip', format='zip', architecture='CBOW', trained_data='Wikipedia 2017', language='en'), Embedding(name=u'wiki_news_300_sub', dimensions=300, corpus_size='16B', vocabulary_size='1M', download_url= 'https://dl.fbaipublicfiles.com/fasttext/vectors-english/' 'wiki-news-300d-1M-subword.vec.zip', format='zip', architecture='CBOW', trained_data='Wikipedia 2017', language='en'), Embedding(name=u'common_crawl_300', dimensions=300, corpus_size='600B', vocabulary_size='2M', download_url= 'https://dl.fbaipublicfiles.com/fasttext/vectors-english/' 'crawl-300d-2M.vec.zip', format='zip', architecture='CBOW', trained_data='Common Crawl (600B tokens)', language='en'), Embedding(name=u'common_crawl_300_sub', dimensions=300, corpus_size='600B', vocabulary_size='2M', download_url= 'https://dl.fbaipublicfiles.com/fasttext/vectors-english/' 'crawl-300d-2M-subword.zip', format='zip', architecture='CBOW', trained_data='Common Crawl (600B tokens)', language='en'), ] EMBEDDING_MODELS: Dict[str, Embedding] = { embedding.name: embedding for embedding in EMBEDDING_MODELS } def __init__(self): self.word_vectors: Dict[Any, Any] = {} self.model = None @classmethod def _tokens(cls, text): return [x.lower().strip() for x in text.split()] def load_model(self, model: str, model_path: str): try: model_file = [ f for f in os.listdir(model_path) if os.path.isfile(os.path.join(model_path, f)) ] f = open(os.path.join(model_path, model_file[0]), 'r') next(f) for line in tqdm(f): split_line = line.split() word = split_line[0] self.word_vectors[word] = np.array( [float(val) for val in split_line[1:]]) print("Model loaded Successfully !") self.model = model return self except Exception as e: print('Error loading Model, ', str(e)) return self def encode(self, texts: list, pooling: str = 'mean', **kwargs) -> np.array: text = texts[0] result = np.zeros(Embeddings.EMBEDDING_MODELS[self.model].dimensions, dtype="float32") tokens = Embeddings._tokens(text) vectors = np.array([ self.word_vectors[token] for token in tokens if token in self.word_vectors.keys() ]) if pooling == 'mean': result = np.mean(vectors, axis=0) elif pooling == 'max': result = np.max(vectors, axis=0) elif pooling == 'sum': result = np.sum(vectors, axis=0) elif pooling == 'tf-idf-sum': if not kwargs.get('tfidf_dict'): print('Must provide tfidf dict') return result tfidf_dict = kwargs.get('tfidf_dict') weighted_vectors = np.array([ tfidf_dict.get(token) * self.word_vectors.get(token) for token in tokens if token in self.word_vectors.keys() and token in tfidf_dict ]) result = np.mean(weighted_vectors, axis=0) else: print(f'Given pooling method "{pooling}" not implemented') return result
def __init__(self, vocab_size, nb_negative, embed_dims=128, context_dims=128, negprob_table=None, optimizer='adam'): super(NCELangModelV2, self).__init__(weighted_inputs=False) self.vocab_size = vocab_size self.embed_dim = embed_dims self.optimizer = optimizers.get(optimizer) self.nb_negative = nb_negative self.loss = categorical_crossentropy self.loss_fnc = objective_fnc(self.loss) if negprob_table is None: negprob_table_ = np.ones(shape=(vocab_size, ), dtype=theano.config.floatX) / vocab_size negprob_table = theano.shared(negprob_table_) self.neg_prob_table = negprob_table_ else: self.neg_prob_table = negprob_table.astype(theano.config.floatX) negprob_table = theano.shared( negprob_table.astype(theano.config.floatX)) self.sampler = TableSampler(self.neg_prob_table) self.add_input(name='idxes', ndim=3, dtype='int32') self.add_node(Split(split_at=1, split_axis=0), name=('pos_sents', ''), inputs='idxes') seq = containers.Sequential() seq.add(self.nodes['pos_sents']) seq.add(Embedding(vocab_size, embed_dims)) seq.add(LangLSTMLayer(embed_dims, output_dim=context_dims)) # seq.add(Dropout(0.5)) self.add_node(seq, name='seq') self.add_node(PartialSoftmax(input_dim=context_dims, output_dim=vocab_size), name='part_prob', inputs=('idxes', 'seq')) self.add_node(Dense(input_dim=context_dims, output_dim=1, activation='exponential'), name='normalizer', inputs='seq') self.add_node(LookupProb(negprob_table), name='lookup_prob', inputs='idxes') test_node = Dense(input_dim=context_dims, output_dim=vocab_size, activation='exponential') test_node.params = [] test_node.W = self.nodes['part_prob'].W test_node.b = self.nodes['part_prob'].b self.add_node(test_node, name='true_unrm_prob', inputs='seq') # self.add_node(ActivationLayer(name='normalization'), name='true_prob', inputs='true_unrm_prob') self.add_output('pos_prob', node='part_prob') self.add_output('neg_prob', node='lookup_prob') # self.add_output('pred_prob', node='true_prob') self.add_output('normalizer', node='normalizer') self.add_output('unrm_prob', node='true_unrm_prob')
class Embeddings(object): EMBEDDING_MODELS: List[Embedding] = [ Embedding(name=u'elmo_bi_lm', dimensions=512, corpus_size='1B', vocabulary_size='5.5B', download_url='https://storage.googleapis.com/tfhub-modules/google/elmo/2.tar.gz', format='tar.gz', architecture='Embedding layer,cnn_layer_with_maxpool,2 lstm layers', trained_data='One Billion Word Benchmark', language='en') ] EMBEDDING_MODELS: Dict[str, Embedding] = {embedding.name: embedding for embedding in EMBEDDING_MODELS} def __init__(self): self.elmo_module = None self.model = None @classmethod def tokenize(cls, text: str): return [word.strip() for word in text.lower().strip().split()] @classmethod def padded_tokens(cls, tokens: List[str], max_seq_length: int): padded_token = "" len_tokens = len(tokens) if len_tokens >= max_seq_length: return tokens[:max_seq_length] else: padded_len = max_seq_length - len_tokens return tokens + [padded_token] * padded_len def load_model(self, model: str, model_path: str): self.elmo_module = hub.Module(model_path) self.model = model def encode(self, texts: list, pooling: str = 'mean', **kwargs) -> Optional[np.array]: text_tokens = [Embeddings.tokenize(text) for text in texts] max_seq_length = kwargs.get('max_seq_length') if max_seq_length: text_tokens = [Embeddings.padded_tokens(tokens, max_seq_length) for tokens in text_tokens] seq_length = [max_seq_length] * len(texts) else: seq_length = [len(tokens) for tokens in text_tokens] embeddings = self.elmo_module(inputs={"tokens": text_tokens, "sequence_len": seq_length}, signature="tokens", as_dict=True)["elmo"] if not pooling: return embeddings if pooling == 'mean': return tf.reduce_mean(embeddings, 0) elif pooling == 'max': return tf.reduce_max(embeddings, 0) elif pooling == 'min': return tf.reduce_min(embeddings, 0) elif pooling == 'mean_max': return tf.concat(values=[tf.reduce_mean(embeddings, 0), tf.reduce_max(embeddings, 0)], axis=0) else: print(f"Pooling method \"{pooling}\" not implemented") return None
from typing import List, Dict from models import Embedding EMBEDDING_MODELS: List[Embedding] = [ Embedding( name=u'infersent_glove', dimensions=300, corpus_size='570k human-generated English sentence pairs', vocabulary_size='na', download_url='https://dl.fbaipublicfiles.com/infersent/infersent1.pkl', format='tar.gz', architecture='cbow', trained_data='SNLI dataset', language='en'), Embedding( name=u'infersent_fasttext', dimensions=300, corpus_size='570k human-generated English sentence pairs', vocabulary_size='na', download_url='https://dl.fbaipublicfiles.com/infersent/infersent2.pkl', format='tar.gz', architecture='cbow', trained_data='SNLI dataset', language='en') ] EMBEDDING_MODELS: Dict[str, Embedding] = { embedding.name: embedding for embedding in EMBEDDING_MODELS }
# vocab.append_sents(valid_sents, fixed_vocab_set=fixed_vocab_set) vocab.append_sents(test_sents, fixed_vocab_set=fixed_vocab_set) # print('vocab size {} before shrink'.format(vocab.vocab_len)) vocab.shrink_vocab(2) print('vocab size {} after shrink'.format(vocab.vocab_len)) print('read vec') word_list = [vocab.idx2word[i] for i in range(len(vocab.idx2word))] vec = read_vec(pubmed_w2v_path, word_list) assert vec.shape[0] == vocab.vocab_len print('build emb layer') emb = Embedding(vocab.vocab_len, vec.shape[1], padding_idx=0, trainable=False) emb.initialize_embedding(vec) emb.cuda() torch.save(emb.state_dict(), emb_path) print('dump data') train_sents = convert_sents_to_idx(train_sents, vocab) test_sents = convert_sents_to_idx(test_sents, vocab) valid_sents = convert_sents_to_idx(valid_sents, vocab) dump_preprocessed_data(opt.train_path, train_sents, train_labels) dump_preprocessed_data(opt.test_path, test_sents, test_labels) dump_preprocessed_data(opt.valid_path, valid_sents, valid_labels) dump_vocab(opt.vocab_path, vocab)
class Embeddings(object): EMBEDDING_MODELS: List[Embedding] = [ Embedding(name=u'twitter_100', dimensions=100, corpus_size='27B', vocabulary_size='1.2M', download_url='https://www.dropbox.com/s/q2wof83a0yq7q74/glove.twitter.27B.100d.txt.zip?dl=1', format='zip', architecture='glove', trained_data='Twitter 2B Tweets', language='en'), Embedding(name=u'twitter_200', dimensions=200, corpus_size='27B', vocabulary_size='1.2M', download_url='https://www.dropbox.com/s/hfw00m77ibz24y5/glove.twitter.27B.200d.txt.zip?dl=1', format='zip', architecture='glove', trained_data='Twitter 2B Tweets', language='en'), Embedding(name=u'twitter_25', dimensions=25, corpus_size='27B', vocabulary_size='1.2M', download_url='https://www.dropbox.com/s/jx97sz8skdp276k/glove.twitter.27B.25d.txt.zip?dl=1', format='zip', architecture='glove', trained_data='Twitter 2B Tweets', language='en'), Embedding(name=u'twitter_50', dimensions=50, corpus_size='27B', vocabulary_size='1.2M', download_url='https://www.dropbox.com/s/9mutj8syz3q20e3/glove.twitter.27B.50d.txt.zip?dl=1', format='zip', architecture='glove', trained_data='Twitter 2B Tweets', language='en'), Embedding(name=u'wiki_100', dimensions=100, corpus_size='6B', vocabulary_size='0.4M', download_url='https://www.dropbox.com/s/g0inzrsy1ds3u63/glove.6B.100d.txt.zip?dl=1', format='zip', architecture='glove', trained_data='Wikipedia+Gigaword', language='en'), Embedding(name=u'wiki_200', dimensions=200, corpus_size='6B', vocabulary_size='0.4M', download_url='https://www.dropbox.com/s/pmj2ycd882qkae5/glove.6B.200d.txt.zip?dl=1', format='zip', architecture='glove', trained_data='Wikipedia+Gigaword', language='en'), Embedding(name=u'wiki_300', dimensions=300, corpus_size='6B', vocabulary_size='0.4M', download_url='https://www.dropbox.com/s/9jbbk99p0d0n1bw/glove.6B.300d.txt.zip?dl=1', format='zip', architecture='glove', trained_data='Wikipedia+Gigaword', language='en'), Embedding(name=u'wiki_50', dimensions=50, corpus_size='6B', vocabulary_size='0.4M', download_url='https://www.dropbox.com/s/o3axsz1j47043si/glove.6B.50d.txt.zip?dl=1', format='zip', architecture='glove', trained_data='Wikipedia+Gigaword', language='en'), Embedding(name=u'crawl_42B_300', dimensions=300, corpus_size='42B', vocabulary_size='1.9M', download_url='http://nlp.stanford.edu/data/glove.42B.300d.zip', format='zip', architecture='glove', trained_data='Common Crawl (42B tokens)', language='en'), Embedding(name=u'crawl_840B_300', dimensions=300, corpus_size='840B', vocabulary_size='2.2M', download_url='http://nlp.stanford.edu/data/glove.840B.300d.zip', format='zip', architecture='glove', trained_data='Common Crawl (840B tokens)', language='en') ] EMBEDDING_MODELS: Dict[str, Embedding] = {embedding.name: embedding for embedding in EMBEDDING_MODELS} def __init__(self): self.word_vectors: Dict[Any, Any] = {} self.model = None @classmethod def _tokens(cls, text: str) -> List[str]: return [x.lower().strip() for x in text.split()] def load_model(self, model: str, model_path: str): try: model_file = [f for f in os.listdir(model_path) if os.path.isfile(os.path.join(model_path, f))] f = open(os.path.join(model_path, model_file[0]), 'r') for line in tqdm(f): split_line = line.split() word = split_line[0] self.word_vectors[word] = np.array([float(val) for val in split_line[1:]]) print("Model loaded Successfully !") self.model = model return self except Exception as e: print('Error loading Model, ', str(e)) return self def encode(self, text: str, pooling: str = 'mean', **kwargs) -> np.array: result = np.zeros(Embeddings.EMBEDDING_MODELS[self.model].dimensions, dtype="float32") tokens = Embeddings._tokens(text) vectors = np.array([self.word_vectors[token] for token in tokens if token in self.word_vectors.keys()]) if pooling == 'mean': result = np.mean(vectors, axis=0) elif pooling == 'max': result = np.max(vectors, axis=0) elif pooling == 'sum': result = np.sum(vectors, axis=0) elif pooling == 'tf-idf-sum': if not kwargs.get('tfidf_dict'): print('Must provide tfidf dict') return result tfidf_dict = kwargs.get('tfidf_dict') weighted_vectors = np.array([tfidf_dict.get(token) * self.word_vectors.get(token) for token in tokens if token in self.word_vectors.keys() and token in tfidf_dict]) result = np.mean(weighted_vectors, axis=0) else: print(f'Given pooling method "{pooling}" not implemented in "{self.model}"') return result
class Embeddings(object): EMBEDDING_MODELS: List[Embedding] = [ Embedding(name=u'bert_base_uncased', dimensions=768, corpus_size='3300M', vocabulary_size='30522(sub-word)', download_url='https://storage.googleapis.com/tfhub-modules/' 'google/bert_uncased_L-12_H-768_A-12/1.tar.gz', format='tar.gz', architecture='Transformer, Layers=12, Hidden = 768, heads = 12', trained_data='BooksCorpus(800M) English Wikipedia (2500M) words', language='en'), Embedding(name=u'bert_base_cased', dimensions=768, corpus_size='3300M', vocabulary_size='30522(sub-word)', download_url='https://storage.googleapis.com/tfhub-modules/google/' 'bert_cased_L-12_H-768_A-12/1.tar.gz', format='tar.gz', architecture='Transformer Layers=12, Hidden = 768, heads = 12', trained_data='BooksCorpus(800M) English Wikipedia (2500M) words', language='en'), Embedding(name=u'bert_multi_cased', dimensions=768, corpus_size='3300M', vocabulary_size='30522 (sub-word)', download_url='https://storage.googleapis.com/tfhub-modules/google/' 'bert_multi_cased_L-12_H-768_A-12/1.tar.gz', format='tar.gz', architecture='Transformer Layers=12, Hidden = 768, heads = 12', trained_data='BooksCorpus(800M) English Wikipedia (2500M) words', language='en'), Embedding(name=u'bert_large_uncased', dimensions=1024, corpus_size='3300M', vocabulary_size='30522 (sub-word)', download_url='https://storage.googleapis.com/tfhub-modules/google/' 'bert_uncased_L-24_H-1024_A-16/1.tar.gz', format='tar.gz', architecture='Transformer Layers=24, Hidden = 1024, heads = 16', trained_data='BooksCorpus(800M) English Wikipedia (2500M) words', language='en'), Embedding(name=u'bert_large_uncased', dimensions=1024, corpus_size='3300M', vocabulary_size='30522 (sub-word)', download_url='https://storage.googleapis.com/tfhub-modules/google/' 'bert_uncased_L-24_H-1024_A-16/1.tar.gz', format='tar.gz', architecture='Transformer Layers=24, Hidden = 1024, heads = 16', trained_data='BooksCorpus(800M) English Wikipedia (2500M) words', language='en') ] EMBEDDING_MODELS: Dict[str, Embedding] = {embedding.name: embedding for embedding in EMBEDDING_MODELS} tokenizer: FullTokenizer = None def __init__(self): self.sess = tf.Session() self.bert_module = None self.model = None def create_tokenizer_from_hub_module(self, model_path: str): """Get the vocab file and casing info from the Hub module.""" tokenization_info = self.bert_module(signature="tokenization_info", as_dict=True) vocab_file, do_lower_case = self.sess.run( [ tokenization_info["vocab_file"], tokenization_info["do_lower_case"], ] ) Embeddings.tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) @staticmethod def _model_single_input(text: str, max_seq_length: int) -> Tuple[List[int], List[int], List[int]]: tokens_a = Embeddings.tokenizer.tokenize(text) if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0: (max_seq_length - 2)] tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) input_ids = Embeddings.tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length return input_ids, input_mask, segment_ids def load_model(self, model: str, model_path: str): self.bert_module = hub.Module(model_path) self.sess.run(tf.initializers.global_variables()) self.create_tokenizer_from_hub_module(model_path) self.model = model print("Model loaded Successfully !") def encode(self, texts: list, pooling: Optional[str] = None, **kwargs) -> Optional[np.array]: max_seq_length = kwargs.get('max_seq_length', 128) input_ids, input_masks, segment_ids = [], [], [] for text in tqdm(texts, desc="Converting texts to features"): input_id, input_mask, segment_id = self._model_single_input(text, max_seq_length) input_ids.append(input_id) input_masks.append(input_mask) segment_ids.append(segment_id) bert_inputs = dict( input_ids=np.array(input_ids), input_mask=np.array(input_masks), segment_ids=np.array(segment_ids)) bert_outputs = self.bert_module(bert_inputs, signature="tokens", as_dict=True) sequence_output = bert_outputs["sequence_output"] token_embeddings = self.sess.run(sequence_output) if not pooling: return token_embeddings else: if pooling not in ["mean", "max", "mean_max", "min"]: print(f"Pooling method \"{pooling}\" not implemented") return None pooling_func = POOL_FUNC_MAP[pooling] pooled = pooling_func(token_embeddings, axis=1) return pooled
class Embeddings(object): EMBEDDING_MODELS: List[Embedding] = [ Embedding( name=u'elmo_bi_lm', dimensions=512, corpus_size='1B', vocabulary_size='5.5B', download_url= 'https://storage.googleapis.com/tfhub-modules/google/elmo/2.tar.gz', format='tar.gz', architecture='Embedding layer,cnn_layer_with_maxpool,2 lstm layers', trained_data='One Billion Word Benchmark', language='en') ] EMBEDDING_MODELS: Dict[str, Embedding] = { embedding.name: embedding for embedding in EMBEDDING_MODELS } def __init__(self): self.elmo_module = None self.model = None self.sess = tf.Session() @classmethod def tokenize(cls, text: str): return [word.strip() for word in text.lower().strip().split()] @classmethod def padded_tokens(cls, tokens: List[str], max_seq_length: int): padded_token = "" len_tokens = len(tokens) if len_tokens >= max_seq_length: return tokens[:max_seq_length] else: padded_len = max_seq_length - len_tokens return tokens + [padded_token] * padded_len def load_model(self, model: str, model_path: str): self.elmo_module = hub.Module(model_path) self.sess.run(tf.initializers.global_variables()) self.model = model def encode(self, texts: list, pooling: Optional[str] = None, **kwargs) -> Optional[np.array]: text_tokens = [Embeddings.tokenize(text) for text in texts] max_seq_length = kwargs.get('max_seq_length') if max_seq_length: text_tokens = [ Embeddings.padded_tokens(tokens, max_seq_length) for tokens in text_tokens ] seq_length = [max_seq_length] * len(texts) else: seq_length = [len(tokens) for tokens in text_tokens] sequence_output = self.elmo_module(inputs={ "tokens": text_tokens, "sequence_len": seq_length }, signature="tokens", as_dict=True)["elmo"] token_embeddings = self.sess.run(sequence_output) if not pooling: return token_embeddings else: if pooling not in ["mean", "max", "mean_max", "min"]: print(f"Pooling method \"{pooling}\" not implemented") return None pooling_func = POOL_FUNC_MAP[pooling] pooled = pooling_func(token_embeddings, axis=1) return pooled
class Embeddings(object): EMBEDDING_MODELS: List[Embedding] = [ Embedding( name=u'xlnet_large_cased', dimensions=1024, corpus_size='32.89B', vocabulary_size='32000', download_url='https://storage.googleapis.com/xlnet/released_models/' 'cased_L-24_H-1024_A-16.zip', format='zip', architecture='Transformer, 24-layer, 1024-hidden, 16-heads', trained_data= 'BooksCorpus(800M) English Wikipedia (2500M) words, Giga5 (16gb), ' 'ClueWeb 2012-B(19gb), Common Crawl(78gb)', language='en'), Embedding( name=u'xlnet_base_cased', dimensions=768, corpus_size='3.86B', vocabulary_size='32000', download_url='https://storage.googleapis.com/xlnet/released_models/' 'cased_L-12_H-768_A-12.zip', format='zip', architecture='Transformer 12-layer, 768-hidden, 12-heads.', trained_data='BooksCorpus(800M) English Wikipedia (2500M) words', language='en') ] EMBEDDING_MODELS: Dict[str, Embedding] = { embedding.name: embedding for embedding in EMBEDDING_MODELS } tokenizer: spm.SentencePieceProcessor = None mode_config_path: str = 'xlnet_config.json' sentence_piece_model_path: str = 'spiece.model' def __init__(self): self.xlnet_config = None self.run_config = None self.model = None self.sess = tf.Session() @staticmethod def load_tokenizer(model_path: str): """Get the vocab file and casing info from the Hub module.""" sp_model = spm.SentencePieceProcessor() sp_model.Load( os.path.join(model_path, Embeddings.sentence_piece_model_path)) Embeddings.tokenizer = sp_model @classmethod def tokenize_fn(cls, text): text = preprocess_text(text, lower=False) return encode_ids(cls.tokenizer, text) @staticmethod def _model_single_input( text: str, max_seq_length: int) -> Tuple[List[int], List[int], List[int]]: tokens_a = Embeddings.tokenize_fn(text) if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0:(max_seq_length - 2)] tokens = [] segment_ids = [] for token in tokens_a: tokens.append(token) segment_ids.append(SEG_ID_A) tokens.append(SEP_ID) segment_ids.append(SEG_ID_A) tokens.append(CLS_ID) segment_ids.append(SEG_ID_CLS) input_ids = tokens # The mask has 0 for real tokens and 1 for padding tokens. Only real # tokens are attended to. input_mask = [0] * len(input_ids) # Zero-pad up to the sequence length. if len(input_ids) < max_seq_length: delta_len = max_seq_length - len(input_ids) input_ids = [0] * delta_len + input_ids input_mask = [1] * delta_len + input_mask segment_ids = [SEG_ID_PAD] * delta_len + segment_ids assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length return input_ids, input_mask, segment_ids def load_model(self, model: str, model_path: str): model_path = os.path.join(model_path, next(os.walk(model_path))[1][0]) self.xlnet_config = xlnet.XLNetConfig( json_path=os.path.join(model_path, Embeddings.mode_config_path)) self.run_config = xlnet.create_run_config(is_training=True, is_finetune=True, FLAGS=Flags) self.load_tokenizer(model_path) self.model = model print("Model loaded Successfully !") def encode(self, texts: list, pooling: Optional[str] = None, **kwargs) -> Optional[np.array]: max_seq_length = kwargs.get('max_seq_length', 128) input_ids, input_masks, segment_ids = [], [], [] for text in tqdm(texts, desc="Converting texts to features"): input_id, input_mask, segment_id = self._model_single_input( text, max_seq_length) input_ids.append(input_id) input_masks.append(input_mask) segment_ids.append(segment_id) # Construct an XLNet model xlnet_model = xlnet.XLNetModel(xlnet_config=self.xlnet_config, run_config=self.run_config, input_ids=np.array(input_ids, dtype=np.int32), seg_ids=np.array(segment_ids, dtype=np.int32), input_mask=np.array(input_masks, dtype=np.float32)) self.sess.run(tf.initializers.global_variables()) # Get a sequence output sequence_output = xlnet_model.get_sequence_output() token_embeddings = self.sess.run(sequence_output) if not pooling: return token_embeddings else: if pooling not in ["mean", "max", "mean_max", "min"]: print(f"Pooling method \"{pooling}\" not implemented") return None pooling_func = POOL_FUNC_MAP[pooling] pooled = pooling_func(token_embeddings, axis=1) return pooled
def main(): opt = Options() print('Use {}'.format(opt.pooling_type_str_dict[opt.pooling_type])) train_sents, train_labels = pickle.load(open(opt.train_path, 'rb')) valid_sents, valid_labels = pickle.load(open(opt.valid_path, 'rb')) test_sents, test_labels = pickle.load(open(opt.test_path, 'rb')) # np.set_printoptions(precision=3) np.set_printoptions(threshold=np.inf) # emb = Embedding(opt.vocab_size, 200, padding_idx=0, trainable=False) cnn = ML_CNN.CNN_Module(n_classes=opt.classifier_output_size) if opt.use_cuda: emb.cuda() cnn.cuda() param = [] param.extend(emb.parameters()) param.extend(cnn.parameters()) # optimizer = torch.optim.Adam(param, lr=opt.lr, weight_decay=0.01) # optimizer = torch.optim.Adam(param, lr=opt.lr, weight_decay=0.00001) optimizer = torch.optim.Adam(param, lr=opt.lr) criteron = torch.nn.CrossEntropyLoss() if opt.restore: if os.path.exists(opt.feature_net_path): print("Load pretrained embedding") emb.load_state_dict(torch.load(opt.feature_net_path)) else: print("No pretrained embedding") if os.path.exists(opt.classifier_net_path): print("Load pretrained cnn classifier") cnn.load_state_dict(torch.load(opt.classifier_net_path)) else: print("No pretrained cnn classifier") best_acc = -1 for epoch in range(opt.max_epochs): print("Starting epoch %d" % epoch) kf = get_minibatches_idx(len(train_sents), opt.batch_size, shuffle=True) epoch_losses = [] cnn.train() emb.train() for iteridx, train_index in kf: if len(train_index) <= 1: continue sents = [train_sents[t] for t in train_index] labels = [train_labels[t] for t in train_index] # X_batch, X_lengths, X_labels = prepare_data_for_rnn(sents, labels) X_batch, X_labels = prepare_data_for_cnn(sents, labels) X_batch = Variable(X_batch) X_labels = Variable(X_labels) if opt.use_cuda: X_batch = X_batch.cuda() X_labels = X_labels.cuda() optimizer.zero_grad() features = emb(X_batch) output = cnn(features) loss = criteron(output, X_labels) local_loss = loss.data[0] epoch_losses.append(local_loss) loss.backward() optimizer.step() if iteridx % opt.print_freq == 0: count = output.size(0) topK_correct = test_result(output.cpu().data, X_labels.cpu().data, topK=topK) topK_acc = [float(tmp) / count for tmp in topK_correct] topK_str = " , ".join(["acc@{}: {}".format(k, tmp_acc) for k, tmp_acc in zip(topK, topK_acc)]) print("Epoch {} Iteration {} loss: {} , {}".format(epoch + 1, iteridx + 1, local_loss, topK_str)) ave_loss = sum(epoch_losses) / len(epoch_losses) kf = get_minibatches_idx(len(valid_sents), opt.batch_size, shuffle=True) count = 0 all_topK_correct = np.zeros(len(topK), dtype=int) for _, valid_index in kf: emb.eval() cnn.eval() sents = [valid_sents[t] for t in valid_index] labels = [valid_labels[t] for t in valid_index] X_batch, X_labels = prepare_data_for_cnn(sents, labels) X_batch = Variable(X_batch) X_labels = Variable(X_labels) if opt.use_cuda: X_batch = X_batch.cuda() X_labels = X_labels.cuda() features = emb(X_batch) output = cnn(features) topK_correct = test_result(output.cpu().data, X_labels.cpu().data, topK=topK) topK_correct = np.array(topK_correct) all_topK_correct += topK_correct bsize = output.size(0) count += bsize all_topK_acc = all_topK_correct / float(count) all_topK_acc = all_topK_acc.tolist() all_topK_str = " , ".join(["val_acc@{}: {}".format(k, tmp_acc) for k, tmp_acc in zip(topK, all_topK_acc)]) print("Epoch {} Avg_loss: {}, {}".format(epoch+1, ave_loss, all_topK_str)) acc = all_topK_acc[important_K] if acc > best_acc: print('Dump current model due to current acc {} > past best acc {}'.format(acc, best_acc)) torch.save(cnn.state_dict(), opt.classifier_net_path) best_acc = acc fscore_records = [{k:FScore() for k in topK} for i in range(opt.classifier_output_size)] kf = get_minibatches_idx(len(test_sents), opt.batch_size, shuffle=True) emb.eval() cnn.eval() for _, test_index in kf: sents = [test_sents[t] for t in test_index] labels = [test_labels[t] for t in test_index] X_batch, X_labels = prepare_data_for_cnn(sents, labels) X_batch = Variable(X_batch) X_labels = Variable(X_labels) if opt.use_cuda: X_batch = X_batch.cuda() X_labels = X_labels.cuda() features = emb(X_batch) output = cnn(features) update_F1(output.cpu().data, X_labels.cpu().data, opt.classifier_output_size, topK, fscore_records) with open('F_score_dir/{}.pkl'.format(epoch+1),'w') as f: print('dumping fscore in epoch {}'.format(epoch+1)) pickle.dump(fscore_records, f) print('Loading best model') cnn.load_state_dict(torch.load(opt.classifier_net_path)) print('Testing Data') kf = get_minibatches_idx(len(test_sents), opt.batch_size, shuffle=True) count = 0 all_topK_correct = np.zeros(len(topK), dtype=int) fscore_records = [{k:FScore() for k in topK} for i in range(opt.classifier_output_size)] for _, test_index in kf: emb.eval() cnn.eval() sents = [test_sents[t] for t in test_index] labels = [test_labels[t] for t in test_index] X_batch, X_labels = prepare_data_for_cnn(sents, labels) X_batch = Variable(X_batch) X_labels = Variable(X_labels) if opt.use_cuda: X_batch = X_batch.cuda() X_labels = X_labels.cuda() features = emb(X_batch) output = cnn(features) update_F1(output.cpu().data, X_labels.cpu().data, opt.classifier_output_size, topK, fscore_records) topK_correct = test_result(output.cpu().data, X_labels.cpu().data, topK=topK) topK_correct = np.array(topK_correct) all_topK_correct += topK_correct bsize = output.size(0) count += bsize all_topK_acc = all_topK_correct / float(count) all_topK_acc = all_topK_acc.tolist() all_topK_str = " , ".join(["test_acc@{}: {}".format(k, tmp_acc) for k, tmp_acc in zip(topK, all_topK_acc)]) print("Training end {}".format(all_topK_str)) with open('F_score_dir/best.pkl','w') as f: print('dumping fscore in') pickle.dump(fscore_records, f)
def __init__(self, graph, path): Embedding.__init__(self, graph) self.nodes = np.asarray(list(graph.nodes())) self.context_embedding = None self.center_embedding = None self.set_paths(path)
from typing import List, Dict from models import Embedding EMBEDDING_MODELS: List[Embedding] = [ Embedding(name=u'umlfit', dimensions=300, corpus_size='570k human-generated English sentence pairs', vocabulary_size='230k', download_url='http://files.fast.ai/models/wt103/', format='.h5', architecture='cbow', trained_data='Stephen Merity’s Wikitext 103 dataset', language='en') ] EMBEDDING_MODELS: Dict[str, Embedding] = { embedding.name: embedding for embedding in EMBEDDING_MODELS }
def embed_network(self, seed, save_path, algorithm_name, precomputed, training, walks): ''' methods that embed the network with a given algorithm, the network is replaced by its embedding to save memory ''' save_path = os.path.join(save_path, 'seed_{}'.format(seed)) if DEBUG: save_path += '_debug' try : os.makedirs(save_path) except: pass if precomputed: model = Embedding(self.residual_network) model.set_paths(save_path) model.load_embedding() self.residual_network = model.word_vectors self.embedded = True elif algorithm_name == 'node2vec': model = Node2vec(self.residual_network, save_path) elif algorithm_name == 'deep_walk': model = DeepWalk(self.residual_network, save_path) elif algorithm_name == 'efge': model = Efge(self.residual_network, save_path) else: raise NotImplementedError('embedding is not implemented') model.get_walks(**walks) model.train(**training) model.save_embedding() # we replace the network by its embedding to save memory self.residual_network = model.word_vectors self.embedded = True