def build_model(vocab: Vocabulary, args, **kwargs) -> Model: print("Building the model") vocab_size = vocab.get_vocab_size("tokens") EMBED_DIMS = 200 if args.pretrained_WE_path: # turn the tokens into 300 dim embedding. Then, turn the embeddings into encodings embedder = BasicTextFieldEmbedder( {"tokens": Embedding(embedding_dim=EMBED_DIMS, num_embeddings=vocab_size, pretrained_file=args.pretrained_WE_path, vocab=vocab, )}) else: embedder = BasicTextFieldEmbedder( {"tokens": Embedding(embedding_dim=EMBED_DIMS, num_embeddings=vocab_size)}) encoder = CnnEncoder(embedding_dim=EMBED_DIMS, ngram_filter_sizes = (2,3,5), num_filters=5) # num_filters is a tad bit dangerous: the reason is that we have this many filters for EACH ngram f # encoder = BertPooler("bert-base-cased") # the output dim is just the num filters *len(ngram_filter_sizes) # construct the regularizer applicator regularizer_applicator = None if args.use_reg : l2_reg = L2Regularizer() regexes = [("embedder", l2_reg), ("encoder", l2_reg), ("classifier", l2_reg) ] regularizer_applicator = RegularizerApplicator(regexes) return MortalityClassifier(vocab, embedder, encoder,regularizer_applicator,**kwargs)
def __init__(self, char_dict_size: int = 10, char_embedding_dim: int = 8, word_dict_size: int = 10, word_embedding_dim: int = 300, char_filter_dim: int = 100, n_gram_sizes: Tuple[int, ...] = [5], rnn_dim: int = 100, keep_prob: bool = .8, bidirectional: bool = True): super(Bidaf, self).__init__() self.char_embedding = Embedding(char_dict_size, char_embedding_dim) self.word_embedding = Embedding(word_dict_size, word_embedding_dim) self.char_cnn_encoder = CNNEncoder(char_embedding_dim, char_filter_dim, n_gram_sizes) self.highway = Highway(char_filter_dim + word_embedding_dim, num_layers=2) self.contextual_embedding = RNNBaseModule(char_filter_dim + word_embedding_dim, rnn_dim, keep_prob=keep_prob, bidirectional=bidirectional) self.model_layers = RNNBaseModule(rnn_dim * 2, rnn_dim, num_layers=2, keep_prob=keep_prob, bidirectional=bidirectional) self.output_module = MultiDimLinear(rnn_dim * 2, 1)
def create_representation(args): rep_type = args['<representation>'] path = args['<representation_path>'] neg = int(args['--neg']) w_c = args['--w+c'] eig = float(args['--eig']) if rep_type == 'PPMI': if w_c: raise Exception('w+c is not implemented for PPMI.') else: return PositiveExplicit(path, True, neg) elif rep_type == 'SVD': if w_c: return EnsembleEmbedding(SVDEmbedding(path, False, eig, False), SVDEmbedding(path, False, eig, True), True) else: return SVDEmbedding(path, True, eig) elif rep_type == 'CHI': if w_c: raise Exception('w+c is not implemented for CHI.') else: return PositiveExplicit(path, True, neg, False) else: if w_c: return EnsembleEmbedding(Embedding(path + '.words', False), Embedding(path + '.contexts', False), True) else: return Embedding(path + '.words', True)
def __init__(self, input_size: int = INPUT_SIZE, output_size: int = OUTPUT_SIZE, hidden_size: int = HIDDEN_SIZE, embed_size: int = EMBED_SIZE, lr: float = LEARNING_RATE, clip_grad: float = CLIP_GRAD, init_range: float = INIT_RANGE): input_layers = [ Embedding(input_size, embed_size, init_range), LSTM(embed_size, hidden_size, init_range) ] output_layers = [ Embedding(output_size, embed_size, init_range), LSTM(embed_size, hidden_size, init_range, previous=input_layers[1]), Softmax(hidden_size, output_size, init_range) ] self.input_layers, self.output_layers = input_layers, output_layers self.hidden_size = hidden_size self.embed_size = embed_size self.input_size = input_size self.output_size = output_size self.lr = lr self.clip_grad = clip_grad
def test_embedding(): np.random.seed(42) random_bits = np.ones(100, dtype=int) assert np.sum(Embedding("00001111", random_bits).compute()) == 0 assert np.sum(Embedding("10001111", random_bits).compute()) == 1 random_bits = np.random.randint(2, size=80) s1 = "1" * 10 + "0" * 5 s2 = "1" * 11 + "0" * 4 assert pair_embed(s1, s2, random_bits) == 1 assert pair_embed("bad", "boy", random_bits) == 16
def get_image_feature(img_path, img_list_path, model_path, epoch, gpu_id): img_list = open(img_list_path) embedding = Embedding(model_path, epoch, gpu_id) files = img_list.readlines() print('files:', len(files)) faceness_scores = [] img_feats = [] for img_index, each_line in enumerate(files): if img_index % 500 == 0: print('processing', img_index) if img_index == 2000: break name_lmk_score = each_line.strip().split(' ') img_name = os.path.join(img_path, name_lmk_score[0]) img = cv2.imread(img_name) lmk = np.array([float(x) for x in name_lmk_score[1:-1]], dtype=np.float32) lmk = lmk.reshape((5, 2)) img_feats.append(embedding.get(img, lmk)) faceness_scores.append(name_lmk_score[-1]) img_feats = np.array(img_feats).astype(np.float32) faceness_scores = np.array(faceness_scores).astype(np.float32) #img_feats = np.ones( (len(files), 1024), dtype=np.float32) * 0.01 #faceness_scores = np.ones( (len(files), ), dtype=np.float32 ) return img_feats, faceness_scores
def __init__(self, data_set: DataSet, ngram_size: int, emb_size: int, hid_size: int): """Initialize the language recognition module. Args: data_set: the dataset from which the set of input symbols and output classes (languages) can be extracted ngram_size: size of n-gram features (e.g., use 1 for unigrams, 2 for bigrams, etc.) emb_size: size of the character embedding vectors hid_size: size of the hidden layer of the FFN use for scoring """ # Keep the size of the ngrams self.ngram_size = ngram_size # Calculate the embedding alphabet and create the embedding sub-module feat_set = self.alphabet(data_set) self.register("emb", Embedding(feat_set, emb_size)) # Encoding (mapping between langs and ints) lang_set = set(lang for (_, lang) in data_set) self.enc = Encoding(lang_set) # Scoring FFN sub-module self.register("ffn", FFN(idim=emb_size, hdim=hid_size, odim=len(lang_set))) # Additional check to verify that all the registered # parameters actually require gradients. This allows # to identify the "bug" in the embedding module. assert all([param.requires_grad is True for param in self.params()])
def dimension_afn(x): _embedding = Embedding(x) dim = np.arange(1, 20 + 2) E, Es = _embedding.afn(x, tau=138, dim=dim, window=45, metric='chebyshev') E1, E2 = E[1:] / E[:-1], Es[1:] / Es[:-1] _embedding.plot_afn(dim, E1, E2)
def __init__( self, dataset='data/185_baseball.csv', columns=None, tree='ontologies/class-tree_dbpedia_2016-10.json', embedding='models/wiki2vec/en.model', row_agg_func=mean_of_rows, tree_agg_func=np.mean, source_agg_func=mean_of_rows, max_num_samples=1e6, verbose=False, ): # print function that works only when verbose is true self.vprint = print if verbose else no_op self.max_num_samples = max_num_samples self.embedding = embedding if isinstance( embedding, Embedding) else Embedding(embedding_path=embedding, verbose=verbose) self.dataset = dataset if isinstance( dataset, EmbeddedDataset) else EmbeddedDataset( self.embedding, columns=columns, dataset_path=dataset, verbose=verbose) self.tree = tree if isinstance( tree, EmbeddedClassTree) else EmbeddedClassTree( self.embedding, tree_path=tree, verbose=verbose) self.row_agg_func = row_agg_func self.source_agg_func = source_agg_func self.tree_agg_func = tree_agg_func self.similarity_matrices = {}
def _create_embeddings(self): self.embedding_layers = [] for i, table_size in enumerate(self.table_sizes): l = Embedding(input_dim=table_size, output_dim=self.local_embedding_dim, trainable=self.embedding_trainable) self.embedding_layers.append(l)
def __init__(self, config, vocab): super(Net, self).__init__() self.embed = Embedding(config, vocab) def gen_convs(in_channel, kernel_sizes, output_channels): return nn.ModuleList([ nn.Conv1d(in_channels=in_channel, out_channels=oc, kernel_size=kz, padding=((kz - 1) // 2)) for kz, oc in zip(kernel_sizes, output_channels) ]) full_size = sum(config.output_channels) self.convs_QA = gen_convs(config.q_seq_len, config.kernel_sizes, config.output_channels) self.convs_QR = gen_convs(config.q_seq_len, config.kernel_sizes, config.output_channels) self.convs_CA = gen_convs(config.c_seq_len, config.kernel_sizes, config.output_channels) self.convs_CR = gen_convs(config.c_seq_len, config.kernel_sizes, config.output_channels) self.convs_PQ = gen_convs(full_size, config.kernel_sizes, config.output_channels) self.convs_PC = gen_convs(full_size, config.kernel_sizes, config.output_channels) self.drop_QA = nn.Dropout(config.dropout) self.drop_QR = nn.Dropout(config.dropout) self.drop_CA = nn.Dropout(config.dropout) self.drop_CR = nn.Dropout(config.dropout) self.drop_PQ = nn.Dropout(config.dropout) self.drop_PC = nn.Dropout(config.dropout) self.proj1 = nn.Linear(full_size, full_size) self.proj2 = nn.Linear(full_size, 1)
def __init__(self, logger=Log(print), embedding_file='data/wiki-news-300d-1M.vec', bots_file='data/bots_tweets.txt', human_file='data/human_tweets.txt', validation_split=0.2, test_split=0.2, batch_size=50, epochs=25, additional_feats_enabled=True, custom_callback=None, early_stopping=5, dataset_config=DatasetConfig.USER_STATE): self.dataset = DatasetBuilder(logger, dataset_config) _, self.dataset_config_name = dataset_config self.logger = logger self.custom_callback = custom_callback self.embedding = Embedding(logger, embedding_file) self.model = None # initialize later self.additional_feats_enabled = additional_feats_enabled self.batch_size = batch_size self.epochs = epochs self.early_stopping = early_stopping self.validation_split = validation_split self.test_split = test_split self.bots_file = bots_file self.human_file = human_file self.x_bot_tweets = [] self.bot_tweets = [] self.bot_test_tweets = [] self.doc_test_tweets = [] self.labels_test = []
def __init__(self, n_token, n_layer, n_head, d_model, d_head, d_inner, dropout, dropatt, tie_weight=True, d_embed=None, div_val=1, tie_projs=[False], pre_lnorm=False, tgt_len=None, ext_len=None, mem_len=None, cutoffs=[], adapt_inp=False, same_length=False, clamp_len=-1, sample_softmax=-1, demographics_len=0): super(MemTransformerLM, self).__init__() self.n_token = n_token d_embed = d_model if d_embed is None else d_embed self.d_embed = d_embed self.d_model = d_model self.n_head = n_head self.d_head = d_head self.word_emb = Embedding(n_token, d_embed) self.drop = nn.Dropout(dropout) self.n_layer = n_layer self.tgt_len = tgt_len self.ext_len = ext_len self.max_klen = tgt_len + ext_len self.clamp_len = clamp_len self.layers = nn.ModuleList() for i in range(n_layer): self.layers.append( DecoderLayer(n_head, d_model, d_head, d_inner, dropout, dropatt=dropatt, pre_lnorm=pre_lnorm)) self.pos_emb = PositionalEmbedding(self.d_model) self.loss = nn.BCEWithLogitsLoss() self.demographics_len = demographics_len self.fc = nn.Linear(self.d_embed + self.demographics_len, self.n_token, bias=True) weights_init(self)
def main(): """ test method for this class, takes a patient ID as the first cmd line argument and prints out lengths of each session that was loaded """ pid = sys.argv[1] patient = Patient(pid) if patient.pre_test is not None: print("season start: {}".format(len(patient.pre_test.raw))) for i in range(len(patient.intermediate_tests)): print("concussion {}: {}".format( i, len(patient.intermediate_tests[i].raw))) print("season end: {}".format(len(patient.post_test.raw))) prep.stft(patient.pre_test) examples = patient.pre_test.get_examples() emb = Embedding("pca") emb.train(examples) emb_examples = emb.embed(examples) #patient.season_start.extract_windows() #patient.season_start.plot_windows(windows=np.arange(10), channels=["c3", "cz", "c4", "p3", "pz", "p4"]) prep.extractWaves(patient.pre_test, n=4001, samplingRate=256, wave='alpha') patient.pre_test.extract_windows() patient.pre_test.plot_windows( windows=np.arange(10), channels=["c3", "cz", "c4", "p3", "pz", "p4"]) #patient.season_start.plot_channels(channels=["c3", "cz", "c4", "p3", "pz", "p4"], end=256) import pdb pdb.set_trace()
def __init__(self, mode=None, tuner=None, model_name="lgb") -> None: self.stopWords = [ x.strip() for x in open( config.stopwords, encoding='utf-8', mode='r').readlines() ] self.embedding = Embedding() self.embedding.load() self.labelToIndex = json.load( open(config.label2id_file, encoding='utf-8')) self.ix2label = {v: k for k, v in self.labelToIndex.items()} self.mode = mode if not self.mode: self.mode = 'train' self.tuner = tuner assert self.mode in ['train', 'predict'] if self.tuner: assert self.tuner in ['bayes', 'grid'] if self.mode == "train": self.train_data = pd.read_csv( config.train_data_file, sep='\t').dropna().reset_index(drop=True) self.dev_data = pd.read_csv( config.eval_data_file, sep='\t').dropna().reset_index(drop=True) else: self.test_data = pd.read_csv( config.test_data_file, sep='\t').dropna().reset_index(drop=True) self.exclusive_col = ['text', 'lda', 'bow', 'label'] self.model = None self.model_name = model_name
def __init__(self, vocab_sizes, embedding_dims, merge_methods, padding_indices, fix_embedding, out_method='none', out_dim=None): super(MultiFeatureEmbedding, self).__init__() self._vocab_sizes = vocab_sizes self._embedding_dims = embedding_dims self._n_feature = len(vocab_sizes) self._merge_methods = merge_methods self._padding_indices = padding_indices self._fix_embedding = fix_embedding self.emb_list = nn.ModuleList( Embedding(vocab_size, embedding_dim, padding_idx, fix_embedding) for vocab_size, embedding_dim, padding_idx in zip( vocab_sizes, embedding_dims, padding_indices)) self._out_method = out_method self._emb_out_dim = sum(dim for index, dim in enumerate(embedding_dims) if merge_methods[index] == 'cat') if out_method == 'none': self._out_dim = self._emb_out_dim elif out_method == 'linear': self._out_dim = out_dim self.out_module = nn.Linear(self._emb_out_dim, self._out_dim) else: self._out_dim = out_dim self.out_module = MLP(self._emb_out_dim, [int(self._emb_out_dim / 2), self._out_dim], ['prelu', 'prelu'])
def main(model_num=1): preprocess = Preprocess() texts_train, labels_train = preprocess.preprocessData( '../projet2/train.txt', mode="train") texts_dev, labels_dev = preprocess.preprocessData('../projet2/dev.txt', mode="train") MAX_SEQUENCE_LENGTH = 24 LSTM_DIM = 64 HIDDEN_LAYER_DIM = 30 NUM_CLASSES = 4 GAUSSIAN_NOISE = 0.1 DROPOUT = 0.2 DROPOUT_LSTM = 0.2 BATCH_SIZE = 200 X_train, X_val, y_train, y_val = train_test_split(texts_train, labels_train, test_size=0.2, random_state=42) labels_categorical_train = to_categorical(np.asarray(y_train)) labels_categorical_val = to_categorical(np.asarray(y_val)) labels_categorical_dev = to_categorical(np.asarray(labels_dev)) embedding = Embedding('../projet2/emosense.300d.txt') embeddings = embedding.getMatrix() tokenizer = embedding.getTokenizer() message_first_message_train, message_second_message_train, message_third_message_train = get_sequences( X_train, MAX_SEQUENCE_LENGTH, tokenizer) message_first_message_val, message_second_message_val, message_third_message_val = get_sequences( X_val, MAX_SEQUENCE_LENGTH, tokenizer) message_first_message_dev, message_second_message_dev, message_third_message_dev = get_sequences( texts_dev, MAX_SEQUENCE_LENGTH, tokenizer) model = CustomModel(model_num) model.build(embeddings, MAX_SEQUENCE_LENGTH, LSTM_DIM, HIDDEN_LAYER_DIM, NUM_CLASSES, noise=GAUSSIAN_NOISE, dropout_lstm=DROPOUT_LSTM, dropout=DROPOUT) model.summary() history = model.train(message_first_message_train, message_second_message_train, message_third_message_train, labels_categorical_train, message_first_message_val, message_second_message_val, message_third_message_val, labels_categorical_val) y_pred = model.predict([ message_first_message_dev, message_second_message_dev, message_third_message_dev ])
def main(model_dir): np.random.seed(0) torch.manual_seed(0) e = Embedding(model_dir) print(e.nearest_k('god')) print(e.similarity('god', 'wickedly')) print(e.analogy('god', 'love', 'satan', k=5)) print()
def __init__(self): self.img_cropper = ImageCropper() self.geolocation_model = GeolocationEstimator() self.embedding_model = Embedding() self.entity_retriever = EntityRetriever() self.news_api = NewsArticlesApi() self.events_api = OekgEventsApi() print('Loaded GeoWINE successfully.')
def main(): # parse arguments parser = ArgumentParser() parser.add_argument("MODEL_TYPE", type=str, choices=Embedding.MODELS.keys(), \ help="Model type of INPUT_EMBEDDING") parser.add_argument("INPUT_BENCHMARK", type=str, \ help="Benchmark directory of INPUT") parser.add_argument("INPUT_EMBEDDING", type=str, \ help="Embedding directory of INPUT") parser.add_argument("OUTPUT_DIR", type=str, \ help="Directory where to output the whole synonym analysis") parser.add_argument("-d", "--dimension", type=int, default=100, \ help="Hidden dimensions of INPUT_EMBEDDING (Default: 100)") parser.add_argument("-g", "--ground-truth-available", action="store_true", default=False, \ help="An optional boolean flag indicating wether INPUT_BENCHMARK contains ground-truth" \ + " synonyms file 'synonyms_id.txt' which should be evaluated (Default: False).") parser.add_argument("-o", "--override-delta", type=str, choices=Embedding.MODELS.keys(), \ required=False, default=None, \ help="An optional func_delta_r override (MUST BE COMPATIBLE!) (Default: None).") args = parser.parse_args() # check delta override func_delta_r = None if args.override_delta: if args.override_delta == "rescal": func_delta_r = get_rescal_delta_r elif args.override_delta == "transe": func_delta_r = get_transe_delta_r elif args.override_delta == "transh": func_delta_r = get_transh_delta_r elif args.override_delta == "transr": func_delta_r = get_transr_delta_r elif args.override_delta == "transd": func_delta_r = get_transd_delta_r elif args.override_delta == "distmult": func_delta_r = get_distmult_delta_r elif args.override_delta == "hole": func_delta_r = get_hole_delta_r elif args.override_delta == "complex": func_delta_r = get_complex_delta_r elif args.override_delta == "analogy": func_delta_r = get_analogy_delta_r else: # not implemented print("func_delta_r not implemented for model " + args.override_delta + ".") sys.exit(1) # import embedding embedding = Embedding(args.INPUT_BENCHMARK, \ args.INPUT_EMBEDDING, \ Embedding.MODELS[args.MODEL_TYPE], \ args.dimension) # analyse embedding fn_synonyms_id = os.path.join(args.INPUT_BENCHMARK, "synonyms_id.txt") \ if args.ground_truth_available else None analyse_embedding(embedding, args.OUTPUT_DIR, fn_synonyms_id, func_delta_r)
def main(args): config_path = os.path.join(args.dest_dir, 'config.json') logging.info('loading configuration from {}'.format(config_path)) with open(config_path) as f: config = json.load(f) preprocessor = Preprocessor(None) # collect words appear in the data words = set() logging.info('collecting words from {}'.format(config['valid_json_path'])) words |= preprocessor.collect_words(config['valid_json_path'], n_workers=args.n_workers) logging.info('collecting words from {}'.format(config['train_json_path'])) words |= preprocessor.collect_words(config['train_json_path'], n_workers=args.n_workers) logging.info('collecting words from {}'.format(config['test_json_path'])) words |= preprocessor.collect_words(config['test_json_path'], n_workers=args.n_workers) logging.info('loading embedding from {}'.format( config['embedding_vec_path'])) embedding = Embedding(config['embedding_vec_path'], words) embedding_pkl_path = os.path.join(args.dest_dir, 'embedding.pkl') logging.info('Saving embedding to {}'.format(embedding_pkl_path)) with open(embedding_pkl_path, 'wb') as f: pickle.dump(embedding, f) # update embedding used by preprocessor preprocessor.embedding = embedding # train logging.info('Processing train from {}'.format(config['train_json_path'])) train = preprocessor.get_dataset(config['train_json_path'], args.n_workers, config['train_dataset']) train_pkl_path = os.path.join(args.dest_dir, 'train.pkl') logging.info('Saving train to {}'.format(train_pkl_path)) with open(train_pkl_path, 'wb') as f: pickle.dump(train, f) # valid logging.info('Processing valid from {}'.format(config['valid_json_path'])) valid = preprocessor.get_dataset(config['valid_json_path'], args.n_workers, config['valid_dataset']) valid_pkl_path = os.path.join(args.dest_dir, 'valid.pkl') logging.info('Saving valid to {}'.format(valid_pkl_path)) with open(valid_pkl_path, 'wb') as f: pickle.dump(valid, f) # test logging.info('Processing test from {}'.format(config['test_json_path'])) test = preprocessor.get_dataset(config['test_json_path'], args.n_workers, config['test_dataset']) test_pkl_path = os.path.join(args.dest_dir, 'test.pkl') logging.info('Saving test to {}'.format(test_pkl_path)) with open(test_pkl_path, 'wb') as f: pickle.dump(test, f)
def dimension_fnn(x): _embedding = Embedding(x) dim = np.arange(1, 20 + 1) f1, f2, f3 = _embedding.fnn(x, tau=14, dim=dim, window=10, metric='cityblock') _embedding.plot_fnn(dim, f1, f2, f3)
def forward(self, xs): N, T = xs.shape V, D = self.W.shape out = np.empty((N, T, D), dtype='f') self.layers = [] for t in range(T): layer = Embedding(self.W) out[:, t, :] = layer.forward(xs[:, t]) self.layers.append(layer) return out
def __init__( self, n_token, n_layer, n_head, d_model, d_head, d_inner, dropout, dropatt, d_embed=None, pre_lnorm=False, tgt_len=None, ext_len=None, clamp_len=-1 ): super(MemTransformerLM, self).__init__() self.n_token = n_token d_embed = d_model if d_embed is None else d_embed self.d_embed = d_embed self.d_model = d_model self.n_head = n_head self.d_head = d_head self.word_emb = Embedding(n_token, d_embed) self.drop = nn.Dropout(dropout) self.n_layer = n_layer self.tgt_len = tgt_len self.ext_len = ext_len self.max_klen = tgt_len + ext_len self.clamp_len = clamp_len self.layers = nn.ModuleList() for _ in range(n_layer): self.layers.append( DecoderLayer( n_head, d_model, d_head, d_inner, dropout, dropatt=dropatt, pre_lnorm=pre_lnorm, ) ) self.pos_emb = PositionalEmbedding(self.d_model) self.loss = nn.BCEWithLogitsLoss() self.fc = nn.Linear( self.d_embed, self.n_token, bias=True ) weights_init(self)
def _preprocess(self): self.data.dropna(inplace=True) self.data['qid'] = self.data['qid'].astype(int) self._gen_vocab() self.data['qs_processed'] = self.data['question'].apply( lambda x: ' '.join(jieba.cut(x))) self.embedding = Embedding(self.config['vocab_path'], self.config['w2v_path']) self.data['qid'] = self.data['qid'].astype(int) self.data['qs_embed'] = self.data['qs_processed'].apply( lambda x: self.embedding.sentence_embedding(x.split()))
def get_embedding_matrix_and_vectorizer(conversations): from vectorize import Vectorize vectorizer = Vectorize(conversations, MAX_VOCAB_SIZE) word_index = vectorizer.word2idx # train_sequences = vectorizer.vectorize_data(conversations, MAX_SEQUENCE_LENGTH) vectorizer.save_tokenizer(TOKENIZER_PATH) from embedding import Embedding embed = Embedding(word_index) embedding_matrix = embed.get_embedding_matrix() return embedding_matrix, vectorizer
def apply_embedding_transformation(embeddings, positive_seeds, negative_seeds, n_epochs=5, n_dim=10, force_orthogonal=False, plot=False, plot_points=50, plot_seeds=False, **kwargs): print("Preparing to learn embedding tranformation") dataset = DatasetMinibatchIterator(embeddings, positive_seeds, negative_seeds, **kwargs) model = get_model(embeddings.m.shape[1], n_dim, **kwargs) print("Learning embedding transformation") # prog = util.Progbar(n_epochs) for epoch in range(n_epochs): dataset.shuffle() loss = 0 for i, X in enumerate(dataset): loss += model.train_on_batch(X)[0] * X['y'].size Q, b = model.get_weights() if force_orthogonal: Q = orthogonalize(Q) model.set_weights([Q, np.zeros_like(b)]) # prog.update(epoch + 1, exact_values=[('loss', loss / dataset.y.size)]) Q, b = model.get_weights() new_mat = embeddings.m.dot(Q)[:, 0:n_dim] #print "Orthogonality rmse", np.mean(np.sqrt( # np.square(np.dot(Q, Q.T) - np.identity(Q.shape[0])))) if plot and n_dim == 2: plot_words = positive_seeds + negative_seeds if plot_seeds else \ [w for w in embeddings if w not in positive_seeds and w not in negative_seeds] plot_words = set(random.sample(plot_words, plot_points)) to_plot = {w: embeddings[w] for w in embeddings if w in plot_words} lexicon = lexicons.load_lexicon() plt.figure(figsize=(10, 10)) for w, e in to_plot.items(): plt.text(e[0], e[1], w, bbox=dict(facecolor='green' if lexicon[w] == 1 else 'red', alpha=0.1)) xmin, ymin = np.min(np.vstack(list(to_plot.values())), axis=0) xmax, ymax = np.max(np.vstack(list(to_plot.values())), axis=0) plt.xlim(xmin, xmax) plt.ylim(ymin, ymax) plt.show() return Embedding(new_mat, embeddings.iw, normalize=n_dim != 1)
def __init__(self, char_vocab_size, word_vocab_size, num_tags): super(RNN, self).__init__() # architecture self.embed = Embedding(char_vocab_size, word_vocab_size, EMBED_SIZE) self.rnn = getattr(nn, RNN_TYPE)(input_size=EMBED_SIZE, hidden_size=HIDDEN_SIZE // NUM_DIRS, num_layers=NUM_LAYERS, bias=True, batch_first=True, dropout=DROPOUT, bidirectional=NUM_DIRS == 2) self.out = nn.Linear(HIDDEN_SIZE, num_tags) # RNN output to tag
def check_embedding_quality(conversations): from vectorize import Vectorize vectorizer = Vectorize(conversations, MAX_VOCAB_SIZE) word_index = vectorizer.word2idx from embedding import Embedding embed = Embedding(word_index) docu_vocab = vectorizer.word_counts embedding_vocab = embed.get_embedding_vocab() oov_words = embed.check_coverage(docu_vocab, embedding_vocab) print('Collected oov words.') return oov_words
def raw_txt_to_embedding(embedding_file, content): # load nlp object nlp = spacy.load('en', create_make_doc=PlangTokenizer) embedding = Embedding(embedding_file) # push through it the text line pipe tokenized = nlp(content) # convert the trigrams to embedding content_embedding = embedding.words_to_embeddings(tokenized) # TODO: pipe to this to a keras model print(content_embedding)