class BPETokenizer(Tokenizer): """ BPTE(Byte-Pair Encoding) Tokenizer text -> ... * Args: name: tokenizer name [roberta] """ def __init__(self, name, config={}): super(BPETokenizer, self).__init__(name, f"bpe-{name}") self.data_handler = DataHandler(CachePath.VOCAB) self.config = config self.bpe_tokenizer = None """ Tokenizers """ def _roberta(self, text, unit="text"): """ ex) """ if self.bpe_tokenizer is None: vocab_path = self.data_handler.read(self.config["vocab_path"], return_path=True) merges_path = self.data_handler.read(self.config["merges_path"], return_path=True) del self.config["vocab_path"] del self.config["merges_path"] self.bpe_tokenizer = RobertaTokenizer(vocab_path, merges_path, **self.config) return self.bpe_tokenizer._tokenize(text)
def __init__( self, vocab, options_file=DEFAULT_OPTIONS_FILE, weight_file=DEFAULT_WEIGHT_FILE, do_layer_norm=False, dropout=0.5, trainable=False, project_dim=None, ): super(ELMoEmbedding, self).__init__(vocab) data_handler = DataHandler(cache_path=CachePath.PRETRAINED_VECTOR) option_path = data_handler.read(options_file, return_path=True) weight_path = data_handler.read(weight_file, return_path=True) self.elmo = Elmo(option_path, weight_path, 1, requires_grad=trainable, dropout=dropout) self.project_dim = project_dim self.project_linear = None if project_dim: self.project_linear = nn.Linear(self.elmo.get_output_dim(), project_dim)
def __init__( self, word_embedding, pretrained_path=None, requires_grad=False, residual_embeddings=False ): """Initialize an MTLSTM. Arguments: n_vocab (bool): If not None, initialize MTLSTM with an embedding matrix with n_vocab vectors vectors (Float Tensor): If not None, initiapize embedding matrix with specified vectors residual_embedding (bool): If True, concatenate the input embeddings with MTLSTM outputs during forward """ super(MTLSTM, self).__init__() self.word_embedding = word_embedding self.rnn = nn.LSTM(300, 300, num_layers=2, bidirectional=True, batch_first=True) data_handler = DataHandler(cache_path=CachePath.PRETRAINED_VECTOR) cove_weight_path = data_handler.read(pretrained_path, return_path=True) if torch.cuda.is_available(): checkpoint = torch.load(cove_weight_path) else: checkpoint = torch.load(cove_weight_path, map_location="cpu") self.rnn.load_state_dict(checkpoint) self.residual_embeddings = residual_embeddings self.requires_grad = requires_grad
def __init__(self, name, word_tokenizer, config={}): super(SubwordTokenizer, self).__init__(name, f"subword-{name}+{word_tokenizer.cache_name}") self.data_handler = DataHandler(CachePath.VOCAB) self.config = config self.word_tokenizer = word_tokenizer self.subword_tokenizer = None
def __init__(self, file_paths, dataset_obj): self.file_paths = file_paths self.dataset_obj = dataset_obj self.data_handler = DataHandler( cache_path=CachePath.DATASET) # for Concrete DataReader self.text_columns = None
def build_with_pretrained_file(self, token_counter): data_handler = DataHandler(CachePath.VOCAB) vocab_texts = data_handler.read(self.pretrained_path) if self.pretrained_path.endswith(".txt"): predefine_vocab = vocab_texts.split("\n") elif self.pretrained_path.endswith(".json"): vocab_texts = json.loads(vocab_texts) # {token: id} predefine_vocab = [ item[0] for item in sorted(vocab_texts.items(), key=lambda x: x[1]) ] else: raise ValueError(f"support vocab extention. .txt or .json") self.build(token_counter, predefine_vocab=predefine_vocab)
def __init__( self, vocab, dropout=0.2, embed_dim=100, padding_idx=None, max_norm=None, norm_type=2, scale_grad_by_freq=False, sparse=False, pretrained_path=None, trainable=True, ): super(WordEmbedding, self).__init__(vocab) self.data_handler = DataHandler(cache_path=CachePath.PRETRAINED_VECTOR) self.embed_dim = embed_dim if dropout and dropout > 0: self.dropout = nn.Dropout(p=dropout) else: self.dropout = lambda x: x if pretrained_path: weight = self._read_pretrained_file(pretrained_path) self.weight = torch.nn.Parameter(weight, requires_grad=trainable) else: self.weight = self._init_weight(trainable=trainable) # nn.functional.embedding = optional paramters # (padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse) # check - https://pytorch.org/docs/master/nn.html#torch.nn.functional.embeddin\ # ://pytorch.org/docs/master/nn.html#torch.nn.functional.embedding self.padding_idx = padding_idx self.max_norm = max_norm self.norm_type = norm_type self.scale_grad_by_freq = scale_grad_by_freq self.sparse = sparse
class SubwordTokenizer(Tokenizer): """ Subword Tokenizer text -> [word tokens] -> [[sub word tokens], ...] * Args: name: tokenizer name [wordpiece] """ def __init__(self, name, word_tokenizer, config={}): super(SubwordTokenizer, self).__init__(name, f"subword-{name}+{word_tokenizer.cache_name}") self.data_handler = DataHandler(CachePath.VOCAB) self.config = config self.word_tokenizer = word_tokenizer self.subword_tokenizer = None """ Tokenizers """ def _wordpiece(self, text, unit="text"): """ ex) Hello World -> ['Hello', 'World'] -> ['He', '##llo', 'Wo', '##rld'] """ if self.subword_tokenizer is None: vocab_path = self.data_handler.read(self.config["vocab_path"], return_path=True) vocab = load_vocab(vocab_path) self.subword_tokenizer = WordpieceTokenizer( vocab, unk_token=self.config.get("unk_token", "[UNK]")) tokens = [] if unit == "word": for sub_token in self.subword_tokenizer.tokenize(text): tokens.append(sub_token) else: for token in self.word_tokenizer.tokenize(text): for sub_token in self.subword_tokenizer.tokenize(token): tokens.append(sub_token) return tokens
def __init__(self, name, config={}): super(BPETokenizer, self).__init__(name, f"bpe-{name}") self.data_handler = DataHandler(CachePath.VOCAB) self.config = config self.bpe_tokenizer = None
def __init__(self, token_makers, lazy_indexing=True): self.token_makers = token_makers self.lazy_indexing = lazy_indexing self.data_handler = DataHandler(cache_path=CachePath.TOKEN_COUNTER)
class TextHandler: """ Text Handler - voacb and token_counter - raw_features -> indexed_features - raw_features -> tensor * Args: token_makers: Dictionary consisting of - key: token_name - value: TokenMaker (claf.tokens.token_maker) * Kwargs: lazy_indexing: Apply `Lazy Evaluation` to text indexing """ def __init__(self, token_makers, lazy_indexing=True): self.token_makers = token_makers self.lazy_indexing = lazy_indexing self.data_handler = DataHandler(cache_path=CachePath.TOKEN_COUNTER) def build_vocabs(self, token_counters): logger.info("Start build vocab") vocab_start_time = time.time() vocabs = {} for token_name, token_maker in self.token_makers.items(): is_defined_config = type(token_maker.vocab_config) == dict if is_defined_config: token_counter = token_counters[token_name] vocab = self._build_vocab_with_config(token_name, token_maker, token_counter) else: vocab = Vocab(token_name) vocab.init() vocabs[token_name] = vocab logger.info( f" => {token_name} vocab size: {len(vocab)} (use predefine vocab: {vocab.pretrained_path is not None})" ) vocab_elapased_time = time.time() - vocab_start_time logger.info(f"Complete build vocab... elapsed_time: {vocab_elapased_time}\n") # Setting Indexer (vocab) for token_name, token_maker in self.token_makers.items(): token_maker.set_vocab(vocabs[token_name]) return vocabs def _build_vocab_with_config(self, token_name, token_maker, token_counter): token_maker.vocab_config["token_name"] = token_name vocab = Vocab(**token_maker.vocab_config) if vocab.pretrained_path is not None: vocab.build_with_pretrained_file(token_counter) else: vocab.build(token_counter) return vocab def is_all_vocab_use_pretrained(self): for token_name, token_maker in self.token_makers.items(): if token_maker.vocab_config.get("pretrained_path", None) is None: return False if token_maker.vocab_config.get("pretrained_token", "") != Vocab.PRETRAINED_ALL: return False return True def make_token_counters(self, texts, config=None): token_counters = {} for token_name, token_maker in self.token_makers.items(): token_vocab_config = token_maker.vocab_config if type(token_vocab_config) == dict: if token_vocab_config.get("pretrained_token", None) == Vocab.PRETRAINED_ALL: texts = [ "" ] # do not use token_counter from dataset -> make empty token_counter token_counter = self._make_token_counter( texts, token_maker.tokenizer, config=config, desc=f"{token_name}-vocab" ) logger.info(f" * {token_name} token_counter size: {len(token_counter)}") token_counters[token_name] = token_counter return token_counters def _make_token_counter(self, texts, tokenizer, config=None, desc=None): tokenizer_name = tokenizer.name cache_token_counter = None if config is not None: data_reader_config = config.data_reader cache_token_counter = self.data_handler.cache_token_counter( data_reader_config, tokenizer_name ) if cache_token_counter: return cache_token_counter else: tokens = [ token for text in tqdm(texts, desc=desc) for token in tokenizer.tokenize(text) ] flatten_list = list(common_utils.flatten(tokens)) token_counter = Counter(flatten_list) if config is not None: # Cache TokenCounter self.data_handler.cache_token_counter( data_reader_config, tokenizer_name, obj=token_counter ) return token_counter def index(self, datas, text_columns): logger.info(f"Start token indexing, Lazy: {self.lazy_indexing}") indexing_start_time = time.time() for data_type, data in datas.items(): if type(data) == list: # Multi-Data Indexing for d in data: self._index_features( d.features, text_columns, desc=f"indexing features ({data_type})" ) else: self._index_features( data.features, text_columns, desc=f"indexing features ({data_type})" ) indexing_elapased_time = time.time() - indexing_start_time logger.info(f"Complete token indexing... elapsed_time: {indexing_elapased_time} \n") def _index_features(self, features, text_columns, desc=None, suppress_tqdm=False): for feature in tqdm(features, desc=desc, disable=suppress_tqdm): for key, text in feature.items(): if key not in text_columns: continue # Set data_type (text => {"text": ..., "token1": ..., ...}) if type(feature[key]) != dict: feature[key] = {"text": text} if type(text) == dict: text = text["text"] for token_name, token_maker in self.token_makers.items(): param_key = token_maker.indexer.param_key if param_key == key: continue feature[key][token_name] = self._index_token(token_maker, text, feature) def _index_token(self, token_maker, text, data): def index(): indexer = token_maker.indexer params = {} if token_maker.type_name == TokenMaker.EXACT_MATCH_TYPE: param_text = data[indexer.param_key] if type(param_text) == dict: param_text = param_text["text"] params["query_text"] = param_text return indexer.index(text, **params) if self.lazy_indexing: return index else: return index() def raw_to_tensor_fn(self, data_reader, cuda_device=None, helper={}): def raw_to_tensor(inputs): is_one = True # batch_size 1 flag feature, _helper = data_reader.read_one_example(inputs) nonlocal helper helper.update(_helper) if type(feature) == list: is_one = False features = feature else: features = [feature] self._index_features(features, data_reader.text_columns, suppress_tqdm=True) if is_one: indexed_features = features[0] else: # when features > 1, need to transpose (dict_of_list -> list_of_dict) indexed_features = {} for key in features[0]: feature_with_key = [feature[key] for feature in features] indexed_features[key] = transpose(feature_with_key, skip_keys=["text"]) for key in indexed_features: for token_name in self.token_makers: if token_name not in indexed_features[key]: continue indexed_values = indexed_features[key][token_name] if is_one: indexed_values = [indexed_values] tensor = padding_tokens(indexed_values, token_name=token_name) if cuda_device is not None and type(tensor) != list: tensor = tensor.cuda(cuda_device) indexed_features[key][token_name] = tensor for key in indexed_features: if "text" in indexed_features[key]: del indexed_features[key]["text"] return indexed_features, helper return raw_to_tensor
def __init__(self, config): super(OpenQA, self).__init__(config) self.data_handler = DataHandler(CachePath.MACHINE / "open_qa") self.load()
def __init__(self, config): super(NLU, self).__init__(config) self.data_handler = DataHandler(CachePath.MACHINE / "nlu") self.load()
def build_with_pretrained_file(self, token_counter): data_handler = DataHandler(CachePath.VOCAB) vocab_texts = data_handler.read(self.pretrained_path) predefine_vocab = vocab_texts.split("\n") self.build(token_counter, predefine_vocab=predefine_vocab)
class MRCEnsemble(Machine): """ Machine Reading Comprehension Ensemble * Args: config: machine_config """ def __init__(self, config): super(MRCEnsemble, self).__init__(config) self.data_handler = DataHandler(CachePath.MACHINE / "mrc_ensemble") self.load() @overrides def load(self): mrc_config = self.config.reading_comprehension # Model 1 - BERT-Kor self.rc_experiment1 = self.make_module(mrc_config.model_1) print("BERT-Kor ready ..! \n") # # Model 2 - BERT-Multilingual # self.rc_experiment2 = self.make_module(mrc_config.model_2) # print("BERT-Multilingual ready ..! \n") # # Model 3 - DocQA # self.rc_experiment3 = self.make_module(mrc_config.model_3) # print("DocQA ready ..! \n") # # Model 4 - DrQA # self.rc_experiment4 = self.make_module(mrc_config.model_4) # print("DrQA ready ..! \n") print("All ready ..! \n") def evaluate(self, file_path, output_path): # KorQuAD dataset... # def get_answer_after_clustering(predictions): # categories = {} # for l1 in predictions: # l1_text = l1["text"] # l1_text_normalized = normalize_answer(l1_text) # categories[l1_text] = { # "items": [], # "score": 0 # } # for l2 in predictions: # l2_text = l2["text"] # l2_text_normalized = normalize_answer(l2_text) # if l1_text_normalized in l2_text_normalized: # categories[l1_text]["items"].append(l2) # categories[l1_text]["score"] += l2["score"] # # # count items then score * 1.n # # for k, v in categories.items(): # # ratio = 1 + (len(v["items"]) / 10) # # v["score"] *= ratio # highest_category = [categories[c] for c in sorted(categories, key=lambda x: categories[x]["score"], reverse=True)][0] # answer_text = sorted(highest_category["items"], key=lambda x: x["score"], reverse=True)[0]["text"] # return answer_text # def get_answer_after_clustering_marginal(predictions): # categories = {} # for l1 in predictions: # l1_text = l1["text"] # l1_text_normalized = normalize_answer(l1_text) # categories[l1_text] = { # "items": [], # "score": 0 # } # for l2 in predictions: # l2_text = l2["text"] # l2_text_normalized = normalize_answer(l2_text) # if l1_text_normalized in l2_text_normalized: # categories[l1_text]["items"].append(l2) # categories[l1_text]["score"] *= l2["score"] # else: # categories[l1_text]["score"] *= 0.01 # Default value # # count items then score * 1.n # for k, v in categories.items(): # ratio = 1 + (len(v["items"]) / 10) # v["score"] *= ratio # highest_category = [categories[c] for c in sorted(categories, key=lambda x: categories[x]["score"], reverse=True)][0] # answer_text = sorted(highest_category["items"], key=lambda x: x["score"], reverse=True)[0]["text"] # return answer_text # def post_processing(text): # # detach josa # # josas = ['은', '는', '이', '가', '을', '를', '과', '와', '이다', '다', '으로', '로', '의', '에'] # josas = ["는", "를", "이다", "으로", "에", "이라고", "라고", "와의", "인데"] # for josa in josas: # if text.endswith(josa): # text = text[:-len(josa)] # break # # temperature # if text.endswith("°"): # text += "C" # # etc # special_cases = ["(", ",", "였", "."] # for s in special_cases: # if text.endswith(s): # text = text[:-len(s)] # return text def _clean_text(text): # https://github.com/allenai/document-qa/blob/2f9fa6878b60ed8a8a31bcf03f802cde292fe48b/docqa/data_processing/text_utils.py#L124 # be consistent with quotes, and replace \u2014 and \u2212 which I have seen being mapped to UNK # by glove word vecs return (text.replace("''", '"').replace("``", '"').replace( "\u2212", "-").replace("\u2014", "\u2013")) predictions = {} topk_predictions = {} print("Read input_data...") data = self.data_handler.read(file_path) squad = json.loads(data) if "data" in squad: squad = squad["data"] wrong_count = 0 print("Start predict 1-examples...") for article in tqdm(squad): for paragraph in article["paragraphs"]: context = paragraph["context"] for qa in paragraph["qas"]: question = qa["question"] id_ = qa["id"] # Marginal probabilities... # prediction = self.get_predict_with_marginal(context, question) prediction = self.get_predict(context, question) # print("prediction count:", len(prediction)) topk_predictions[id_] = prediction predictions[id_] = prediction[0]["text"] # answer_texts = [q["text"] for q in qa["answers"]] # # 1. Highest value # sorted_prediction = sorted(prediction, key=lambda x: x["score"], reverse=True) # prediction_text = sorted_prediction[0]["text"] # 2. Cluster by text # prediction_text = get_answer_after_clustering_marginal(prediction) # prediction_text = post_processing(prediction_text) # predictions[id_] = prediction_text # if prediction_text not in answer_texts: # pred_f1_score = metric_max_over_ground_truths(f1_score, prediction_text, answer_texts) # if pred_f1_score <= 0.5: # sorted_prediction = sorted(prediction, key=lambda x: x["score"], reverse=True) # print("predict:", json.dumps(sorted_prediction[:5], indent=4, ensure_ascii=False)) # print("predict_text:", prediction_text) # print("answers:", qa["answers"], "f1:", pred_f1_score) # print("-"*50) # wrong_count += 1 # is_answer = False # for pred in prediction: # if pred["text"] in answer_texts: # predictions[id_] = pred["text"] # is_answer = True # break # if not is_answer: # prediction_text = sorted(prediction, key=lambda x: x["score"], reverse=True)[0]["text"] # predictions[id_] = prediction_text # print("predict:", prediction) # print("predict_text:", prediction_text) # print("answers:", qa["answers"]) # print("-"*50) # wrong_count += 1 print("total_count:", len(predictions), "wrong_count:", wrong_count) print("Completed...!") with open(output_path, "w") as out_file: out_file.write(json.dumps(topk_predictions, indent=4) + "\n") # Evaluate with open(file_path) as dataset_file: dataset_json = json.load(dataset_file) dataset = dataset_json if "data" in dataset: dataset = dataset["data"] # with open(output_path) as prediction_file: # predictions = json.load(prediction_file) results = evaluate(dataset, predictions) print(json.dumps(results)) def get_predict(self, context, question): raw_feature = {"context": context, "question": question} # print(raw_feature) # Approach 1. Max Prob models = [ (self.rc_experiment1, 0.94), # (self.rc_experiment2, 0.90) # (self.rc_experiment3, 0.85), # (self.rc_experiment4, 0.84), ] # models = [self.rc_experiment3, self.rc_experiment4] model = models[0][0] return sorted(model.predict(raw_feature), key=lambda x: x["score"], reverse=True)
class OpenQA(Machine): """ Open-Domain Question Answer Machine (DrQA) DrQA is a system for reading comprehension applied to open-domain question answering. * Args: config: machine_config """ def __init__(self, config): super(OpenQA, self).__init__(config) self.data_handler = DataHandler(CachePath.MACHINE / "open_qa") self.load() @overrides def load(self): # Tokenizers tokenizers_config = convert_config2dict(self.config.tokenizers) tokenizers = make_all_tokenizers(tokenizers_config) # Knowledge Base # - Wiki knowledge_base_config = self.config.knowledge_base self.docs, doc_name = self._load_knowledge_base(knowledge_base_config) # Reasoning # - Document Retrieval # - Reading Comprehension Experiment reasoning_config = self.config.reasoning self.document_retrieval = self._load_document_retrieval( reasoning_config.document_retrieval, tokenizers["word"], basename=doc_name) self.rc_experiment = self.make_module( reasoning_config.reading_comprehension) print("Ready ..! \n") def _load_knowledge_base(self, config): docs = read_wiki_articles(config.wiki) # TODO: fix read whole wiki doc_name = f"{os.path.basename(config.wiki)}-{len(docs)}-articles" return docs, doc_name def _load_document_retrieval(self, config, word_tokenizer, basename="docs"): dir_path = f"doc-{config.type}-{config.name}-{word_tokenizer.cache_name}" doc_retrieval_path = os.path.join(dir_path, basename) config.params = { "texts": [doc.title for doc in self.docs], "word_tokenizer": word_tokenizer, } document_retrieval = self.make_module(config) doc_retrieval_path = self.data_handler.convert_cache_path( doc_retrieval_path) if doc_retrieval_path.exists(): document_retrieval.load(doc_retrieval_path) else: print("Start Document Retrieval Indexing ...") document_retrieval.init() document_retrieval.save(doc_retrieval_path) # Save Cache print("Completed!") return document_retrieval @overrides def __call__(self, question): result_docs = self.search_documents(question) print("-" * 50) print("Doc Scores:") for doc in result_docs: print(f" - {doc[1]} : {doc[2]}") print("-" * 50) passages = [] for result_doc in result_docs: doc_index = result_doc[0] doc = self.docs[doc_index] passages.append(doc.text) answers = [] for passage in passages: answer_text = self.machine_reading(passage, question) answers.append(answer_text) ranked_answers = sorted(answers, key=lambda x: x["score"], reverse=True) return ranked_answers def search_documents(self, question): return self.document_retrieval.get_closest(question) def machine_reading(self, context, question): raw_feature = {"context": context, "question": question} return self.rc_experiment.predict(raw_feature)
def __init__(self, config): super(MRCEnsemble, self).__init__(config) self.data_handler = DataHandler(CachePath.MACHINE / "mrc_ensemble") self.load()
class WordEmbedding(TokenEmbedding): """ Word Embedding Default Token Embedding * Args: vocab: Vocab (claf.tokens.vocab) * Kwargs: dropout: The number of dropout probability embed_dim: The number of embedding dimension padding_idx: If given, pads the output with the embedding vector at padding_idx (initialized to zeros) whenever it encounters the index. max_norm: If given, will renormalize the embedding vectors to have a norm lesser than this before extracting. Note: this will modify weight in-place. norm_type: The p of the p-norm to compute for the max_norm option. Default 2. scale_grad_by_freq: if given, this will scale gradients by the inverse of frequency of the words in the mini-batch. Default False. sparse: if True, gradient w.r.t. weight will be a sparse tensor. See Notes under torch.nn.Embedding for more details regarding sparse gradients. pretrained_path: pretrained vector path (eg. GloVe) trainable: finetune or fixed """ def __init__( self, vocab, dropout=0.2, embed_dim=100, padding_idx=None, max_norm=None, norm_type=2, scale_grad_by_freq=False, sparse=False, pretrained_path=None, trainable=True, ): super(WordEmbedding, self).__init__(vocab) self.data_handler = DataHandler(cache_path=CachePath.PRETRAINED_VECTOR) self.embed_dim = embed_dim if dropout and dropout > 0: self.dropout = nn.Dropout(p=dropout) else: self.dropout = lambda x: x if pretrained_path: weight = self._read_pretrained_file(pretrained_path) self.weight = torch.nn.Parameter(weight, requires_grad=trainable) else: self.weight = self._init_weight(trainable=trainable) # nn.functional.embedding = optional paramters # (padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse) # check - https://pytorch.org/docs/master/nn.html#torch.nn.functional.embeddin\ # ://pytorch.org/docs/master/nn.html#torch.nn.functional.embedding self.padding_idx = padding_idx self.max_norm = max_norm self.norm_type = norm_type self.scale_grad_by_freq = scale_grad_by_freq self.sparse = sparse def _init_weight(self, trainable=True): weight = torch.FloatTensor(self.get_vocab_size(), self.embed_dim) weight = torch.nn.Parameter(weight, requires_grad=trainable) torch.nn.init.xavier_uniform_(weight) return weight @overrides def forward(self, words): input_size = words.size() if len(input_size) > 2: words = words.view(-1, input_size[-1]) embedded_words = F.embedding( words, self.weight, padding_idx=self.padding_idx, max_norm=self.max_norm, norm_type=self.norm_type, scale_grad_by_freq=self.scale_grad_by_freq, sparse=self.sparse, ) if len(input_size) > 2: embedded_size = list(input_size) + [embedded_words.size(-1)] embedded_words = embedded_words.view(*embedded_size) return self.dropout(embedded_words) def _read_pretrained_file(self, file_path): words_to_keep = set(self.vocab.get_all_tokens()) vocab_size = self.get_vocab_size() embeddings = {} # First we read the embeddings from the file, only keeping vectors for the words we need. logger.info("Reading embeddings from file") file_path = self.data_handler.read(file_path, return_path=True) with open(file_path, "rb") as embeddings_file: for line in embeddings_file: fields = line.decode("utf-8").rstrip().split(" ") if len(fields) - 1 != self.embed_dim: logger.info( f"Found line with wrong number of dimensions (expected {self.embed_dim}, was {len(fields)}): {line}" ) continue word = fields[0] if word in words_to_keep: vector = np.asarray(fields[1:], dtype="float32") embeddings[word] = vector if not embeddings: raise ValueError( "No embeddings of correct dimension found. check input dimension value" ) all_embeddings = np.asarray(list(embeddings.values())) embeddings_mean = float(np.mean(all_embeddings)) embeddings_std = float(np.std(all_embeddings)) # Now we initialize the weight matrix for an embedding layer, starting with random vectors, # then filling in the word vectors we just read. logger.info("Initializing pre-trained embedding layer") embedding_matrix = torch.FloatTensor(vocab_size, self.embed_dim).normal_( embeddings_mean, embeddings_std ) match_count = 0 for i in range(0, vocab_size): word = self.vocab.get_token(i) if word in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[word]) match_count += 1 else: # f"Word {word} was not found in the embedding file. Initialising randomly." pass logger.info(f"Match embedding vocab size: {match_count}. [{match_count}/{vocab_size}]") return embedding_matrix @overrides def get_output_dim(self): return self.embed_dim