def __init__(self, data_path, label_encoder, embedding_path, tokenizer_model, separator, phrase, context, label): """ :param data_path: [String] The path to the csv datafile that needs to be transformed into a dataset. :param embedding_path: [String] the path to the pretrained embeddings :param tokenizer_model: [String] Uses the Somajo tokenizer for tokenization. Defines the Tokenizer model that should be used. :param separator: [String] the csv separator (Default = tab) :param phrase: [String] the label of the column the phrase is stored in :param context: [String]the label of the column the sentences :param label: [String]the label of the column the class label is stored in """ self._feature_extractor = StaticEmbeddingExtractor( path_to_embeddings=embedding_path) super(PhraseAndContextDatasetStatic, self).__init__(data_path, label_encoder, label=label, phrase=phrase, separator=separator) self._sentences = list(self.data[context]) self._tokenizer = somajo.SoMaJo(tokenizer_model, split_camel_case=True, split_sentences=False) self._sentences = self.tokenizer.tokenize_text(self.sentences) self._sentences = [[token.text for token in sent] for sent in self.sentences] self._samples = self.populate_samples()
class StaticEmbeddingExtractorTest(unittest.TestCase): def setUp(self): path_skipgram = "embeddings/german-skipgram-mincount-30-ctx-10-dims-300.fifu" path_struct = "embeddings/german-structgram-mincount-30-ctx-10-dims-300.fifu" path_word2vec = "embeddings/encow-adj-n.fifu" self.extractor_skipgram = StaticEmbeddingExtractor(path_skipgram) self.extractor_structgram = StaticEmbeddingExtractor(path_struct) self.extractor_word2vec = StaticEmbeddingExtractor(path_word2vec) self.emb_skip = self.extractor_skipgram.get_embedding("Pferd") self.emb_struct = self.extractor_structgram.get_embedding("Pferd") array_words = ["Schaaf", "Pferd", "Hund", "Katze"] self.array_embeddings = self.extractor_skipgram.get_array_embeddings( array_words) def test_dimension(self): """ test dimension of both types of embedding (skipgram and structured skipgrams) """ expected = 300 np.testing.assert_equal(expected, self.emb_skip.shape[0]) np.testing.assert_equal(expected, self.emb_struct.shape[0]) def test_embeddings_different(self): """ tests that two different embeddings (structured vs. normal skipgram) correspond to the the same word """ np.testing.assert_equal(np.array_equal(self.emb_skip, self.emb_struct), False) def test_array_embedding(self): """ tests length of array of embeddings and dimension of embedding """ expected = 4 expected_dimensions = 300 np.testing.assert_equal(expected, len(self.array_embeddings)) np.testing.assert_equal(expected_dimensions, self.array_embeddings[0].shape[0]) def test_additional(self): emb = self.extractor_word2vec.get_embedding( "hobadibuduasdfasdfadsdsaadsdfsdfs") np.testing.assert_equal(emb.shape[0], self.extractor_word2vec.embedding_dim)
def __init__(self, path_to_predictions, embedding_path, data_loader, all_labels, max_rank, y_label): """ This class stores the functionality to rank a prediction with respect to some gold standard representations and to compute the precision at certain ranks or the quartiles :param path_to_predictions: [String] The path to numpy array stored predictions (number_of_test_instances x embedding_dim) :param path_to_ranks: [String] the path were the computed ranks will be saved to :param embedding_path: [String] the path to the embeddings :param data_loader: [Dataloader] a data loader witch batchsize 1 that holds the test data :param all_labels: [list(String)] a list of all unique labels that can occur in the test data :param max_rank: [int] the worst possible rank (even if an instance would get a lower rank it is set to this number) :param y_label: [String] the column name of the label in the test data """ # load composed predictions self._predicted_embeddings = np.load(path_to_predictions, allow_pickle=True) self._embeddings = StaticEmbeddingExtractor(embedding_path) data = next(iter(data_loader)) # the correct labels are stored here self._true_labels = data[y_label] self._max_rank = max_rank # construct label embedding matrix, embeddings of labels are looked up in the original embeddings all_labels = sorted(all_labels) self._label_embeddings = self._embeddings.get_array_embeddings( all_labels) self._label2index = dict(zip(all_labels, range(len(all_labels)))) # normalize predictions and label embedding matrix (in case they are not normalized) self._label_embeddings = F.normalize(torch.from_numpy( np.array(self._label_embeddings)), p=2, dim=1) self._predicted_embeddings = F.normalize(torch.from_numpy( np.array(self._predicted_embeddings)), p=2, dim=1) # compute the ranks, quartiles and precision self._ranks, self._gold_similarities, self._composed_similarities = self.get_target_based_rank( ) self._quartiles, self._result = self.calculate_quartiles(self._ranks)
def setUp(self): self.predictions = "embeddings/predictions_rank.npy" self.embedding_path = "embeddings/ranking_embeddings.fifu" self.ranks_path = "data_ranking/ranks.txt" self.labels = extract_all_words("data_ranking/test.txt", "data_ranking/test.txt", "data_ranking/test.txt", separator=" ", head="head", modifier="modifier", phrase="phrase") self.static_extractor = StaticEmbeddingExtractor(self.embedding_path) self.contextualized_extractor = BertExtractor('bert-base-german-cased', 20, False, 4) dataset_test = StaticRankingDataset(data_path="data_ranking/test.txt", embedding_path=self.embedding_path, separator=" ", phrase="phrase", mod="modifier", head="head") self.attributes = extract_all_labels( test_data="data_ranking/attributes.txt", training_data="data_ranking/attributes.txt", validation_data="data_ranking/attributes.txt", separator=" ", label="label") contextualized_dataset = ContextualizedRankingDataset( data_path="data_ranking/attributes.txt", mod="modifier", head="head", label="label", bert_model='bert-base-german-cased', batch_size=2, lower_case=False, max_len=10, separator=" ", label_definition_path="data_ranking/attribute_definitions") self.data_loader = DataLoader(dataset=dataset_test, shuffle=False, batch_size=len(dataset_test)) self.data_loader_contextualized = DataLoader( dataset=contextualized_dataset, shuffle=False, batch_size=len(contextualized_dataset))
def setUp(self): path_skipgram = "embeddings/german-skipgram-mincount-30-ctx-10-dims-300.fifu" path_struct = "embeddings/german-structgram-mincount-30-ctx-10-dims-300.fifu" path_word2vec = "embeddings/encow-adj-n.fifu" self.extractor_skipgram = StaticEmbeddingExtractor(path_skipgram) self.extractor_structgram = StaticEmbeddingExtractor(path_struct) self.extractor_word2vec = StaticEmbeddingExtractor(path_word2vec) self.emb_skip = self.extractor_skipgram.get_embedding("Pferd") self.emb_struct = self.extractor_structgram.get_embedding("Pferd") array_words = ["Schaaf", "Pferd", "Hund", "Katze"] self.array_embeddings = self.extractor_skipgram.get_array_embeddings( array_words)
def __init__(self, data_path, label_encoder, embedding_path, separator, phrase, label): """ :param data_path: [String] The path to the csv datafile that needs to be transformed into a dataset. :param embedding_path: [String] the path to the pretrained embeddings :param separator: [String] the csv separator :param phrase: [String] the label of the column the phrase is stored in :param label: [String]the label of the column the class label is stored in """ self._feature_extractor = StaticEmbeddingExtractor( path_to_embeddings=embedding_path) super(SimplePhraseStaticDataset, self).__init__(data_path, label_encoder, label=label, phrase=phrase, separator=separator) self._samples = self.populate_samples()
def __init__(self, data_path, embedding_path, separator, mod, head, phrase): """ This datasets can be used to pretrain a composition model on a reconstruction task :param data_path: the path to the dataset, should have a header :param embedding_path: the path to the pretrained static word embeddings :param separator: the separator within the dataset (default = whitespace) :param mod: the name of the column holding the modifier words :param head: the name of the column holding the head words :param phrase: the name of the column holding the phrases """ self._data = pandas.read_csv(data_path, delimiter=separator, index_col=False) self._modifier_words = list(self.data[mod]) self._head_words = list(self.data[head]) self._phrases = list(self.data[phrase]) assert len(self.modifier_words) == len(self.head_words) == len( self.phrases), "invalid input data, different lenghts" self._feature_extractor = StaticEmbeddingExtractor( path_to_embeddings=embedding_path) self._samples = self.populate_samples()
class Ranker: def __init__(self, path_to_predictions, embedding_path, data_loader, all_labels, max_rank, y_label): """ This class stores the functionality to rank a prediction with respect to some gold standard representations and to compute the precision at certain ranks or the quartiles :param path_to_predictions: [String] The path to numpy array stored predictions (number_of_test_instances x embedding_dim) :param path_to_ranks: [String] the path were the computed ranks will be saved to :param embedding_path: [String] the path to the embeddings :param data_loader: [Dataloader] a data loader witch batchsize 1 that holds the test data :param all_labels: [list(String)] a list of all unique labels that can occur in the test data :param max_rank: [int] the worst possible rank (even if an instance would get a lower rank it is set to this number) :param y_label: [String] the column name of the label in the test data """ # load composed predictions self._predicted_embeddings = np.load(path_to_predictions, allow_pickle=True) self._embeddings = StaticEmbeddingExtractor(embedding_path) data = next(iter(data_loader)) # the correct labels are stored here self._true_labels = data[y_label] self._max_rank = max_rank # construct label embedding matrix, embeddings of labels are looked up in the original embeddings all_labels = sorted(all_labels) self._label_embeddings = self._embeddings.get_array_embeddings( all_labels) self._label2index = dict(zip(all_labels, range(len(all_labels)))) # normalize predictions and label embedding matrix (in case they are not normalized) self._label_embeddings = F.normalize(torch.from_numpy( np.array(self._label_embeddings)), p=2, dim=1) self._predicted_embeddings = F.normalize(torch.from_numpy( np.array(self._predicted_embeddings)), p=2, dim=1) # compute the ranks, quartiles and precision self._ranks, self._gold_similarities, self._composed_similarities = self.get_target_based_rank( ) self._quartiles, self._result = self.calculate_quartiles(self._ranks) def get_target_based_rank(self): """ Computes the ranks of the composed representations, given a matrix of gold standard label embeddings. The ordering is relative to the gold standard target/label representation. :return: a list with the ranks for all the composed representations in the batch """ all_ranks = [] gold_similarities = [] composed_similarities = [] # get the index for each label in the true labels target_idxs = [self.label2index[label] for label in self.true_labels] # get a matrix, each row representing the gold representation of the corresponding label target_repr = np.take(self.label_embeddings, target_idxs, axis=0) # get the similarity between each label and each other possible label # result: [labelsize x targetinstances] = for each instance a vector of cosine similarities to each label target_dict_similarities = np.dot(self.label_embeddings, np.transpose(target_repr)) for i in range(self._predicted_embeddings.shape[0]): # compute similarity between the target and the predicted vector target_composed_similarity = np.dot(self.predicted_embeddings[i], target_repr[i]) composed_similarities.append(target_composed_similarity) gold_similarities.append(target_dict_similarities[:, i]) # delete the similarity between the target label and itself target_sims = np.delete(target_dict_similarities[:, i], target_idxs[i]) # the rank is the number of vectors with greater similarity that the one between # the target representation and the composed one; no sorting is required, just # the number of elements that are more similar rank = np.count_nonzero( target_sims > target_composed_similarity) + 1 if rank > self.max_rank: rank = self.max_rank all_ranks.append(rank) return all_ranks, gold_similarities, composed_similarities def save_ranks(self, file_to_save): with open(file_to_save, "w", encoding="utf8") as f: for i in range(len(self._true_labels)): f.write(self.true_labels[i] + " " + str(self.ranks[i]) + "\n") print("ranks saved to file: " + file_to_save) @staticmethod def calculate_quartiles(ranks): """ get the quartiles for the data :param ranks: a list of ranks :return: the three quartiles we are interested in, string representation of percentage of data that are rank 1 and percentage of data that are """ sorted_data = sorted(ranks) leq5 = sum([1 for rank in sorted_data if rank <= 5]) leq1 = sum([1 for rank in sorted_data if rank == 1]) if len(ranks) < 3: return ranks, "%.2f%% of ranks = 1; %.2f%% of ranks <=5" % ( (100 * leq1 / float(len(sorted_data))), (100 * leq5 / float(len(sorted_data)))) mid_index = math.floor((len(sorted_data) - 1) / 2) if len(sorted_data) % 2 != 0: quartiles = list( map(np.median, [ sorted_data[0:mid_index], sorted_data, sorted_data[mid_index + 1:] ])) else: quartiles = list( map(np.median, [ sorted_data[0:mid_index + 1], sorted_data, sorted_data[mid_index + 1:] ])) return quartiles, "%.2f%% of ranks = 1; %.2f%% of ranks <=5" % ( (100 * leq1 / float(len(sorted_data))), (100 * leq5 / float(len(sorted_data)))) @property def predicted_embeddings(self): return self._predicted_embeddings @property def embeddings(self): return self._embeddings @property def true_labels(self): return self._true_labels @property def max_rank(self): return self._max_rank @property def label_embeddings(self): return self._label_embeddings @property def label2index(self): return self._label2index @property def ranks(self): return self._ranks @property def quartiles(self): return self._quartiles @property def result(self): return self._result @property def gold_similarities(self): return self._gold_similarities @property def composed_similarities(self): return self._composed_similarities
argp.add_argument("training_config", help="the config that was used to train the model with") argp.add_argument("model_path", help="the path to the model that should be used to construct phrase representations") argp = argp.parse_args() with open(argp.training_config, 'r') as f: training_config = json.load(f) adj2lexunits = get_adj2lexunits(argp.sense_definitions) descriptions = read_sense_descriptions(argp.sense_definitions) if "Gerco" in argp.wsd_dataset: wsd_dataset = load_dataset_gerco(argp.wsd_dataset) else: wsd_dataset = load_dataset_wiki(argp.wsd_dataset) embeddings = StaticEmbeddingExtractor(training_config["feature_extractor"]["static"]["pretrained_model"]) sense_embeddings = StaticEmbeddingExtractor(argp.sense_embeddings) triples = adj_triples(wsd_dataset, sense_embeddings, embeddings, descriptions) if "pretrain" in training_config["model"]["type"]: predictions = predict_single_task(model_path=argp.model_path, training_config=training_config, wsd_dataset=wsd_dataset, embedding_extractor=embeddings) else: predictions = predict_joint_model(model_path=argp.model_path, training_config=training_config, wsd_dataset=wsd_dataset, embedding_extractor=embeddings) result_final, result_att, result_reconstructed = disambiguate(wsd_dataset, triples, predictions) baseline = get_baseline(wsd_dataset, adj2lexunits) print("accuracy for a random picked sense for this dataset is: %.2f" % baseline) print( "accuracy for final phrase for this dataset is: %.2f\naccuracy for attribute phrase is %.2f \naccuracy for " "reconstructed phrase is %.2f" % (result_final, result_att, result_reconstructed))
scores_path_val = str( Path(config["directory_path"]).joinpath(config["save_name"] + "_scores_val.txt")) scores_path_test = str( Path(config["directory_path"]).joinpath(config["save_name"] + "_scores_test.txt")) # get static embedding extractor and dataset if contextualized embeddings is false if config["feature_extractor"]["contextualized_embeddings"] is False: data_val = StaticRankingDataset( config["validation_data_path"], config["feature_extractor"]["static"]["pretrained_model"], config["data"]["separator"], config["data"]["modifier"], config["data"]["head"], config["data"]["label"]) feature_extractor = StaticEmbeddingExtractor( path_to_embeddings=config["feature_extractor"]["static"] ["pretrained_model"]) # else get contextualised feature extractor and dataset else: bert_parameter = config["feature_extractor"]["contextualized"]["bert"] bert_model = bert_parameter["model"] max_len = bert_parameter["max_sent_len"] lower_case = bert_parameter["lower_case"] batch_size = bert_parameter["batch_size"] data_val = ContextualizedRankingDataset( config["validation_data_path"], bert_model, max_len, lower_case, batch_size,