Пример #1
0
 def __init__(self, data_path, label_encoder, embedding_path,
              tokenizer_model, separator, phrase, context, label):
     """
     :param data_path: [String] The path to the csv datafile that needs to be transformed into a dataset.
     :param embedding_path: [String] the path to the pretrained embeddings
     :param tokenizer_model: [String] Uses the Somajo tokenizer for tokenization. Defines the Tokenizer model that
     should be used.
     :param separator: [String] the csv separator (Default = tab)
     :param phrase: [String] the label of the column the phrase is stored in
     :param context: [String]the label of the column the sentences
     :param label: [String]the label of the column the class label is stored in
     """
     self._feature_extractor = StaticEmbeddingExtractor(
         path_to_embeddings=embedding_path)
     super(PhraseAndContextDatasetStatic,
           self).__init__(data_path,
                          label_encoder,
                          label=label,
                          phrase=phrase,
                          separator=separator)
     self._sentences = list(self.data[context])
     self._tokenizer = somajo.SoMaJo(tokenizer_model,
                                     split_camel_case=True,
                                     split_sentences=False)
     self._sentences = self.tokenizer.tokenize_text(self.sentences)
     self._sentences = [[token.text for token in sent]
                        for sent in self.sentences]
     self._samples = self.populate_samples()
Пример #2
0
class StaticEmbeddingExtractorTest(unittest.TestCase):
    def setUp(self):
        path_skipgram = "embeddings/german-skipgram-mincount-30-ctx-10-dims-300.fifu"
        path_struct = "embeddings/german-structgram-mincount-30-ctx-10-dims-300.fifu"
        path_word2vec = "embeddings/encow-adj-n.fifu"
        self.extractor_skipgram = StaticEmbeddingExtractor(path_skipgram)
        self.extractor_structgram = StaticEmbeddingExtractor(path_struct)
        self.extractor_word2vec = StaticEmbeddingExtractor(path_word2vec)
        self.emb_skip = self.extractor_skipgram.get_embedding("Pferd")
        self.emb_struct = self.extractor_structgram.get_embedding("Pferd")
        array_words = ["Schaaf", "Pferd", "Hund", "Katze"]
        self.array_embeddings = self.extractor_skipgram.get_array_embeddings(
            array_words)

    def test_dimension(self):
        """
        test dimension of both types of embedding (skipgram and structured skipgrams)
        """
        expected = 300

        np.testing.assert_equal(expected, self.emb_skip.shape[0])
        np.testing.assert_equal(expected, self.emb_struct.shape[0])

    def test_embeddings_different(self):
        """
        tests that two different embeddings (structured vs. normal skipgram) correspond to the the same word
        """
        np.testing.assert_equal(np.array_equal(self.emb_skip, self.emb_struct),
                                False)

    def test_array_embedding(self):
        """
        tests length of array of embeddings and dimension of embedding
        """
        expected = 4
        expected_dimensions = 300
        np.testing.assert_equal(expected, len(self.array_embeddings))
        np.testing.assert_equal(expected_dimensions,
                                self.array_embeddings[0].shape[0])

    def test_additional(self):
        emb = self.extractor_word2vec.get_embedding(
            "hobadibuduasdfasdfadsdsaadsdfsdfs")
        np.testing.assert_equal(emb.shape[0],
                                self.extractor_word2vec.embedding_dim)
Пример #3
0
    def __init__(self, path_to_predictions, embedding_path, data_loader,
                 all_labels, max_rank, y_label):
        """
        This class stores the functionality to rank a prediction with respect to some gold standard representations and
        to compute the precision at certain ranks or the quartiles
        :param path_to_predictions: [String] The path to numpy array stored predictions (number_of_test_instances x
        embedding_dim)
        :param path_to_ranks: [String] the path were the computed ranks will be saved to
        :param embedding_path: [String] the path to the embeddings
        :param data_loader: [Dataloader] a data loader witch batchsize 1 that holds the test data
        :param all_labels: [list(String)] a list of all unique labels that can occur in the test data
        :param max_rank: [int] the worst possible rank (even if an instance would get a lower rank it is set to this
        number)
        :param y_label: [String] the column name of the label in the test data
        """
        # load composed predictions
        self._predicted_embeddings = np.load(path_to_predictions,
                                             allow_pickle=True)
        self._embeddings = StaticEmbeddingExtractor(embedding_path)
        data = next(iter(data_loader))
        # the correct labels are stored here
        self._true_labels = data[y_label]
        self._max_rank = max_rank

        # construct label embedding matrix, embeddings of labels are looked up in the original embeddings
        all_labels = sorted(all_labels)
        self._label_embeddings = self._embeddings.get_array_embeddings(
            all_labels)
        self._label2index = dict(zip(all_labels, range(len(all_labels))))
        # normalize predictions and label embedding matrix (in case they are not normalized)
        self._label_embeddings = F.normalize(torch.from_numpy(
            np.array(self._label_embeddings)),
                                             p=2,
                                             dim=1)
        self._predicted_embeddings = F.normalize(torch.from_numpy(
            np.array(self._predicted_embeddings)),
                                                 p=2,
                                                 dim=1)
        # compute the ranks, quartiles and precision
        self._ranks, self._gold_similarities, self._composed_similarities = self.get_target_based_rank(
        )
        self._quartiles, self._result = self.calculate_quartiles(self._ranks)
Пример #4
0
    def setUp(self):
        self.predictions = "embeddings/predictions_rank.npy"
        self.embedding_path = "embeddings/ranking_embeddings.fifu"
        self.ranks_path = "data_ranking/ranks.txt"
        self.labels = extract_all_words("data_ranking/test.txt",
                                        "data_ranking/test.txt",
                                        "data_ranking/test.txt",
                                        separator=" ",
                                        head="head",
                                        modifier="modifier",
                                        phrase="phrase")
        self.static_extractor = StaticEmbeddingExtractor(self.embedding_path)
        self.contextualized_extractor = BertExtractor('bert-base-german-cased',
                                                      20, False, 4)

        dataset_test = StaticRankingDataset(data_path="data_ranking/test.txt",
                                            embedding_path=self.embedding_path,
                                            separator=" ",
                                            phrase="phrase",
                                            mod="modifier",
                                            head="head")

        self.attributes = extract_all_labels(
            test_data="data_ranking/attributes.txt",
            training_data="data_ranking/attributes.txt",
            validation_data="data_ranking/attributes.txt",
            separator=" ",
            label="label")
        contextualized_dataset = ContextualizedRankingDataset(
            data_path="data_ranking/attributes.txt",
            mod="modifier",
            head="head",
            label="label",
            bert_model='bert-base-german-cased',
            batch_size=2,
            lower_case=False,
            max_len=10,
            separator=" ",
            label_definition_path="data_ranking/attribute_definitions")

        self.data_loader = DataLoader(dataset=dataset_test,
                                      shuffle=False,
                                      batch_size=len(dataset_test))
        self.data_loader_contextualized = DataLoader(
            dataset=contextualized_dataset,
            shuffle=False,
            batch_size=len(contextualized_dataset))
Пример #5
0
 def setUp(self):
     path_skipgram = "embeddings/german-skipgram-mincount-30-ctx-10-dims-300.fifu"
     path_struct = "embeddings/german-structgram-mincount-30-ctx-10-dims-300.fifu"
     path_word2vec = "embeddings/encow-adj-n.fifu"
     self.extractor_skipgram = StaticEmbeddingExtractor(path_skipgram)
     self.extractor_structgram = StaticEmbeddingExtractor(path_struct)
     self.extractor_word2vec = StaticEmbeddingExtractor(path_word2vec)
     self.emb_skip = self.extractor_skipgram.get_embedding("Pferd")
     self.emb_struct = self.extractor_structgram.get_embedding("Pferd")
     array_words = ["Schaaf", "Pferd", "Hund", "Katze"]
     self.array_embeddings = self.extractor_skipgram.get_array_embeddings(
         array_words)
Пример #6
0
    def __init__(self, data_path, label_encoder, embedding_path, separator,
                 phrase, label):
        """

        :param data_path: [String] The path to the csv datafile that needs to be transformed into a dataset.
        :param embedding_path: [String] the path to the pretrained embeddings
        :param separator: [String] the csv separator
        :param phrase: [String] the label of the column the phrase is stored in
        :param label: [String]the label of the column the class label is stored in
        """
        self._feature_extractor = StaticEmbeddingExtractor(
            path_to_embeddings=embedding_path)
        super(SimplePhraseStaticDataset, self).__init__(data_path,
                                                        label_encoder,
                                                        label=label,
                                                        phrase=phrase,
                                                        separator=separator)
        self._samples = self.populate_samples()
Пример #7
0
    def __init__(self, data_path, embedding_path, separator, mod, head,
                 phrase):
        """
        This datasets can be used to pretrain a composition model on a reconstruction task
        :param data_path: the path to the dataset, should have a header
        :param embedding_path: the path to the pretrained static word embeddings
        :param separator: the separator within the dataset (default = whitespace)
        :param mod: the name of the column holding the modifier words
        :param head: the name of the column holding the head words
        :param phrase: the name of the column holding the phrases
        """
        self._data = pandas.read_csv(data_path,
                                     delimiter=separator,
                                     index_col=False)
        self._modifier_words = list(self.data[mod])
        self._head_words = list(self.data[head])
        self._phrases = list(self.data[phrase])
        assert len(self.modifier_words) == len(self.head_words) == len(
            self.phrases), "invalid input data, different lenghts"

        self._feature_extractor = StaticEmbeddingExtractor(
            path_to_embeddings=embedding_path)
        self._samples = self.populate_samples()
Пример #8
0
class Ranker:
    def __init__(self, path_to_predictions, embedding_path, data_loader,
                 all_labels, max_rank, y_label):
        """
        This class stores the functionality to rank a prediction with respect to some gold standard representations and
        to compute the precision at certain ranks or the quartiles
        :param path_to_predictions: [String] The path to numpy array stored predictions (number_of_test_instances x
        embedding_dim)
        :param path_to_ranks: [String] the path were the computed ranks will be saved to
        :param embedding_path: [String] the path to the embeddings
        :param data_loader: [Dataloader] a data loader witch batchsize 1 that holds the test data
        :param all_labels: [list(String)] a list of all unique labels that can occur in the test data
        :param max_rank: [int] the worst possible rank (even if an instance would get a lower rank it is set to this
        number)
        :param y_label: [String] the column name of the label in the test data
        """
        # load composed predictions
        self._predicted_embeddings = np.load(path_to_predictions,
                                             allow_pickle=True)
        self._embeddings = StaticEmbeddingExtractor(embedding_path)
        data = next(iter(data_loader))
        # the correct labels are stored here
        self._true_labels = data[y_label]
        self._max_rank = max_rank

        # construct label embedding matrix, embeddings of labels are looked up in the original embeddings
        all_labels = sorted(all_labels)
        self._label_embeddings = self._embeddings.get_array_embeddings(
            all_labels)
        self._label2index = dict(zip(all_labels, range(len(all_labels))))
        # normalize predictions and label embedding matrix (in case they are not normalized)
        self._label_embeddings = F.normalize(torch.from_numpy(
            np.array(self._label_embeddings)),
                                             p=2,
                                             dim=1)
        self._predicted_embeddings = F.normalize(torch.from_numpy(
            np.array(self._predicted_embeddings)),
                                                 p=2,
                                                 dim=1)
        # compute the ranks, quartiles and precision
        self._ranks, self._gold_similarities, self._composed_similarities = self.get_target_based_rank(
        )
        self._quartiles, self._result = self.calculate_quartiles(self._ranks)

    def get_target_based_rank(self):
        """
        Computes the ranks of the composed representations, given a matrix of gold standard label embeddings.
        The ordering is relative to the gold standard target/label representation.
        :return: a list with the ranks for all the composed representations in the batch
        """
        all_ranks = []
        gold_similarities = []
        composed_similarities = []
        # get the index for each label in the true labels
        target_idxs = [self.label2index[label] for label in self.true_labels]

        # get a matrix, each row representing the gold representation of the corresponding label
        target_repr = np.take(self.label_embeddings, target_idxs, axis=0)

        # get the similarity between each label and each other possible label
        # result: [labelsize x targetinstances]  = for each instance a vector of cosine similarities to each label
        target_dict_similarities = np.dot(self.label_embeddings,
                                          np.transpose(target_repr))

        for i in range(self._predicted_embeddings.shape[0]):
            # compute similarity between the target and the predicted vector
            target_composed_similarity = np.dot(self.predicted_embeddings[i],
                                                target_repr[i])
            composed_similarities.append(target_composed_similarity)
            gold_similarities.append(target_dict_similarities[:, i])
            # delete the similarity between the target label and itself
            target_sims = np.delete(target_dict_similarities[:, i],
                                    target_idxs[i])

            # the rank is the number of vectors with greater similarity that the one between
            # the target representation and the composed one; no sorting is required, just
            # the number of elements that are more similar
            rank = np.count_nonzero(
                target_sims > target_composed_similarity) + 1
            if rank > self.max_rank:
                rank = self.max_rank
            all_ranks.append(rank)

        return all_ranks, gold_similarities, composed_similarities

    def save_ranks(self, file_to_save):
        with open(file_to_save, "w", encoding="utf8") as f:
            for i in range(len(self._true_labels)):
                f.write(self.true_labels[i] + " " + str(self.ranks[i]) + "\n")
        print("ranks saved to file: " + file_to_save)

    @staticmethod
    def calculate_quartiles(ranks):
        """
        get the quartiles for the data
        :param ranks: a list of ranks
        :return: the three quartiles we are interested in, string representation of percentage of data that are rank 1
        and percentage of data that are
        """
        sorted_data = sorted(ranks)
        leq5 = sum([1 for rank in sorted_data if rank <= 5])
        leq1 = sum([1 for rank in sorted_data if rank == 1])
        if len(ranks) < 3:
            return ranks, "%.2f%% of ranks = 1; %.2f%% of ranks <=5" % (
                (100 * leq1 / float(len(sorted_data))),
                (100 * leq5 / float(len(sorted_data))))
        mid_index = math.floor((len(sorted_data) - 1) / 2)
        if len(sorted_data) % 2 != 0:
            quartiles = list(
                map(np.median, [
                    sorted_data[0:mid_index], sorted_data,
                    sorted_data[mid_index + 1:]
                ]))
        else:
            quartiles = list(
                map(np.median, [
                    sorted_data[0:mid_index + 1], sorted_data,
                    sorted_data[mid_index + 1:]
                ]))
        return quartiles, "%.2f%% of ranks = 1; %.2f%% of ranks <=5" % (
            (100 * leq1 / float(len(sorted_data))),
            (100 * leq5 / float(len(sorted_data))))

    @property
    def predicted_embeddings(self):
        return self._predicted_embeddings

    @property
    def embeddings(self):
        return self._embeddings

    @property
    def true_labels(self):
        return self._true_labels

    @property
    def max_rank(self):
        return self._max_rank

    @property
    def label_embeddings(self):
        return self._label_embeddings

    @property
    def label2index(self):
        return self._label2index

    @property
    def ranks(self):
        return self._ranks

    @property
    def quartiles(self):
        return self._quartiles

    @property
    def result(self):
        return self._result

    @property
    def gold_similarities(self):
        return self._gold_similarities

    @property
    def composed_similarities(self):
        return self._composed_similarities
Пример #9
0
    argp.add_argument("training_config", help="the config that was used to train the model with")
    argp.add_argument("model_path", help="the path to the model that should be used to construct phrase representations")
    argp = argp.parse_args()

    with open(argp.training_config, 'r') as f:
        training_config = json.load(f)

    adj2lexunits = get_adj2lexunits(argp.sense_definitions)
    descriptions = read_sense_descriptions(argp.sense_definitions)
    if "Gerco" in argp.wsd_dataset:

        wsd_dataset = load_dataset_gerco(argp.wsd_dataset)
    else:
        wsd_dataset = load_dataset_wiki(argp.wsd_dataset)

    embeddings = StaticEmbeddingExtractor(training_config["feature_extractor"]["static"]["pretrained_model"])
    sense_embeddings = StaticEmbeddingExtractor(argp.sense_embeddings)
    triples = adj_triples(wsd_dataset, sense_embeddings, embeddings, descriptions)
    if "pretrain" in training_config["model"]["type"]:
        predictions = predict_single_task(model_path=argp.model_path, training_config=training_config,
                                          wsd_dataset=wsd_dataset, embedding_extractor=embeddings)
    else:
        predictions = predict_joint_model(model_path=argp.model_path, training_config=training_config,
                                          wsd_dataset=wsd_dataset, embedding_extractor=embeddings)

    result_final, result_att, result_reconstructed = disambiguate(wsd_dataset, triples, predictions)
    baseline = get_baseline(wsd_dataset, adj2lexunits)
    print("accuracy for a random picked sense for this dataset is: %.2f" % baseline)
    print(
        "accuracy for final phrase for this dataset is: %.2f\naccuracy for attribute phrase is %.2f \naccuracy for "
        "reconstructed phrase is %.2f" % (result_final, result_att, result_reconstructed))
    scores_path_val = str(
        Path(config["directory_path"]).joinpath(config["save_name"] +
                                                "_scores_val.txt"))
    scores_path_test = str(
        Path(config["directory_path"]).joinpath(config["save_name"] +
                                                "_scores_test.txt"))

    # get static embedding extractor and dataset if contextualized embeddings is false
    if config["feature_extractor"]["contextualized_embeddings"] is False:
        data_val = StaticRankingDataset(
            config["validation_data_path"],
            config["feature_extractor"]["static"]["pretrained_model"],
            config["data"]["separator"], config["data"]["modifier"],
            config["data"]["head"], config["data"]["label"])
        feature_extractor = StaticEmbeddingExtractor(
            path_to_embeddings=config["feature_extractor"]["static"]
            ["pretrained_model"])
    # else get contextualised feature extractor and dataset
    else:
        bert_parameter = config["feature_extractor"]["contextualized"]["bert"]
        bert_model = bert_parameter["model"]
        max_len = bert_parameter["max_sent_len"]
        lower_case = bert_parameter["lower_case"]
        batch_size = bert_parameter["batch_size"]

        data_val = ContextualizedRankingDataset(
            config["validation_data_path"],
            bert_model,
            max_len,
            lower_case,
            batch_size,