示例#1
0
    def __init__(
        self,
        vocab,
        options_file=DEFAULT_OPTIONS_FILE,
        weight_file=DEFAULT_WEIGHT_FILE,
        do_layer_norm=False,
        dropout=0.5,
        trainable=False,
        project_dim=None,
    ):
        super(ELMoEmbedding, self).__init__(vocab)
        data_handler = DataHandler(cache_path=CachePath.PRETRAINED_VECTOR)
        option_path = data_handler.read(options_file, return_path=True)
        weight_path = data_handler.read(weight_file, return_path=True)

        self.elmo = Elmo(option_path,
                         weight_path,
                         1,
                         requires_grad=trainable,
                         dropout=dropout)

        self.project_dim = project_dim
        self.project_linear = None
        if project_dim:
            self.project_linear = nn.Linear(self.elmo.get_output_dim(),
                                            project_dim)
示例#2
0
文件: bpe.py 项目: paulsunnypark/claf
class BPETokenizer(Tokenizer):
    """
    BPTE(Byte-Pair Encoding) Tokenizer
    text -> ...
    * Args:
        name: tokenizer name [roberta]
    """
    def __init__(self, name, config={}):
        super(BPETokenizer, self).__init__(name, f"bpe-{name}")
        self.data_handler = DataHandler(CachePath.VOCAB)
        self.config = config

        self.bpe_tokenizer = None

    """ Tokenizers """

    def _roberta(self, text, unit="text"):
        """
        ex)
        """
        if self.bpe_tokenizer is None:
            vocab_path = self.data_handler.read(self.config["vocab_path"],
                                                return_path=True)
            merges_path = self.data_handler.read(self.config["merges_path"],
                                                 return_path=True)
            del self.config["vocab_path"]
            del self.config["merges_path"]

            self.bpe_tokenizer = RobertaTokenizer(vocab_path, merges_path,
                                                  **self.config)

        return self.bpe_tokenizer._tokenize(text)
示例#3
0
    def __init__(
        self, word_embedding, pretrained_path=None, requires_grad=False, residual_embeddings=False
    ):
        """Initialize an MTLSTM.

        Arguments:
            n_vocab (bool): If not None, initialize MTLSTM with an embedding matrix with n_vocab vectors
            vectors (Float Tensor): If not None, initiapize embedding matrix with specified vectors
            residual_embedding (bool): If True, concatenate the input embeddings with MTLSTM outputs during forward
        """
        super(MTLSTM, self).__init__()
        self.word_embedding = word_embedding
        self.rnn = nn.LSTM(300, 300, num_layers=2, bidirectional=True, batch_first=True)

        data_handler = DataHandler(cache_path=CachePath.PRETRAINED_VECTOR)
        cove_weight_path = data_handler.read(pretrained_path, return_path=True)

        if torch.cuda.is_available():
            checkpoint = torch.load(cove_weight_path)
        else:
            checkpoint = torch.load(cove_weight_path, map_location="cpu")

        self.rnn.load_state_dict(checkpoint)
        self.residual_embeddings = residual_embeddings
        self.requires_grad = requires_grad
示例#4
0
    def build_with_pretrained_file(self, token_counter):
        data_handler = DataHandler(CachePath.VOCAB)
        vocab_texts = data_handler.read(self.pretrained_path)

        if self.pretrained_path.endswith(".txt"):
            predefine_vocab = vocab_texts.split("\n")
        elif self.pretrained_path.endswith(".json"):
            vocab_texts = json.loads(vocab_texts)  # {token: id}
            predefine_vocab = [
                item[0]
                for item in sorted(vocab_texts.items(), key=lambda x: x[1])
            ]
        else:
            raise ValueError(f"support vocab extention. .txt or .json")

        self.build(token_counter, predefine_vocab=predefine_vocab)
示例#5
0
class SubwordTokenizer(Tokenizer):
    """
    Subword Tokenizer

    text -> [word tokens] -> [[sub word tokens], ...]

    * Args:
        name: tokenizer name [wordpiece]
    """
    def __init__(self, name, word_tokenizer, config={}):
        super(SubwordTokenizer,
              self).__init__(name,
                             f"subword-{name}+{word_tokenizer.cache_name}")
        self.data_handler = DataHandler(CachePath.VOCAB)
        self.config = config
        self.word_tokenizer = word_tokenizer
        self.subword_tokenizer = None

    """ Tokenizers """

    def _wordpiece(self, text, unit="text"):
        """
        ex) Hello World -> ['Hello', 'World'] -> ['He', '##llo', 'Wo', '##rld']
        """
        if self.subword_tokenizer is None:
            vocab_path = self.data_handler.read(self.config["vocab_path"],
                                                return_path=True)
            vocab = load_vocab(vocab_path)
            self.subword_tokenizer = WordpieceTokenizer(
                vocab, unk_token=self.config.get("unk_token", "[UNK]"))

        tokens = []

        if unit == "word":
            for sub_token in self.subword_tokenizer.tokenize(text):
                tokens.append(sub_token)
        else:
            for token in self.word_tokenizer.tokenize(text):
                for sub_token in self.subword_tokenizer.tokenize(token):
                    tokens.append(sub_token)

        return tokens
示例#6
0
class MRCEnsemble(Machine):
    """
    Machine Reading Comprehension Ensemble

    * Args:
        config: machine_config
    """
    def __init__(self, config):
        super(MRCEnsemble, self).__init__(config)
        self.data_handler = DataHandler(CachePath.MACHINE / "mrc_ensemble")

        self.load()

    @overrides
    def load(self):
        mrc_config = self.config.reading_comprehension

        # Model 1 - BERT-Kor
        self.rc_experiment1 = self.make_module(mrc_config.model_1)
        print("BERT-Kor ready ..! \n")

        # # Model 2 - BERT-Multilingual
        # self.rc_experiment2 = self.make_module(mrc_config.model_2)
        # print("BERT-Multilingual ready ..! \n")

        # # Model 3 - DocQA
        # self.rc_experiment3 = self.make_module(mrc_config.model_3)
        # print("DocQA ready ..! \n")

        # # Model 4 - DrQA
        # self.rc_experiment4 = self.make_module(mrc_config.model_4)
        # print("DrQA ready ..! \n")

        print("All ready ..! \n")

    def evaluate(self, file_path, output_path):
        # KorQuAD dataset...

        # def get_answer_after_clustering(predictions):
        # categories = {}

        # for l1 in predictions:
        # l1_text = l1["text"]
        # l1_text_normalized = normalize_answer(l1_text)

        # categories[l1_text] = {
        # "items": [],
        # "score": 0
        # }

        # for l2 in predictions:
        # l2_text = l2["text"]
        # l2_text_normalized = normalize_answer(l2_text)

        # if l1_text_normalized in l2_text_normalized:
        # categories[l1_text]["items"].append(l2)
        # categories[l1_text]["score"] += l2["score"]

        # # # count items then score * 1.n
        # # for k, v in categories.items():
        # # ratio = 1 + (len(v["items"]) / 10)
        # # v["score"] *= ratio

        # highest_category = [categories[c] for c in sorted(categories, key=lambda x: categories[x]["score"], reverse=True)][0]
        # answer_text = sorted(highest_category["items"], key=lambda x: x["score"], reverse=True)[0]["text"]
        # return answer_text

        # def get_answer_after_clustering_marginal(predictions):
        # categories = {}

        # for l1 in predictions:
        # l1_text = l1["text"]
        # l1_text_normalized = normalize_answer(l1_text)

        # categories[l1_text] = {
        # "items": [],
        # "score": 0
        # }

        # for l2 in predictions:
        # l2_text = l2["text"]
        # l2_text_normalized = normalize_answer(l2_text)

        # if l1_text_normalized in l2_text_normalized:
        # categories[l1_text]["items"].append(l2)
        # categories[l1_text]["score"] *= l2["score"]
        # else:
        # categories[l1_text]["score"] *= 0.01  # Default value

        # # count items then score * 1.n
        # for k, v in categories.items():
        # ratio = 1 + (len(v["items"]) / 10)
        # v["score"] *= ratio

        # highest_category = [categories[c] for c in sorted(categories, key=lambda x: categories[x]["score"], reverse=True)][0]
        # answer_text = sorted(highest_category["items"], key=lambda x: x["score"], reverse=True)[0]["text"]
        # return answer_text

        # def post_processing(text):
        # # detach josa
        # # josas = ['은', '는', '이', '가', '을', '를', '과', '와', '이다', '다', '으로', '로', '의', '에']
        # josas = ["는", "를", "이다", "으로", "에", "이라고", "라고", "와의", "인데"]

        # for josa in josas:
        # if text.endswith(josa):
        # text = text[:-len(josa)]
        # break

        # # temperature
        # if text.endswith("°"):
        # text += "C"

        # # etc
        # special_cases = ["(", ",", "였", "."]
        # for s in special_cases:
        # if text.endswith(s):
        # text = text[:-len(s)]
        # return text

        def _clean_text(text):
            # https://github.com/allenai/document-qa/blob/2f9fa6878b60ed8a8a31bcf03f802cde292fe48b/docqa/data_processing/text_utils.py#L124
            # be consistent with quotes, and replace \u2014 and \u2212 which I have seen being mapped to UNK
            # by glove word vecs
            return (text.replace("''", '"').replace("``", '"').replace(
                "\u2212", "-").replace("\u2014", "\u2013"))

        predictions = {}
        topk_predictions = {}

        print("Read input_data...")
        data = self.data_handler.read(file_path)
        squad = json.loads(data)
        if "data" in squad:
            squad = squad["data"]

        wrong_count = 0

        print("Start predict 1-examples...")
        for article in tqdm(squad):
            for paragraph in article["paragraphs"]:
                context = paragraph["context"]

                for qa in paragraph["qas"]:
                    question = qa["question"]
                    id_ = qa["id"]

                    # Marginal probabilities...
                    # prediction = self.get_predict_with_marginal(context, question)
                    prediction = self.get_predict(context, question)
                    # print("prediction count:", len(prediction))

                    topk_predictions[id_] = prediction
                    predictions[id_] = prediction[0]["text"]

                    # answer_texts = [q["text"] for q in qa["answers"]]

                    # # 1. Highest value
                    # sorted_prediction = sorted(prediction, key=lambda x: x["score"], reverse=True)
                    # prediction_text = sorted_prediction[0]["text"]

                    # 2. Cluster by text
                    # prediction_text = get_answer_after_clustering_marginal(prediction)
                    # prediction_text = post_processing(prediction_text)

                    # predictions[id_] = prediction_text
                    # if prediction_text not in answer_texts:
                    # pred_f1_score = metric_max_over_ground_truths(f1_score, prediction_text, answer_texts)

                    # if pred_f1_score <= 0.5:
                    # sorted_prediction = sorted(prediction, key=lambda x: x["score"], reverse=True)
                    # print("predict:", json.dumps(sorted_prediction[:5], indent=4, ensure_ascii=False))
                    # print("predict_text:", prediction_text)
                    # print("answers:", qa["answers"], "f1:", pred_f1_score)
                    # print("-"*50)
                    # wrong_count += 1

                    # is_answer = False
                    # for pred in prediction:
                    # if pred["text"] in answer_texts:
                    # predictions[id_] = pred["text"]
                    # is_answer = True
                    # break

                    # if not is_answer:
                    # prediction_text = sorted(prediction, key=lambda x: x["score"], reverse=True)[0]["text"]
                    # predictions[id_] = prediction_text

                    # print("predict:", prediction)
                    # print("predict_text:", prediction_text)
                    # print("answers:", qa["answers"])
                    # print("-"*50)
                    # wrong_count += 1

        print("total_count:", len(predictions), "wrong_count:", wrong_count)

        print("Completed...!")
        with open(output_path, "w") as out_file:
            out_file.write(json.dumps(topk_predictions, indent=4) + "\n")

        # Evaluate
        with open(file_path) as dataset_file:
            dataset_json = json.load(dataset_file)
            dataset = dataset_json
            if "data" in dataset:
                dataset = dataset["data"]
        # with open(output_path) as prediction_file:
        # predictions = json.load(prediction_file)

        results = evaluate(dataset, predictions)
        print(json.dumps(results))

    def get_predict(self, context, question):
        raw_feature = {"context": context, "question": question}
        # print(raw_feature)

        # Approach 1. Max Prob
        models = [
            (self.rc_experiment1, 0.94),
            # (self.rc_experiment2, 0.90)
            # (self.rc_experiment3, 0.85),
            # (self.rc_experiment4, 0.84),
        ]
        # models = [self.rc_experiment3, self.rc_experiment4]

        model = models[0][0]
        return sorted(model.predict(raw_feature),
                      key=lambda x: x["score"],
                      reverse=True)
示例#7
0
    def build_with_pretrained_file(self, token_counter):
        data_handler = DataHandler(CachePath.VOCAB)
        vocab_texts = data_handler.read(self.pretrained_path)
        predefine_vocab = vocab_texts.split("\n")

        self.build(token_counter, predefine_vocab=predefine_vocab)
示例#8
0
class WordEmbedding(TokenEmbedding):
    """
    Word Embedding
    Default Token Embedding

    * Args:
        vocab: Vocab (claf.tokens.vocab)

    * Kwargs:
        dropout: The number of dropout probability
        embed_dim: The number of embedding dimension
        padding_idx: If given, pads the output with the embedding vector at padding_idx
            (initialized to zeros) whenever it encounters the index.
        max_norm: If given, will renormalize the embedding vectors to have a norm lesser
            than this before extracting. Note: this will modify weight in-place.
        norm_type: The p of the p-norm to compute for the max_norm option. Default 2.
        scale_grad_by_freq: if given, this will scale gradients by the inverse of
            frequency of the words in the mini-batch. Default False.
        sparse: if True, gradient w.r.t. weight will be a sparse tensor.
            See Notes under torch.nn.Embedding for more details regarding sparse gradients.
        pretrained_path: pretrained vector path (eg. GloVe)
        trainable: finetune or fixed
    """

    def __init__(
        self,
        vocab,
        dropout=0.2,
        embed_dim=100,
        padding_idx=None,
        max_norm=None,
        norm_type=2,
        scale_grad_by_freq=False,
        sparse=False,
        pretrained_path=None,
        trainable=True,
    ):
        super(WordEmbedding, self).__init__(vocab)
        self.data_handler = DataHandler(cache_path=CachePath.PRETRAINED_VECTOR)

        self.embed_dim = embed_dim
        if dropout and dropout > 0:
            self.dropout = nn.Dropout(p=dropout)
        else:
            self.dropout = lambda x: x

        if pretrained_path:
            weight = self._read_pretrained_file(pretrained_path)
            self.weight = torch.nn.Parameter(weight, requires_grad=trainable)
        else:
            self.weight = self._init_weight(trainable=trainable)

        # nn.functional.embedding = optional paramters
        #  (padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
        # check - https://pytorch.org/docs/master/nn.html#torch.nn.functional.embeddin\
        #    ://pytorch.org/docs/master/nn.html#torch.nn.functional.embedding
        self.padding_idx = padding_idx
        self.max_norm = max_norm
        self.norm_type = norm_type
        self.scale_grad_by_freq = scale_grad_by_freq
        self.sparse = sparse

    def _init_weight(self, trainable=True):
        weight = torch.FloatTensor(self.get_vocab_size(), self.embed_dim)
        weight = torch.nn.Parameter(weight, requires_grad=trainable)
        torch.nn.init.xavier_uniform_(weight)
        return weight

    @overrides
    def forward(self, words):
        input_size = words.size()
        if len(input_size) > 2:
            words = words.view(-1, input_size[-1])

        embedded_words = F.embedding(
            words,
            self.weight,
            padding_idx=self.padding_idx,
            max_norm=self.max_norm,
            norm_type=self.norm_type,
            scale_grad_by_freq=self.scale_grad_by_freq,
            sparse=self.sparse,
        )

        if len(input_size) > 2:
            embedded_size = list(input_size) + [embedded_words.size(-1)]
            embedded_words = embedded_words.view(*embedded_size)
        return self.dropout(embedded_words)

    def _read_pretrained_file(self, file_path):
        words_to_keep = set(self.vocab.get_all_tokens())
        vocab_size = self.get_vocab_size()
        embeddings = {}

        # First we read the embeddings from the file, only keeping vectors for the words we need.
        logger.info("Reading embeddings from file")
        file_path = self.data_handler.read(file_path, return_path=True)
        with open(file_path, "rb") as embeddings_file:
            for line in embeddings_file:
                fields = line.decode("utf-8").rstrip().split(" ")

                if len(fields) - 1 != self.embed_dim:
                    logger.info(
                        f"Found line with wrong number of dimensions (expected {self.embed_dim}, was {len(fields)}): {line}"
                    )
                    continue

                word = fields[0]
                if word in words_to_keep:
                    vector = np.asarray(fields[1:], dtype="float32")
                    embeddings[word] = vector

        if not embeddings:
            raise ValueError(
                "No embeddings of correct dimension found. check input dimension value"
            )

        all_embeddings = np.asarray(list(embeddings.values()))
        embeddings_mean = float(np.mean(all_embeddings))
        embeddings_std = float(np.std(all_embeddings))
        # Now we initialize the weight matrix for an embedding layer, starting with random vectors,
        # then filling in the word vectors we just read.
        logger.info("Initializing pre-trained embedding layer")
        embedding_matrix = torch.FloatTensor(vocab_size, self.embed_dim).normal_(
            embeddings_mean, embeddings_std
        )

        match_count = 0
        for i in range(0, vocab_size):
            word = self.vocab.get_token(i)
            if word in embeddings:
                embedding_matrix[i] = torch.FloatTensor(embeddings[word])
                match_count += 1
            else:
                # f"Word {word} was not found in the embedding file. Initialising randomly."
                pass
        logger.info(f"Match embedding vocab size: {match_count}.  [{match_count}/{vocab_size}]")
        return embedding_matrix

    @overrides
    def get_output_dim(self):
        return self.embed_dim