Пример #1
0
class BPETokenizer(Tokenizer):
    """
    BPTE(Byte-Pair Encoding) Tokenizer
    text -> ...
    * Args:
        name: tokenizer name [roberta]
    """
    def __init__(self, name, config={}):
        super(BPETokenizer, self).__init__(name, f"bpe-{name}")
        self.data_handler = DataHandler(CachePath.VOCAB)
        self.config = config

        self.bpe_tokenizer = None

    """ Tokenizers """

    def _roberta(self, text, unit="text"):
        """
        ex)
        """
        if self.bpe_tokenizer is None:
            vocab_path = self.data_handler.read(self.config["vocab_path"],
                                                return_path=True)
            merges_path = self.data_handler.read(self.config["merges_path"],
                                                 return_path=True)
            del self.config["vocab_path"]
            del self.config["merges_path"]

            self.bpe_tokenizer = RobertaTokenizer(vocab_path, merges_path,
                                                  **self.config)

        return self.bpe_tokenizer._tokenize(text)
Пример #2
0
    def __init__(
        self,
        vocab,
        options_file=DEFAULT_OPTIONS_FILE,
        weight_file=DEFAULT_WEIGHT_FILE,
        do_layer_norm=False,
        dropout=0.5,
        trainable=False,
        project_dim=None,
    ):
        super(ELMoEmbedding, self).__init__(vocab)
        data_handler = DataHandler(cache_path=CachePath.PRETRAINED_VECTOR)
        option_path = data_handler.read(options_file, return_path=True)
        weight_path = data_handler.read(weight_file, return_path=True)

        self.elmo = Elmo(option_path,
                         weight_path,
                         1,
                         requires_grad=trainable,
                         dropout=dropout)

        self.project_dim = project_dim
        self.project_linear = None
        if project_dim:
            self.project_linear = nn.Linear(self.elmo.get_output_dim(),
                                            project_dim)
Пример #3
0
    def __init__(
        self, word_embedding, pretrained_path=None, requires_grad=False, residual_embeddings=False
    ):
        """Initialize an MTLSTM.

        Arguments:
            n_vocab (bool): If not None, initialize MTLSTM with an embedding matrix with n_vocab vectors
            vectors (Float Tensor): If not None, initiapize embedding matrix with specified vectors
            residual_embedding (bool): If True, concatenate the input embeddings with MTLSTM outputs during forward
        """
        super(MTLSTM, self).__init__()
        self.word_embedding = word_embedding
        self.rnn = nn.LSTM(300, 300, num_layers=2, bidirectional=True, batch_first=True)

        data_handler = DataHandler(cache_path=CachePath.PRETRAINED_VECTOR)
        cove_weight_path = data_handler.read(pretrained_path, return_path=True)

        if torch.cuda.is_available():
            checkpoint = torch.load(cove_weight_path)
        else:
            checkpoint = torch.load(cove_weight_path, map_location="cpu")

        self.rnn.load_state_dict(checkpoint)
        self.residual_embeddings = residual_embeddings
        self.requires_grad = requires_grad
Пример #4
0
 def __init__(self, name, word_tokenizer, config={}):
     super(SubwordTokenizer,
           self).__init__(name,
                          f"subword-{name}+{word_tokenizer.cache_name}")
     self.data_handler = DataHandler(CachePath.VOCAB)
     self.config = config
     self.word_tokenizer = word_tokenizer
     self.subword_tokenizer = None
Пример #5
0
    def __init__(self, file_paths, dataset_obj):
        self.file_paths = file_paths
        self.dataset_obj = dataset_obj

        self.data_handler = DataHandler(
            cache_path=CachePath.DATASET)  # for Concrete DataReader
        self.text_columns = None
Пример #6
0
    def build_with_pretrained_file(self, token_counter):
        data_handler = DataHandler(CachePath.VOCAB)
        vocab_texts = data_handler.read(self.pretrained_path)

        if self.pretrained_path.endswith(".txt"):
            predefine_vocab = vocab_texts.split("\n")
        elif self.pretrained_path.endswith(".json"):
            vocab_texts = json.loads(vocab_texts)  # {token: id}
            predefine_vocab = [
                item[0]
                for item in sorted(vocab_texts.items(), key=lambda x: x[1])
            ]
        else:
            raise ValueError(f"support vocab extention. .txt or .json")

        self.build(token_counter, predefine_vocab=predefine_vocab)
Пример #7
0
    def __init__(
        self,
        vocab,
        dropout=0.2,
        embed_dim=100,
        padding_idx=None,
        max_norm=None,
        norm_type=2,
        scale_grad_by_freq=False,
        sparse=False,
        pretrained_path=None,
        trainable=True,
    ):
        super(WordEmbedding, self).__init__(vocab)
        self.data_handler = DataHandler(cache_path=CachePath.PRETRAINED_VECTOR)

        self.embed_dim = embed_dim
        if dropout and dropout > 0:
            self.dropout = nn.Dropout(p=dropout)
        else:
            self.dropout = lambda x: x

        if pretrained_path:
            weight = self._read_pretrained_file(pretrained_path)
            self.weight = torch.nn.Parameter(weight, requires_grad=trainable)
        else:
            self.weight = self._init_weight(trainable=trainable)

        # nn.functional.embedding = optional paramters
        #  (padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
        # check - https://pytorch.org/docs/master/nn.html#torch.nn.functional.embeddin\
        #    ://pytorch.org/docs/master/nn.html#torch.nn.functional.embedding
        self.padding_idx = padding_idx
        self.max_norm = max_norm
        self.norm_type = norm_type
        self.scale_grad_by_freq = scale_grad_by_freq
        self.sparse = sparse
Пример #8
0
class SubwordTokenizer(Tokenizer):
    """
    Subword Tokenizer

    text -> [word tokens] -> [[sub word tokens], ...]

    * Args:
        name: tokenizer name [wordpiece]
    """
    def __init__(self, name, word_tokenizer, config={}):
        super(SubwordTokenizer,
              self).__init__(name,
                             f"subword-{name}+{word_tokenizer.cache_name}")
        self.data_handler = DataHandler(CachePath.VOCAB)
        self.config = config
        self.word_tokenizer = word_tokenizer
        self.subword_tokenizer = None

    """ Tokenizers """

    def _wordpiece(self, text, unit="text"):
        """
        ex) Hello World -> ['Hello', 'World'] -> ['He', '##llo', 'Wo', '##rld']
        """
        if self.subword_tokenizer is None:
            vocab_path = self.data_handler.read(self.config["vocab_path"],
                                                return_path=True)
            vocab = load_vocab(vocab_path)
            self.subword_tokenizer = WordpieceTokenizer(
                vocab, unk_token=self.config.get("unk_token", "[UNK]"))

        tokens = []

        if unit == "word":
            for sub_token in self.subword_tokenizer.tokenize(text):
                tokens.append(sub_token)
        else:
            for token in self.word_tokenizer.tokenize(text):
                for sub_token in self.subword_tokenizer.tokenize(token):
                    tokens.append(sub_token)

        return tokens
Пример #9
0
    def __init__(self, name, config={}):
        super(BPETokenizer, self).__init__(name, f"bpe-{name}")
        self.data_handler = DataHandler(CachePath.VOCAB)
        self.config = config

        self.bpe_tokenizer = None
Пример #10
0
    def __init__(self, token_makers, lazy_indexing=True):
        self.token_makers = token_makers
        self.lazy_indexing = lazy_indexing

        self.data_handler = DataHandler(cache_path=CachePath.TOKEN_COUNTER)
Пример #11
0
class TextHandler:
    """
    Text Handler

    - voacb and token_counter
    - raw_features -> indexed_features
    - raw_features -> tensor

    * Args:
        token_makers: Dictionary consisting of
            - key: token_name
            - value: TokenMaker (claf.tokens.token_maker)

    * Kwargs:
        lazy_indexing: Apply `Lazy Evaluation` to text indexing
    """

    def __init__(self, token_makers, lazy_indexing=True):
        self.token_makers = token_makers
        self.lazy_indexing = lazy_indexing

        self.data_handler = DataHandler(cache_path=CachePath.TOKEN_COUNTER)

    def build_vocabs(self, token_counters):
        logger.info("Start build vocab")
        vocab_start_time = time.time()

        vocabs = {}
        for token_name, token_maker in self.token_makers.items():
            is_defined_config = type(token_maker.vocab_config) == dict
            if is_defined_config:
                token_counter = token_counters[token_name]
                vocab = self._build_vocab_with_config(token_name, token_maker, token_counter)
            else:
                vocab = Vocab(token_name)
                vocab.init()

            vocabs[token_name] = vocab
            logger.info(
                f" => {token_name} vocab size: {len(vocab)}  (use predefine vocab: {vocab.pretrained_path is not None})"
            )

        vocab_elapased_time = time.time() - vocab_start_time
        logger.info(f"Complete build vocab...  elapsed_time: {vocab_elapased_time}\n")

        # Setting Indexer (vocab)
        for token_name, token_maker in self.token_makers.items():
            token_maker.set_vocab(vocabs[token_name])
        return vocabs

    def _build_vocab_with_config(self, token_name, token_maker, token_counter):
        token_maker.vocab_config["token_name"] = token_name
        vocab = Vocab(**token_maker.vocab_config)

        if vocab.pretrained_path is not None:
            vocab.build_with_pretrained_file(token_counter)
        else:
            vocab.build(token_counter)
        return vocab

    def is_all_vocab_use_pretrained(self):
        for token_name, token_maker in self.token_makers.items():
            if token_maker.vocab_config.get("pretrained_path", None) is None:
                return False
            if token_maker.vocab_config.get("pretrained_token", "") != Vocab.PRETRAINED_ALL:
                return False
        return True

    def make_token_counters(self, texts, config=None):
        token_counters = {}
        for token_name, token_maker in self.token_makers.items():
            token_vocab_config = token_maker.vocab_config
            if type(token_vocab_config) == dict:
                if token_vocab_config.get("pretrained_token", None) == Vocab.PRETRAINED_ALL:
                    texts = [
                        ""
                    ]  # do not use token_counter from dataset -> make empty token_counter

            token_counter = self._make_token_counter(
                texts, token_maker.tokenizer, config=config, desc=f"{token_name}-vocab"
            )
            logger.info(f" * {token_name} token_counter size: {len(token_counter)}")

            token_counters[token_name] = token_counter
        return token_counters

    def _make_token_counter(self, texts, tokenizer, config=None, desc=None):
        tokenizer_name = tokenizer.name

        cache_token_counter = None
        if config is not None:
            data_reader_config = config.data_reader
            cache_token_counter = self.data_handler.cache_token_counter(
                data_reader_config, tokenizer_name
            )

        if cache_token_counter:
            return cache_token_counter
        else:
            tokens = [
                token for text in tqdm(texts, desc=desc) for token in tokenizer.tokenize(text)
            ]
            flatten_list = list(common_utils.flatten(tokens))
            token_counter = Counter(flatten_list)

            if config is not None:  # Cache TokenCounter
                self.data_handler.cache_token_counter(
                    data_reader_config, tokenizer_name, obj=token_counter
                )
            return token_counter

    def index(self, datas, text_columns):
        logger.info(f"Start token indexing, Lazy: {self.lazy_indexing}")
        indexing_start_time = time.time()

        for data_type, data in datas.items():
            if type(data) == list:
                # Multi-Data Indexing
                for d in data:
                    self._index_features(
                        d.features, text_columns, desc=f"indexing features ({data_type})"
                    )
            else:
                self._index_features(
                    data.features, text_columns, desc=f"indexing features ({data_type})"
                )

        indexing_elapased_time = time.time() - indexing_start_time
        logger.info(f"Complete token indexing... elapsed_time: {indexing_elapased_time} \n")

    def _index_features(self, features, text_columns, desc=None, suppress_tqdm=False):
        for feature in tqdm(features, desc=desc, disable=suppress_tqdm):
            for key, text in feature.items():
                if key not in text_columns:
                    continue

                # Set data_type (text => {"text": ..., "token1": ..., ...})
                if type(feature[key]) != dict:
                    feature[key] = {"text": text}
                if type(text) == dict:
                    text = text["text"]

                for token_name, token_maker in self.token_makers.items():
                    param_key = token_maker.indexer.param_key
                    if param_key == key:
                        continue

                    feature[key][token_name] = self._index_token(token_maker, text, feature)

    def _index_token(self, token_maker, text, data):
        def index():
            indexer = token_maker.indexer
            params = {}
            if token_maker.type_name == TokenMaker.EXACT_MATCH_TYPE:
                param_text = data[indexer.param_key]
                if type(param_text) == dict:
                    param_text = param_text["text"]
                params["query_text"] = param_text
            return indexer.index(text, **params)

        if self.lazy_indexing:
            return index
        else:
            return index()

    def raw_to_tensor_fn(self, data_reader, cuda_device=None, helper={}):
        def raw_to_tensor(inputs):
            is_one = True  # batch_size 1 flag
            feature, _helper = data_reader.read_one_example(inputs)

            nonlocal helper
            helper.update(_helper)

            if type(feature) == list:
                is_one = False
                features = feature
            else:
                features = [feature]

            self._index_features(features, data_reader.text_columns, suppress_tqdm=True)

            if is_one:
                indexed_features = features[0]
            else:  # when features > 1, need to transpose (dict_of_list -> list_of_dict)
                indexed_features = {}
                for key in features[0]:
                    feature_with_key = [feature[key] for feature in features]
                    indexed_features[key] = transpose(feature_with_key, skip_keys=["text"])

            for key in indexed_features:
                for token_name in self.token_makers:
                    if token_name not in indexed_features[key]:
                        continue

                    indexed_values = indexed_features[key][token_name]
                    if is_one:
                        indexed_values = [indexed_values]

                    tensor = padding_tokens(indexed_values, token_name=token_name)
                    if cuda_device is not None and type(tensor) != list:
                        tensor = tensor.cuda(cuda_device)
                    indexed_features[key][token_name] = tensor

            for key in indexed_features:
                if "text" in indexed_features[key]:
                    del indexed_features[key]["text"]

            return indexed_features, helper

        return raw_to_tensor
Пример #12
0
    def __init__(self, config):
        super(OpenQA, self).__init__(config)
        self.data_handler = DataHandler(CachePath.MACHINE / "open_qa")

        self.load()
Пример #13
0
    def __init__(self, config):
        super(NLU, self).__init__(config)
        self.data_handler = DataHandler(CachePath.MACHINE / "nlu")

        self.load()
Пример #14
0
    def build_with_pretrained_file(self, token_counter):
        data_handler = DataHandler(CachePath.VOCAB)
        vocab_texts = data_handler.read(self.pretrained_path)
        predefine_vocab = vocab_texts.split("\n")

        self.build(token_counter, predefine_vocab=predefine_vocab)
Пример #15
0
class MRCEnsemble(Machine):
    """
    Machine Reading Comprehension Ensemble

    * Args:
        config: machine_config
    """
    def __init__(self, config):
        super(MRCEnsemble, self).__init__(config)
        self.data_handler = DataHandler(CachePath.MACHINE / "mrc_ensemble")

        self.load()

    @overrides
    def load(self):
        mrc_config = self.config.reading_comprehension

        # Model 1 - BERT-Kor
        self.rc_experiment1 = self.make_module(mrc_config.model_1)
        print("BERT-Kor ready ..! \n")

        # # Model 2 - BERT-Multilingual
        # self.rc_experiment2 = self.make_module(mrc_config.model_2)
        # print("BERT-Multilingual ready ..! \n")

        # # Model 3 - DocQA
        # self.rc_experiment3 = self.make_module(mrc_config.model_3)
        # print("DocQA ready ..! \n")

        # # Model 4 - DrQA
        # self.rc_experiment4 = self.make_module(mrc_config.model_4)
        # print("DrQA ready ..! \n")

        print("All ready ..! \n")

    def evaluate(self, file_path, output_path):
        # KorQuAD dataset...

        # def get_answer_after_clustering(predictions):
        # categories = {}

        # for l1 in predictions:
        # l1_text = l1["text"]
        # l1_text_normalized = normalize_answer(l1_text)

        # categories[l1_text] = {
        # "items": [],
        # "score": 0
        # }

        # for l2 in predictions:
        # l2_text = l2["text"]
        # l2_text_normalized = normalize_answer(l2_text)

        # if l1_text_normalized in l2_text_normalized:
        # categories[l1_text]["items"].append(l2)
        # categories[l1_text]["score"] += l2["score"]

        # # # count items then score * 1.n
        # # for k, v in categories.items():
        # # ratio = 1 + (len(v["items"]) / 10)
        # # v["score"] *= ratio

        # highest_category = [categories[c] for c in sorted(categories, key=lambda x: categories[x]["score"], reverse=True)][0]
        # answer_text = sorted(highest_category["items"], key=lambda x: x["score"], reverse=True)[0]["text"]
        # return answer_text

        # def get_answer_after_clustering_marginal(predictions):
        # categories = {}

        # for l1 in predictions:
        # l1_text = l1["text"]
        # l1_text_normalized = normalize_answer(l1_text)

        # categories[l1_text] = {
        # "items": [],
        # "score": 0
        # }

        # for l2 in predictions:
        # l2_text = l2["text"]
        # l2_text_normalized = normalize_answer(l2_text)

        # if l1_text_normalized in l2_text_normalized:
        # categories[l1_text]["items"].append(l2)
        # categories[l1_text]["score"] *= l2["score"]
        # else:
        # categories[l1_text]["score"] *= 0.01  # Default value

        # # count items then score * 1.n
        # for k, v in categories.items():
        # ratio = 1 + (len(v["items"]) / 10)
        # v["score"] *= ratio

        # highest_category = [categories[c] for c in sorted(categories, key=lambda x: categories[x]["score"], reverse=True)][0]
        # answer_text = sorted(highest_category["items"], key=lambda x: x["score"], reverse=True)[0]["text"]
        # return answer_text

        # def post_processing(text):
        # # detach josa
        # # josas = ['은', '는', '이', '가', '을', '를', '과', '와', '이다', '다', '으로', '로', '의', '에']
        # josas = ["는", "를", "이다", "으로", "에", "이라고", "라고", "와의", "인데"]

        # for josa in josas:
        # if text.endswith(josa):
        # text = text[:-len(josa)]
        # break

        # # temperature
        # if text.endswith("°"):
        # text += "C"

        # # etc
        # special_cases = ["(", ",", "였", "."]
        # for s in special_cases:
        # if text.endswith(s):
        # text = text[:-len(s)]
        # return text

        def _clean_text(text):
            # https://github.com/allenai/document-qa/blob/2f9fa6878b60ed8a8a31bcf03f802cde292fe48b/docqa/data_processing/text_utils.py#L124
            # be consistent with quotes, and replace \u2014 and \u2212 which I have seen being mapped to UNK
            # by glove word vecs
            return (text.replace("''", '"').replace("``", '"').replace(
                "\u2212", "-").replace("\u2014", "\u2013"))

        predictions = {}
        topk_predictions = {}

        print("Read input_data...")
        data = self.data_handler.read(file_path)
        squad = json.loads(data)
        if "data" in squad:
            squad = squad["data"]

        wrong_count = 0

        print("Start predict 1-examples...")
        for article in tqdm(squad):
            for paragraph in article["paragraphs"]:
                context = paragraph["context"]

                for qa in paragraph["qas"]:
                    question = qa["question"]
                    id_ = qa["id"]

                    # Marginal probabilities...
                    # prediction = self.get_predict_with_marginal(context, question)
                    prediction = self.get_predict(context, question)
                    # print("prediction count:", len(prediction))

                    topk_predictions[id_] = prediction
                    predictions[id_] = prediction[0]["text"]

                    # answer_texts = [q["text"] for q in qa["answers"]]

                    # # 1. Highest value
                    # sorted_prediction = sorted(prediction, key=lambda x: x["score"], reverse=True)
                    # prediction_text = sorted_prediction[0]["text"]

                    # 2. Cluster by text
                    # prediction_text = get_answer_after_clustering_marginal(prediction)
                    # prediction_text = post_processing(prediction_text)

                    # predictions[id_] = prediction_text
                    # if prediction_text not in answer_texts:
                    # pred_f1_score = metric_max_over_ground_truths(f1_score, prediction_text, answer_texts)

                    # if pred_f1_score <= 0.5:
                    # sorted_prediction = sorted(prediction, key=lambda x: x["score"], reverse=True)
                    # print("predict:", json.dumps(sorted_prediction[:5], indent=4, ensure_ascii=False))
                    # print("predict_text:", prediction_text)
                    # print("answers:", qa["answers"], "f1:", pred_f1_score)
                    # print("-"*50)
                    # wrong_count += 1

                    # is_answer = False
                    # for pred in prediction:
                    # if pred["text"] in answer_texts:
                    # predictions[id_] = pred["text"]
                    # is_answer = True
                    # break

                    # if not is_answer:
                    # prediction_text = sorted(prediction, key=lambda x: x["score"], reverse=True)[0]["text"]
                    # predictions[id_] = prediction_text

                    # print("predict:", prediction)
                    # print("predict_text:", prediction_text)
                    # print("answers:", qa["answers"])
                    # print("-"*50)
                    # wrong_count += 1

        print("total_count:", len(predictions), "wrong_count:", wrong_count)

        print("Completed...!")
        with open(output_path, "w") as out_file:
            out_file.write(json.dumps(topk_predictions, indent=4) + "\n")

        # Evaluate
        with open(file_path) as dataset_file:
            dataset_json = json.load(dataset_file)
            dataset = dataset_json
            if "data" in dataset:
                dataset = dataset["data"]
        # with open(output_path) as prediction_file:
        # predictions = json.load(prediction_file)

        results = evaluate(dataset, predictions)
        print(json.dumps(results))

    def get_predict(self, context, question):
        raw_feature = {"context": context, "question": question}
        # print(raw_feature)

        # Approach 1. Max Prob
        models = [
            (self.rc_experiment1, 0.94),
            # (self.rc_experiment2, 0.90)
            # (self.rc_experiment3, 0.85),
            # (self.rc_experiment4, 0.84),
        ]
        # models = [self.rc_experiment3, self.rc_experiment4]

        model = models[0][0]
        return sorted(model.predict(raw_feature),
                      key=lambda x: x["score"],
                      reverse=True)
Пример #16
0
class OpenQA(Machine):
    """
    Open-Domain Question Answer Machine (DrQA)

    DrQA is a system for reading comprehension applied to open-domain question answering.

    * Args:
        config: machine_config
    """
    def __init__(self, config):
        super(OpenQA, self).__init__(config)
        self.data_handler = DataHandler(CachePath.MACHINE / "open_qa")

        self.load()

    @overrides
    def load(self):
        # Tokenizers
        tokenizers_config = convert_config2dict(self.config.tokenizers)
        tokenizers = make_all_tokenizers(tokenizers_config)

        # Knowledge Base
        # - Wiki
        knowledge_base_config = self.config.knowledge_base
        self.docs, doc_name = self._load_knowledge_base(knowledge_base_config)

        # Reasoning
        # - Document Retrieval
        # - Reading Comprehension Experiment
        reasoning_config = self.config.reasoning

        self.document_retrieval = self._load_document_retrieval(
            reasoning_config.document_retrieval,
            tokenizers["word"],
            basename=doc_name)
        self.rc_experiment = self.make_module(
            reasoning_config.reading_comprehension)
        print("Ready ..! \n")

    def _load_knowledge_base(self, config):
        docs = read_wiki_articles(config.wiki)  # TODO: fix read whole wiki
        doc_name = f"{os.path.basename(config.wiki)}-{len(docs)}-articles"
        return docs, doc_name

    def _load_document_retrieval(self,
                                 config,
                                 word_tokenizer,
                                 basename="docs"):
        dir_path = f"doc-{config.type}-{config.name}-{word_tokenizer.cache_name}"
        doc_retrieval_path = os.path.join(dir_path, basename)

        config.params = {
            "texts": [doc.title for doc in self.docs],
            "word_tokenizer": word_tokenizer,
        }
        document_retrieval = self.make_module(config)

        doc_retrieval_path = self.data_handler.convert_cache_path(
            doc_retrieval_path)
        if doc_retrieval_path.exists():
            document_retrieval.load(doc_retrieval_path)
        else:
            print("Start Document Retrieval Indexing ...")
            document_retrieval.init()
            document_retrieval.save(doc_retrieval_path)  # Save Cache
        print("Completed!")
        return document_retrieval

    @overrides
    def __call__(self, question):
        result_docs = self.search_documents(question)
        print("-" * 50)
        print("Doc Scores:")
        for doc in result_docs:
            print(f" - {doc[1]} : {doc[2]}")
        print("-" * 50)

        passages = []
        for result_doc in result_docs:
            doc_index = result_doc[0]
            doc = self.docs[doc_index]
            passages.append(doc.text)

        answers = []
        for passage in passages:
            answer_text = self.machine_reading(passage, question)
            answers.append(answer_text)

        ranked_answers = sorted(answers,
                                key=lambda x: x["score"],
                                reverse=True)
        return ranked_answers

    def search_documents(self, question):
        return self.document_retrieval.get_closest(question)

    def machine_reading(self, context, question):
        raw_feature = {"context": context, "question": question}
        return self.rc_experiment.predict(raw_feature)
Пример #17
0
    def __init__(self, config):
        super(MRCEnsemble, self).__init__(config)
        self.data_handler = DataHandler(CachePath.MACHINE / "mrc_ensemble")

        self.load()
Пример #18
0
class WordEmbedding(TokenEmbedding):
    """
    Word Embedding
    Default Token Embedding

    * Args:
        vocab: Vocab (claf.tokens.vocab)

    * Kwargs:
        dropout: The number of dropout probability
        embed_dim: The number of embedding dimension
        padding_idx: If given, pads the output with the embedding vector at padding_idx
            (initialized to zeros) whenever it encounters the index.
        max_norm: If given, will renormalize the embedding vectors to have a norm lesser
            than this before extracting. Note: this will modify weight in-place.
        norm_type: The p of the p-norm to compute for the max_norm option. Default 2.
        scale_grad_by_freq: if given, this will scale gradients by the inverse of
            frequency of the words in the mini-batch. Default False.
        sparse: if True, gradient w.r.t. weight will be a sparse tensor.
            See Notes under torch.nn.Embedding for more details regarding sparse gradients.
        pretrained_path: pretrained vector path (eg. GloVe)
        trainable: finetune or fixed
    """

    def __init__(
        self,
        vocab,
        dropout=0.2,
        embed_dim=100,
        padding_idx=None,
        max_norm=None,
        norm_type=2,
        scale_grad_by_freq=False,
        sparse=False,
        pretrained_path=None,
        trainable=True,
    ):
        super(WordEmbedding, self).__init__(vocab)
        self.data_handler = DataHandler(cache_path=CachePath.PRETRAINED_VECTOR)

        self.embed_dim = embed_dim
        if dropout and dropout > 0:
            self.dropout = nn.Dropout(p=dropout)
        else:
            self.dropout = lambda x: x

        if pretrained_path:
            weight = self._read_pretrained_file(pretrained_path)
            self.weight = torch.nn.Parameter(weight, requires_grad=trainable)
        else:
            self.weight = self._init_weight(trainable=trainable)

        # nn.functional.embedding = optional paramters
        #  (padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
        # check - https://pytorch.org/docs/master/nn.html#torch.nn.functional.embeddin\
        #    ://pytorch.org/docs/master/nn.html#torch.nn.functional.embedding
        self.padding_idx = padding_idx
        self.max_norm = max_norm
        self.norm_type = norm_type
        self.scale_grad_by_freq = scale_grad_by_freq
        self.sparse = sparse

    def _init_weight(self, trainable=True):
        weight = torch.FloatTensor(self.get_vocab_size(), self.embed_dim)
        weight = torch.nn.Parameter(weight, requires_grad=trainable)
        torch.nn.init.xavier_uniform_(weight)
        return weight

    @overrides
    def forward(self, words):
        input_size = words.size()
        if len(input_size) > 2:
            words = words.view(-1, input_size[-1])

        embedded_words = F.embedding(
            words,
            self.weight,
            padding_idx=self.padding_idx,
            max_norm=self.max_norm,
            norm_type=self.norm_type,
            scale_grad_by_freq=self.scale_grad_by_freq,
            sparse=self.sparse,
        )

        if len(input_size) > 2:
            embedded_size = list(input_size) + [embedded_words.size(-1)]
            embedded_words = embedded_words.view(*embedded_size)
        return self.dropout(embedded_words)

    def _read_pretrained_file(self, file_path):
        words_to_keep = set(self.vocab.get_all_tokens())
        vocab_size = self.get_vocab_size()
        embeddings = {}

        # First we read the embeddings from the file, only keeping vectors for the words we need.
        logger.info("Reading embeddings from file")
        file_path = self.data_handler.read(file_path, return_path=True)
        with open(file_path, "rb") as embeddings_file:
            for line in embeddings_file:
                fields = line.decode("utf-8").rstrip().split(" ")

                if len(fields) - 1 != self.embed_dim:
                    logger.info(
                        f"Found line with wrong number of dimensions (expected {self.embed_dim}, was {len(fields)}): {line}"
                    )
                    continue

                word = fields[0]
                if word in words_to_keep:
                    vector = np.asarray(fields[1:], dtype="float32")
                    embeddings[word] = vector

        if not embeddings:
            raise ValueError(
                "No embeddings of correct dimension found. check input dimension value"
            )

        all_embeddings = np.asarray(list(embeddings.values()))
        embeddings_mean = float(np.mean(all_embeddings))
        embeddings_std = float(np.std(all_embeddings))
        # Now we initialize the weight matrix for an embedding layer, starting with random vectors,
        # then filling in the word vectors we just read.
        logger.info("Initializing pre-trained embedding layer")
        embedding_matrix = torch.FloatTensor(vocab_size, self.embed_dim).normal_(
            embeddings_mean, embeddings_std
        )

        match_count = 0
        for i in range(0, vocab_size):
            word = self.vocab.get_token(i)
            if word in embeddings:
                embedding_matrix[i] = torch.FloatTensor(embeddings[word])
                match_count += 1
            else:
                # f"Word {word} was not found in the embedding file. Initialising randomly."
                pass
        logger.info(f"Match embedding vocab size: {match_count}.  [{match_count}/{vocab_size}]")
        return embedding_matrix

    @overrides
    def get_output_dim(self):
        return self.embed_dim