示例#1
0
    def make_x(self, answer, relation):
        parser = get_spacy_parser()

        # question = datautils.clean_question(question)
        # q_parsed = parser(question)
        # q_vect = list(
        #     map(lambda t: self.w2idx[t.text] if t.text in self.w2idx else self.w2idx[c.UNK_TAG], q_parsed))
        # pad_length = self.MAX_LENGTH - len(q_vect)
        # nil_X = 0
        # x_question = (q_vect + (pad_length) * [nil_X])
        # x_question= np.array(x_question)

        answer = datautils.clean_question(answer)
        a_parsed = parser(answer)
        a_vect = list(
            map(
                lambda t: self.w2idx[t.text]
                if t.text in self.w2idx else self.w2idx[c.UNK_TAG], a_parsed))
        pad_length = self.MAX_LENGTH - len(a_vect)
        nil_X = 0
        x_answer = (a_vect + (pad_length) * [nil_X])
        x_answer = np.array(x_answer)

        x_rel = datautils.to_ohenc(c.RELATIONS.index(relation),
                                   len(c.RELATIONS))
        return np.array([x_answer, x_rel])
示例#2
0
    def make_XY(self, sents_relations, concepts):
        """

        :param sents_relations: list of pairs (question, relations)
        :param concepts: list of pairs (c1, c2)
        :return:
        """
        parser = get_spacy_parser()
        X, Y = [], []
        for (sent, relation), c_list in zip(sents_relations, concepts):
            x = self.make_x(sent, relation)

            question = datautils.clean_question(sent)
            q_parsed = parser(question)
            c1, c2 = datautils.clean_concept(
                c_list[0]), datautils.clean_concept(c_list[1])

            # find the indexes of the concept mentions
            c1_idx = question.find(c1)
            assert c1_idx != -1
            assert question[c1_idx:c1_idx + len(c1)] == c1

            # print((question[:c1_idx] + "#"*len(c1) + question[c1_idx+len(c1):]))
            c2_idx = (question[:c1_idx] + "#" * len(c1) +
                      question[c1_idx + len(c1):]).find(c2)
            if c2_idx != -1:
                assert question[c2_idx:c2_idx + len(c2)] == c2

            # iterate over tokens of the question
            # if the index falls into concept mentions indexes, then it is a concept right or left)
            tags = list(
                map(
                    lambda t: datautils.to_ohenc(
                        c.entity_tags.index(c.LEFT_ENT_TAG), len(c.entity_tags)
                    ) if (t.idx >= c1_idx and t.idx + len(t) <= c1_idx + len(
                        c1)) else datautils.to_ohenc(
                            c.entity_tags.index(c.RIGHT_ENT_TAG),
                            len(c.entity_tags)) if
                    (c2_idx != -1 and t.idx >= c2_idx and t.idx + len(t) <=
                     c2_idx + len(c2)) else datautils.to_ohenc(
                         c.entity_tags.index(c.N_ENT_TAG), len(c.entity_tags)),
                    q_parsed))

            nil_Y = datautils.to_ohenc(c.entity_tags.index(c.NIL_TAG),
                                       len(c.entity_tags))

            pad_length = self.MAX_LENGTH - len(tags)
            y = (tags + ((pad_length) * [nil_Y]))

            X.append(np.array(x))
            Y.append(np.array(y))

        X = [np.array(t) for t in zip(*X)]
        # at the end, X is a list of two arrays:
        #     the 1st is a list of sentences (in indexed forms)
        #     the 2nd is a list of relation representation
        # Y is a list of samples, each of them a list of tags
        return X, np.array(Y)
示例#3
0
    def make_x(self, sent):
        parser = get_spacy_parser()
        question = datautils.clean_question(sent)

        # tokenize the question string
        q_parsed = parser(question)
        q_vect = list(map(lambda t: self.w2idx[t.text] if t.text in self.w2idx else self.w2idx[c.UNK_TAG], q_parsed))
        pad_length = self.MAX_LENGTH - len(q_vect)
        nil_X = 0
        x = (q_vect + (pad_length) * [nil_X])
        return np.array(x)
示例#4
0
 def make_vocab(self, sents):
     parser = get_spacy_parser()
     for s in sents:
         cleaned_s = datautils.clean_question(s)
         doc = parser.tokenizer(cleaned_s)
         for t in doc:
             if not t.text in self.w2idx:
                 new_idx = len(self.w2idx)
                 self.w2idx[t.text] = new_idx
                 self.idx2w[new_idx] = t.text
             if not t.text.lower() in self.w2idx:
                 new_idx = len(self.w2idx)
                 self.w2idx[t.text.lower()] = new_idx
                 self.idx2w[new_idx] = t.text.lower()
示例#5
0
    def make_XY(self, questions_answers_relations, concepts):
        parser = get_spacy_parser()
        X, Y = [], []
        for (question, answer,
             relation), c_list in zip(questions_answers_relations, concepts):
            # x = self.make_x(question, answer, relation)
            x = self.make_x(answer, relation)
            answer = datautils.clean_question(answer)
            a_parsed = parser(answer)
            c1, c2 = datautils.clean_concept(
                c_list[0]), datautils.clean_concept(c_list[1])

            # the question input layer is an older version: you can ignore the question parts
            # assert c1_idx != -1
            # assert question[c1_idx:c1_idx + len(c1)] == c1

            # print((question[:c1_idx] + "#"*len(c1) + question[c1_idx+len(c1):]))
            # c2_idx = (question[:c1_idx] + "#"*len(c1) + question[c1_idx+len(c1):]).find(c2)

            c1_idx = answer.find(c1)
            c2_idx = answer.find(c2)

            tags = list(
                map(
                    lambda t: datautils.to_ohenc(
                        c.entity_tags.index(c.RIGHT_ENT_TAG), len(c.entity_tags
                                                                  ))
                    if (c2_idx != -1 and t.idx >= c2_idx and t.idx + len(t) <=
                        c2_idx + len(c2)) else datautils.to_ohenc(
                            c.entity_tags.index(c.LEFT_ENT_TAG),
                            len(c.entity_tags)) if
                    (c1_idx != -1 and t.idx >= c1_idx and t.idx + len(t) <=
                     c1_idx + len(c1)) else datautils.to_ohenc(
                         c.entity_tags.index(c.N_ENT_TAG), len(c.entity_tags)),
                    a_parsed))
            print(c1, ";", c2)
            print(a_parsed)
            print(self.to_tag_sequence(tags))

            nil_Y = datautils.to_ohenc(c.entity_tags.index(c.NIL_TAG),
                                       len(c.entity_tags))

            pad_length = self.MAX_LENGTH - len(tags)
            y = (tags + ((pad_length) * [nil_Y]))

            X.append(np.array(x))
            Y.append(np.array(y))

        X = [np.array(t) for t in zip(*X)]
        return X, np.array(Y)
示例#6
0
    def make_x(self, sent, relation):
        parser = get_spacy_parser()
        question = datautils.clean_question(sent)
        q_parsed = parser(question)
        q_vect = list(
            map(
                lambda t: self.w2idx[t.text]
                if t.text in self.w2idx else self.w2idx[c.UNK_TAG], q_parsed))
        pad_length = self.MAX_LENGTH - len(q_vect)
        nil_X = 0
        x = (q_vect + (pad_length) * [nil_X])
        x = np.array(x)

        x_rel = datautils.to_ohenc(c.RELATIONS.index(relation),
                                   len(c.RELATIONS))
        return np.array([x, x_rel])
示例#7
0
 def normalize_string(self, string):
     return datautils.clean_question(string)