def make_x(self, answer, relation): parser = get_spacy_parser() # question = datautils.clean_question(question) # q_parsed = parser(question) # q_vect = list( # map(lambda t: self.w2idx[t.text] if t.text in self.w2idx else self.w2idx[c.UNK_TAG], q_parsed)) # pad_length = self.MAX_LENGTH - len(q_vect) # nil_X = 0 # x_question = (q_vect + (pad_length) * [nil_X]) # x_question= np.array(x_question) answer = datautils.clean_question(answer) a_parsed = parser(answer) a_vect = list( map( lambda t: self.w2idx[t.text] if t.text in self.w2idx else self.w2idx[c.UNK_TAG], a_parsed)) pad_length = self.MAX_LENGTH - len(a_vect) nil_X = 0 x_answer = (a_vect + (pad_length) * [nil_X]) x_answer = np.array(x_answer) x_rel = datautils.to_ohenc(c.RELATIONS.index(relation), len(c.RELATIONS)) return np.array([x_answer, x_rel])
def make_XY(self, sents_relations, concepts): """ :param sents_relations: list of pairs (question, relations) :param concepts: list of pairs (c1, c2) :return: """ parser = get_spacy_parser() X, Y = [], [] for (sent, relation), c_list in zip(sents_relations, concepts): x = self.make_x(sent, relation) question = datautils.clean_question(sent) q_parsed = parser(question) c1, c2 = datautils.clean_concept( c_list[0]), datautils.clean_concept(c_list[1]) # find the indexes of the concept mentions c1_idx = question.find(c1) assert c1_idx != -1 assert question[c1_idx:c1_idx + len(c1)] == c1 # print((question[:c1_idx] + "#"*len(c1) + question[c1_idx+len(c1):])) c2_idx = (question[:c1_idx] + "#" * len(c1) + question[c1_idx + len(c1):]).find(c2) if c2_idx != -1: assert question[c2_idx:c2_idx + len(c2)] == c2 # iterate over tokens of the question # if the index falls into concept mentions indexes, then it is a concept right or left) tags = list( map( lambda t: datautils.to_ohenc( c.entity_tags.index(c.LEFT_ENT_TAG), len(c.entity_tags) ) if (t.idx >= c1_idx and t.idx + len(t) <= c1_idx + len( c1)) else datautils.to_ohenc( c.entity_tags.index(c.RIGHT_ENT_TAG), len(c.entity_tags)) if (c2_idx != -1 and t.idx >= c2_idx and t.idx + len(t) <= c2_idx + len(c2)) else datautils.to_ohenc( c.entity_tags.index(c.N_ENT_TAG), len(c.entity_tags)), q_parsed)) nil_Y = datautils.to_ohenc(c.entity_tags.index(c.NIL_TAG), len(c.entity_tags)) pad_length = self.MAX_LENGTH - len(tags) y = (tags + ((pad_length) * [nil_Y])) X.append(np.array(x)) Y.append(np.array(y)) X = [np.array(t) for t in zip(*X)] # at the end, X is a list of two arrays: # the 1st is a list of sentences (in indexed forms) # the 2nd is a list of relation representation # Y is a list of samples, each of them a list of tags return X, np.array(Y)
def make_x(self, sent): parser = get_spacy_parser() question = datautils.clean_question(sent) # tokenize the question string q_parsed = parser(question) q_vect = list(map(lambda t: self.w2idx[t.text] if t.text in self.w2idx else self.w2idx[c.UNK_TAG], q_parsed)) pad_length = self.MAX_LENGTH - len(q_vect) nil_X = 0 x = (q_vect + (pad_length) * [nil_X]) return np.array(x)
def make_vocab(self, sents): parser = get_spacy_parser() for s in sents: cleaned_s = datautils.clean_question(s) doc = parser.tokenizer(cleaned_s) for t in doc: if not t.text in self.w2idx: new_idx = len(self.w2idx) self.w2idx[t.text] = new_idx self.idx2w[new_idx] = t.text if not t.text.lower() in self.w2idx: new_idx = len(self.w2idx) self.w2idx[t.text.lower()] = new_idx self.idx2w[new_idx] = t.text.lower()
def make_XY(self, questions_answers_relations, concepts): parser = get_spacy_parser() X, Y = [], [] for (question, answer, relation), c_list in zip(questions_answers_relations, concepts): # x = self.make_x(question, answer, relation) x = self.make_x(answer, relation) answer = datautils.clean_question(answer) a_parsed = parser(answer) c1, c2 = datautils.clean_concept( c_list[0]), datautils.clean_concept(c_list[1]) # the question input layer is an older version: you can ignore the question parts # assert c1_idx != -1 # assert question[c1_idx:c1_idx + len(c1)] == c1 # print((question[:c1_idx] + "#"*len(c1) + question[c1_idx+len(c1):])) # c2_idx = (question[:c1_idx] + "#"*len(c1) + question[c1_idx+len(c1):]).find(c2) c1_idx = answer.find(c1) c2_idx = answer.find(c2) tags = list( map( lambda t: datautils.to_ohenc( c.entity_tags.index(c.RIGHT_ENT_TAG), len(c.entity_tags )) if (c2_idx != -1 and t.idx >= c2_idx and t.idx + len(t) <= c2_idx + len(c2)) else datautils.to_ohenc( c.entity_tags.index(c.LEFT_ENT_TAG), len(c.entity_tags)) if (c1_idx != -1 and t.idx >= c1_idx and t.idx + len(t) <= c1_idx + len(c1)) else datautils.to_ohenc( c.entity_tags.index(c.N_ENT_TAG), len(c.entity_tags)), a_parsed)) print(c1, ";", c2) print(a_parsed) print(self.to_tag_sequence(tags)) nil_Y = datautils.to_ohenc(c.entity_tags.index(c.NIL_TAG), len(c.entity_tags)) pad_length = self.MAX_LENGTH - len(tags) y = (tags + ((pad_length) * [nil_Y])) X.append(np.array(x)) Y.append(np.array(y)) X = [np.array(t) for t in zip(*X)] return X, np.array(Y)
def make_x(self, sent, relation): parser = get_spacy_parser() question = datautils.clean_question(sent) q_parsed = parser(question) q_vect = list( map( lambda t: self.w2idx[t.text] if t.text in self.w2idx else self.w2idx[c.UNK_TAG], q_parsed)) pad_length = self.MAX_LENGTH - len(q_vect) nil_X = 0 x = (q_vect + (pad_length) * [nil_X]) x = np.array(x) x_rel = datautils.to_ohenc(c.RELATIONS.index(relation), len(c.RELATIONS)) return np.array([x, x_rel])
def normalize_string(self, string): return datautils.clean_question(string)