def simple_match(self, template, entity_map): delex = [] refexes = {} entities = sorted(entity_map.items(), key=lambda x: len(x[1].name), reverse=True) for tag_entity in entities: tag, entity = tag_entity matcher = ' '.join(entity.name.replace('\'', '').replace('\"', '').split('_')) matcher = ' '.join(nltk.word_tokenize(matcher)) + ' ' if len(re.findall(matcher, template)) > 0: template = template.replace(matcher, 'SIMPLE-'+tag+' ') refexes[entity.name] = matcher # Solving the bracket problem of regular expressions if len(re.findall(matcher, template)) == 0: delex.append(tag) out = self.proc_parse.parse_doc(template)['sentences'] removals = dict(map(lambda i: (i, []), range(len(out)))) for tag_entity in entities: tag, entity = tag_entity references, entity_removals = ref_delex.get_references(out, 'SIMPLE-'+tag, entity) for reference in references: reference['tag'] = tag reference['reftype'] = 'name' if reference['determiner'].lower().strip() in ['the', 'a', 'an']: reference['reftype'] = 'description' elif reference['determiner'].lower().strip() in ['this', 'that', 'these', 'those']: reference['reftype'] = 'demonstrative' reference['refex'] = reference['determiner'] for compound in sorted(reference['compounds'], key=lambda x: x[0]): reference['refex'] = reference['refex'] + ' ' + compound[1] reference['refex'] = reference['refex'] + ' ' + refexes[entity.name] del reference['determiner'] del reference['compounds'] self.references.extend(references) for k in entity_removals: removals[k].extend(entity_removals[k]) # Remove marked tokens snt_templates = len(out) * [''] for i, snt in enumerate(out): snt_template = [] for j, token in enumerate(snt['tokens']): if j not in removals[i]: snt_template.append(token) snt_templates[i] = ' '.join(snt_template) template = ' '.join(snt_templates).replace('-LRB-', '(').replace('-RRB-', ')').strip() return template, delex
def similarity_match(self, template, entity_map, delex_tag, nps): refexes = {} entities = sorted(entity_map.items(), key=lambda x: len(x[1].name), reverse=True) for tag, entity in entities: if tag not in delex_tag: ranking = {} for np in nps: ranking[np] = edit_distance(' '.join(entity_map[tag].name.split('_')), np) ranking = sorted(ranking.items(), key=operator.itemgetter(1)) np = ranking[0][0] template = template.replace(np, 'SIMILARITY-'+tag) refexes[entity.name] = np delex_tag.append(tag) out = self.proc_parse.parse_doc(template)['sentences'] removals = dict(map(lambda i: (i, []), range(len(out)))) for tag, entity in entity_map.iteritems(): references, entity_removals = ref_delex.get_references(out, 'SIMILARITY-'+tag, entity) for reference in references: reference['tag'] = tag reference['reftype'] = 'name' if reference['determiner'].lower().strip() in ['the', 'a', 'an']: reference['reftype'] = 'description' elif reference['determiner'].lower().strip() in ['this', 'that', 'these', 'those']: reference['reftype'] = 'demonstrative' reference['refex'] = refexes[entity.name] del reference['determiner'] del reference['compounds'] self.references.extend(references) for k in entity_removals: removals[k].extend(entity_removals[k]) # Remove marked tokens snt_templates = len(out) * [''] for i, snt in enumerate(out): snt_template = [] for j, token in enumerate(snt['tokens']): if j not in removals[i]: snt_template.append(token) snt_templates[i] = ' '.join(snt_template) template = ' '.join(snt_templates).replace('-LRB-', '(').replace('-RRB-', ')').strip() return template, delex_tag
def reference_match(self, template, entity_map): refexes = {} delex_tag = [] matchers = self.get_refexes(entity_map) for matcher in matchers: tag, entity_name, matcher, reftype = matcher matcher = matcher + ' ' regex = re.findall(re.escape(matcher), template) if len(regex) > 0: template = template.replace(matcher, 'REF-'+tag+' ') if entity_name not in refexes: refexes[entity_name] = [] for row in regex: refexes[entity_name].append({'matcher':matcher, 'reftype':reftype}) # Solving the bracket problem of regular expressions if len(re.findall(matcher, template)) == 0: delex_tag.append(tag) out = self.proc_parse.parse_doc(template)['sentences'] removals = dict(map(lambda i: (i, []), range(len(out)))) for tag, entity in entity_map.iteritems(): references, entity_removals = ref_delex.get_references(out, 'REF-'+tag, entity) for i, reference in enumerate(references): reference['tag'] = tag reference['reftype'] = refexes[entity.name][i]['reftype'] reference['refex'] = refexes[entity.name][i]['matcher'] del reference['determiner'] del reference['compounds'] self.references.extend(references) for k in entity_removals: removals[k].extend(entity_removals[k]) # Remove marked tokens snt_templates = len(out) * [''] for i, snt in enumerate(out): snt_template = [] for j, token in enumerate(snt['tokens']): if j not in removals[i]: snt_template.append(token) snt_templates[i] = ' '.join(snt_template) template = ' '.join(snt_templates).replace('-LRB-', '(').replace('-RRB-', ')').strip() return template, delex_tag
def _get_references_info(self, out, entities): ''' Get syntactic position, text and sentence status of the references based on dependency parser :param out: stanford corenlp result :param entities: tag - wikipedia id mapping :return: ''' references = [] for tag_entity in entities.iteritems(): tag, entity = tag_entity refs, entity_removals = ref_delex.get_references(out, tag, entity) references.extend(refs) references = sorted(references, key=lambda x: (x['entity'], x['sentence'], x['pos'])) sentence_statuses = {} for i, reference in enumerate(references): if i == 0 or (reference['entity'] != references[i - 1]['entity']): reference['text_status'] = 'new' else: reference['text_status'] = 'given' if reference['sentence'] not in sentence_statuses: sentence_statuses[reference['sentence']] = [] if reference['entity'] not in sentence_statuses[ reference['sentence']]: reference['sentence_status'] = 'new' else: reference['sentence_status'] = 'given' sentence_statuses[reference['sentence']].append( reference['entity']) references = sorted(references, key=lambda x: x['general_pos']) return references
def coreference_match(self, template, entity_map, out_parse): pronrefs = self.get_pronrefs(out_parse) for pronref in pronrefs: ranking = {} for tag in entity_map: ranking[tag] = [] for nomref in pronref['nominalrefs']: ranking[tag].append(edit_distance(entity_map[tag].name, nomref)) ranking[tag] = numpy.mean(ranking[tag]) ranking = sorted(ranking.items(), key=operator.itemgetter(1)) tag = ranking[0][0] template = template.replace(' ' + pronref['reference'], ' PRON-'+tag, 1) out = self.proc_parse.parse_doc(template)['sentences'] references, removals = ref_delex.get_references(out, 'PRON-'+tag, entity_map[tag]) for reference in references: reference['tag'] = tag reference['reftype'] = 'pronoun' reference['refex'] = pronref['reference'].lower() self.references.extend(references) # Remove marked tokens snt_templates = len(out) * [''] for i, snt in enumerate(out): snt_template = [] for j, token in enumerate(snt['tokens']): if j not in removals[i]: snt_template.append(token) snt_templates[i] = ' '.join(snt_template) template = ' '.join(snt_templates).replace('-LRB- ', '( ').replace(' -RRB-', ' )').replace('-LRB-', '(').replace('-RRB-', ')').strip() template = template.replace('PRON-', '') return template
def probabilistic_match(self, template, entity_map, predicates, nps): def calc_prob(np, wiki): words = np.split() if wiki not in self.e2f: return None _min = numpy.log(sys.float_info.min) prob = 0 for word in words: if word in self.e2f[wiki]: prob += numpy.log(self.e2f[wiki][word]) else: prob += _min return prob refexes = {} while len(nps) > 0: np = nps[0] if np in template: ranking = {} for tag, entity in entity_map.items(): prob = calc_prob(np.lower(), entity_map[tag].name.lower()) if prob != None: ranking[tag] = prob for predicate in predicates: prob = calc_prob(np.lower(), predicate.lower()) if prob != None: ranking[predicate] = prob ranking = sorted(ranking.items(), key=operator.itemgetter(1), reverse=True) tag = ranking[0][0] if tag not in predicates: template = template.replace(np, 'PROBABILISTIC-'+tag) entity = entity_map[tag] refexes[entity.name] = np del nps[0] out = self.proc_parse.parse_doc(template)['sentences'] removals = dict(map(lambda i: (i, []), range(len(out)))) for tag, entity in entity_map.iteritems(): references, entity_removals = ref_delex.get_references(out, 'PROBABILISTIC-'+tag, entity) for reference in references: reference['tag'] = tag reference['reftype'] = 'name' reference['refex'] = refexes[entity.name] del reference['determiner'] del reference['compounds'] self.references.extend(references) for k in entity_removals: removals[k].extend(entity_removals[k]) # Remove marked tokens snt_templates = len(out) * [''] for i, snt in enumerate(out): snt_template = [] for j, token in enumerate(snt['tokens']): if j not in removals[i]: snt_template.append(token) snt_templates[i] = ' '.join(snt_template) template = ' '.join(snt_templates).replace('-LRB- ', '(').replace(' -RRB-', ')').replace('-LRB-', '(').replace('-RRB-', ')').strip() return template