Exemplo n.º 1
0
    def simple_match(self, template, entity_map):
        delex = []
        refexes = {}

        entities = sorted(entity_map.items(), key=lambda x: len(x[1].name), reverse=True)
        for tag_entity in entities:
            tag, entity = tag_entity
            matcher = ' '.join(entity.name.replace('\'', '').replace('\"', '').split('_'))
            matcher = ' '.join(nltk.word_tokenize(matcher)) + ' '

            if len(re.findall(matcher, template)) > 0:
                template = template.replace(matcher, 'SIMPLE-'+tag+' ')
                refexes[entity.name] = matcher

                # Solving the bracket problem of regular expressions
                if len(re.findall(matcher, template)) == 0:
                    delex.append(tag)

        out = self.proc_parse.parse_doc(template)['sentences']
        removals = dict(map(lambda i: (i, []), range(len(out))))
        for tag_entity in entities:
            tag, entity = tag_entity
            references, entity_removals = ref_delex.get_references(out, 'SIMPLE-'+tag, entity)
            for reference in references:
                reference['tag'] = tag
                reference['reftype'] = 'name'
                if reference['determiner'].lower().strip() in ['the', 'a', 'an']:
                    reference['reftype'] = 'description'
                elif reference['determiner'].lower().strip() in ['this', 'that', 'these', 'those']:
                    reference['reftype'] = 'demonstrative'

                reference['refex'] = reference['determiner']
                for compound in sorted(reference['compounds'], key=lambda x: x[0]):
                    reference['refex'] = reference['refex'] + ' ' + compound[1]
                reference['refex'] = reference['refex'] + ' ' + refexes[entity.name]

                del reference['determiner']
                del reference['compounds']
            self.references.extend(references)

            for k in entity_removals:
                removals[k].extend(entity_removals[k])

        # Remove marked tokens
        snt_templates = len(out) * ['']
        for i, snt in enumerate(out):
            snt_template = []
            for j, token in enumerate(snt['tokens']):
                if j not in removals[i]:
                    snt_template.append(token)
            snt_templates[i] = ' '.join(snt_template)

        template = ' '.join(snt_templates).replace('-LRB-', '(').replace('-RRB-', ')').strip()
        return template, delex
Exemplo n.º 2
0
    def similarity_match(self, template, entity_map, delex_tag, nps):
        refexes = {}

        entities = sorted(entity_map.items(), key=lambda x: len(x[1].name), reverse=True)
        for tag, entity in entities:
            if tag not in delex_tag:
                ranking = {}
                for np in nps:
                    ranking[np] = edit_distance(' '.join(entity_map[tag].name.split('_')), np)

                ranking = sorted(ranking.items(), key=operator.itemgetter(1))
                np = ranking[0][0]
                template = template.replace(np, 'SIMILARITY-'+tag)
                refexes[entity.name] = np

                delex_tag.append(tag)

        out = self.proc_parse.parse_doc(template)['sentences']
        removals = dict(map(lambda i: (i, []), range(len(out))))
        for tag, entity in entity_map.iteritems():
            references, entity_removals = ref_delex.get_references(out, 'SIMILARITY-'+tag, entity)
            for reference in references:
                reference['tag'] = tag
                reference['reftype'] = 'name'
                if reference['determiner'].lower().strip() in ['the', 'a', 'an']:
                    reference['reftype'] = 'description'
                elif reference['determiner'].lower().strip() in ['this', 'that', 'these', 'those']:
                    reference['reftype'] = 'demonstrative'

                reference['refex'] = refexes[entity.name]

                del reference['determiner']
                del reference['compounds']
            self.references.extend(references)

            for k in entity_removals:
                removals[k].extend(entity_removals[k])

        # Remove marked tokens
        snt_templates = len(out) * ['']
        for i, snt in enumerate(out):
            snt_template = []
            for j, token in enumerate(snt['tokens']):
                if j not in removals[i]:
                    snt_template.append(token)
            snt_templates[i] = ' '.join(snt_template)

        template = ' '.join(snt_templates).replace('-LRB-', '(').replace('-RRB-', ')').strip()

        return template, delex_tag
Exemplo n.º 3
0
    def reference_match(self, template, entity_map):
        refexes = {}
        delex_tag = []

        matchers = self.get_refexes(entity_map)
        for matcher in matchers:
            tag, entity_name, matcher, reftype = matcher
            matcher = matcher + ' '

            regex = re.findall(re.escape(matcher), template)
            if len(regex) > 0:
                template = template.replace(matcher, 'REF-'+tag+' ')
                if entity_name not in refexes:
                    refexes[entity_name] = []
                for row in regex:
                    refexes[entity_name].append({'matcher':matcher, 'reftype':reftype})

                # Solving the bracket problem of regular expressions
                if len(re.findall(matcher, template)) == 0:
                    delex_tag.append(tag)

        out = self.proc_parse.parse_doc(template)['sentences']
        removals = dict(map(lambda i: (i, []), range(len(out))))
        for tag, entity in entity_map.iteritems():
            references, entity_removals = ref_delex.get_references(out, 'REF-'+tag, entity)
            for i, reference in enumerate(references):
                reference['tag'] = tag
                reference['reftype'] = refexes[entity.name][i]['reftype']
                reference['refex'] = refexes[entity.name][i]['matcher']

                del reference['determiner']
                del reference['compounds']
            self.references.extend(references)

            for k in entity_removals:
                removals[k].extend(entity_removals[k])

        # Remove marked tokens
        snt_templates = len(out) * ['']
        for i, snt in enumerate(out):
            snt_template = []
            for j, token in enumerate(snt['tokens']):
                if j not in removals[i]:
                    snt_template.append(token)
            snt_templates[i] = ' '.join(snt_template)

        template = ' '.join(snt_templates).replace('-LRB-', '(').replace('-RRB-', ')').strip()
        return template, delex_tag
Exemplo n.º 4
0
    def _get_references_info(self, out, entities):
        '''
        Get syntactic position, text and sentence status of the references based on dependency parser
        :param out: stanford corenlp result
        :param entities: tag - wikipedia id mapping
        :return:
        '''
        references = []
        for tag_entity in entities.iteritems():
            tag, entity = tag_entity
            refs, entity_removals = ref_delex.get_references(out, tag, entity)

            references.extend(refs)

        references = sorted(references,
                            key=lambda x:
                            (x['entity'], x['sentence'], x['pos']))

        sentence_statuses = {}
        for i, reference in enumerate(references):
            if i == 0 or (reference['entity'] != references[i - 1]['entity']):
                reference['text_status'] = 'new'
            else:
                reference['text_status'] = 'given'

            if reference['sentence'] not in sentence_statuses:
                sentence_statuses[reference['sentence']] = []

            if reference['entity'] not in sentence_statuses[
                    reference['sentence']]:
                reference['sentence_status'] = 'new'
            else:
                reference['sentence_status'] = 'given'

            sentence_statuses[reference['sentence']].append(
                reference['entity'])

        references = sorted(references, key=lambda x: x['general_pos'])
        return references
Exemplo n.º 5
0
    def coreference_match(self, template, entity_map, out_parse):
        pronrefs = self.get_pronrefs(out_parse)

        for pronref in pronrefs:
            ranking = {}
            for tag in entity_map:
                ranking[tag] = []

                for nomref in pronref['nominalrefs']:
                    ranking[tag].append(edit_distance(entity_map[tag].name, nomref))
                ranking[tag] = numpy.mean(ranking[tag])

            ranking = sorted(ranking.items(), key=operator.itemgetter(1))
            tag = ranking[0][0]

            template = template.replace(' ' + pronref['reference'], ' PRON-'+tag, 1)

            out = self.proc_parse.parse_doc(template)['sentences']
            references, removals = ref_delex.get_references(out, 'PRON-'+tag, entity_map[tag])
            for reference in references:
                reference['tag'] = tag
                reference['reftype'] = 'pronoun'
                reference['refex'] = pronref['reference'].lower()
            self.references.extend(references)

            # Remove marked tokens
            snt_templates = len(out) * ['']
            for i, snt in enumerate(out):
                snt_template = []
                for j, token in enumerate(snt['tokens']):
                    if j not in removals[i]:
                        snt_template.append(token)
                snt_templates[i] = ' '.join(snt_template)

            template = ' '.join(snt_templates).replace('-LRB- ', '( ').replace(' -RRB-', ' )').replace('-LRB-', '(').replace('-RRB-', ')').strip()
            template = template.replace('PRON-', '')

        return template
Exemplo n.º 6
0
    def probabilistic_match(self, template, entity_map, predicates, nps):
        def calc_prob(np, wiki):
            words = np.split()

            if wiki not in self.e2f:
                return None

            _min = numpy.log(sys.float_info.min)
            prob = 0
            for word in words:
                if word in self.e2f[wiki]:
                    prob += numpy.log(self.e2f[wiki][word])
                else:
                    prob += _min
            return prob

        refexes = {}
        while len(nps) > 0:
            np = nps[0]
            if np in template:
                ranking = {}
                for tag, entity in entity_map.items():
                    prob = calc_prob(np.lower(), entity_map[tag].name.lower())
                    if prob != None:
                        ranking[tag] = prob

                for predicate in predicates:
                    prob = calc_prob(np.lower(), predicate.lower())
                    if prob != None:
                        ranking[predicate] = prob

                ranking = sorted(ranking.items(), key=operator.itemgetter(1), reverse=True)

                tag = ranking[0][0]
                if tag not in predicates:
                    template = template.replace(np, 'PROBABILISTIC-'+tag)
                    entity = entity_map[tag]
                    refexes[entity.name] = np

            del nps[0]

        out = self.proc_parse.parse_doc(template)['sentences']
        removals = dict(map(lambda i: (i, []), range(len(out))))
        for tag, entity in entity_map.iteritems():
            references, entity_removals = ref_delex.get_references(out, 'PROBABILISTIC-'+tag, entity)
            for reference in references:
                reference['tag'] = tag
                reference['reftype'] = 'name'
                reference['refex'] = refexes[entity.name]

                del reference['determiner']
                del reference['compounds']
            self.references.extend(references)

            for k in entity_removals:
                removals[k].extend(entity_removals[k])

        # Remove marked tokens
        snt_templates = len(out) * ['']
        for i, snt in enumerate(out):
            snt_template = []
            for j, token in enumerate(snt['tokens']):
                if j not in removals[i]:
                    snt_template.append(token)
            snt_templates[i] = ' '.join(snt_template)

        template = ' '.join(snt_templates).replace('-LRB- ', '(').replace(' -RRB-', ')').replace('-LRB-', '(').replace('-RRB-', ')').strip()

        return template