Пример #1
0
    def serialize_numerical(self, subj, fe, url):
        """ Serializes a numerical FE found by the normalizer
        """
        literal = fe['literal']
        if fe['fe'] == 'Time':
            value = wikidata.format_date(**literal)
            yield wikidata.finalize_statement(subj,
                                              'P585',
                                              value,
                                              self.language,
                                              url,
                                              resolve_property=False,
                                              resolve_value=False)
        elif fe['fe'] == 'Duration':
            if 'start' in literal:
                value = wikidata.format_date(**literal['start'])
                yield wikidata.finalize_statement(subj,
                                                  'P580',
                                                  value,
                                                  self.language,
                                                  url,
                                                  resolve_property=False,
                                                  resolve_value=False)

            if 'end' in literal:
                value = wikidata.format_date(**literal['end'])
                yield wikidata.finalize_statement(subj,
                                                  'P580',
                                                  value,
                                                  self.language,
                                                  url,
                                                  resolve_property=False,
                                                  resolve_value=False)
Пример #2
0
    def process_corpus(self, items, output_file, dump_unresolved_file=None, genealogics=None, processes=0):
        count = skipped = 0

        genealogics_url_to_id = {}
        for success, item in parallel.map(self.serialize_item, items, processes, flatten=True):
            if success:
                subj, prop, val, url = item
                statement = wikidata.finalize_statement(
                    subj, prop, val, self.language, url,
                    resolve_property=False, resolve_value=False
                )

                if not statement:
                    continue

                output_file.write(statement.encode('utf8'))
                output_file.write('\n')

                if genealogics and url.startswith('http://www.genealogics.org/'):
                    genealogics_url_to_id[url] = subj

                count += 1
                if count % 10000 == 0:
                    logger.info('Produced %d statements so far, skipped %d names', count, skipped)
            else:
                skipped += 1
                if dump_unresolved_file:
                    dump_unresolved_file.write(json.dumps(item))
                    dump_unresolved_file.write('\n')

        logger.info('Produced %d statements so far, skipped %d names', count, skipped)
        return genealogics_url_to_id, count, skipped
Пример #3
0
def process_row(data):
    subject = data['emergenza']

    resolved = defaultdict(lambda: [])
    for k, v in data.iteritems():
        if COLUMN_TO_PROPERTY.get(k):
            v = wikidata.resolve(COLUMN_TO_PROPERTY[k], v.decode('utf8'), 'it')
            if v:
                resolved[COLUMN_TO_PROPERTY[k]].append(v)

    info = {k: v for k, v in resolved.iteritems()}

    subject = wikidata.resolver_with_hints('ddd', subject, 'it', **info)
    if subject:
        statements = []
        for property, value in resolved.iteritems():
            stmt = wikidata.finalize_statement(subject, property, value,
                                               'it', resolve_property=False,
                                               resolve_value=False)
            if stmt is not None:
                statements.append(stmt)
    else:
        logger.warn('could not find the wikidata id of "%s"' % data['emergenza'])
        statements = None
    return statements
Пример #4
0
    def serialize_numerical(self, subj, fe, data):
        """ Serializes a numerical FE found by the normalizer
        """
        literal = fe['literal']
        wikidata_property = self.lu_fe_map.get((data['lu'], fe['fe']), {}).get('wid')
        if not wikidata_property:
            logger.debug('skipping *numerical* FE of type "%s" and lu "%s"',
                         fe['fe'], data['lu'])
            return

        if fe['fe'] == 'Time':
            value = wikidata.format_date(**literal)
            yield wikidata.finalize_statement(subj, wikidata_property, value, self.language, data['url'],
                                              resolve_property=False, resolve_value=False)
        elif fe['fe'] == 'Duration':
            if 'start' in literal:
                value = wikidata.format_date(**literal['start'])
                yield wikidata.finalize_statement(subj, wikidata_property, value, self.language, data['url'],
                                                  resolve_property=False, resolve_value=False)

            if 'end' in literal:
                value = wikidata.format_date(**literal['end'])
                yield wikidata.finalize_statement(subj, wikidata_property, value, self.language, data['url'],
                                                  resolve_property=False, resolve_value=False)
Пример #5
0
    def resolve_genealogics_family(self, input_file, url_to_id):
        """ Performs a second pass on genealogics to resolve additional family members
        """
        family_properties = {
            'Family': 'P1038',
            'Father': 'P22',
            'Married': 'P26',
            'Mother': 'P25',
            u'Children\xa0': 'P40',
        }

        for row in input_file:
            data = json.loads(row)

            if 'url' not in data or data['url'] not in url_to_id:
                continue

            subj = url_to_id[data['url']]

            for key, value in data.get('other', {}).iteritems():
                if key in family_properties:
                    prop = family_properties[key]

                    if not isinstance(value, list):
                        logger.debug('unexpected value "%s", property "%s" subject %s',
                                     value, key, subj)
                        continue

                    for member in value:
                        for name, url in member.iteritems():
                            if url in url_to_id:
                                val = url_to_id[url]
                                logger.debug('resolved "%s", %s of/with %s to %s',
                                             name.strip(), key, subj, val)

                                statement = wikidata.finalize_statement(
                                    subj, prop, val, self.language, data['url'],
                                    resolve_property=False, resolve_value=False
                                )

                                yield True, statement
                            else:
                                logger.debug('skipping "%s" (%s), %s of/with %s',
                                             name.strip(), url, key, subj)
                                yield False, name
Пример #6
0
    def to_statements(self, data, input_encoded=True):
        """ Converts the classification results into quick statements

            :param data: Data from the classifier. Can be either str or dict
            :param bool input_encoded: Whether data is a str or a dict
            :returns: Tuples <success, item> where item is a statement if success
             is true else it is a named entity which could not be resolved
            :type: generator
        """
        data = json.loads(data) if input_encoded else data

        url = data.get('url')
        if not url:
            logger.warn('skipping item without url')
            return

        for name, subj in self.get_subjects(data):
            if not subj:
                logger.warn(
                    "Could not resolve Wikidata Item ID of subject '%s'", name)
                yield False, {
                    'chunk': name,
                    'additional': {
                        'sentence': data['text'],
                        'url': url
                    }
                }
                continue

            for fe in data['fes']:
                if fe['chunk'] == name:  # do not add a statement for the current subject
                    continue

                if fe['fe'] in ['Time', 'Duration']:
                    for each in self.serialize_numerical(subj, fe, url):
                        yield True, each
                else:
                    prop = self.fe_to_wid.get(fe['fe'])
                    if not prop:
                        logger.debug('unknown fe type %s, skipping', fe['fe'])
                        continue

                    val = wikidata.resolve(prop, fe['chunk'], self.language)
                    if val:
                        yield True, wikidata.finalize_statement(
                            subj,
                            prop,
                            val,
                            self.language,
                            url,
                            resolve_property=False,
                            resolve_value=False)
                    else:
                        logger.debug(
                            'could not resolve chunk "%s" of fe %s (property is %s)',
                            fe['chunk'], fe['fe'], prop)
                        yield False, {
                            'chunk': fe['chunk'],
                            'additional': {
                                'fe': fe,
                                'sentence': data['text'],
                                'url': url
                            }
                        }
Пример #7
0
    def to_statements(self, data, input_encoded=True):
        """ Converts the classification results into quick statements

            :param data: Data from the classifier. Can be either str or dict
            :param bool input_encoded: Whether data is a str or a dict
            :returns: Tuples <success, item> where item is a statement if success
             is true else it is a named entity which could not be resolved
            :type: generator
        """
        data = json.loads(data) if input_encoded else data

        url = data.get('url')
        if not url:
            logger.warn('skipping item without url')
            return

        for name, subj in self.get_subjects(data):
            if not subj:
                logger.warn("Could not resolve Wikidata Item ID of subject '%s'", name)
                yield False, {'chunk': name, 'additional': {'sentence': data['text'], 'url': url}}
                continue

            all_qualifiers = self.find_qualifiers(data['fes'])
            for fe in data['fes']:
                if fe['chunk'] == name:  # do not add a statement for the current subject
                    continue

                if fe['fe'] in ['Time', 'Duration']:
                    for each in self.serialize_numerical(subj, fe, data):
                        yield True, each
                else:
                    prop = self.lu_fe_map.get((data['lu'], fe['fe']), {}).get('wid')
                    if not prop:
                        logger.debug('unknown fe type %s for LU %s, skipping', fe['fe'], data['lu'])
                        continue

                    chunk_types = set(t[len('http://dbpedia.org/ontology/'):]
                                      for t in fe.get('link', {}).get('types'))
                    fe_types = self.lu_fe_map.get((data['lu'], fe['fe']), {}).get('types', set())
                    if fe_types and chunk_types and not fe_types & chunk_types:
                        logger.debug('skipping chunk "%s" of fe %s because types do not match, '
                                     'expected: %s actual %s', fe['chunk'], fe['fe'], fe_types, chunk_types)
                        continue

                    val = None
                    if 'link' in fe:
                        uri = fe['link']['uri']
                        val = wikidata.wikidata_id_from_wikipedia_url(uri)

                    if not val:
                        val = wikidata.resolve(prop, fe['chunk'], self.language)

                    if not val:
                        val = 'Q19798648'
                        logger.debug('could not resolve chunk "%s" of fe %s (property is %s), '
                                     'using default value of %s',
                                     fe['chunk'], fe['fe'], prop, val)

                    stmt_qualifiers = []
                    for qualifier_property in self.lu_fe_map.get((data['lu'], fe['fe']), {}).get('qualifiers', []):
                        for qualifier_value in all_qualifiers.get(qualifier_property, []):
                            stmt_qualifiers.extend((qualifier_property, qualifier_value))

                    yield True, wikidata.finalize_statement(
                        subj, prop, val, self.language, url, qualifiers=stmt_qualifiers,
                        resolve_property=False, resolve_value=False
                    )