def process_entities(args):
    wikisite = args.wikisite
    if wikisite == "zhwiki":
        title_db, redir_db = 1, 3
    elif wikisite == "enwiki":
        title_db, redir_db = 4, 5

    title_idx = get_redis_title(title_db)
    redir_idx = get_redis_redir(redir_db)

    logging.info('collect titles for entities...')

    # thousands level
    title_bulk = set()
    for i, entity in enumerate(reader(args.entities)):
        try:
            title = entity['sitelinks'][wikisite]['title']
        except:
            continue
        title = canonicalize(title, redir_idx)
        title_bulk.add(title)
    logging.info("got %d title out of %d entities" % (len(title_bulk), i))

    logging.debug('====================')
    logging.debug('required: ' + u','.join(title_bulk).encode('utf-8'))

    title_text = bulk_query_wikipage(title_idx, title_bulk, redir_idx)
    logging.info("found page amount: %d" % len(title_text))

    logging.debug('====================')
    logging.debug('got text: ' + u','.join(title_text.keys()).encode('utf-8'))

    output = open(args.output, 'w')
    for i, entity in enumerate(reader(args.entities)):
        try:
            title = entity['sitelinks'][wikisite]['title']
        except:
            title = ""

        title = canonicalize(title, redir_idx)
        text = title_text.get(title)
        if text is None:
            if title:
                logging.warning("specified title %s not found" %
                                title.encode('utf-8'))
            text = ""
        else:
            logging.debug("title %s found" % title.encode('utf-8'))

        entity[wikisite] = text
        output.write(json.dumps(entity) + '\n')

        if i % 500 == 0:
            logging.info("%d entities processed" % i)
            output.flush()
    output.close()
示例#2
0
def process_entities(args):
    logging.info('building title to wiki filename mapping...')
    title_idx = load_idx_mapping(args.wiki_index)
    logging.info('building title redirect mapping...')
    redir_idx = load_redir_mapping(args.redir_file)

    logging.info('collect titles for entities...')

    # thousands level
    wikisite = args.wikisite
    title_bulk = set()
    for i, entity in enumerate(reader(args.entities)):
        try:
            title = entity['sitelinks'][wikisite]['title']
        except:
            continue
        title = redirect(redir_idx, title)
        title = cc.convert(title)
        title_bulk.add(title)
    logging.info("got %d title out of %d entities" % (len(title_bulk), i))

    title_text = bulk_query_wikipage(title_idx, title_bulk)
    logging.info("found page amount: %d" % len(title_text))

    logging.debug('====================')
    logging.debug('got text: ' + u','.join(title_text.keys()).encode('utf-8'))
    logging.debug('--------------------')
    logging.debug('required: ' + u','.join(title_bulk).encode('utf-8'))

    output = open(args.output, 'w')
    for i, entity in enumerate(reader(args.entities)):
        try:
            title = entity['sitelinks'][wikisite]['title']
        except:
            title = ""

        title = redirect(redir_idx, title)
        title = cc.convert(title)
        text = title_text.get(title)
        if text is None:
            if title:
                logging.warning("specified title %s not found" %
                                title.encode('utf-8'))
            text = ""
        else:
            logging.debug("title %s found" % title.encode('utf-8'))

        entity[wikisite] = text
        output.write(json.dumps(entity) + '\n')

        if i % 500 == 0:
            logging.info("%d entities processed" % i)
            output.flush()
    output.close()
def process_entities(args):
    logging.info('building title to wiki filename mapping...')
    title_idx = load_idx_mapping(args.wiki_index)
    logging.info('building title redirect mapping...')
    redir_idx = load_redir_mapping(args.redir_file)

    logging.info('collect titles for entities...')

    # thousands level
    wikisite = args.wikisite
    title_bulk = set()
    for i, entity in enumerate(reader(args.entities)):
        try:
            title = entity['sitelinks'][wikisite]['title']
        except:
            continue
        title = redirect(redir_idx, title)
        title = cc.convert(title)
        title_bulk.add(title)
    logging.info("got %d title out of %d entities" % (len(title_bulk), i))

    title_text = bulk_query_wikipage(title_idx, title_bulk)
    logging.info("found page amount: %d" % len(title_text))

    logging.debug('====================')
    logging.debug('got text: ' + u','.join(title_text.keys()).encode('utf-8'))
    logging.debug('--------------------')
    logging.debug('required: ' + u','.join(title_bulk).encode('utf-8'))

    output = open(args.output, 'w')
    for i, entity in enumerate(reader(args.entities)):
        try:
            title = entity['sitelinks'][wikisite]['title']
        except:
            title = ""

        title = redirect(redir_idx, title)
        title = cc.convert(title)
        text = title_text.get(title)
        if text is None:
            if title:
                logging.warning("specified title %s not found" % title.encode('utf-8'))
            text = ""
        else:
            logging.debug("title %s found" % title.encode('utf-8'))

        entity[wikisite] = text
        output.write(json.dumps(entity) + '\n')

        if i % 500 == 0:
            logging.info("%d entities processed" % i)
            output.flush()
    output.close()
def find_neighbor(args):

    filelist = [l.rstrip() for l in args.filelist]
    
    qids = set(v for _, _, v in next_neighbor_id(reader(args.entities)))

    written_qids = set()
    output = open(args.output, 'w')
    for d in xrange(args.depth):
        logging.info('========> %d-hop neighbors amount: %d' % (d + 1, len(qids)))

        if args.debug:
            logging.debug(u','.join(qids).encode('utf-8'))
            break

        next_qids = set()
        for i, e in enumerate(next_required_entity_from_files(qids, filelist)):
            if e['id'] not in written_qids:
                output.write(json.dumps(e) + '\n')
                written_qids.add(e['id'])

                for v in next_neighbor_of_entity(e):
                    next_qids.add(v)

            if i % 1000 == 0:
                logging.info("found %d neighbors so far by reading files, %d have been written" % (i, len(written_qids)))
                output.flush()

        qids = next_qids

    output.close()
示例#5
0
def collect_categories(args):
    categories = set()
    for i, entity in enumerate(reader(args.input)):
        if i % 20000 == 0:
            logging.info(
                "%d data (first) processed: %s" %
                (i, datetime.datetime.now().strftime('%m%d-%H:%M:%S')))
            if args.debug and i / 20000 == 1:
                break

        try:
            subclass_claims = entity['claims']['P279']
            values = filter(None,
                            (claim_value(claim) for claim in subclass_claims))

            # An entity with a non-empty 'subclass_of' property is itself a class.
            # The parent entities indicated by the 'subclass_of' property are all classes.
            # If the class is an 'instance_of' another entity, the other one must also be a class.
            if values:
                categories.add(entity['id'])

                categories.update(values)
                instance_claims = entity['claims']['P31']
                values = filter(None, (claim_value(claim)
                                       for claim in instance_claims))
                categories.update(values)

        except:
            continue

    return categories
def collect_categories(args):
    categories = set()
    for i, entity in enumerate(reader(args.input)):
        if i % 20000 == 0:
            logging.info("%d data (first) processed: %s" %(i, datetime.datetime.now().strftime('%m%d-%H:%M:%S')))
            if args.debug and i / 20000 == 1:
                break

        try:
            subclass_claims = entity['claims']['P279']
            values = filter(None, (claim_value(claim) for claim in subclass_claims))

            # An entity with a non-empty 'subclass_of' property is itself a class.
            # The parent entities indicated by the 'subclass_of' property are all classes.
            # If the class is an 'instance_of' another entity, the other one must also be a class.
            if values:
                categories.add(entity['id'])

                categories.update(values)
                instance_claims = entity['claims']['P31']
                values = filter(None, (claim_value(claim) for claim in instance_claims))
                categories.update(values)

        except:
            continue

    return categories
示例#7
0
def stat(args):
    for i, entity in enumerate(reader(args.entity_file)):
        if i / 50000 == 1:
            break
        
        for claims in entity['claims'].itervalues():
            for c in claims:
                # datatype, datavalue.type, datavalue.value
                x = extract_kv(c)
                print entity['id'].encode('utf-8') + '\t' + u'\t'.join(x).encode('utf-8')
def next_required_entity_from_files(qids, filelist):
    i = 0
    for f in filelist:
        logging.info("reading file %s ..." % f)
        for e in reader(f):
            if i % 10000 == 0:
                logging.info('read %d input entities in file' % i)

            i += 1
            qid = e['id']
            if qid in qids:
                yield e
示例#9
0
def extract(args):
    categories = collect_categories(args)

    logging.info('got all categories, count %d' % len(categories))
    logging.debug('first 100 categories repr: ' +
                  ','.join(repr(x) for x in list(categories)[:100]))

    output = open(args.output, 'w')
    for i, entity in enumerate(reader(args.input)):
        if i % 20000 == 0:
            logging.info(
                "%d data (first) processed: %s" %
                (i, datetime.datetime.now().strftime('%m%d-%H:%M:%S')))
            if args.debug and i / 20000 == 1:
                break

        try:
            qid = entity['id']
        except:
            continue

        if qid not in categories: continue

        if 'claims' not in entity: continue
        claims = entity['claims']
        subclass_claims = claims['P279'] if 'P279' in claims else [
        ]  # subclass_of
        instance_claims = claims['P31'] if 'P31' in claims else [
        ]  # instance_of
        if len(subclass_claims) + len(instance_claims) == 0: continue

        subclass_values = filter(None, (claim_value(claim)
                                        for claim in subclass_claims))
        instance_values = filter(None, (claim_value(claim)
                                        for claim in instance_claims))

        new_entity = {}
        new_entity['id'] = entity['id']
        try:
            new_entity['enlabel'] = entity['labels']['en']['value']
        except:
            new_entity['enlabel'] = ""
        try:
            new_entity['zhlabel'] = entity['labels']['zh']['value']
        except:
            new_entity['zhlabel'] = ""
        new_entity['pids'] = subclass_values + instance_values

        output.write(json.dumps(new_entity) + "\n")
示例#10
0
def main(filename, dbname, colname):
    conn = pymongo.MongoClient()
    db = conn.get_database(dbname)
    col = db.get_collection(colname)

    logging.getLogger().setLevel(logging.INFO)

    for obj in reader(filename):
        try:
            col.insert_one(obj)
            logging.info('inserted id=%s' % str(obj['id']))
        except pymongo.errors.DuplicateKeyError:
            logging.info('duplicated id=%s' % str(obj['id']))
            continue
        except Exception:
            logging.warning('exception when writing obj %s' % str(obj['id']))
def bulk_query_wikipage(title_idx, titles, redir_idx={}):
    filelist = sorted(set(filter(None, (title_idx.get(t) for t in titles))))
    logging.info('there are %d files to read, given these %d titles' % (len(filelist), len(titles)))
    texts = {}
    # scan all filelist
    for f in filelist:
        logging.info("reading file %s .." % f)
        for page in reader(f):
            try:
                title = page['title']
                title = cc.convert(redirect(redir_idx, title))
                if title in titles:
                    texts[title] = page['text']
            except:
                continue
    return texts
示例#12
0
def bulk_query_wikipage(title_idx, titles, redir_idx={}):
    filelist = sorted(set(filter(None, (title_idx.get(t) for t in titles))))
    logging.info('there are %d files to read, given these %d titles' %
                 (len(filelist), len(titles)))
    texts = {}
    # scan all filelist
    for f in filelist:
        logging.info("reading file %s .." % f)
        for page in reader(f):
            try:
                title = page['title']
                title = cc.convert(redirect(redir_idx, title))
                if title in titles:
                    texts[title] = page['text']
            except:
                continue
    return texts
def extract(args):
    categories = collect_categories(args)

    logging.info('got all categories, count %d' % len(categories))
    logging.debug('first 100 categories repr: ' + ','.join(repr(x) for x in list(categories)[:100]))
    
    output = open(args.output, 'w')
    for i, entity in enumerate(reader(args.input)):
        if i % 20000 == 0:
            logging.info("%d data (first) processed: %s" %(i, datetime.datetime.now().strftime('%m%d-%H:%M:%S')))
            if args.debug and i / 20000 == 1:
                break

        try:
            qid = entity['id']
        except:
            continue

        if qid not in categories: continue

        if 'claims' not in entity: continue
        claims = entity['claims']
        subclass_claims = claims['P279'] if 'P279' in claims else [] # subclass_of
        instance_claims = claims['P31'] if 'P31' in claims else [] # instance_of
        if len(subclass_claims) + len(instance_claims) == 0: continue

        subclass_values = filter(None, (claim_value(claim) for claim in subclass_claims))
        instance_values = filter(None, (claim_value(claim) for claim in instance_claims))

        new_entity = {}
        new_entity['id'] = entity['id']
        try:
            new_entity['enlabel'] = entity['labels']['en']['value']
        except:
            new_entity['enlabel'] = ""
        try:
            new_entity['zhlabel'] = entity['labels']['zh']['value']
        except:
            new_entity['zhlabel'] = ""
        new_entity['pids'] = subclass_values + instance_values

        output.write(json.dumps(new_entity) + "\n")
示例#14
0
def bulk_query_wikidata(qids, wikidata_idx):
    filelist = sorted(set(mquery_redis_idx(qids, wikidata_idx)))
    logging.info('there are %d files to read for %d entities' % (len(filelist), len(qids)))
    entities = {}
    for f in filelist:
        logging.info("reading file %s ..." % f)
        for i, entity in enumerate(reader(f)):
            try:
                qid = entity['id']
            except KeyError:
                continue

            if qid in qids and qid not in entities:
                entities[qid] = entity

            if i % 10000 == 0:
                logging.info("%d items processed, current %s" % (i, qid))

        logging.info("%d items cumulated after reading file %s" % (len(entities), f))
    return entities
示例#15
0
def find_neighbor(args):
    wikidata_idx = get_redis_wikidata()
    es = list(x for x in reader(args.entities))
    logging.info('read %d input entities' % len(es))
    output = open(args.output, 'w')
    for l in xrange(args.depth):
        if len(es) == 0 and l < args.depth:
            logging.info("early breaking at layer %d because of empty entities" % l)
            break

        logging.info('find %d-hop neighbors for %d inputs' % (l, len(es)))
        qids = set(filter(lambda v: v is not None and re.match('^Q\d+$', v),
            (claim_value(c) for e in es for _, cs in e['claims'].iteritems() for c in cs)))
        logging.debug('qids: %s' % u','.join(qids).encode('utf-8'))
        logging.info('there\'re %d neighbors to read' % len(qids))
        es_d = bulk_query_wikidata(qids, wikidata_idx)
        es = [x for x in es_d.itervalues()]

        for e in es:
            output.write(json.dumps(e) + '\n')
def find_entities_by_kinships(args, kinships):
    logging.info('now filtering entities by kinships...')

    # iterate over the outputs
    output = open(args.output, 'w')
    dataset = reader_for_list(args.input_filelist) if args.input_filelist else reader(args.input)
    for i, entity in enumerate(dataset):
        if 'claims' not in entity: continue
        claims = entity['claims']

        subclass_claims = claims['P279'] if 'P279' in claims else [] # subclass_of
        instance_claims = claims['P31'] if 'P31' in claims else [] # instance_of
        if len(subclass_claims) + len(instance_claims) == 0: continue

        categories = filter(lambda x: x is not None, (claim_value(claim) for claim in subclass_claims))
        classes = filter(lambda x: x is not None, (claim_value(claim) for claim in instance_claims))

        if any(x in kinships for x in categories + classes):
            output.write(json.dumps(entity) + '\n')

        if i % 20000 == 0:
            logging.info('categories: %s, classes: %s' % (repr(categories), repr(classes)))
            logging.info('%d entities iterated over: %s' % (i, datetime.datetime.now().strftime('%Y%m%d%H%M%S')))
示例#17
0
trainer_id = int(sys.argv[1])  # trainer id for each guest
job_path = "fl_job_config"
job = FLRunTimeJob()
job.load_trainer_job(job_path, trainer_id)
job._scheduler_ep = "127.0.0.1:9091"  # Inform the scheduler IP to trainer
# print(job._trainer_send_program)

trainer = FLTrainerFactory().create_fl_trainer(job)
use_cuda = False
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
trainer._current_ep = "127.0.0.1:8192"
trainer.start(place=place)
trainer._logger.setLevel(logging.DEBUG)

g = reader()
if trainer_id > 0:
    for i in range(trainer_id):
        next(g)
data = next(g)
print(data)

output_folder = "fl_model"
step_i = 0
while not trainer.stop():
    step_i += 1
    print("batch %d start train" % step_i)
    trainer.run(feed=data, fetch=[])
    if trainer_id == 0:
        print("start saving model")
        trainer.save_inference_program(output_folder)
示例#18
0
 def dataextractor(filelist):
     for f in open(filelist):
         f = f.rstrip()
         for i, entity in enumerate(reader(f)):
             qid = entity['id']
             yield qid, f, i + 1