示例#1
0
class MigrateDB(OfflineTools):
    def __init__(self, local_configure):
        super().__init__()
        self.DBIPFrom = local_configure["DBIPFrom"]
        self.DBIPTo = local_configure["DBIPTo"]
        self.DBPortFrom = local_configure["DBPortFrom"]
        self.DBPortTo = local_configure["DBPortTo"]
        self.DBFrom = local_configure["DBFrom"]
        self.DBTo = local_configure["DBTo"]
        self.MongoFrom = MongoDB(db_server_ip=self.DBIPFrom,
                                 db_server_port=self.DBPortFrom,
                                 database_name=self.DBFrom)
        self.MongoTo = MongoDB(db_server_ip=self.DBIPTo,
                               db_server_port=self.DBPortTo,
                               database_name=self.DBTo)

    def execute(self):
        print("From Relevant docs count:",
              self.MongoFrom.db["RelevantType"].count())
        print("To Relevant docs count:",
              self.MongoTo.db["RelevantType"].count())
        print("Begin migrating DB ...")
        documents = list(self.MongoFrom.db["RelevantType"].find())
        result1 = self.MongoTo.save_to_mongodb_many("RelevantType", documents)
        print("Relevant Finished!", self.MongoTo.db["RelevantType"].count())

        print("From Relevant docs count:",
              self.MongoFrom.db["FutureUseType"].count())
        print("To Relevant docs count:",
              self.MongoTo.db["FutureUseType"].count())
        print("Begin migrating DB ...")
        documents = list(self.MongoFrom.db["FutureUseType"].find())
        result2 = self.MongoTo.save_to_mongodb_many("FutureUseType", documents)
        print("FutureUse Finished!", self.MongoTo.db["FutureUseType"].count())
示例#2
0
 def __init__(self, local_configure):
     super().__init__()
     self.seed_entity_id = local_configure["seed_entity_id"]
     self.MongoDB_obj = MongoDB(db_server_ip="10.12.192.47")
     self.internal_use = local_configure["internal_use"]
     self.kg_api = internal_kg_api.InternalKGAPI(self.internal_use)
     self.save_root = local_configure["save_root"]
     self.entityType = EntityType(local_configure["EntityType"])
示例#3
0
 def __init__(self, local_configure):
     super().__init__()
     self.internal_use = local_configure["internal_use"]
     self.kg_api = internal_kg_api.InternalKGAPI(self.internal_use)
     self.save_root = local_configure["save_root"]
     self.process_number_max = local_configure["process_number_max"]
     self.batch_size = local_configure["batch_size"]
     self.debug = local_configure["debug"]
     if self.debug:
         self.process_number_max = 5
         self.MongoDB_obj = MongoDB(db_server_ip="10.12.192.47")
     else:
         self.MongoDB_obj = MongoDB(db_server_ip="10.12.192.22")
示例#4
0
 def __init__(self, local_configure, global_configure):
     super().__init__()
     self.global_configure = global_configure
     self.local_configure = local_configure
     self.db_name = local_configure["db_name"]
     self.collection_name = local_configure["collection_name"]
     self.data_dir = local_configure["data_dir"]
     self.db_interface = MongoDB(db_server_ip="10.93.128.143",
                                 db_server_port=27017,
                                 database_name=self.db_name)
     self.non_id_char = re.compile("([^\u4E00-\u9FD5a-zA-Z0-9])", re.U)
     self.system_logger = logging.getLogger("system_log")
     pass
示例#5
0
 def __init__(self, local_configure):
     super().__init__()
     self.DBIPFrom = local_configure["DBIPFrom"]
     self.DBIPTo = local_configure["DBIPTo"]
     self.DBPortFrom = local_configure["DBPortFrom"]
     self.DBPortTo = local_configure["DBPortTo"]
     self.DBFrom = local_configure["DBFrom"]
     self.DBTo = local_configure["DBTo"]
     self.MongoFrom = MongoDB(db_server_ip=self.DBIPFrom,
                              db_server_port=self.DBPortFrom,
                              database_name=self.DBFrom)
     self.MongoTo = MongoDB(db_server_ip=self.DBIPTo,
                            db_server_port=self.DBPortTo,
                            database_name=self.DBTo)
示例#6
0
    def __init__(self, search_warehouse_configure):

        self.name = search_warehouse_configure.name
        self.host = search_warehouse_configure.host
        self.port = search_warehouse_configure.port
        self.db_name = search_warehouse_configure.db_name
        self.user = search_warehouse_configure.user
        self.pwd = search_warehouse_configure.pwd
        self.sentence_collection_name = search_warehouse_configure.sentence_collection_name
        self.index_dir = search_warehouse_configure.index_dir
        self.memcache_ip_port = search_warehouse_configure.memcache_ip_port

        self.db_client = MongoDB(self.host, self.port, self.db_name, self.user,
                                 self.pwd)
        self.sentence_collection = self.db_client.db[
            self.sentence_collection_name]

        tmp_files = get_files(self.index_dir, r'.*bin')
        if len(tmp_files) == 1:
            self.index_template = tmp_files[0].replace(".bin", "")
        else:
            files_date = {}
            for filename in tmp_files:
                files_date[filename] = int(os.path.basename(filename)[:8])
            sorted_filenames = sorted(files_date.items(),
                                      key=lambda x: x[1],
                                      reverse=True)
            self.index_template = sorted_filenames[0][0].replace(".bin", "")

        index2pos, bin_handle, word2sentence_tf = load_index(
            self.index_template)

        self.word_index_to_position_in_file = index2pos
        self.index_bin_file_handle = bin_handle
        self.word_index_to_sentence_tf = word2sentence_tf
示例#7
0
def process_chunk(db_configure, query, ids):
    try:
        # MongoDB is not fork-safe:
        # http://api.mongodb.com/python/current/faq.html#is-pymongo-fork-safe
        wrap_db = MongoDB(db_configure["host"], int(db_configure["port"]),
                          db_configure["db_name"], db_configure['user'],
                          db_configure['pwd'])
        input_collection = wrap_db.db[db_configure["input_collection_name"]]

        df_dict = defaultdict(int)
        query['_id'] = {'$in': ids}
        response = input_collection.find(query)
        for data in response:
            word_exist = set()
            nlu = process_one(data)
            json = nlu.to_json()
            for i in json:
                ins = json[i]
                for item in ins['分词词性']:
                    elems = item.split("/")
                    word = elems[0].strip()
                    if word == "" or elems[-1] == "True" or elems[1] == "wp":
                        continue
                    word_exist.add(word)

            for word in word_exist:
                df_dict[word] += 1

        return df_dict
    except Exception:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        msg = 'unexpected error: %s | %s | %s' % (exc_type, exc_obj,
                                                  exc_tb.tb_lineno)
        print(msg)
示例#8
0
    def __init__(self, local_configure):

        super().__init__()
        self.system_logger = logging.getLogger("system_log")
        self.nworkers = int(local_configure['nworkers'])
        self.db_config = {
            'host': local_configure['host'],
            'port': int(local_configure['port']),
            'user': local_configure['user'],
            'pwd': local_configure['pwd'],
            'input_db_name': local_configure['input_db_name'],
            'sentence_collection_name':
            local_configure['sentence_collection_name']
        }

        self.wrap_processed_db = MongoDB(self.db_config["host"],
                                         self.db_config["port"],
                                         self.db_config['input_db_name'],
                                         self.db_config['user'],
                                         self.db_config['pwd'])
        self.sentence_collection = self.wrap_processed_db.db[
            self.db_config['sentence_collection_name']]
        self.output_file = local_configure['output_file']

        pass
示例#9
0
    def __init__(self, local_configure):

        super().__init__()

        self.local_configure = local_configure
        self.output_file = local_configure["output_file"]
        self.nworkers = int(local_configure["nworkers"])

        self.db_config = {
            'host': local_configure['host'],
            'port': int(local_configure['port']),
            'db_name': local_configure['db_name'],
            'input_collection_name': local_configure['input_collection_name'],
            'user': local_configure['user'],
            'pwd': local_configure['pwd'],
        }

        self.wrap_db = MongoDB(self.db_config['host'],
                               int(self.db_config['port']),
                               self.db_config['db_name'],
                               self.db_config['user'], self.db_config['pwd'])
        self.input_collection = self.wrap_db.db[
            self.db_config['input_collection_name']]

        self.non_id_char = re.compile("([^\u4E00-\u9FD5a-zA-Z0-9])", re.U)
        self.non_char = re.compile("([^\u4E00-\u9FD5a-zA-Z0-9])")
        self.exclude_category = ["kuakua", "kuaixun"]

        self.system_logger = logging.getLogger("system_log")
        pass
示例#10
0
    def process(self):
        self.system_logger.info('initializing...')
        os.makedirs(self.output_dir, exist_ok=True)
        visualizer.copy_config_files(self.output_dir)
        visualizer.KBID_PREFIX['Tencent_KG'] = self.kg_api_url

        self.system_logger.info('searching...')
        input_db = MongoDB(self.db_config['host'], self.db_config['port'],
                           self.db_config['input_db_name'],
                           self.db_config['user'], self.db_config['pwd'])
        input_collection = input_db.db[self.db_config['input_collection_name']]

        query = {"import_date": {"$gt": self.import_date}}
        try:
            docids = input_collection.find(query).distinct('docid')
        except pymongo.errors.OperationFailure:
            # TO-DO: fix the following error:
            # pymongo.errors.OperationFailure: distinct too big, 16mb cap
            docids = [i['docid'] for i in input_collection.find(query)]

        chunk_size = int(len(docids) / self.nworkers)
        self.system_logger.info('# of docs found: %s' % len(docids))
        if len(docids) == 0:
            return
        if chunk_size == 0:
            chunk_size = len(docids)

        self.system_logger.info('# of workers: %s' % self.nworkers)
        self.system_logger.info('chunk size: %s' % chunk_size)
        chunks = []
        for i in range(0, len(docids), chunk_size):
            chunks.append(slice(i, i + chunk_size))
        self.system_logger.info('parent pid: %s' % os.getpid())
        self.system_logger.info('processing...')

        # # Single processing
        # for c in chunks:
        #     process_chunk(self.db_config, docids[c], self.output_dir)

        # Multi-processing
        pool = multiprocessing.Pool(processes=self.nworkers)
        for c in chunks:
            args = (
                self.db_config,
                docids[c],
                self.output_dir,
            )
            pool.apply_async(
                process_chunk,
                args=args,
            )
        pool.close()
        pool.join()
示例#11
0
    def get_names(self, props):
        '''
        Extract name list from Tencent KG
        '''
        self.system_logger.info('host: %s' % self.db_config['host'])
        self.system_logger.info('port: %s' % self.db_config['port'])
        self.system_logger.info('db name: %s' % self.db_config['db_name'])
        self.system_logger.info('collection name: %s' %
                                self.db_config['input_collection_name'])

        client = MongoDB(self.db_config['host'], self.db_config['port'],
                         self.db_config['db_name'], self.db_config['user'],
                         self.db_config['pwd'])
        collection = client.db[self.db_config['input_collection_name']]

        kbid2names = defaultdict(lambda: defaultdict(int))
        kbid2types = {}
        kbid2hypernyms = {}
        kbid2popularity = {}
        count = defaultdict(int)
        res = collection.find({})
        self.system_logger.info('# of entries found: %s' % res.count())
        for i in res:
            for p in props:
                try:
                    # if not set([x[0] for x in i['types']]).intersection(TYPES):
                    #     continue
                    kbid2types[i['_id']] = [x[0] for x in i['types']]
                except KeyError:
                    count['missing_type'] += 1

                try:
                    for name in i[p]:
                        kbid2names[i['_id']][name[0]] += 1
                except KeyError:
                    count['missing_%s' % p] += 1

                try:
                    kbid2hypernyms[i['_id']] = [x[0] for x in i['精选上位词']]
                except KeyError:
                    count['missing_hypernyms'] += 1

                try:
                    kbid2popularity[i['_id']] = int(i['popular'][0][0])
                except KeyError:
                    count['missing_popularity'] += 1

        self.system_logger.info('Missing properties:')
        for i in count:
            self.system_logger.info('  %s: %s' % (i, count[i]))

        return kbid2names, kbid2types, kbid2hypernyms, kbid2popularity
示例#12
0
    def __init__(self, local_configure):
        self.host = local_configure['db_host']
        self.port = int(local_configure['db_port'])
        self.db_name = local_configure['db_name']
        self.user = local_configure['user']
        self.pwd = local_configure['pwd']
        self.kg_collection_name = local_configure['kg_collection_name']
        self.linker_collection_name = local_configure['linker_collection_name']

        self.client = MongoDB(self.host, self.port, self.db_name, self.user,
                              self.pwd)
        self.collection_kg = self.client.db[self.kg_collection_name]
        self.collection_mentions = self.client.db[self.linker_collection_name]
示例#13
0
    def __init__(self, local_configure):

        super().__init__()
        self.data_dir = local_configure['data_dir']
        self.output_path = local_configure['output_path']
        self.keywords = local_configure['keywords']
        self.mongo = MongoDB(local_configure['host'],
                             int(local_configure['port']),
                             local_configure['db_name'],
                             local_configure['user'], local_configure['pwd'])
        self.collection = self.mongo.db[local_configure['collection_name']]
        self.system_logger = logging.getLogger("system_log")
        pass
示例#14
0
    def __init__(self, entity_warehouse_configure):

        self.name = entity_warehouse_configure.name
        self.host = entity_warehouse_configure.host
        self.port = entity_warehouse_configure.port
        self.db_name = entity_warehouse_configure.db_name
        self.user = entity_warehouse_configure.user
        self.pwd = entity_warehouse_configure.pwd
        self.collection_name = entity_warehouse_configure.entity_collection_name
        self.mentions_name = entity_warehouse_configure.entity_mentions_name
        self.memcache_ip_port = entity_warehouse_configure.memcache_ip_port

        self.db_client = MongoDB(self.host, self.port, self.db_name, self.user, self.pwd)
        self.collection = self.db_client.db[self.collection_name]
        self.mentions = self.db_client.db[self.mentions_name]
示例#15
0
def process_chunk(db_config, docids, output_dir):
    try:
        # MongoDB is not fork-safe:
        # http://api.mongodb.com/python/current/faq.html#is-pymongo-fork-safe
        input_db = MongoDB(db_config['host'], db_config['port'],
                           db_config['input_db_name'], db_config['user'],
                           db_config['pwd'])
        input_collection = input_db.db[db_config['input_collection_name']]

        for docid in docids:
            html = []
            response = input_collection.find({'docid': docid})
            for sent in response:
                # text = ''.join([x.split('/')[0] for x in sent['分词词性']])
                text = sent['raw_sentence']
                entitymentions = []
                if sent['实体链接']:
                    for i in sent['实体链接']:
                        if i['entity']:
                            en = Entity(i['entity']['kbid'])
                        else:
                            en = None
                        em = EntityMention(
                            i['entity_mention'],
                            beg=i['beg'] - sent['sentence_start'],
                            end=i['end'] - sent['sentence_start'],
                            entity=en)
                        entitymentions.append(em)
                h = visualizer.visualize(text, entitymentions, stats=True)
                html.append(h)
            html = visualizer.HTML_TEMPLATE % '<br>\n'.join(html)

            outpath = '%s/%s.html' % (output_dir, docid)
            with open(outpath, 'w') as fw:
                fw.write(html)

    except Exception:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        msg = 'unexpected error: %s | %s | %s' % \
            (exc_type, exc_obj, exc_tb.tb_lineno)
        print(msg)
示例#16
0
def process_chunk(db_config, ids):

    try:
        wrap_processed_db = MongoDB(db_config["host"], db_config["port"],
                                    db_config['input_db_name'],
                                    db_config['user'], db_config['pwd'])
        sentence_collection = wrap_processed_db.db[
            db_config['sentence_collection_name']]
        sentences = sentence_collection.find({"_id": {"$in": ids}})

        entity2sentence_candidates = defaultdict()
        for one_sentence in sentences:

            sentence_entity = {}
            for entity_mention in one_sentence["实体链接"]:
                entity_text = entity_mention["entity_mention"]
                entity_id = None
                if "entity" in entity_mention and entity_mention[
                        "entity"] is not None:
                    entity_id = entity_mention["entity"]["kbid"]
                if entity_id is None:
                    continue
                sentence_entity[entity_id] = entity_text

            for id, text in sentence_entity.items():
                entity_key = id + "_" + text
                if entity_key not in entity2sentence_candidates:
                    entity2sentence_candidates[entity_key] = []
                entity2sentence_candidates[entity_key].append(
                    one_sentence["_id"])

        return entity2sentence_candidates
    except Exception:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        msg = 'unexpected error: %s | %s | %s' % \
              (exc_type, exc_obj, exc_tb.tb_lineno)
        print(msg)
示例#17
0
    #     assert planglinks
    #     logger.info('loading langlinks...')
    #     tmp = json.load(open(planglinks))
    #     for i in tmp:
    #         if i['title'] in langlinks:
    #             count['duplicate'] += 1
    #             continue
    #         langlinks[i['title']] = (i['title_ll'], i['id_ll'])
    #     logger.warning('# of duplicate langlinks: %s' % (count['duplicate']))
    #     logger.info('done.')
    #     del tmp

    logger.info('db name: %s' % db_name)
    logger.info('collection name: %s' % collection_name)
    logger.info('drop collection')
    client = MongoDB(host, port, db_name, user, pwd)
    client.db.drop_collection(collection_name)

    logger.info('importing...')
    pool = multiprocessing.Pool(processes=nworker)
    logger.info('# of workers: %s' % nworker)
    for i in sorted(os.listdir(indir),
                    key=lambda x: os.path.getsize('%s/%s' % (indir, x)),
                    reverse=True):
        inpath = '%s/%s' % (indir, i)
        pool.apply_async(
            import_sents,
            args=(inpath, i),
        )
    pool.close()
    pool.join()
示例#18
0
class BFSKg(OfflineTools):
    def __init__(self, local_configure):
        super().__init__()
        self.seed_entity_id = local_configure["seed_entity_id"]
        self.MongoDB_obj = MongoDB(db_server_ip="10.12.192.47")
        self.internal_use = local_configure["internal_use"]
        self.kg_api = internal_kg_api.InternalKGAPI(self.internal_use)
        self.save_root = local_configure["save_root"]
        self.entityType = EntityType(local_configure["EntityType"])

    def execute(self):
        time_start = datetime.datetime.now()
        to_visit_save_file = self.save_root + "to_visit_save_file.npy"
        visited_save_file = self.save_root + "visited_save_file.npy"
        NoID_save_file = self.save_root + "NoID_save_file.npy"

        # If output_filename (.npy file only) already exists, continue to work on existing database. Otherwise, create a new database.
        NoID = []
        entity_id_visited = []
        entity_id_to_visit = [self.seed_entity_id]
        if os.path.isfile(visited_save_file):
            entity_id_visited = list(
                np.load(visited_save_file, allow_pickle=True))
        if os.path.isfile(to_visit_save_file):
            entity_id_to_visit = list(
                np.load(to_visit_save_file, allow_pickle=True))
        if os.path.isfile(NoID_save_file):
            NoID = list(np.load(NoID_save_file, allow_pickle=True))

        # for document each epoch
        time_old = datetime.datetime.now()
        # print(entity_id_to_visit)
        while len(entity_id_to_visit) != 0:
            # Dequeue the oldest entity in the queue and set it to visited
            entity_id_current = entity_id_to_visit.pop(0)
            # print(entity_id_visited)

            # Judge if current entity has already been visited
            if entity_id_current not in entity_id_visited:
                data = self.kg_api.retrieve_entity(entity_id_current, False)

                # Judge if the new entity belongs to desinated type. If not, omit it.
                # if type_number not in str(data["type"]):
                #     continue

                # data["popular"] = entity_id_current["popular"]

                # 0. set entity "_id"
                data["_id"] = data.pop("__id", None)
                if data["_id"] == None:
                    if data != defaultdict(list, {'_id': None}):
                        NoID.append(data)
                    continue
                else:
                    data["_id"] = data["_id"][0]

                # 1. Save entity into corresponding database
                NeedSaveFlag = True
                for i in data.get("types", []):
                    if self.entityType.isDesiredType(i):
                        insert_id = self.MongoDB_obj.save_to_mongodb(
                            "RelevantType", data)
                        NeedSaveFlag = False
                        break
                if NeedSaveFlag:
                    insert_id = self.MongoDB_obj.save_to_mongodb(
                        "FutureUseType", data)
                    NeedSaveFlag = False
                entity_id_visited.append(entity_id_current)

                # 2. explore children of current entity
                children = data.get("相关实体", [])
                # children_expand = []
                # for child in children:
                #     children_expand.extend([i["__id"] for i in self.kg_api.retrieve_relevant_entities2(child)["relevant_entity_list"] if i["__id"] not in children_expand and i["__id"] not in entity_id_visited and i["__id"] not in entity_id_to_visit])

                # 3. Enqueue unvisited children
                entity_id_to_visit.extend([
                    child for child in children
                    if child not in entity_id_visited
                    and child not in entity_id_to_visit
                ])
                # entity_id_to_visit.extend(children_expand)

            if len(entity_id_visited) % 20 == 0:
                print("*************************************")
                print(len(entity_id_to_visit))
                print(len(entity_id_visited))
                if os.path.isfile(to_visit_save_file):
                    os.rename(
                        to_visit_save_file, to_visit_save_file[:-4] + "_OLD" +
                        to_visit_save_file[-4:])
                if os.path.isfile(visited_save_file):
                    os.rename(
                        visited_save_file, visited_save_file[:-4] + "_OLD" +
                        visited_save_file[-4:])
                if os.path.isfile(NoID_save_file):
                    os.rename(
                        NoID_save_file,
                        NoID_save_file[:-4] + "_OLD" + NoID_save_file[-4:])
                np.save(to_visit_save_file, entity_id_to_visit)
                np.save(visited_save_file, entity_id_visited)
                np.save(NoID_save_file, NoID)
                time_new = datetime.datetime.now()
                print(time_new - time_start, time_new - time_old)
                time_old = time_new

        print("*************************************")
        print(len(entity_id_to_visit))
        print(len(entity_id_visited))
        if os.path.isfile(to_visit_save_file):
            os.rename(
                to_visit_save_file,
                to_visit_save_file[:-4] + "_OLD" + to_visit_save_file[-4:])
        if os.path.isfile(visited_save_file):
            os.rename(visited_save_file,
                      visited_save_file[:-4] + "_OLD" + visited_save_file[-4:])
        if os.path.isfile(NoID_save_file):
            os.rename(NoID_save_file,
                      NoID_save_file[:-4] + "_OLD" + NoID_save_file[-4:])
        np.save(to_visit_save_file, entity_id_to_visit)
        np.save(visited_save_file, entity_id_visited)
        np.save(NoID_save_file, NoID)
        time_new = datetime.datetime.now()
        print(time_new - time_start, time_new - time_old)
示例#19
0
class BFSMultiprocessing(OfflineTools):
    def __init__(self, local_configure):
        super().__init__()
        self.seed_entity_id = local_configure["seed_entity_id"]
        self.internal_use = local_configure["internal_use"]
        self.kg_api = internal_kg_api.InternalKGAPI(self.internal_use)
        self.save_root = local_configure["save_root"]
        self.entityType = EntityType(local_configure["EntityType"])
        self.process_number_max = local_configure["process_number_max"]
        self.batch_size = local_configure["batch_size"]
        self.debug = local_configure["debug"]
        if self.debug:
            self.process_number_max = 5
            self.MongoDB_obj = MongoDB(db_server_ip="10.12.192.47")
        else:
            self.MongoDB_obj = MongoDB()

    def execute(self):
        time_start = datetime.datetime.now()
        processes = []
        to_visit_save_file = self.save_root + "to_visit_save_file.npy"
        visited_save_file = self.save_root + "visited_save_file.npy"
        NoID_save_file = self.save_root + "NoID_save_file.npy"

        # If output_filename (.npy file only) already exists, continue to work on existing database. Otherwise, create a new database.
        NoID = []
        entity_id_visited = []
        entity_id_to_visit = [self.seed_entity_id]
        if os.path.isfile(visited_save_file):
            entity_id_visited = list(
                np.load(visited_save_file, allow_pickle=True))
        if os.path.isfile(to_visit_save_file):
            entity_id_to_visit = list(
                np.load(to_visit_save_file, allow_pickle=True))
        if os.path.isfile(NoID_save_file):
            NoID = list(np.load(NoID_save_file, allow_pickle=True))

        # for document each epoch
        # Worker Input: seeds_to_visit, entity_id_visited, entity_id_to_visit, EntityType
        # Worker Output: data_relevant, data_future, entity_id_visited_delta, entity_id_to_visit_delta
        time_old = datetime.datetime.now()
        while len(entity_id_to_visit) > 0:
            process_number = int(len(entity_id_to_visit) / self.batch_size)

            if process_number == 0:
                (data_relevant, data_future, entity_id_visited_delta,
                 entity_id_to_visit_delta,
                 NoID_delta) = BFSWorker(entity_id_to_visit, entity_id_visited,
                                         entity_id_to_visit, self.entityType,
                                         self.kg_api)
                entity_id_to_visit = []
                insert_result = self.MongoDB_obj.save_to_mongodb_many(
                    "RelevantType", data_relevant)
                insert_result = self.MongoDB_obj.save_to_mongodb_many(
                    "FutureUseType", data_future)
                entity_id_visited.extend(entity_id_visited_delta)
                entity_id_to_visit.extend(entity_id_to_visit_delta)
                NoID.extend(NoID_delta)

                print("*************************************")
                print(len(entity_id_to_visit))
                print(len(entity_id_visited))
                np.save(to_visit_save_file, entity_id_to_visit)
                np.save(visited_save_file, entity_id_visited)
                np.save(NoID_save_file, NoID)
                time_new = datetime.datetime.now()
                print(time_new - time_start, time_new - time_old)
                continue

            if process_number > self.process_number_max:
                process_number = self.process_number_max
            while len(entity_id_to_visit) >= (process_number *
                                              self.batch_size):
                parameters = []
                data_relevant = []
                data_future = []
                entity_id_visited_delta = set()
                entity_id_to_visit_delta = set()
                NoID_delta = []

                # Set parameters for multiprocessing
                for i in range(process_number):
                    temp = [
                        entity_id_to_visit[(i * self.batch_size):(
                            (i + 1) * self.batch_size)], entity_id_visited,
                        entity_id_to_visit, self.entityType, self.kg_api
                    ]
                    parameters.append(temp)

                # Multiprocessing
                with Pool(process_number) as p:
                    result_workers = p.starmap(BFSWorker, parameters)

                # delete visited entity in the multiprocessing
                entity_id_to_visit = entity_id_to_visit[((i + 1) *
                                                         self.batch_size):]

                # Merge multiprocessing results
                for i in result_workers:
                    data_relevant.extend(i[0])
                    data_future.extend(i[1])
                    entity_id_visited_delta = entity_id_visited_delta | set(
                        i[2])
                    entity_id_to_visit_delta = entity_id_to_visit_delta | set(
                        i[3])
                    NoID_delta.extend(i[4])

                # print(len(data_relevant))
                # print(len(data_future))
                if self.debug:
                    np.save("/home/markzhao/Desktop/results.npy",
                            result_workers)
                    np.save("/home/markzhao/Desktop/data_relevant.npy",
                            data_relevant)
                    np.save("/home/markzhao/Desktop/data_future.npy",
                            data_future)
                insert_result = self.MongoDB_obj.save_to_mongodb_many(
                    "RelevantType", data_relevant)
                insert_result = self.MongoDB_obj.save_to_mongodb_many(
                    "FutureUseType", data_future)
                entity_id_visited.extend(list(entity_id_visited_delta))
                entity_id_to_visit.extend(list(entity_id_to_visit_delta))
                NoID.extend(NoID_delta)

                print("*************************************")
                print(len(entity_id_to_visit))
                print(len(entity_id_visited))
                if os.path.isfile(to_visit_save_file):
                    os.rename(
                        to_visit_save_file, to_visit_save_file[:-4] + "_OLD" +
                        to_visit_save_file[-4:])
                if os.path.isfile(visited_save_file):
                    os.rename(
                        visited_save_file, visited_save_file[:-4] + "_OLD" +
                        visited_save_file[-4:])
                if os.path.isfile(NoID_save_file):
                    os.rename(
                        NoID_save_file,
                        NoID_save_file[:-4] + "_OLD" + NoID_save_file[-4:])
                np.save(to_visit_save_file, entity_id_to_visit)
                np.save(visited_save_file, entity_id_visited)
                np.save(NoID_save_file, NoID)
                time_new = datetime.datetime.now()
                print(time_new - time_start, time_new - time_old)
                time_old = time_new

        print("*************************************")
        print(len(entity_id_to_visit))
        print(len(entity_id_visited))
        if os.path.isfile(to_visit_save_file):
            os.rename(
                to_visit_save_file,
                to_visit_save_file[:-4] + "_OLD" + to_visit_save_file[-4:])
        if os.path.isfile(visited_save_file):
            os.rename(visited_save_file,
                      visited_save_file[:-4] + "_OLD" + visited_save_file[-4:])
        if os.path.isfile(NoID_save_file):
            os.rename(NoID_save_file,
                      NoID_save_file[:-4] + "_OLD" + NoID_save_file[-4:])
        np.save(to_visit_save_file, entity_id_to_visit)
        np.save(visited_save_file, entity_id_visited)
        np.save(NoID_save_file, NoID)
        time_new = datetime.datetime.now()
        print(time_new - time_start, time_new - time_old)
示例#20
0
    def process(self):

        log_str = 'searching...'
        self.system_logger.info(log_str)
        print(log_str)

        input_db = MongoDB(self.db_config['host'],
                           self.db_config['port'],
                           self.db_config['input_db_name'],
                           self.db_config['user'],
                           self.db_config['pwd'])

        input_collection = input_db.db[self.db_config['article_collection_name']]

        all_docs = []
        for source_category in self.input_source_category:
            elems = source_category.split("_")
            source = elems[0]
            category = elems[1]
            if category == "all":
                query = {"source": source, "date": {"$gte": self.date_after}}
            else:
                query = {"source": source, "category": category, "date": {"$gte": self.date_after}}

            log_str = 'searching query' + str(query)
            self.system_logger.info(log_str)
            print(log_str)

            try:
                ids = input_collection.find(query).distinct('_id')
            except pymongo.errors.OperationFailure:
                # TO-DO: fix the following error:
                # pymongo.errors.OperationFailure: distinct too big, 16mb cap
                ids = [i['_id'] for i in input_collection.find(query)]

            all_docs.extend(ids)

        log_str = '# of docs found: %s' % len(all_docs)
        self.system_logger.info(log_str)
        print(log_str)

        if len(all_docs) == 0:
            return

        chunk_size = int(len(all_docs) / self.nworkers)
        if chunk_size == 0:
            chunk_size = len(all_docs)

        output_db = MongoDB(self.db_config['host'],
                            self.db_config['port'],
                            self.db_config['output_db_name'],
                            self.db_config['user'],
                            self.db_config['pwd']
                            )
        sentence_collection = output_db.db[self.db_config['sentence_collection_name']]
        current_count = sentence_collection.count()
        if current_count > 0:
            current_count -= 1

        log_str = '# of workers: %s\n chunk size: %s \n' % (self.nworkers, chunk_size)
        self.system_logger.info(log_str)
        print(log_str)

        chunks = []
        for i in range(0, len(all_docs), chunk_size):
            chunks.append(slice(i, i+chunk_size))

        log_str = '# parent pid: %s\n processing...\n' % os.getpid()
        self.system_logger.info(log_str)
        print(log_str)

        # Multi-processing
        pool = multiprocessing.Pool(processes=self.nworkers)
        thread_id = 0
        for c in chunks:
            args = (self.db_config,
                    self.sport_category_filename,
                    {},
                    all_docs[c],
                    self.save_batch_size,
                    self.tmp_dir,
                    thread_id)
            thread_id += 1
            pool.apply_async(process_chunk, args=args,)
        pool.close()
        pool.join()

        log_str = 'start merging...'
        self.system_logger.info('start merging...')
        print(log_str)

        # merge some information
        current_index = current_count + 1
        current_sentence_length = 0
        current_sentence_number = 0
        all_tmp_files = get_files(self.tmp_dir, r".*json")

        group_insert = []
        for index, one_file in enumerate(all_tmp_files):
            sys.stdout.write("%d / %d\r" % (index, len(all_tmp_files)))
            json_str = codecs.open(one_file, 'r', 'utf-8').read()
            insert_sentences = json.loads(json_str)
            for one_sentence in insert_sentences:
                _length = int(one_sentence["sentence_length"])
                one_sentence["sentence_index"] = current_index
                group_insert.append(one_sentence)
                current_index += 1
                current_sentence_length += _length
            current_sentence_number += len(insert_sentences)
            if len(group_insert) == self.insert_batch_size:
                sentence_collection.insert(group_insert)
                group_insert.clear()

        if len(group_insert) > 0:
            sentence_collection.insert(group_insert)
            group_insert.clear()

        avg_length_entry = list(sentence_collection.find({"_id": "avg_length"}))
        if len(avg_length_entry) == 1:
            saved_sentence_length = avg_length_entry[0]["current_sentence_length"]
            saved_sentence_number = avg_length_entry[0]["current_sentence_number"]
            current_sentence_length += saved_sentence_length
            current_sentence_number += saved_sentence_number
            avg_length = float(current_sentence_length / current_sentence_number)
            find_query = {"_id": "avg_length"}
            update_query = {"$set": {"current_sentence_length": current_sentence_length,
                                     "current_sentence_number": current_sentence_number,
                                     "avg_length": avg_length}}
            sentence_collection.update_one(find_query, update_query)
        else:
            avg_length = float(current_sentence_length / current_sentence_number)
            sentence_collection.insert({"_id": "avg_length",
                                        "current_sentence_length": current_sentence_length,
                                        "current_sentence_number": current_sentence_number,
                                        "avg_length": avg_length})

        if current_index != sentence_collection.count():
            self.system_logger.error('sentence index is not equal to the number of sentence.\n')
            self.system_logger.error('current max sentence index is [%d], but current all sentence number is [%d].\n' %
                                     (current_index - 1, sentence_collection.count() - 1))

        log_str = 'start indexing...'
        self.system_logger.info('start indexing...')
        print("\n", log_str)

        sentence_collection.create_index('docid')
        sentence_collection.create_index('import_date')
        sentence_collection.create_index('news_date')
        sentence_collection.create_index('entity_len')
        sentence_collection.create_index('sentence_index')
        sentence_collection.create_index('sentence_length')
        sentence_collection.create_index('sentence_position')
        key = [('entity_set', 1)]
        pfe = {'entity_set': {'$exists': True}}
        sentence_collection.create_index(key, partialFilterExpression=pfe)
        key = [('category_set', 1)]
        pfe = {'category_set': {'$exists': True}}
        sentence_collection.create_index(key, partialFilterExpression=pfe)

        log_str = 'all done'
        self.system_logger.info('all done')
        print(log_str)

        shutil.rmtree(self.tmp_dir)
示例#21
0
def process_chunk(db_config, sport_category_filename, query, doc_ids, batch_size, tmp_dir, thread_id):
    try:

        sport_category_classifier = SportNewsCategoryClassifier(sport_category_filename)

        # MongoDB is not fork-safe:
        # http://api.mongodb.com/python/current/faq.html#is-pymongo-fork-safe
        input_db = MongoDB(db_config['host'],
                           db_config['port'],
                           db_config['input_db_name'],
                           db_config['user'],
                           db_config['pwd']
                           )
        article_collection = input_db.db[db_config['article_collection_name']]

        output_db = MongoDB(db_config['host'],
                            db_config['port'],
                            db_config['output_db_name'],
                            db_config['user'],
                            db_config['pwd'])

        sentence_collection = output_db.db[db_config['sentence_collection_name']]

        query['_id'] = {'$in': doc_ids}
        response = article_collection.find(query, no_cursor_timeout=True)

        batch_number = 0
        to_insert = []
        all_inserted_sentence = 0
        for one_document in response:

            if len(one_document['content']) == 0:
                continue

            one_document_content = []
            for line in one_document['content']:
                if len(line.strip()) > 0:
                    one_document_content.append(line.strip())
            if len(one_document_content) == 0:
                continue

            docid = one_document['_id']
            search_query = {"docid": docid}
            if len(list(sentence_collection.find(search_query))) > 0:
                continue

            # process title
            title = one_document['title']
            title_query = process_title(title)

            # process document body
            body_query = process_document_body(one_document_content)

            category_result = sport_category_classifier.get_category(title_query.full_entity_ids,
                                                                     body_query,
                                                                     one_document['source'],
                                                                     one_document['category'])

            # generate a sentence from query
            sentence_to_insert = {"_id": '%s_%s' % (docid, 1)}
            search_query = {"_id": sentence_to_insert["_id"]}
            if len(list(sentence_collection.find(search_query))) > 0:
                continue

            entity_set = set(title_query.full_entity_ids)
            if len(entity_set) == 0:
                continue

            sentence_length = 0
            sentence_tokens = []
            for sentence in title_query.sentence_list:
                sentence_length += sentence.sentence_length
                for token in sentence.token_list:
                    sentence_tokens.append(token.original_text + "/" +
                                           str(token.pos) + "/" +
                                           str(token.ner) + "/" +
                                           str(token.is_stop_word))

            sentence_to_insert['分词词性'] = sentence_tokens
            sentence_to_insert['docid'] = docid
            sentence_to_insert['source'] = one_document['source']
            sentence_to_insert['category'] = one_document['category']
            sentence_to_insert['news_date'] = int(one_document['date'])
            sentence_to_insert['import_date'] = int(one_document['import_date'])
            sentence_to_insert['raw_sentence'] = title
            sentence_to_insert['sentence_length'] = sentence_length
            sentence_to_insert['sentence_position'] = 1
            sentence_to_insert['token_number'] = len(sentence_tokens)
            sentence_to_insert['entity_set'] = list(entity_set)
            sentence_to_insert['entity_len'] = len(entity_set)
            sentence_to_insert["topic_category"] = category_result.res

            to_insert.append(sentence_to_insert)

            if body_query is None:
                continue
            # generate sentences from body
            for sentence_index, sentence in enumerate(body_query.sentence_list):

                sentence_to_insert = {"_id": '%s_%s' % (docid, sentence_index + 2)}
                search_query = {"_id": sentence_to_insert["_id"]}
                if len(list(sentence_collection.find(search_query))) > 0:
                    continue

                entity_set = set(sentence.full_entity_ids)
                if len(entity_set) == 0:
                    continue

                sentence_tokens = []
                for token in sentence.token_list:
                    sentence_tokens.append(token.original_text + "/" +
                                           str(token.pos) + "/" +
                                           str(token.ner) + "/" +
                                           str(token.is_stop_word))

                sentence_to_insert['分词词性'] = sentence_tokens
                sentence_to_insert['docid'] = docid
                sentence_to_insert['source'] = one_document['source']
                sentence_to_insert['category'] = one_document['category']
                sentence_to_insert['news_date'] = int(one_document['date'])
                sentence_to_insert['import_date'] = int(one_document['import_date'])
                sentence_to_insert['raw_sentence'] = sentence.raw_sentence
                sentence_to_insert['sentence_length'] = sentence.sentence_length
                sentence_to_insert['sentence_position'] = sentence_index + 2
                sentence_to_insert['token_number'] = len(sentence_tokens)
                sentence_to_insert['entity_set'] = list(entity_set)
                sentence_to_insert['entity_len'] = len(entity_set)
                sentence_to_insert["topic_category"] = category_result.res

                to_insert.append(sentence_to_insert)
                if len(to_insert) >= batch_size:
                    tmp_file = tmp_dir + "/thread_" + str(thread_id) + "_batch_" + str(batch_number) + ".json"
                    batch_number += 1
                    f = codecs.open(tmp_file, 'w', 'utf-8')
                    f.write(json.dumps(to_insert))
                    f.close()
                    print("Successfully saved one batch ([%d] sentences) in file [%s]!" % (len(to_insert), tmp_file))
                    all_inserted_sentence += len(to_insert)
                    to_insert.clear()

        if len(to_insert) > 0:
            tmp_file = tmp_dir + "/thread_" + str(thread_id) + "_batch_" + str(batch_number) + ".json"
            f = codecs.open(tmp_file, 'w', 'utf-8')
            f.write(json.dumps(to_insert))
            f.close()
            print("Successfully saved rest [%d] sentences in file [%s]!" % (len(to_insert), tmp_file))
            all_inserted_sentence += len(to_insert)
            to_insert.clear()

        log_str = "One thread finished! [%d] sentences are saved!" % all_inserted_sentence
        logging.getLogger("system_log").info(log_str)
        print(log_str)

    except Exception:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        msg = 'unexpected error: %s | %s | %s' % \
            (exc_type, exc_obj, exc_tb.tb_lineno)
        print(msg)
示例#22
0
class SaveEntity2DB(OfflineTools):
    def __init__(self, local_configure, global_configure):
        super().__init__()
        self.global_configure = global_configure
        self.local_configure = local_configure
        self.db_name = local_configure["db_name"]
        self.collection_name = local_configure["collection_name"]
        self.data_dir = local_configure["data_dir"]
        self.db_interface = MongoDB(db_server_ip="10.93.128.143",
                                    db_server_port=27017,
                                    database_name=self.db_name)
        self.non_id_char = re.compile("([^\u4E00-\u9FD5a-zA-Z0-9])", re.U)
        self.system_logger = logging.getLogger("system_log")
        pass

    def execute(self):
        for source, data_path in self.data_dir.items():
            if source == "douban_movie":
                self.process_douban_movie(data_path)
            elif source == "qq_music":
                self.process_qq_music(data_path)
            else:
                sys.stdout.write("[%s] is not supported at this moment!" %
                                 source)
                continue

    def is_han(self, text):
        return any('\u4e00' <= char <= '\u9fff' for char in text)

    def process_douban_movie(self, data_path):

        current_category_dirs = os.listdir(data_path)

        for category in current_category_dirs:
            if category in ["configures", "errors", "category_list_json"]:
                continue

            category_dir = os.path.join(data_path, category) + "/"

            log_str = "\nProcessing: %s\n" % category_dir
            sys.stdout.write(log_str)
            self.system_logger.info(log_str)

            all_json_file = get_files(category_dir, r'.*json')
            for index, filename in enumerate(all_json_file):

                sys.stdout.write("%d / %d\r" % (index, len(all_json_file)))

                json_str = codecs.open(filename, 'r', 'utf-8').read()
                json_obj = json.loads(json_str)
                json_obj["_id"] = "douban_" + json_obj["id"]
                json_obj["web_category"] = category
                json_obj["entity_type"] = "Movie"

                try:
                    self.db_interface.save_to_mongodb(self.collection_name,
                                                      json_obj)
                except:
                    output_str = json.dumps(json_obj)
                    self.system_logger.info(
                        "Errors writing following object into DB: \n" +
                        output_str + "\n")
                    sys.stderr.write("Error writing object into DB\n")
                    sys.exit()

    def process_qq_music(self, data_path):

        current_category_dirs = os.listdir(data_path)

        for category in current_category_dirs:
            if category not in [
                    "内地", "台湾", "日本", "新加坡", "泰国", "韩国", "香港", "马来西亚"
            ]:
                continue

            category_dir = os.path.join(data_path, category) + "/"
            sys.stdout.write("\nProcessing: %s\n" % category_dir)
            all_json_file = get_files(category_dir, r'.*_albums.json')

            for index, filename in enumerate(all_json_file):

                sys.stdout.write("%d / %d\r" % (index, len(all_json_file)))

                singer_filename = filename.replace("_albums", "")
                if not os.path.exists(singer_filename):
                    sys.stdout.write(
                        "\n[%s] does not exist in directory [%s]\n" %
                        (singer_filename, category_dir))
                    continue
                json_str = codecs.open(singer_filename, 'r', 'utf-8').read()
                json_obj = json.loads(json_str)
                json_obj["_id"] = "qq_music_singer_" + json_obj["singer_mid"]
                json_obj["entity_type"] = "singer"

                try:
                    self.db_interface.save_to_mongodb(self.collection_name,
                                                      json_obj)
                except:
                    output_str = json.dumps(json_obj)
                    self.system_logger.info(
                        "Errors writing following object into DB: \n" +
                        output_str + "\n")
                    sys.stderr.write("Error writing object into DB\n")
                    sys.exit()
示例#23
0
def import_sents(pdata, name):
    try:
        client = MongoDB(host, port, db_name, user, pwd)
        collection = client.db[collection_name]
        sents = []
        with open(pdata, 'r') as f:
            for line in f:
                d = json.loads(line)
                for sent in d['sentences']:
                    ids = set()
                    titles = set()
                    ids_ll = set()
                    titles_ll = set()
                    for n, i in enumerate(sent['links']):
                        if i['id']:
                            ids.add(i['id'])
                            titles.add(i['title'])
                        if langlinks and i['title'] in langlinks:
                            title_ll, id_ll = langlinks[i['title']]
                            sent['links'][n]['id_ll'] = id_ll
                            sent['links'][n]['title_ll'] = title_ll
                            ids_ll.add(id_ll)
                            titles_ll.add(title_ll)
                    sent['ids_len'] = 0
                    if ids:
                        sent['ids'] = list(ids)
                        sent['ids_len'] = len(ids)
                    sent['ids_ll_len'] = 0
                    if ids_ll:
                        sent['ids_ll'] = list(ids_ll)
                        sent['ids_ll_len'] = len(ids_ll)
                    if titles:
                        sent['titles'] = list(titles)
                    if titles_ll:
                        sent['titles_ll'] = list(titles_ll)
                    sent['source_id'] = d['id']
                    sent['source_title'] = d['title']
                    if sent['source_title'] in langlinks:
                        title_ll, id_ll = langlinks[sent['source_title']]
                        sent['source_id_ll'] = id_ll
                        sent['source_title_ll'] = title_ll
                    sent['_chunk_id'] = name
                    sents.append(sent)
        if sents:
            # Insert a list is faster than insert_one
            # Reduce the size of the list to reduce RAM usage
            collection.insert(sents)

            # Indexing
            collection.create_index('_chunk_id')
            collection.create_index('source_id')
            collection.create_index('source_title')
            collection.create_index('source_id_ll', sparse=True)
            collection.create_index('source_title_ll', sparse=True)
            collection.create_index('start')
            collection.create_index('end')
            collection.create_index('ids_len')
            collection.create_index('ids_ll_len')
            key = [('ids', 1)]
            pfe = {'ids': {'$exists': True}}
            collection.create_index(key, partialFilterExpression=pfe)
            key = [('ids_ll', 1)]
            pfe = {'ids_ll': {'$exists': True}}
            collection.create_index(key, partialFilterExpression=pfe)
            key = [('titles', 1)]
            pfe = {'titles': {'$exists': True}}
            collection.create_index(key, partialFilterExpression=pfe)
            key = [('titles_ll', 1)]
            pfe = {'titles_ll': {'$exists': True}}
            collection.create_index(key, partialFilterExpression=pfe)
            key = [('source_id', 1), ('ids', 1)]
            pfe = {'ids': {'$exists': True}}
            collection.create_index(key, partialFilterExpression=pfe)

    except Exception as e:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        msg = 'unexpected error: %s | %s | %s | %s | %s' % \
              (exc_type, exc_obj, exc_tb.tb_lineno, name, d['title'])
        logger.error(msg)
示例#24
0
    def create_metion_table(self):
        def strip_mention(text):
            text = text.replace('\t', ' ').replace('\n',
                                                   ' ').replace('\r', ' ')
            text = text.lower().strip()
            text = text.replace('\\', '')
            text = ' '.join(text.split())
            return text

        def expand_mention(text):
            RE_STRIP = r' \([^)]*\)|\<[^)]*\>|,|"|\.|\'|:|-'
            # STOP_WORDS = ['a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for',
            #               'from', 'has', 'he', 'i', 'in', 'is', 'it', 'its', 'of', 'on',
            #               'that', 'the', 'their', 'we', 'to', 'was', 'were', 'with',
            #               'you', 'your', 'yours', 'our', 'ours', 'theirs', 'her',
            #               'hers', 'his', 'him', 'mine', 'or', 'but', 'though', 'since']
            res = []
            # Strip mention
            res.append(''.join(re.sub(RE_STRIP, '', text).strip().split()))
            # # Remove stop words
            # res.append(' '.join([word for word in text.split() \
            #                      if word not in STOP_WORDS]).strip())
            # '·' in Chinese names
            if '·' in text:
                res.append(text.replace('·', '-'))

            return res

        def filter_mention(mention):
            if not mention:
                return False
            if mention == '':
                return False
            return True

        def get_kbid2mention(data):
            res = defaultdict(lambda: defaultdict(int))
            for mention in data:
                for kbid in data[mention]:
                    assert type(data[mention][kbid]) == int
                    res[kbid][mention] = data[mention][kbid]
            return res

        def add_score(data):
            for mention in data:
                c = Counter(data[mention])
                tol = sum(c.values())
                assert type(tol) == int
                for kbid in data[mention]:
                    data[mention][kbid] = data[mention][kbid] / tol

        kbid2names, _, _, kbid2popularity = self.get_names(MENTION_PROPS)

        mention2kbid = defaultdict(lambda: defaultdict(int))
        for kbid in kbid2names:
            for name in kbid2names[kbid]:
                mention = strip_mention(name)
                mentions = [mention]
                mentions.extend(expand_mention(mention))
                mentions = set(mentions)

                for m in mentions:
                    if not filter_mention(m):
                        continue
                    # mention2kbid[m][kbid] += kbid2names[kbid][name]
                    try:
                        mention2kbid[m][kbid] += kbid2popularity[kbid]
                    except KeyError:
                        mention2kbid[m][kbid] += 1

        mention2kbid = dict(mention2kbid)
        with open('%s/mention2kbid_raw.json' % self.output_dir, 'w') as fw:
            json.dump(mention2kbid, fw, indent=4)

        self.system_logger.info('converting kbid2mention..')
        kbid2mention = get_kbid2mention(mention2kbid)
        self.system_logger.info('done.')
        with open('%s/kbid2mention_raw.json' % self.output_dir, 'w') as fw:
            json.dump(kbid2mention, fw, indent=4)

        self.system_logger.info('computing mention2kbid...')
        add_score(mention2kbid)
        with open('%s/mention2kbid.json' % self.output_dir, 'w') as fw:
            json.dump(mention2kbid, fw, indent=4)
        self.system_logger.info('done.')

        self.system_logger.info('computing kbid2mention...')
        add_score(kbid2mention)
        with open('%s/kbid2mention.json' % self.output_dir, 'w') as fw:
            json.dump(kbid2mention, fw, indent=4)
        self.system_logger.info('done.')

        # start insert into mongo db
        self.system_logger.info('db name: %s' % self.db_config['db_name'])
        self.system_logger.info('collection name: %s' %
                                self.db_config["output_collection_name"])
        client = MongoDB(self.db_config['host'], self.db_config['port'],
                         self.db_config['db_name'], self.db_config['user'],
                         self.db_config['pwd'])

        self.system_logger.info('drop collection')
        client.db.drop_collection(self.db_config['output_collection_name'])

        collection = client.db[self.db_config['output_collection_name']]
        self.system_logger.info('processing...')

        to_insert = []
        self.system_logger.info('converting...')  # TO-DO: save RAM
        for mention in mention2kbid:
            if sys.getsizeof(mention) >= 512:
                self.system_logger.warning('mention is too large, skip')
                continue

            entities = sorted(mention2kbid[mention].items(),
                              key=lambda x: x[1],
                              reverse=True)
            ins = {'mention': mention, 'entities': entities}
            to_insert.append(ins)

        self.system_logger.info('importing...')
        try:
            collection.insert(to_insert)
        except Exception as e:
            exc_type, exc_obj, exc_tb = sys.exc_info()
            msg = 'unexpected error: %s | %s | %s' % \
                  (exc_type, exc_obj, exc_tb.tb_lineno)
            self.system_logger.error(msg)

        self.system_logger.info('done.')

        self.system_logger.info('indexing...')
        collection.create_index('mention', unique=True)
        # collection.create_index([('mention', 1), ('entities', 1)], unique=True)
        self.system_logger.info('done.')

        self.system_logger.info(collection)
        self.system_logger.info(collection.count())