def destruct_danmaku(cid: int, danmakus: List[CustomTag]): danmakuMap: MutableMapping[int, DanmakuDO] = {} relationMap: MutableMapping[int, DanmakuRealationDO] = {} print('[FORMER] cid: %s, danmakus: %s' % (cid, danmakus.__len__())) for danmaku in danmakus: # 弹幕出现时间,模式,字体大小,颜色,发送时间戳,弹幕池,用户Hash,数据库ID obj: DanmakuDO = DanmakuDO() obj.content = danmaku.content l: list = danmaku.tag_content.split(',') obj.danmaku_epoch = float(l[0]) obj.mode = int(l[1]) obj.font_size = int(l[2]) obj.font_color = int(l[3]) obj.send_time = datetime.fromtimestamp(int(l[4]), timezone(timedelta(hours=8))) obj.danmaku_pool = int(l[5]) obj.user_hash = int(l[6], 16) # 暂不直接从数据库获取crc32数据, 太慢了 # value = crc32.get_value(l[6]) # if value[0] > 0: # obj.user_id = value[1] obj.id = int(l[7]) relation: DanmakuRealationDO = DanmakuRealationDO() relation.cid = cid relation.danmaku_id = obj.id danmakuMap[obj.id] = obj relationMap[relation.danmaku_id] = relation session = DBSession() try: removeExist(cid, danmakuMap, relationMap) if danmakuMap.__len__() == relationMap.__len__( ) and relationMap.__len__() == 0: print('cid: %s, has saved all danmaku' % cid) return session.bulk_save_objects( danmakuMap.values() if danmakuMap.values().__len__() > 0 else None) session.bulk_save_objects(relationMap.values( ) if relationMap.values().__len__() > 0 else None) session.commit() except BaseException as e: session.rollback() print(e) print('cid: %s, has error. ' % cid) else: print('cid: %s, Saved into DB.' % cid) finally: session.close() print('[SAVED] danmakuMap.len: %s' % danmakuMap.__len__()) print('[SAVED] relationMap.len: %s' % relationMap.__len__()) danmakuMap.clear() relationMap.clear()
async def save_cid_aid_relation(cid_aid: MutableMapping[int, int], cid_info: MutableMapping[int, AvDanmakuCid]): """ 保存av与cid的关系 """ if cid_aid.keys().__len__() < 1: return objs: List[AVCidsDO] = [] sql: str = 'select cid from av_cids where cid in (%s)' % ','.join( '%s' % item for item in cid_aid.keys()) cids: ResultProxy = await execute_sql(sql) exist_cids: Set[int] = set() for item in cids.fetchall(): """ 保存已经存在的关系 """ exist_cids.add(int(item[0])) if not exist_cids.__len__() == cid_aid.__len__(): session = DBSession() for cid, aid in cid_aid.items(): if exist_cids.__contains__(cid): continue obj: AVCidsDO = AVCidsDO() obj.cid = cid obj.aid = aid objs.append(obj) for cid in exist_cids: cid_info.pop(cid, None) try: if cid_info.values().__len__() > 0: for item in cid_info.values(): await execute_sql( "update av_cids set page = %s, page_name = '%s' where cid = %s;" % (item.page, item.pagename, item.cid)) session.bulk_save_objects(objs) session.commit() except BaseException as e: session.rollback() raise e else: log.info('[Saved] av-cid relation. len: %s' % objs.__len__()) finally: session.close() else: log.info('All av-cid relation exist')
def save_danmaku_to_db(q: Queue, danmakuMap: MutableMapping[int, DanmakuDO], relationMap: MutableMapping[int, DanmakuRealationDO], cid_danmakuIdSet: MutableMapping[int, Set[int]]): session = DBSession() try: remove_db_exist_ids(danmakuMap, relationMap, cid_danmakuIdSet.keys()) print('[After Removed DB ids] danmaku len: %s, relation len: %s' % (danmakuMap.__len__(), relationMap.__len__())) if danmakuMap.__len__() != relationMap.__len__(): raise Exception("danmaku's len is not eq relation's len") if danmakuMap.values(): session.bulk_save_objects(danmakuMap.values()) if relationMap.values(): session.bulk_save_objects(relationMap.values()) session.commit() except BaseException: session.rollback() name = multiprocessing.current_process().name _map: MutableMapping[str, str] = {name: traceback.format_exc()} q.put(_map) print('Oops: ', name) else: print('Save to DB success, len: %s' % danmakuMap.__len__()) for cid, value in cid_danmakuIdSet.items(): try: red.sadd(cid, *value) except BaseException: traceback.print_exc() print('[ERROR] redis. cid: %s' % cid) print('[DONE] save danmaku ids to redis') finally: session.close() del danmakuMap del relationMap gc.collect()
def main(): """ 测试需要调整数据库, s3删除, archive目录 :return: """ temp_file_dir = 'data-temp/' # download data log.info("Getting objects' keys") keys: Set[str] = _s3.get_all_objects_key() if keys.__len__() < 1: log.info("No file in COS!") exit(0) else: local_processing.multi_download(temp_file_dir, keys) if not _s3.archive_object(keys): log.error("Archive objects failed") exit(0) log.info("Download files, DONE.") # reading data all_data: MutableMapping[str, AV] = read_file(temp_file_dir) log.info("Analyze") # multi analyze pool = Pool(processes = cpu_use_number) q = multiprocessing.Manager().Queue() size = int(math.ceil(all_data.__len__() / float(cpu_use_number))) map_temp: MutableMapping[str, AV] = {} res: List[ApplyResult] = list() for key, value in all_data.items(): map_temp[key] = value if map_temp.__len__() % size == 0: res.append(pool.apply_async(func = analyze, args = (q, map_temp,))) map_temp = {} res.append(pool.apply_async(func = analyze, args = (q, map_temp,))) pool.close() pool.join() if q.qsize() > 0: # 当queue的size大于0的话, 那就是进程里面出现了错误, raise, 结束任务 log.error('analyze occurs error') raise Exception(q) # saving all_avinfos: List[AVInfoDO] = [] all_avstats: List[AVStatDO] = [] for item in res: v = item.get() all_avinfos.extend(v[0]) all_avstats.extend(v[1]) # remove avinfos which exist in db already and same in program log.info("Remove duplicated avinfo") temp: Set[int] = set() # db for item in all_avinfos: temp.add(item.aid) session = DBSession() sql: str = "select aid from av_info where aid in (%s)" % ",".join("%s" % item for item in temp) aids: ResultProxy = session.execute(sql) temp.clear() for item in aids.fetchall(): temp.add(int(item[0])) temp2: List[AVInfoDO] = [] # program for item in all_avinfos: if not temp.__contains__(item.aid): temp2.append(item) temp.add(item.aid) all_avinfos = temp2 # db log.info("Save infos(%s) and stats(%s)" % (all_avinfos.__len__(), all_avstats.__len__())) session.bulk_save_objects(all_avinfos) session.bulk_save_objects(all_avstats) session.commit() # archive log.info("Archive") for item in all_data.keys(): index: int = item.find("/online") shutil.move(item[:index], "D:/spider archive") log.info('[Done]')