Exemplo n.º 1
0
def destruct_danmaku(cid: int, danmakus: List[CustomTag]):
    danmakuMap: MutableMapping[int, DanmakuDO] = {}
    relationMap: MutableMapping[int, DanmakuRealationDO] = {}
    print('[FORMER] cid: %s, danmakus: %s' % (cid, danmakus.__len__()))
    for danmaku in danmakus:
        # 弹幕出现时间,模式,字体大小,颜色,发送时间戳,弹幕池,用户Hash,数据库ID
        obj: DanmakuDO = DanmakuDO()
        obj.content = danmaku.content
        l: list = danmaku.tag_content.split(',')
        obj.danmaku_epoch = float(l[0])
        obj.mode = int(l[1])
        obj.font_size = int(l[2])
        obj.font_color = int(l[3])
        obj.send_time = datetime.fromtimestamp(int(l[4]),
                                               timezone(timedelta(hours=8)))
        obj.danmaku_pool = int(l[5])
        obj.user_hash = int(l[6], 16)
        # 暂不直接从数据库获取crc32数据, 太慢了
        # value = crc32.get_value(l[6])
        # if value[0] > 0:
        #   obj.user_id = value[1]
        obj.id = int(l[7])

        relation: DanmakuRealationDO = DanmakuRealationDO()
        relation.cid = cid
        relation.danmaku_id = obj.id

        danmakuMap[obj.id] = obj
        relationMap[relation.danmaku_id] = relation

    session = DBSession()
    try:
        removeExist(cid, danmakuMap, relationMap)

        if danmakuMap.__len__() == relationMap.__len__(
        ) and relationMap.__len__() == 0:
            print('cid: %s, has saved all danmaku' % cid)
            return

        session.bulk_save_objects(
            danmakuMap.values() if danmakuMap.values().__len__() > 0 else None)
        session.bulk_save_objects(relationMap.values(
        ) if relationMap.values().__len__() > 0 else None)
        session.commit()
    except BaseException as e:
        session.rollback()
        print(e)
        print('cid: %s, has error. ' % cid)
    else:
        print('cid: %s, Saved into DB.' % cid)
    finally:
        session.close()
        print('[SAVED] danmakuMap.len: %s' % danmakuMap.__len__())
        print('[SAVED] relationMap.len: %s' % relationMap.__len__())
        danmakuMap.clear()
        relationMap.clear()
Exemplo n.º 2
0
async def save_cid_aid_relation(cid_aid: MutableMapping[int, int],
                                cid_info: MutableMapping[int, AvDanmakuCid]):
    """
  保存av与cid的关系
  """
    if cid_aid.keys().__len__() < 1:
        return
    objs: List[AVCidsDO] = []

    sql: str = 'select cid from av_cids where cid in (%s)' % ','.join(
        '%s' % item for item in cid_aid.keys())

    cids: ResultProxy = await execute_sql(sql)
    exist_cids: Set[int] = set()
    for item in cids.fetchall():
        """
    保存已经存在的关系
    """
        exist_cids.add(int(item[0]))

    if not exist_cids.__len__() == cid_aid.__len__():
        session = DBSession()
        for cid, aid in cid_aid.items():
            if exist_cids.__contains__(cid):
                continue
            obj: AVCidsDO = AVCidsDO()
            obj.cid = cid
            obj.aid = aid
            objs.append(obj)
        for cid in exist_cids:
            cid_info.pop(cid, None)

        try:

            if cid_info.values().__len__() > 0:
                for item in cid_info.values():
                    await execute_sql(
                        "update av_cids set page = %s, page_name = '%s' where cid = %s;"
                        % (item.page, item.pagename, item.cid))
            session.bulk_save_objects(objs)
            session.commit()
        except BaseException as e:
            session.rollback()
            raise e
        else:
            log.info('[Saved] av-cid relation. len: %s' % objs.__len__())
        finally:
            session.close()
    else:
        log.info('All av-cid relation exist')
Exemplo n.º 3
0
def save_danmaku_to_db(q: Queue, danmakuMap: MutableMapping[int, DanmakuDO],
                       relationMap: MutableMapping[int, DanmakuRealationDO],
                       cid_danmakuIdSet: MutableMapping[int, Set[int]]):
    session = DBSession()
    try:
        remove_db_exist_ids(danmakuMap, relationMap, cid_danmakuIdSet.keys())
        print('[After Removed DB ids] danmaku len: %s, relation len: %s' %
              (danmakuMap.__len__(), relationMap.__len__()))

        if danmakuMap.__len__() != relationMap.__len__():
            raise Exception("danmaku's len is not eq relation's len")

        if danmakuMap.values():
            session.bulk_save_objects(danmakuMap.values())
        if relationMap.values():
            session.bulk_save_objects(relationMap.values())
        session.commit()
    except BaseException:
        session.rollback()
        name = multiprocessing.current_process().name
        _map: MutableMapping[str, str] = {name: traceback.format_exc()}
        q.put(_map)
        print('Oops: ', name)
    else:
        print('Save to DB success, len: %s' % danmakuMap.__len__())
        for cid, value in cid_danmakuIdSet.items():
            try:
                red.sadd(cid, *value)
            except BaseException:
                traceback.print_exc()
                print('[ERROR] redis. cid: %s' % cid)
        print('[DONE] save danmaku ids to redis')
    finally:
        session.close()
        del danmakuMap
        del relationMap
        gc.collect()
Exemplo n.º 4
0
def main():
  """
  测试需要调整数据库, s3删除, archive目录
  :return:
  """
  temp_file_dir = 'data-temp/'

  # download data
  log.info("Getting objects' keys")
  keys: Set[str] = _s3.get_all_objects_key()

  if keys.__len__() < 1:
    log.info("No file in COS!")
    exit(0)
  else:
    local_processing.multi_download(temp_file_dir, keys)
    if not _s3.archive_object(keys):
      log.error("Archive objects failed")
      exit(0)
    log.info("Download files, DONE.")

  # reading data
  all_data: MutableMapping[str, AV] = read_file(temp_file_dir)

  log.info("Analyze")
  # multi analyze
  pool = Pool(processes = cpu_use_number)
  q = multiprocessing.Manager().Queue()

  size = int(math.ceil(all_data.__len__() / float(cpu_use_number)))
  map_temp: MutableMapping[str, AV] = {}

  res: List[ApplyResult] = list()
  for key, value in all_data.items():
    map_temp[key] = value
    if map_temp.__len__() % size == 0:
      res.append(pool.apply_async(func = analyze, args = (q, map_temp,)))
      map_temp = {}
  res.append(pool.apply_async(func = analyze, args = (q, map_temp,)))
  pool.close()
  pool.join()
  if q.qsize() > 0:  # 当queue的size大于0的话, 那就是进程里面出现了错误, raise, 结束任务
    log.error('analyze occurs error')
    raise Exception(q)

  # saving
  all_avinfos: List[AVInfoDO] = []
  all_avstats: List[AVStatDO] = []
  for item in res:
    v = item.get()
    all_avinfos.extend(v[0])
    all_avstats.extend(v[1])

  # remove avinfos which exist in db already and same in program
  log.info("Remove duplicated avinfo")
  temp: Set[int] = set()  # db
  for item in all_avinfos:
    temp.add(item.aid)
  session = DBSession()
  sql: str = "select aid from av_info where aid in (%s)" % ",".join("%s" % item for item in temp)
  aids: ResultProxy = session.execute(sql)
  temp.clear()
  for item in aids.fetchall():
    temp.add(int(item[0]))

  temp2: List[AVInfoDO] = []  # program
  for item in all_avinfos:
    if not temp.__contains__(item.aid):
      temp2.append(item)
      temp.add(item.aid)
  all_avinfos = temp2

  # db
  log.info("Save infos(%s) and stats(%s)" % (all_avinfos.__len__(), all_avstats.__len__()))
  session.bulk_save_objects(all_avinfos)
  session.bulk_save_objects(all_avstats)
  session.commit()

  # archive
  log.info("Archive")
  for item in all_data.keys():
    index: int = item.find("/online")
    shutil.move(item[:index], "D:/spider archive")

  log.info('[Done]')