def __init__( self, stddatatype: EStandardDataType, platform: str, cmdstatus: ECommandStatus, cmdrcvmsg: str = None, currenttime: str = None, ): OutputData.__init__(self, platform, stddatatype) OutputDataSeg.__init__(self) self._cmdstatus: ECommandStatus = None if isinstance(cmdstatus, ECommandStatus): self._cmdstatus = cmdstatus else: try: self._cmdstatus = ECommandStatus(int(cmdstatus)) except Exception: pass self._cmdrcvmsg: str = None if isinstance(cmdrcvmsg, str) and cmdrcvmsg != '': self._cmdrcvmsg = cmdrcvmsg # 当前时间, 字符串的datetime self._time: str = datetime.now( pytz.timezone('Asia/Shanghai')).strftime('%Y-%m-%d %H:%M:%S') if currenttime is not None and currenttime != '': self._time = currenttime
def __init__(self, task: IscoutTask, platform: str, url: str, source: str, rsctp: EResourceType): if not isinstance(task, IscoutTask): raise Exception("Invalid IscoutTask") if not isinstance(platform, str): raise Exception("Invalid platform") if not isinstance(url, str): raise Exception("Invalid url") if not isinstance(source, str): raise Exception("Invalid source") Resource.__init__(self, url, rsctp) OutputData.__init__(self, platform, EStandardDataType.IScoutNetworkResource) OutputDataSeg.__init__(self) self._task: IscoutTask = task self._source: str = source self.resourceid: str = None self.filename: str = None self.extension: str = None self.remark: str = None self.stream = None
def __init__(self, task: IscoutTask, level: int, parentobj: str, parentobjtype: EObjectType, url: str, keyword: str): if not isinstance(task, IscoutTask): raise Exception("Invalid iscouttask") # if not isinstance(level, int): # raise Exception("Invalid level") if not isinstance(parentobjtype, EObjectType): raise Exception("Invalid parentobjtype") if not isinstance(url, str) or url == '': raise Exception("Invalid url") if not isinstance(keyword, str) or keyword == '': raise Exception("Invalid keyword") OutputData.__init__(self, task.platform, EStandardDataType.IScoutScreenShotSE) OutputDataSeg.__init__(self) # OutputData.__init__(self, task._platform, EStandardDataType.IScoutScreenShotUrl) self._task: IscoutTask = task self._level: int = level self._parentobj: str = parentobj self.parentobjtype: EObjectType = parentobjtype self.url = url self.keyword: str = keyword # self.platform = platform # self.datatype: EStandardDataType = datatype self._stream: io.RawIOBase = None
def delete_complete_file(self, succ: bool, data: OutputData): """ 回调函数,在文件被读完后删除 :param filename: :return: """ res = True try: if not hasattr(data, "isdeleteable") or not hasattr( data, "filepath_telegram"): return res if not data.isdeleteable: return res if data.filepath_telegram is None: self._logger.error("Telegram deleteable filepath is None") return res stm = data.get_stream() if stm is not None and not stm.closed: stm.close() if not data.filepath_telegram.exists(): self._logger.error( "Telegram deletable file is not found: {}".format( data.filepath_telegram)) return res data.filepath_telegram.unlink() # filename.unlink() except: res = False return res
def output_to_file(cls, data: OutputData, datastd: OutputDataConfig, targetdir: str) -> bool: """输出数据到指定目录""" res: bool = False try: if not isinstance(data, OutputData): cls._logger.error("Invalid OutputData object: {}".format(data)) return res if not isinstance(datastd, OutputDataConfig): cls._logger.error( "Invalid OutputDataStandard: {}".format(datastd)) return res stm = data.get_stream() if not isinstance(stm, io.IOBase) or not stm.readable(): succ: bool = True for b in cls._get_mutiple_bs(data, datastd): if not cls._output_to_file(data, b, datastd, targetdir): succ = False break res = succ else: b: bytes = cls._get_single_bs(data, datastd, stm) return cls._output_to_file(data, b, datastd, targetdir, stm) except Exception: cls._logger.error("Output data error: {} {}\n{}".format( data._platform, datastd._uniquename, traceback.format_exc())) return res
def __init__(self, suffix, datatype: EStandardDataType, task: Task, apptype: int, clientid: str, is_muti_seg: bool = False): UniqueData.__init__(self, task, apptype) OutputData.__init__(self, self._task.platform, datatype) OutputDataSeg.__init__(self) if not isinstance(clientid, str) or clientid == "": raise Exception("Invalid param 'clientid' for FeedDataBase") self._clientid: str = clientid # 东8区时间 self.time = datetime.datetime.now( pytz.timezone('Asia/Shanghai')).strftime('%Y-%m-%d %H:%M:%S') if not isinstance(suffix, str) or suffix == "": raise Exception("Suffix is invalid.") self._is_muti_seg: bool = False if isinstance(is_muti_seg, bool): self._is_muti_seg = is_muti_seg self._suffix: str = suffix # 文件后缀 self.__innerdatas: list = [] # 内部多段数据 self.__innerdata_locker = threading.Lock() self._io_stream = None # 从网上下载回来的数据流 # 统一使用ha来获取head里面的length # resp = ha.get_response() # lengthn = resp.headers.get('Content-Length') # responseio = ResponseIO(resp) self.stream_length = 0 # 下载的文件流大小,用来做文件大小过滤 self.remarks = None # 用于将一些爬取过程中有用的东西记录下来
def output(self, data: OutputData, datastd: OutputDataConfig) -> bool: """异步输出。 根据标准检验数据字段是否符合规范,并输出数据,返回bool指示是否输出成功\n data: 要输出的数据对象\n datastd: 此数据对应的数据标准""" res: bool = False try: if not isinstance(data, OutputData): self._logger.error( "Invalid OutputData object: {}".format(data)) return res if not isinstance(datastd, OutputDataConfig): self._logger.error( "Invalid OutputDataStandard: {}".format(datastd)) return res stm = data.get_stream() if not issubclass(type(stm), io.IOBase) or not stm.readable(): succ: bool = True for b in self._get_mutiple_bs(data, datastd): if not self._output_sub(data, b, datastd): succ = False res = succ else: b: bytes = self._get_single_bs(data, datastd, stm) res = self._output_sub(data, b, datastd, stm) if callable(data.on_complete): data.on_complete(res, data) except Exception: self._logger.error("Output data error: {} {}\n{}".format( data._platform, datastd._uniquename, traceback.format_exc())) return res
def _get_single_bs( cls, data: OutputData, datastd: OutputDataConfig, stm: io.RawIOBase, enc: str = "utf-8", ) -> bytes: """输出带文件体的数据类型""" res: bytes = None try: if not isinstance(data, OutputData) or stm is None or not stm.readable(): cls._logger.error( "Invalid OutputData object or stream for output single") return res for seg in data.get_output_segs(): # seg: OutputDataSeg = data.get_output_segs() fields: dict = cls._parse_fields(seg, datastd) if not isinstance(fields, dict) or len(fields) < 1: cls._logger.error( "Invalid fields after check output segment fields:\nplatform:{}\ndatatype:{}" .format(data._platform, data._datatype.name)) return res bs: bytes = cls._fields_to_bytes(fields, enc) if bs is None or not any(bs): return res res = bs return res except Exception: res = None cls._logger.error( "Output single data segment error:\nplatform:{}\ndatatype:{}\nerror:{}" .format(data._platform, data._datatype.name, traceback.format_exc()))
def _get_mutiple_bs( cls, data: OutputData, datastd: OutputDataConfig, enc: str = "utf-8", maxsegcount: int = 1000, ) -> iter: """输出多段类型数据,返回bytes迭代器""" res: bool = True segcount = 0 segbs: bytes = bytes() try: if not datastd._enable: cls._logger.debug( "Data standard '{}' in platform '{}' is not enabled, data won't output" .format(datastd._datatype.name, datastd.owner._platform)) return res for seg in data.get_output_segs(): try: # 构建输出bytes try: if not isinstance(seg, OutputDataSeg): cls._logger.error( "Invalid OutputDataSeg object: {}".format(seg)) res = False return res seg: OutputDataSeg = seg # 检查输出数据字段有效性 fields = cls._parse_fields(seg, datastd) if not isinstance(fields, dict) or len(fields) < 1: continue bs = cls._fields_to_bytes(fields, enc) if bs is None or not any(bs): continue segbs += bs segcount += 1 except Exception: res = False cls._logger.error( "Check segment fields validation failed:\nplatform:{}\ndatatype:{}\nerror:{}" .format( data._platform, data._datatype.name, traceback.format_exc(), )) if segcount < maxsegcount: continue # 达到segment段落数量上限输出 try: yield segbs except Exception: cls._logger.error( "Output mutiple segments error: {}".format( traceback.format_exc())) finally: segbs = bytes() segcount = 0 except Exception: res = False cls._logger.error( "Check output data segment failed:\nplatform:{}\ndatatype:{}\nerror:{}" .format(data._platform, data._datatype.name, traceback.format_exc())) # 或者遍历所有seg完成时输出 if not segbs is None and any(segbs): yield segbs except Exception: res = False cls._logger.error( "Output mutiple data segment error:\nplatform:{}\ndatatype:{}\nerror:{}" .format(data._platform, data._datatype.name, traceback.format_exc()))
def _output_to_file( cls, data: OutputData, bs: bytes, datastd: OutputDataConfig, targetdir: str, stm: io.RawIOBase = None, ) -> bool: """输出到指定目录\n bs:要输出的数据\n datastd:数据对应的数据标准,用于构建文件名等\n targetdir:目标目录\n stm: 附带的数据流""" res: bool = False tmppath: str = None outfi: str = None try: with cls.tmpdir_locker: # 临时路径 tmppath: str = cls._get_datapath(cls._tmpdir, datastd) if not isinstance(tmppath, str) or tmppath == "": return res with open(tmppath, mode="wb") as fs: fs.write(bs) if not stm is None and stm.readable(): # stm.readinto(fs) readlen = 1024 * 1024 * 1024 while True: buf = stm.read(readlen) if buf is None: break readcount = len(buf) fs.write(buf) if readcount < readlen: break # 加了一个验证步骤.. # 后面如果要搞扩展输出方式, # 应吧输出到临时,和输出到目标分成两个函数, # 在两个函数调用的中间加一个验证步骤,各自实现 if not data.validate_file(tmppath): # 不打日志了,错误数据直接不输出 cls._logger.debug("Corrupted data: {}".format(tmppath)) if os.path.isfile(tmppath): os.remove(tmppath) return res with cls.outdir_locker: outfi: str = cls._get_datapath(targetdir, datastd) shutil.move(tmppath, outfi) res = True except Exception: if not tmppath is None and tmppath != "" and os.path.isfile( tmppath): os.remove(tmppath) if not outfi is None and outfi != "" and os.path.isfile(outfi): os.remove(outfi) cls._logger.error("Output data segments sub error: {}".format( traceback.format_exc())) return res