def to_workflow_dep_graph(proj_folder, output_folder, target_folder_name=None, target_node_names=[], exclude_keys=[], filter_type=None): wfp = WorkflowParser() obj = wfp.parse_folder(proj_folder) proj_name = os.path.basename(proj_folder) dest_filepath = '{}/event_dep_[{}]_target_folders[{}]_nodes[{}]_filter_{}'\ .format(output_folder, proj_name, target_folder_name, '-'.join(target_node_names), filter_type) # only support either target_folder_name or target_node_names if target_folder_name and len(target_node_names) == 0: for f in FileUtility.list_files_recursive(proj_folder, target_suffix='.script'): if target_folder_name not in f: continue target_node_names.append(os.path.basename(f)) FileUtility.mkdir_p(output_folder) wfp.to_workflow_dep_graph(obj, dest_filepath=dest_filepath, target_node_names=target_node_names, filter_type=filter_type)
def __init__(self, start_date=None, *args, **kwargs): super(cn_proxy, self).__init__(*args, **kwargs) self.start_date = None if start_date is None: self.start_date = (datetime.datetime.now() + datetime.timedelta(days=-1)).strftime('%Y-%m-%d %H:%M:%S') else: self.start_date = "%s-%s-%s" % (start_date[0:4], start_date[4:6], start_date[6:8]) script_dir = self.get_script_dir() config = FileUtility.load_json_config('%s/../../../../conf/proxy_crawler.config.json' % script_dir) timestamp = time.strftime('%Y-%m-%d_%H:%M:%S') self._logger = get_logger("%s/../../../../log/%s.%s.log" % (script_dir, self.name, timestamp), config['log_level'])
def generate_workflow_dep_graph(dwc_wf_folder, out_folder, target_wf_folders=[], target_node_names=[]): FileUtility.mkdir_p(out_folder) for wf_folder in os.listdir(dwc_wf_folder): if target_wf_folders and wf_folder not in target_wf_folders: print('wf_folder [{}] not in target list [{}]'.format( wf_folder, target_wf_folders)) continue wf_folder_path = os.path.join(dwc_wf_folder, wf_folder) print('wf_folder_path [{}]'.format(wf_folder_path)) out_sub_folder = os.path.join(out_folder, wf_folder) try: to_workflow_dep_graph(wf_folder_path, out_sub_folder, target_node_names=target_node_names) except Exception as ex: print("Exception: {}".format(ex))
this_func_name = sys._getframe().f_code.co_name self.logger.debug("%s(): start ..." % this_func_name) if self.login(): self.vip_check_in() self.finance_check_in() self.driver.quit() self.logger.debug("%s(): end ..." % this_func_name) return def get_script_dir(): return os.path.dirname(os.path.realpath(__file__)) if __name__ == '__main__': this_func_name = __name__ script_dir = get_script_dir() config = FileUtility.load_json_config('%s/../conf/check_in.config.json' % script_dir) timestamp = time.strftime('%Y-%m-%d_%H:%M:%S') logger = get_logger("%s/../log/jd.%s.log" % (script_dir, timestamp), config['log_level']) logger.debug( "%s(): ****************************************************************************************************" % this_func_name) for item in config['jd']: jd = JD(logger, item['user_name'], item['password']) jd.run()
SPIDER_MODULES = ['proxy_service_provider.kuaidaili.spiders'] NEWSPIDER_MODULE = 'proxy_service_provider.kuaidaili.spiders' FEED_FORMAT = 'csv' # 尽量提取、保留所有字段 FEED_EXPORT_FIELDS = [ "protocol", "ip", "port", "user_name", "password", "anonymity", "support_request_type", "location", "sp", "validation_time", "source_site", "source_url" ] # FEED_URI = script_dir + '/../../../output/%(name)s.%(time)s.csv' FEED_URI = '%s/../../../output/%s.%s.csv' % ( script_dir, BOT_NAME, time.strftime('%Y-%m-%d_%H:%M:%S')) FEED_URI = os.path.realpath(FEED_URI) FileUtility.write_to_file( '%s/../../../output/proxy_crawler.output' % script_dir, FEED_URI + '\n') CSV_DELIMITER = '\t' FEED_EXPORTERS = {'csv': '_scrapy.exporters.CsvOptionRespectingItemExporter'} LOG_LEVEL = 'INFO' LOG_FILE = '%s/../../../log/%s.%s.log' % (script_dir, BOT_NAME, time.strftime('%Y-%m-%d_%H:%M:%S')) ITEM_PIPELINES = {'proxy_service_provider.kuaidaili.pipelines.ETL': 0} DOWNLOAD_DELAY = 3 EXTENSIONS = {} CONCURRENT_REQUESTS_PER_DOMAIN = 500
def parse_file(self, filepath, external_params={}, dest_filepath=None): self.logger.info('parse_file [{}]'.format(filepath)) self.logger.debug('file [{}], external_params = {}'.format( filepath, external_params)) # keep date key because external params from config is probably yyyy-MM-dd format for key in external_params: if 'date' in key.lower() or 'hour' in key.lower( ) or 'time' in key.lower(): if 'yyyy' in external_params[key] or 'mm' in external_params[ key] or 'dd' in external_params[key]: normalized_format = ScopeResolver.to_normalized_time_format( external_params[key]) normalized_format = normalized_format.replace('{', '') \ .replace('}', '') \ .replace('@', '') \ .replace('"', '') self.logger.debug( 'external_param datetime format = {}, normalized to {}' .format(external_params[key], normalized_format)) if key not in self.external_params: self.logger.debug( 'use TARGET_DATE [{}] in config.ini as datatime'. format(self.target_date_str)) default_datetime = parser.parse(self.target_date_str) self.external_params[key] = '"{}"'.format( default_datetime.strftime(normalized_format)) self.logger.debug( 'set self.external_params[{}] to [{}]'.format( key, self.external_params[key])) continue self.external_params[key] = external_params[key] self.logger.debug( 'update external_param key [{}] to value [{}]'.format( key, self.external_params[key])) content = FileUtility.get_file_content(filepath) final_nodes = [] final_edges = [] if filepath.endswith('.module'): d = self.get_module_views(content) for view_name in d: content = d[view_name] nodes, edges = self.parse_content(content, external_params) self.update_module_view_data(final_nodes, final_edges, nodes, edges, view_name) if filepath.endswith('.view'): content = self.remove_view_template(content) final_nodes, final_edges = self.parse_content( content, external_params) if filepath.endswith('.script'): final_nodes, final_edges = self.parse_content( content, external_params) if dest_filepath: self.to_graph(dest_filepath, final_nodes, final_edges) # save cosmos querying results if self.b_add_sstream_size: self.ssu.refresh_cache()
def parse_folder(self, folder_root, exclude_keys=[]): files = FileUtility.list_files_recursive(folder_root, target_suffix='.config') masters = {} # config_filename -> master config dict workflows = {} # process_name -> workflow config dict process_filepath = {} # process_name -> filepath wf_groups = {} # group_name -> list of process_name process_event_deps = {} # process_name -> list of depends on events master_wf_groups = {} # config_filename -> list of group names process_master_map = {} # process_name -> master config filename process_group_map = {} # process_name -> group name group_master_map = {} # group name -> master config name script_process_map = {} # script name -> process_name event_interval_map = {} # event_name -> interval exclude_keys.append('/objd/') # by default, ignore this for filepath in files: b_exclude = False for key in exclude_keys: if key in filepath: self.logger.info('skip exclude file [{}]'.format(filepath)) b_exclude = True break if b_exclude: continue self.logger.debug('parse_folder: filepath = {}'.format(filepath)) try: d = self.parse_file(filepath) except Exception as e: self.logger.warning('skip wrongly parsed file [{}]: {}'.format( filepath, e)) continue if d is None: continue if d['master']: self.logger.info('found master config [{}]'.format(filepath)) filename = os.path.basename(filepath) folder_name = os.path.basename(os.path.dirname(filepath)) key = '{}##{}'.format(folder_name, filename) self.logger.info('master config key = [{}]'.format(key)) if key in masters: self.logger.info( 'only use the first occurrence of a master config') continue masters[key] = d master_wf_groups[key] = set() for process_name in d['workflows']: group = d['workflows'][process_name] if not group in wf_groups: wf_groups[group] = [] wf_groups[group].append(process_name) if not group in master_wf_groups[key]: master_wf_groups[key].add(group) group_master_map[group] = key process_group_map[process_name] = group process_master_map[process_name] = key else: process_name = d['process_name'] class_nam = d['class_name'] if not 'ScopeJobRunner' in class_nam: self.logger.info('skip non-ScopeJobRunner config.') continue script_name = os.path.basename(d['ScriptFile']) script_name_key = script_name.replace('.script', '') config_name = os.path.basename(filepath).replace('.config', '') if config_name != process_name: self.logger.warning( 'config_name != script_name_key, use config_name as process_name' ) process_name = config_name d['process_name'] = config_name # keep the first occurrence only if script_name not in script_process_map: script_process_map[script_name] = d['process_name'] self.logger.debug('process_name = {}'.format(process_name)) workflows[process_name] = d process_filepath[process_name] = filepath if 'EventNamesToCheck' in d: process_event_deps[process_name] = d['EventNamesToCheck'] else: process_event_deps[process_name] = ('None', ) # keep the interval of this event if 'EventName' in d: event_interval_map[ d['EventName']] = self.normalized_delta_interval( d['DeltaInterval']) obj = WorkflowObj() obj.workflows = workflows obj.master_wf_groups = master_wf_groups obj.process_event_deps = process_event_deps obj.wf_groups = wf_groups obj.process_filepath = process_filepath obj.masters = masters obj.group_master_map = group_master_map obj.process_group_map = process_group_map obj.process_master_map = process_master_map obj.script_process_map = script_process_map obj.event_interval_map = event_interval_map return obj
def parse_script(proj_folder, workflow_folder, output_folder, target_script_folder=None, target_filenames=[], add_sstream_link=False, add_sstream_size=False, exclude_keys=[], external_params={}, master_key=None, target_date_str=None): print('proj_folder [{}]'.format(proj_folder)) print('workflow_folder [{}]'.format(workflow_folder)) wfp = WorkflowParser() obj = wfp.parse_folder(workflow_folder) script_fullpath_map = {} for f in FileUtility.list_files_recursive(proj_folder, target_suffix='.script'): script_fullpath_map[os.path.basename(f)] = f if len(target_filenames) == 0: print('no specified target_filenames, check target_script_folder [{}]'. format(target_script_folder)) if target_script_folder is not None: for f in FileUtility.list_files_recursive(target_script_folder, target_suffix='.script'): target_filenames.append(os.path.basename(f)) else: print( 'no specified target_filenames, add all scripts appear in workflows...' ) for script_name in obj.script_process_map: print('add script [{}]'.format(script_name)) target_filenames.append(script_name) print('target files:') for f in target_filenames: print(f) if len(target_filenames) == 0: print('no target files, abort.') return if not os.path.isdir(output_folder): print('create folder [{}]'.format(output_folder)) os.makedirs(output_folder) arguments_list = [] for target_filename in target_filenames: arguments = (target_filename, wfp, obj, script_fullpath_map, add_sstream_link, add_sstream_size, output_folder, external_params, master_key, target_date_str) arguments_list.append(arguments) process_no = min(len(target_filenames), 10) if process_no == 1: exe_results = [ parse_script_single(*arguments_list[0]), ] else: pool = mp.Pool(processes=process_no) exe_results = pool.starmap(parse_script_single, arguments_list) return exe_results
def all_in_one(dwc_wf_folder, out_folder, target_wf_folders=[], target_filenames=[], keep_exts=['.pdf', '.svg'], error_log_filename=None, add_sstream_link=False, add_sstream_size=False, script_root_folder=None): target_date_str = DatetimeUtility.get_datetime(-6, fmt_str='%Y-%m-%d') FileUtility.mkdir_p(out_folder) error_fp = None if error_log_filename: filepath = os.path.join(out_folder, error_log_filename) if os.path.exists(filepath): # backup print('backup existing error log file {}'.format(filepath)) os.rename( filepath, '{}.{}'.format( filepath, DatetimeUtility.get_datetime(0, '%Y-%m-%d_%H%M%S'))) error_fp = open(filepath, 'w+') for wf_folder in os.listdir(dwc_wf_folder): if target_wf_folders and wf_folder not in target_wf_folders: print('wf_folder [{}] not in target list [{}]'.format( wf_folder, target_wf_folders)) continue wf_folder_path = os.path.join(dwc_wf_folder, wf_folder) print('wf_folder_path [{}]'.format(wf_folder_path)) out_sub_folder = os.path.join(out_folder, wf_folder) # if os.path.exists(out_sub_folder): # print('skip processed folder [{}]'.format(out_sub_folder)) # continue try: to_workflow_dep_graph(wf_folder_path, out_sub_folder) out_script_folder = os.path.join(out_sub_folder, 'script_graph') script_folder = wf_folder_path # explicitly specified if script_root_folder: script_folder = script_root_folder results = parse_script(script_folder, wf_folder_path, out_script_folder, target_filenames=target_filenames[:], add_sstream_link=add_sstream_link, add_sstream_size=add_sstream_size, target_date_str=target_date_str) for result in results: # not None means error if result: print('error processing file [{}]'.format(result)) if error_fp: error_fp.write('{}/{}\n'.format( wf_folder_path, result)) if keep_exts: FileUtility.delete_files_except_exts(out_sub_folder, keep_exts) FileUtility.delete_files_except_exts(out_script_folder, keep_exts) except Exception as ex: print("Exception: {}".format(ex)) if error_fp: error_fp.close()