Пример #1
0
def to_workflow_dep_graph(proj_folder,
                          output_folder,
                          target_folder_name=None,
                          target_node_names=[],
                          exclude_keys=[],
                          filter_type=None):
    wfp = WorkflowParser()
    obj = wfp.parse_folder(proj_folder)

    proj_name = os.path.basename(proj_folder)
    dest_filepath = '{}/event_dep_[{}]_target_folders[{}]_nodes[{}]_filter_{}'\
                            .format(output_folder,
                                    proj_name,
                                    target_folder_name,
                                    '-'.join(target_node_names),
                                    filter_type)

    # only support either target_folder_name or target_node_names
    if target_folder_name and len(target_node_names) == 0:
        for f in FileUtility.list_files_recursive(proj_folder,
                                                  target_suffix='.script'):
            if target_folder_name not in f:
                continue

            target_node_names.append(os.path.basename(f))

    FileUtility.mkdir_p(output_folder)
    wfp.to_workflow_dep_graph(obj,
                              dest_filepath=dest_filepath,
                              target_node_names=target_node_names,
                              filter_type=filter_type)
Пример #2
0
    def __init__(self, start_date=None, *args, **kwargs):
        super(cn_proxy, self).__init__(*args, **kwargs)
        self.start_date = None
        if start_date is None:
            self.start_date = (datetime.datetime.now() + datetime.timedelta(days=-1)).strftime('%Y-%m-%d %H:%M:%S')
        else:
            self.start_date = "%s-%s-%s" % (start_date[0:4], start_date[4:6], start_date[6:8])

        script_dir = self.get_script_dir()
        config = FileUtility.load_json_config('%s/../../../../conf/proxy_crawler.config.json' % script_dir)
        timestamp = time.strftime('%Y-%m-%d_%H:%M:%S')
        self._logger = get_logger("%s/../../../../log/%s.%s.log" % (script_dir, self.name, timestamp), config['log_level'])
Пример #3
0
def generate_workflow_dep_graph(dwc_wf_folder,
                                out_folder,
                                target_wf_folders=[],
                                target_node_names=[]):
    FileUtility.mkdir_p(out_folder)
    for wf_folder in os.listdir(dwc_wf_folder):
        if target_wf_folders and wf_folder not in target_wf_folders:
            print('wf_folder [{}] not in target list [{}]'.format(
                wf_folder, target_wf_folders))
            continue

        wf_folder_path = os.path.join(dwc_wf_folder, wf_folder)
        print('wf_folder_path [{}]'.format(wf_folder_path))

        out_sub_folder = os.path.join(out_folder, wf_folder)

        try:
            to_workflow_dep_graph(wf_folder_path,
                                  out_sub_folder,
                                  target_node_names=target_node_names)
        except Exception as ex:
            print("Exception: {}".format(ex))
Пример #4
0
        this_func_name = sys._getframe().f_code.co_name
        self.logger.debug("%s(): start ..." % this_func_name)
        if self.login():
            self.vip_check_in()
            self.finance_check_in()
        self.driver.quit()
        self.logger.debug("%s(): end ..." % this_func_name)
        return


def get_script_dir():
    return os.path.dirname(os.path.realpath(__file__))


if __name__ == '__main__':
    this_func_name = __name__

    script_dir = get_script_dir()
    config = FileUtility.load_json_config('%s/../conf/check_in.config.json' %
                                          script_dir)
    timestamp = time.strftime('%Y-%m-%d_%H:%M:%S')
    logger = get_logger("%s/../log/jd.%s.log" % (script_dir, timestamp),
                        config['log_level'])
    logger.debug(
        "%s(): ****************************************************************************************************"
        % this_func_name)

    for item in config['jd']:
        jd = JD(logger, item['user_name'], item['password'])
        jd.run()
Пример #5
0
SPIDER_MODULES = ['proxy_service_provider.kuaidaili.spiders']
NEWSPIDER_MODULE = 'proxy_service_provider.kuaidaili.spiders'

FEED_FORMAT = 'csv'
# 尽量提取、保留所有字段
FEED_EXPORT_FIELDS = [
    "protocol", "ip", "port", "user_name", "password", "anonymity",
    "support_request_type", "location", "sp", "validation_time", "source_site",
    "source_url"
]

# FEED_URI = script_dir + '/../../../output/%(name)s.%(time)s.csv'
FEED_URI = '%s/../../../output/%s.%s.csv' % (
    script_dir, BOT_NAME, time.strftime('%Y-%m-%d_%H:%M:%S'))
FEED_URI = os.path.realpath(FEED_URI)
FileUtility.write_to_file(
    '%s/../../../output/proxy_crawler.output' % script_dir, FEED_URI + '\n')

CSV_DELIMITER = '\t'
FEED_EXPORTERS = {'csv': '_scrapy.exporters.CsvOptionRespectingItemExporter'}

LOG_LEVEL = 'INFO'
LOG_FILE = '%s/../../../log/%s.%s.log' % (script_dir, BOT_NAME,
                                          time.strftime('%Y-%m-%d_%H:%M:%S'))

ITEM_PIPELINES = {'proxy_service_provider.kuaidaili.pipelines.ETL': 0}

DOWNLOAD_DELAY = 3

EXTENSIONS = {}

CONCURRENT_REQUESTS_PER_DOMAIN = 500
    def parse_file(self, filepath, external_params={}, dest_filepath=None):
        self.logger.info('parse_file [{}]'.format(filepath))
        self.logger.debug('file [{}], external_params = {}'.format(
            filepath, external_params))

        # keep date key because external params from config is probably yyyy-MM-dd format
        for key in external_params:
            if 'date' in key.lower() or 'hour' in key.lower(
            ) or 'time' in key.lower():
                if 'yyyy' in external_params[key] or 'mm' in external_params[
                        key] or 'dd' in external_params[key]:
                    normalized_format = ScopeResolver.to_normalized_time_format(
                        external_params[key])
                    normalized_format = normalized_format.replace('{', '') \
                        .replace('}', '') \
                        .replace('@', '') \
                        .replace('"', '')

                    self.logger.debug(
                        'external_param datetime format = {}, normalized to {}'
                        .format(external_params[key], normalized_format))

                    if key not in self.external_params:
                        self.logger.debug(
                            'use TARGET_DATE [{}] in config.ini as datatime'.
                            format(self.target_date_str))
                        default_datetime = parser.parse(self.target_date_str)
                        self.external_params[key] = '"{}"'.format(
                            default_datetime.strftime(normalized_format))

                        self.logger.debug(
                            'set self.external_params[{}] to [{}]'.format(
                                key, self.external_params[key]))
                        continue

            self.external_params[key] = external_params[key]
            self.logger.debug(
                'update external_param key [{}] to value [{}]'.format(
                    key, self.external_params[key]))

        content = FileUtility.get_file_content(filepath)

        final_nodes = []
        final_edges = []

        if filepath.endswith('.module'):
            d = self.get_module_views(content)

            for view_name in d:
                content = d[view_name]
                nodes, edges = self.parse_content(content, external_params)

                self.update_module_view_data(final_nodes, final_edges, nodes,
                                             edges, view_name)

        if filepath.endswith('.view'):
            content = self.remove_view_template(content)

            final_nodes, final_edges = self.parse_content(
                content, external_params)

        if filepath.endswith('.script'):
            final_nodes, final_edges = self.parse_content(
                content, external_params)

        if dest_filepath:
            self.to_graph(dest_filepath, final_nodes, final_edges)

        # save cosmos querying results
        if self.b_add_sstream_size:
            self.ssu.refresh_cache()
Пример #7
0
    def parse_folder(self, folder_root, exclude_keys=[]):
        files = FileUtility.list_files_recursive(folder_root,
                                                 target_suffix='.config')

        masters = {}  # config_filename -> master config dict
        workflows = {}  # process_name -> workflow config dict
        process_filepath = {}  # process_name -> filepath
        wf_groups = {}  # group_name -> list of process_name
        process_event_deps = {}  # process_name -> list of depends on events
        master_wf_groups = {}  # config_filename -> list of group names
        process_master_map = {}  # process_name -> master config filename
        process_group_map = {}  # process_name -> group name
        group_master_map = {}  # group name -> master config name
        script_process_map = {}  # script name -> process_name
        event_interval_map = {}  # event_name -> interval

        exclude_keys.append('/objd/')  # by default, ignore this

        for filepath in files:
            b_exclude = False
            for key in exclude_keys:
                if key in filepath:
                    self.logger.info('skip exclude file [{}]'.format(filepath))
                    b_exclude = True
                    break

            if b_exclude:
                continue

            self.logger.debug('parse_folder: filepath = {}'.format(filepath))
            try:
                d = self.parse_file(filepath)
            except Exception as e:
                self.logger.warning('skip wrongly parsed file [{}]: {}'.format(
                    filepath, e))
                continue

            if d is None:
                continue

            if d['master']:
                self.logger.info('found master config [{}]'.format(filepath))
                filename = os.path.basename(filepath)
                folder_name = os.path.basename(os.path.dirname(filepath))
                key = '{}##{}'.format(folder_name, filename)
                self.logger.info('master config key = [{}]'.format(key))

                if key in masters:
                    self.logger.info(
                        'only use the first occurrence of a master config')
                    continue

                masters[key] = d
                master_wf_groups[key] = set()

                for process_name in d['workflows']:
                    group = d['workflows'][process_name]

                    if not group in wf_groups:
                        wf_groups[group] = []

                    wf_groups[group].append(process_name)

                    if not group in master_wf_groups[key]:
                        master_wf_groups[key].add(group)
                        group_master_map[group] = key

                    process_group_map[process_name] = group
                    process_master_map[process_name] = key
            else:
                process_name = d['process_name']
                class_nam = d['class_name']

                if not 'ScopeJobRunner' in class_nam:
                    self.logger.info('skip non-ScopeJobRunner config.')
                    continue

                script_name = os.path.basename(d['ScriptFile'])
                script_name_key = script_name.replace('.script', '')
                config_name = os.path.basename(filepath).replace('.config', '')

                if config_name != process_name:
                    self.logger.warning(
                        'config_name != script_name_key, use config_name as process_name'
                    )
                    process_name = config_name
                    d['process_name'] = config_name

                # keep the first occurrence only
                if script_name not in script_process_map:
                    script_process_map[script_name] = d['process_name']

                self.logger.debug('process_name = {}'.format(process_name))
                workflows[process_name] = d
                process_filepath[process_name] = filepath

                if 'EventNamesToCheck' in d:
                    process_event_deps[process_name] = d['EventNamesToCheck']
                else:
                    process_event_deps[process_name] = ('None', )

                # keep the interval of this event
                if 'EventName' in d:
                    event_interval_map[
                        d['EventName']] = self.normalized_delta_interval(
                            d['DeltaInterval'])

        obj = WorkflowObj()

        obj.workflows = workflows
        obj.master_wf_groups = master_wf_groups
        obj.process_event_deps = process_event_deps
        obj.wf_groups = wf_groups
        obj.process_filepath = process_filepath
        obj.masters = masters
        obj.group_master_map = group_master_map
        obj.process_group_map = process_group_map
        obj.process_master_map = process_master_map
        obj.script_process_map = script_process_map
        obj.event_interval_map = event_interval_map

        return obj
Пример #8
0
def parse_script(proj_folder,
                 workflow_folder,
                 output_folder,
                 target_script_folder=None,
                 target_filenames=[],
                 add_sstream_link=False,
                 add_sstream_size=False,
                 exclude_keys=[],
                 external_params={},
                 master_key=None,
                 target_date_str=None):

    print('proj_folder [{}]'.format(proj_folder))
    print('workflow_folder [{}]'.format(workflow_folder))

    wfp = WorkflowParser()
    obj = wfp.parse_folder(workflow_folder)

    script_fullpath_map = {}
    for f in FileUtility.list_files_recursive(proj_folder,
                                              target_suffix='.script'):
        script_fullpath_map[os.path.basename(f)] = f

    if len(target_filenames) == 0:
        print('no specified target_filenames, check target_script_folder [{}]'.
              format(target_script_folder))

        if target_script_folder is not None:
            for f in FileUtility.list_files_recursive(target_script_folder,
                                                      target_suffix='.script'):
                target_filenames.append(os.path.basename(f))
        else:
            print(
                'no specified target_filenames, add all scripts appear in workflows...'
            )
            for script_name in obj.script_process_map:
                print('add script [{}]'.format(script_name))
                target_filenames.append(script_name)

    print('target files:')
    for f in target_filenames:
        print(f)

    if len(target_filenames) == 0:
        print('no target files, abort.')
        return

    if not os.path.isdir(output_folder):
        print('create folder [{}]'.format(output_folder))
        os.makedirs(output_folder)

    arguments_list = []
    for target_filename in target_filenames:
        arguments = (target_filename, wfp, obj, script_fullpath_map,
                     add_sstream_link, add_sstream_size, output_folder,
                     external_params, master_key, target_date_str)

        arguments_list.append(arguments)

    process_no = min(len(target_filenames), 10)

    if process_no == 1:
        exe_results = [
            parse_script_single(*arguments_list[0]),
        ]
    else:
        pool = mp.Pool(processes=process_no)
        exe_results = pool.starmap(parse_script_single, arguments_list)

    return exe_results
Пример #9
0
def all_in_one(dwc_wf_folder,
               out_folder,
               target_wf_folders=[],
               target_filenames=[],
               keep_exts=['.pdf', '.svg'],
               error_log_filename=None,
               add_sstream_link=False,
               add_sstream_size=False,
               script_root_folder=None):
    target_date_str = DatetimeUtility.get_datetime(-6, fmt_str='%Y-%m-%d')

    FileUtility.mkdir_p(out_folder)

    error_fp = None
    if error_log_filename:
        filepath = os.path.join(out_folder, error_log_filename)

        if os.path.exists(filepath):
            # backup
            print('backup existing error log file {}'.format(filepath))
            os.rename(
                filepath, '{}.{}'.format(
                    filepath,
                    DatetimeUtility.get_datetime(0, '%Y-%m-%d_%H%M%S')))

        error_fp = open(filepath, 'w+')

    for wf_folder in os.listdir(dwc_wf_folder):
        if target_wf_folders and wf_folder not in target_wf_folders:
            print('wf_folder [{}] not in target list [{}]'.format(
                wf_folder, target_wf_folders))
            continue

        wf_folder_path = os.path.join(dwc_wf_folder, wf_folder)
        print('wf_folder_path [{}]'.format(wf_folder_path))

        out_sub_folder = os.path.join(out_folder, wf_folder)

        #        if os.path.exists(out_sub_folder):
        #            print('skip processed folder [{}]'.format(out_sub_folder))
        #            continue

        try:
            to_workflow_dep_graph(wf_folder_path, out_sub_folder)

            out_script_folder = os.path.join(out_sub_folder, 'script_graph')
            script_folder = wf_folder_path

            # explicitly specified
            if script_root_folder:
                script_folder = script_root_folder

            results = parse_script(script_folder,
                                   wf_folder_path,
                                   out_script_folder,
                                   target_filenames=target_filenames[:],
                                   add_sstream_link=add_sstream_link,
                                   add_sstream_size=add_sstream_size,
                                   target_date_str=target_date_str)

            for result in results:
                # not None means error
                if result:
                    print('error processing file [{}]'.format(result))

                    if error_fp:
                        error_fp.write('{}/{}\n'.format(
                            wf_folder_path, result))

            if keep_exts:
                FileUtility.delete_files_except_exts(out_sub_folder, keep_exts)
                FileUtility.delete_files_except_exts(out_script_folder,
                                                     keep_exts)
        except Exception as ex:
            print("Exception: {}".format(ex))

    if error_fp:
        error_fp.close()