示例#1
0
def run_pipeline():
    if not options.jobs or options.jobs == 1:
        options.jobs = available_cpu_count()

    try:
        cmdline.run(options)
    except ruffus_exceptions.RethrownJobError as e:
        if options.verbose:
            print(e)

        # Yuck. Hunt through the ruffus exception to find out what the
        # return code is supposed to be.
        for exc in e.args:
            task_name, job_name, exc_name, exc_value, exc_stack = exc
            if exc_name == 'builtins.SystemExit':
                return eval(
                    exc_value,
                    {'ExitCode': ExitCode}, {'exc_value': exc_value})
        return ExitCode.other_error

    if not validate_pdfa(options.output_file, _log):
        _log.warning('Output file: The generated PDF/A file is INVALID')
        return ExitCode.invalid_output_pdfa

    return ExitCode.ok
示例#2
0
def bootstrap(config=None, option=None):
    """entry point; parse command line argument, create pipeline object,
    and run it
    """
    print("+- Apus powered by Ruffus ver {0} -+".format(ruffus.__version__))
    if config is None:
        config = sys.modules['__main__']
    if option is None:
        option = sys.argv[1:]
    config, option = configure(config, option)
    if option.list_tasks:
        tlist = config.get_task_names()
        if not tlist:
            print("no tasks found")
        else:
            for t in tlist:
                print("{0}".format(t))
        sys.exit(0)
    # set up astromatic config
    config.am = am.AmConfig(**config.env_overrides)
    build_pipeline(config)
    # handle redo-all
    if option.redo_all:
        task_list = ruffus.pipeline_get_task_names()
        option.forced_tasks.extend(task_list)
    if len(option.forced_tasks) > 0:
        for t in option.forced_tasks:
            config.logger.info("forced redo: {0}".format(utils.alert(t)))
    cmdline.run(option, checksum_level=1)
示例#3
0
文件: main.py 项目: bjpop/lynch_gatk
def main():
    '''Initialise the pipeline, then run it'''
    # Parse command line arguments
    options = parse_command_line()
    # Initialise the logger
    logger = Logger(__name__, options.log_file, options.verbose)
    # Log the command line used to run the pipeline
    logger.info(' '.join(sys.argv))
    drmaa_session = None
    try:
        # Set up the DRMAA session for running cluster jobs
        import drmaa
        drmaa_session = drmaa.Session()
        drmaa_session.initialize()
    except Exception as e:
        print("{progname} error using DRMAA library".format(progname=program_name), file=sys.stdout)
        print("Error message: {msg}".format(msg=e.message, file=sys.stdout))
        exit(error_codes.DRMAA_ERROR)
    # Parse the configuration file, and initialise global state
    config = Config(options.config)
    config.validate()
    state = State(options=options, config=config, logger=logger,
                  drmaa_session=drmaa_session)
    # Build the pipeline workflow
    pipeline = make_pipeline(state)
    # Run (or print) the pipeline
    cmdline.run(options)
    if drmaa_session is not None:
        # Shut down the DRMAA session
        drmaa_session.exit()
示例#4
0
文件: main.py 项目: mawekuwe/OCRmyPDF
def run_pipeline():
    if not options.jobs or options.jobs == 1:
        options.jobs = available_cpu_count()

    try:
        cmdline.run(options)
    except ruffus_exceptions.RethrownJobError as e:
        if options.verbose:
            print(e)

        # Yuck. Hunt through the ruffus exception to find out what the
        # return code is supposed to be.
        for exc in e.args:
            task_name, job_name, exc_name, exc_value, exc_stack = exc
            if exc_name == 'builtins.SystemExit':
                return eval(
                    exc_value,
                    {'ExitCode': ExitCode}, {'exc_value': exc_value})
        return ExitCode.other_error

    if not validate_pdfa(options.output_file, _log):
        _log.warning('Output file: The generated PDF/A file is INVALID')
        return ExitCode.invalid_output_pdfa

    return ExitCode.ok
示例#5
0
def main():
    '''Initialise the pipeline, then run it'''
    # Parse command line arguments
    options = parse_command_line()
    # Initialise the logger
    logger = Logger(__name__, options.log_file, options.verbose)
    # Log the command line used to run the pipeline
    logger.info(' '.join(sys.argv))
    drmaa_session = None
    try:
        # Set up the DRMAA session for running cluster jobs
        import drmaa
        drmaa_session = drmaa.Session()
        drmaa_session.initialize()
    except Exception as e:
        print("{progname} error using DRMAA library".format(
            progname=program_name),
              file=sys.stdout)
        print("Error message: {msg}".format(msg=e.message, file=sys.stdout))
        exit(error_codes.DRMAA_ERROR)
    # Parse the configuration file, and initialise global state
    config = Config(options.config)
    config.validate()
    state = State(options=options,
                  config=config,
                  logger=logger,
                  drmaa_session=drmaa_session)
    # Build the pipeline workflow
    pipeline = make_pipeline(state)
    # Run (or print) the pipeline
    cmdline.run(options)
    if drmaa_session is not None:
        # Shut down the DRMAA session
        drmaa_session.exit()
示例#6
0
文件: main.py 项目: concepz/OCRmyPDF
def run_pipeline():
    if not options.jobs:
        options.jobs = available_cpu_count()
    try:
        options.history_file = os.path.join(
            work_folder, 'ruffus_history.sqlite')
        cmdline.run(options)
    except ruffus_exceptions.RethrownJobError as e:
        if options.verbose:
            _log.debug(str(e))  # stringify exception so logger doesn't have to

        # Ruffus flattens exception to 5 element tuples. Because of a bug
        # in <= 2.6.3 it may present either the single:
        #   (task, job, exc, value, stack)
        # or something like:
        #   [[(task, job, exc, value, stack)]]
        #
        # Generally cross-process exception marshalling doesn't work well
        # and ruffus doesn't support because BaseException has its own
        # implementation of __reduce__ that attempts to reconstruct the
        # exception based on e.__init__(e.args).
        #
        # Attempting to log the exception directly marshalls it to the logger
        # which is probably in another process, so it's better to log only
        # data from the exception at this point.

        exitcode = traverse_ruffus_exception(e.args)
        if exitcode is None:
            _log.error("Unexpected ruffus exception: " + str(e))
            _log.error(repr(e))
            return ExitCode.other_error
        else:
            return exitcode
    except Exception as e:
        _log.error(e)
        return ExitCode.other_error

    if not validate_pdfa(options.output_file, _log):
        _log.warning('Output file: The generated PDF/A file is INVALID')
        return ExitCode.invalid_output_pdfa

    with _pdfinfo_lock:
        _log.debug(_pdfinfo)
        direction = {0: 'n', 90: 'e',
                     180: 's', 270: 'w'}
        orientations = []
        for n, page in enumerate(_pdfinfo):
            angle = _pdfinfo[n].get('rotated', 0)
            if angle != 0:
                orientations.append('{0}{1}'.format(
                    n + 1,
                    direction.get(angle, '')))
        if orientations:
            _log.info('Page orientations detected: ' + ' '.join(orientations))

    return ExitCode.ok
示例#7
0
文件: trenchrun.py 项目: sivy/kuiil
def main():
    parser = cmdline.get_argparse(description="Trench Run pipeline")

    args = parser.parse_args()

    if args.target_tasks:
        cmdline.run(args)

    else:
        pipeline_run(publish_data)
示例#8
0
def run_pipeline():
    if not options.jobs:
        options.jobs = available_cpu_count()
    try:
        options.history_file = os.path.join(work_folder,
                                            'ruffus_history.sqlite')
        cmdline.run(options)
    except ruffus_exceptions.RethrownJobError as e:
        if options.verbose:
            _log.debug(str(e))  # stringify exception so logger doesn't have to

        # Ruffus flattens exception to 5 element tuples. Because of a bug
        # in <= 2.6.3 it may present either the single:
        #   (task, job, exc, value, stack)
        # or something like:
        #   [[(task, job, exc, value, stack)]]
        #
        # Generally cross-process exception marshalling doesn't work well
        # and ruffus doesn't support because BaseException has its own
        # implementation of __reduce__ that attempts to reconstruct the
        # exception based on e.__init__(e.args).
        #
        # Attempting to log the exception directly marshalls it to the logger
        # which is probably in another process, so it's better to log only
        # data from the exception at this point.

        exitcode = traverse_ruffus_exception(e.args)
        if exitcode is None:
            _log.error("Unexpected ruffus exception: " + str(e))
            _log.error(repr(e))
            return ExitCode.other_error
        else:
            return exitcode
    except Exception as e:
        _log.error(e)
        return ExitCode.other_error

    if not validate_pdfa(options.output_file, _log):
        _log.warning('Output file: The generated PDF/A file is INVALID')
        return ExitCode.invalid_output_pdfa

    with _pdfinfo_lock:
        _log.debug(_pdfinfo)
        direction = {0: 'n', 90: 'e', 180: 's', 270: 'w'}
        orientations = []
        for n, page in enumerate(_pdfinfo):
            angle = _pdfinfo[n].get('rotated', 0)
            if angle != 0:
                orientations.append('{0}{1}'.format(n + 1,
                                                    direction.get(angle, '')))
        if orientations:
            _log.info('Page orientations detected: ' + ' '.join(orientations))

    return ExitCode.ok
示例#9
0
def main(program_name, program_version, make_pipeline):
    '''Initialise the pipeline, then run it'''
    # Parse command line arguments
    options = parse_command_line(program_version)
    # Initialise the logger
    # logger = Logger(__name__, options.log_file, options.verbose)
    if options.log_file:
        logging.basicConfig(filename=options.log_file,
                            level=LOGGING_LEVEL,
                            filemode="a",
                            format="%(asctime)s %(levelname)s - %(message)s",
                            datefmt="%m-%d-%Y %H:%M:%S")
    logger = logging.getLogger(__name__)
    # Log the command line used to run the pipeline
    logger.info("*** rnapipe ***")
    logger.info(' '.join(sys.argv))
    drmaa_session = None
    try:
        # Set up the DRMAA session for running cluster jobs
        import drmaa
        drmaa_session = drmaa.Session()
        drmaa_session.initialize()
    except Exception as e:
        print("{progname} error using DRMAA library".format(
            progname=program_name),
              file=sys.stdout)
        print("Error message: {msg}".format(msg=e.message, file=sys.stdout))
        exit(error_codes.DRMAA_ERROR)
    # Parse the configuration file, and initialise global state
    config = Config(options.config)
    config.validate()
    state = State(options=options,
                  config=config,
                  logger=logger,
                  drmaa_session=drmaa_session)
    # Build the pipeline workflow
    pipeline = make_pipeline(state)
    # Run (or print) the pipeline
    cmdline.run(options)
    if drmaa_session is not None:
        # Shut down the DRMAA session
        drmaa_session.exit()
示例#10
0
def main():
    '''Initialise the pipeline, then run it'''
    # Parse command line arguments
    options = parse_command_line()
    # Initialise the logger
    logger = Logger(__name__, options.log_file, options.verbose)
    # Log the command line used to run the pipeline
    logger.info(' '.join(sys.argv))
    # Set up the DRMAA session for running cluster jobs
    drmaa_session = drmaa.Session()
    drmaa_session.initialize()
    # Parse the configuration file, and initialise global state
    config = Config(options.config)
    config.validate()
    state = State(options=options, config=config, logger=logger,
                  drmaa_session=drmaa_session)
    # Build the pipeline workflow
    pipeline = make_pipeline(state)
    # Run (or print) the pipeline
    cmdline.run(options)
    # Shut down the DRMAA session
    drmaa_session.exit()
示例#11
0
文件: main.py 项目: balu-/OCRmyPDF
def run_pipeline():
    if not options.jobs:
        options.jobs = available_cpu_count()
    try:
        options.history_file = os.path.join(work_folder, 'ruffus_history.sqlite')
        cmdline.run(options)
    except ruffus_exceptions.RethrownJobError as e:
        if options.verbose:
            print(e)

        # Yuck. Hunt through the ruffus exception to find out what the
        # return code is supposed to be.
        for exc in e.args:
            task_name, job_name, exc_name, exc_value, exc_stack = exc
            if exc_name == 'builtins.SystemExit':
                match = re.search(r"\.(.+?)\)", exc_value)
                exit_code_name = match.groups()[0]
                exit_code = getattr(ExitCode, exit_code_name, 'other_error')
                return exit_code
            elif exc_name == 'ruffus.ruffus_exceptions.MissingInputFileError':
                print(cleanup_ruffus_error_message(exc_value))
                return ExitCode.input_file
            elif exc_name == 'builtins.TypeError':
                # Even though repair_pdf will fail, ruffus will still try
                # to call split_pages with no input files, likely due to a bug
                if task_name == 'split_pages':
                    print("Input file '{0}' is not a valid PDF".format(
                        options.input_file))
                    return ExitCode.input_file

        return ExitCode.other_error

    if not validate_pdfa(options.output_file, _log):
        _log.warning('Output file: The generated PDF/A file is INVALID')
        return ExitCode.invalid_output_pdfa

    return ExitCode.ok
示例#12
0
文件: main.py 项目: khalidm/crpipe
def main():
    '''Initialise the pipeline, then run it'''
    # Parse command line arguments
    options = parse_command_line()
    # Initialise the logger
    logger = Logger(__name__, options.log_file, options.verbose)
    # Log the command line used to run the pipeline
    logger.info(' '.join(sys.argv))
    # Set up the DRMAA session for running cluster jobs
    drmaa_session = drmaa.Session()
    drmaa_session.initialize()
    # Parse the configuration file, and initialise global state
    config = Config(options.config)
    config.validate()
    state = State(options=options,
                  config=config,
                  logger=logger,
                  drmaa_session=drmaa_session)
    # Build the pipeline workflow
    pipeline = make_pipeline(state)
    # Run (or print) the pipeline
    cmdline.run(options)
    # Shut down the DRMAA session
    drmaa_session.exit()
示例#13
0
def run_pipeline():
    if not options.jobs or options.jobs == 1:
        options.jobs = available_cpu_count()
    try:
        cmdline.run(options)
    except ruffus_exceptions.RethrownJobError as e:
        if options.verbose:
            print(e)

        # Yuck. Hunt through the ruffus exception to find out what the
        # return code is supposed to be.
        for exc in e.args:
            task_name, job_name, exc_name, exc_value, exc_stack = exc
            if exc_name == 'builtins.SystemExit':
                match = re.search(r"\.(.+?)\)", exc_value)
                exit_code_name = match.groups()[0]
                exit_code = getattr(ExitCode, exit_code_name, 'other_error')
                return exit_code
            elif exc_name == 'ruffus.ruffus_exceptions.MissingInputFileError':
                print(cleanup_ruffus_error_message(exc_value))
                return ExitCode.input_file
            elif exc_name == 'builtins.TypeError':
                # Even though repair_pdf will fail, ruffus will still try
                # to call split_pages with no input files, likely due to a bug
                if task_name == 'split_pages':
                    print("Input file '{0}' is not a valid PDF".format(
                        options.input_file))
                    return ExitCode.input_file

        return ExitCode.other_error

    if not validate_pdfa(options.output_file, _log):
        _log.warning('Output file: The generated PDF/A file is INVALID')
        return ExitCode.invalid_output_pdfa

    return ExitCode.ok
示例#14
0
def run_pipeline():
    if not options.jobs:
        options.jobs = available_cpu_count()
    try:
        options.history_file = os.path.join(work_folder,
                                            'ruffus_history.sqlite')
        cmdline.run(options)
    except ruffus_exceptions.RethrownJobError as e:
        if options.verbose:
            _log.debug(e)

        # Yuck. Hunt through the ruffus exception to find out what the
        # return code is supposed to be.
        # Ruffus flattens the exception to a string, throwing away all kinds
        # of helpful details
        # task_name, job_name - ruffus status
        # exc_name - class name of exception
        # exc_value - irritating string that makes impossible to recover
        #   exception object
        # exc_stack - string that contains traceback of exception
        for exc in e.args:
            task_name, job_name, exc_name, exc_value, exc_stack = exc
            if exc_name == 'builtins.SystemExit':
                match = re.search(r"\.(.+?)\)", exc_value)
                exit_code_name = match.groups()[0]
                exit_code = getattr(ExitCode, exit_code_name, 'other_error')
                return exit_code
            elif exc_name == 'ruffus.ruffus_exceptions.MissingInputFileError':
                _log.error(cleanup_ruffus_error_message(exc_value))
                return ExitCode.input_file
            elif exc_name == 'builtins.TypeError':
                # Even though repair_pdf will fail, ruffus will still try
                # to call split_pages with no input files, likely due to a bug
                if task_name == 'split_pages':
                    _log.error("Input file '{0}' is not a valid PDF".format(
                        options.input_file))
                    return ExitCode.input_file
            elif exc_name == 'subprocess.CalledProcessError':
                # It's up to the subprocess handler to report something useful
                msg = "Error occurred while running this command:"
                _log.error(msg + '\n' + exc_value)
                return ExitCode.child_process_error
            elif not options.verbose:
                _log.error(e)

        return ExitCode.other_error
    except Exception as e:
        _log.error(e)
        return ExitCode.other_error

    if not validate_pdfa(options.output_file, _log):
        _log.warning('Output file: The generated PDF/A file is INVALID')
        return ExitCode.invalid_output_pdfa

    with _pdfinfo_lock:
        _log.debug(_pdfinfo)
        direction = {0: 'n', 90: 'e', 180: 's', 270: 'w'}
        orientations = []
        for n, page in enumerate(_pdfinfo):
            angle = _pdfinfo[n].get('rotated', 0)
            if angle != 0:
                orientations.append('{0}{1}'.format(n + 1,
                                                    direction.get(angle, '')))
        if orientations:
            _log.info('Page orientations detected: ' + ' '.join(orientations))

    return ExitCode.ok
示例#15
0
args = parser.parse_args()
params = get_params(args, args.params)
check_params(args, params)

logs_dir = args.outdir + '/logs'
if not os.path.exists(logs_dir):
    os.makedirs(logs_dir)

log_file = '%s/log.%s.txt' % (logs_dir, datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S"))
logger, logging_mutex = cmdline.setup_logging(__name__,
                                              log_file,
                                              args.verbose)
print 'log_file:', log_file

cmdline.run(args)

read_pairs = []
if args.fq:
    read_pairs = format_read_pairs(fqs=args.fq)
elif args.fq_list:
    read_pairs = format_read_pairs(list_file=args.fq_list)

history_file = '%s/.ruffus_history.sqlite' % args.outdir
bbt_outdir = '%s/bbt_%s' % (args.outdir, get_version('bbt'))
assembly_outdir = '%s/rnabloom_%s' % (args.outdir, get_version('rnabloom'))
pv_outdir = '%s/pv_%s' % (args.outdir, get_version('pv'))
bbt_prefix = bbt_outdir + '/' + args.sample

# for determining how many procs/threads to give to each analysis
num_analysis = 2
示例#16
0
def run_pipeline():
    cmdline.run(options, multiprocess=available_cpu_count())
示例#17
0
    except error_drmaa_job as err:
      raise Exception("\n".join(map(str,
                      ["Failed to run:",
                        cmd,
                        err,
                        stdout_res,
                        stderr_res])))

    with logger_mutex:
      logger.debug("kallisto worked")




if __name__ == '__main__':
  cmdline.run (options, multithread = options.jobs)
  drmaa_session.exit()
  pipeline_printout_graph ("bulk_rna-seq.jpg", "jpg", [trim_fastq,hisat2,star,kallisto,cufflinks,qorts],
                          no_key_legend=True,
                          ignore_upstream_of_target=True,
                          pipeline_name="bulk RNA-seq",
                          user_colour_scheme = {
                                                "colour_scheme_index" :2,
                                                "Bulk RNA-seq"      :{"fontcolor" : '"#FF3232"' },
                                                "Task to run"       :{"linecolor" : '"#0044A0"' },
                                                "Final target"      :{"fillcolor" : '"#EFA03B"',
                                                                       "fontcolor" : "black",
                                                                       "dashed"    : 0           }
                                               })
  pipeline_printout()
示例#18
0
def run_pipeline(args=None):
    options = parser.parse_args(args=args)
    options.verbose_abbreviated_path = 1
    if os.environ.get('_OCRMYPDF_THREADS'):
        options.use_threads = True

    if not check_closed_streams(options):
        return ExitCode.bad_args

    logger_args = {'verbose': options.verbose, 'quiet': options.quiet}

    _log, _log_mutex = proxy_logger.make_shared_logger_and_proxy(
        logging_factory, __name__, logger_args
    )
    preamble(_log)
    check_options(options, _log)
    check_dependency_versions(options, _log)

    # Any changes to options will not take effect for options that are already
    # bound to function parameters in the pipeline. (For example
    # options.input_file, options.pdf_renderer are already bound.)
    if not options.jobs:
        options.jobs = available_cpu_count()

    # Performance is improved by setting Tesseract to single threaded. In tests
    # this gives better throughput than letting a smaller number of Tesseract
    # jobs run multithreaded. Same story for pngquant. Tess <4 ignores this
    # variable, but harmless to set if ignored.
    os.environ.setdefault('OMP_THREAD_LIMIT', '1')

    check_environ(options, _log)
    if os.environ.get('PYTEST_CURRENT_TEST'):
        os.environ['_OCRMYPDF_TEST_INFILE'] = options.input_file

    try:
        work_folder = mkdtemp(prefix="com.github.ocrmypdf.")
        options.history_file = os.path.join(work_folder, 'ruffus_history.sqlite')
        start_input_file = os.path.join(work_folder, 'origin')

        check_input_file(options, _log, start_input_file)
        check_requested_output_file(options, _log)

        manager = JobContextManager()
        manager.register('JobContext', JobContext)  # pylint: disable=no-member
        manager.start()

        context = manager.JobContext()  # pylint: disable=no-member
        context.set_options(options)
        context.set_work_folder(work_folder)

        build_pipeline(options, work_folder, _log, context)
        atexit.register(cleanup_working_files, work_folder, options)
        if hasattr(os, 'nice'):
            os.nice(5)
        cmdline.run(options)
    except ruffus_exceptions.RethrownJobError as e:
        if options.verbose:
            _log.debug(str(e))  # stringify exception so logger doesn't have to
        exceptions = e.job_exceptions
        exitcode = traverse_ruffus_exception(exceptions, options, _log)
        if exitcode is None:
            _log.error("Unexpected ruffus exception: " + str(e))
            _log.error(repr(e))
            return ExitCode.other_error
        return exitcode
    except ExitCodeException as e:
        return e.exit_code
    except Exception as e:
        _log.error(str(e))
        return ExitCode.other_error

    if options.flowchart:
        _log.info(f"Flowchart saved to {options.flowchart}")
        return ExitCode.ok
    elif options.output_file == '-':
        _log.info("Output sent to stdout")
    elif os.path.samefile(options.output_file, os.devnull):
        pass  # Say nothing when sending to dev null
    else:
        if options.output_type.startswith('pdfa'):
            pdfa_info = file_claims_pdfa(options.output_file)
            if pdfa_info['pass']:
                msg = f"Output file is a {pdfa_info['conformance']} (as expected)"
                _log.info(msg)
            else:
                msg = f"Output file is okay but is not PDF/A (seems to be {pdfa_info['conformance']})"
                _log.warning(msg)
                return ExitCode.pdfa_conversion_failed
        if not qpdf.check(options.output_file, _log):
            _log.warning('Output file: The generated PDF is INVALID')
            return ExitCode.invalid_output_pdf

        report_output_file_size(options, _log, start_input_file, options.output_file)

    pdfinfo = context.get_pdfinfo()
    if options.verbose:
        from pprint import pformat

        _log.debug(pformat(pdfinfo))

    log_page_orientations(pdfinfo, _log)

    return ExitCode.ok
示例#19
0
def run_pipeline():
    options = parser.parse_args()
    options.verbose_abbreviated_path = 1

    if not check_closed_streams(options):
        return ExitCode.bad_args

    logger_args = {'verbose': options.verbose, 'quiet': options.quiet}

    _log, _log_mutex = proxy_logger.make_shared_logger_and_proxy(
        logging_factory, __name__, logger_args)
    _log.debug('ocrmypdf ' + VERSION)
    _log.debug('tesseract ' + tesseract.version())

    check_options(options, _log)

    # Any changes to options will not take effect for options that are already
    # bound to function parameters in the pipeline. (For example
    # options.input_file, options.pdf_renderer are already bound.)
    if not options.jobs:
        options.jobs = available_cpu_count()
    try:
        work_folder = mkdtemp(prefix="com.github.ocrmypdf.")
        options.history_file = os.path.join(
            work_folder, 'ruffus_history.sqlite')
        start_input_file = os.path.join(
            work_folder, 'origin')

        if options.input_file == '-':
            # stdin
            _log.info('reading file from standard input')
            with open(start_input_file, 'wb') as stream_buffer:
                from shutil import copyfileobj
                copyfileobj(sys.stdin.buffer, stream_buffer)
        else:
            try:
                re_symlink(options.input_file, start_input_file, _log)
            except FileNotFoundError:
                _log.error("File not found - " + options.input_file)
                return ExitCode.input_file

        if options.output_file == '-':
            if sys.stdout.isatty():
                _log.error(textwrap.dedent("""\
                    Output was set to stdout '-' but it looks like stdout
                    is connected to a terminal.  Please redirect stdout to a
                    file."""))
                return ExitCode.bad_args
        elif not is_file_writable(options.output_file):
            _log.error(textwrap.dedent("""\
                Output file location is not writable."""))
            return ExitCode.file_access_error

        manager = JobContextManager()
        manager.register('JobContext', JobContext)
        manager.start()

        context = manager.JobContext()
        context.set_options(options)
        context.set_work_folder(work_folder)

        build_pipeline(options, work_folder, _log, context)
        atexit.register(cleanup_working_files, work_folder, options)
        cmdline.run(options)
    except ruffus_exceptions.RethrownJobError as e:
        if options.verbose:
            _log.debug(str(e))  # stringify exception so logger doesn't have to

        # Ruffus flattens exception to 5 element tuples. Because of a bug
        # in <= 2.6.3 it may present either the single:
        #   (task, job, exc, value, stack)
        # or something like:
        #   [[(task, job, exc, value, stack)]]
        #
        # Generally cross-process exception marshalling doesn't work well
        # and ruffus doesn't support because BaseException has its own
        # implementation of __reduce__ that attempts to reconstruct the
        # exception based on e.__init__(e.args).
        #
        # Attempting to log the exception directly marshalls it to the logger
        # which is probably in another process, so it's better to log only
        # data from the exception at this point.

        exitcode = traverse_ruffus_exception(e.args, options, _log)
        if exitcode is None:
            _log.error("Unexpected ruffus exception: " + str(e))
            _log.error(repr(e))
            return ExitCode.other_error
        else:
            return exitcode
    except ExitCodeException as e:
        return e.exit_code
    except Exception as e:
        _log.error(e)
        return ExitCode.other_error

    if options.flowchart:
        _log.info("Flowchart saved to {}".format(options.flowchart))
    elif options.output_file != '-':
        if options.output_type == 'pdfa':
            pdfa_info = file_claims_pdfa(options.output_file)
            if pdfa_info['pass']:
                msg = 'Output file is a {} (as expected)'
                _log.info(msg.format(pdfa_info['conformance']))
            else:
                msg = 'Output file is okay but is not PDF/A (seems to be {})'
                _log.warning(msg.format(pdfa_info['conformance']))

                return ExitCode.invalid_output_pdf
        if not qpdf.check(options.output_file, _log):
            _log.warning('Output file: The generated PDF is INVALID')
            return ExitCode.invalid_output_pdf
    else:
        _log.info("Output sent to stdout")

    pdfinfo = context.get_pdfinfo()
    if options.verbose:
        from pprint import pformat
        _log.debug(pformat(pdfinfo))
    direction = {0: 'n', 90: 'e',
                 180: 's', 270: 'w'}
    orientations = []
    for n, page in enumerate(pdfinfo):
        angle = pdfinfo[n].rotation or 0
        if angle != 0:
            orientations.append('{0}{1}'.format(
                n + 1,
                direction.get(angle, '')))
    if orientations:
        _log.info('Page orientations detected: ' + ' '.join(orientations))

    return ExitCode.ok
示例#20
0
              (outfile, infiles[0], infiles[1]))

    out.close()


@follows(exampleCombinations, examplePermutations, exampleProduct)
def advancedRuffus():
    '''
    This is a dummy function to demonstrate the use of dummy functions to run
    subsections of the pipeline.
    Running the pipeline as make advancedRuffus will update, if needed,
    exampleCombinations, examplePermutations and exampleProduct,
    plus any prior steps they depend upon - these are exampleOriginate,
    exampleTransform and exampleSubdivide.
    exampleMerge, exampleSplit and exampleCollate will not be run.
    '''


@follows(basicRuffus, advancedRuffus)
def full():
    '''
    All cgat pipelines should end with a full() function which updates,
    if needed, all branches of the pipeline.
    The @follows statement should ensure that all functions are covered,
    either directly or as prerequisites.
    '''


# this is essential to run the pipeline with ruffus
cmdline.run(options)
示例#21
0
            dfm = dfo.join(dft)  # Merge in preparation for comparison
            assert len(dfo) == len(dft) == len(dfm)
            results.append({
                'numerator':
                info['numerator'],
                'denominator':
                info['denominator'],
                'pearson':
                dfm.corr(method='pearson').loc['orig']['xform'],
                'spearman':
                dfm.corr(method='spearman').loc['origRnk']['xformRnk'],
                'kendall':
                dfm.corr(method='kendall').loc['origRnk']['xformRnk']
            })
            lg.info("xform::powall_cmp path::%s ::done" % xfpath)

        newk = 'comparison'
        ous[newk] = pd.DataFrame(results).set_index(
            ['numerator', 'denominator'])
        ous.get_storer(newk).attrs.info = ins.get_storer('orig').attrs.info

    finally:
        ins.close()
        ous.close()


cmdline.run(options,
            checksum_level=rf.ruffus_utility.CHECKSUM_HISTORY_TIMESTAMPS,
            logger=lg)
示例#22
0
    pipe.transform(
        name="convert_csv_files_to_tsv",
        task_func=csv_to_tsv,
        input=output_from("create_three_new_files"),
        filter=suffix(".csv"),
        output=".tsv",
    )

    pipe.transform(
        name="calculate_md5",
        task_func=md5,
        input=output_from("convert_csv_files_to_tsv"),
        filter=suffix(".tsv"),
        output=".md5sum",
    )

    return pipe


if __name__ == "__main__":
    parser = cmdline.get_argparse(description="CNV Calling",
                                  ignored_args=["jobs"])

    options = parser.parse_args()
    options.history_file = os.path.join(WORK_DIR, ".ruffus_history.sqlite")

    pipeline = build_pipeline()

    cmdline.run(options, multithead=3)
示例#23
0
# parser.add_argument('--pipeline', "-p",
# 					type=str,
# 					choices = ['sim_pure', 'sim_contam','real_pure','test_pipe'],
#                     help="Defining which pipeline to run")

parser.add_argument(
    '--config_file',
    "-cf",
    type=str,
    #metavar="config_file",
    help="yaml file with pipeline parameters")

options = parser.parse_args()

## standard python logger which can be synchronised across concurrent Ruffus tasks
## define logging output with --log_file  log_file_name
logger, logger_mutex = cmdline.setup_logging(__name__, options.log_file,
                                             options.verbose)

# if we are printing only
if  not options.just_print and \
    not options.flowchart and \
    not options.touch_files_only:

    config_file = file(options.config_file, 'r')
    config = yaml.load(config_file)

    pipeline1a = make_sipp(org_list=config['org_list'], config=config)
    cmdline.run(options, logger=logger)
    sys.exit()
示例#24
0
文件: main.py 项目: silasxue/OCRmyPDF
def run_pipeline():
    if not options.jobs:
        options.jobs = available_cpu_count()
    try:
        options.history_file = os.path.join(work_folder, 'ruffus_history.sqlite')
        cmdline.run(options)
    except ruffus_exceptions.RethrownJobError as e:
        if options.verbose:
            _log.debug(e)

        # Yuck. Hunt through the ruffus exception to find out what the
        # return code is supposed to be.
        # Ruffus flattens the exception to a string, throwing away all kinds
        # of helpful details
        # task_name, job_name - ruffus status
        # exc_name - class name of exception
        # exc_value - irritating string that makes impossible to recover
        #   exception object
        # exc_stack - string that contains traceback of exception
        for exc in e.args:
            task_name, job_name, exc_name, exc_value, exc_stack = exc
            if exc_name == 'builtins.SystemExit':
                match = re.search(r"\.(.+?)\)", exc_value)
                exit_code_name = match.groups()[0]
                exit_code = getattr(ExitCode, exit_code_name, 'other_error')
                return exit_code
            elif exc_name == 'ruffus.ruffus_exceptions.MissingInputFileError':
                _log.error(cleanup_ruffus_error_message(exc_value))
                return ExitCode.input_file
            elif exc_name == 'builtins.TypeError':
                # Even though repair_pdf will fail, ruffus will still try
                # to call split_pages with no input files, likely due to a bug
                if task_name == 'split_pages':
                    _log.error("Input file '{0}' is not a valid PDF".format(
                        options.input_file))
                    return ExitCode.input_file
            elif exc_name == 'subprocess.CalledProcessError':
                # It's up to the subprocess handler to report something useful
                msg = "Error occurred while running this command:"
                _log.error(msg + '\n' + exc_value)
                return ExitCode.child_process_error
            elif not options.verbose:
                _log.error(e)

        return ExitCode.other_error
    except Exception as e:
        _log.error(e)
        return ExitCode.other_error

    if not validate_pdfa(options.output_file, _log):
        _log.warning('Output file: The generated PDF/A file is INVALID')
        return ExitCode.invalid_output_pdfa

    with _pdfinfo_lock:
        _log.debug(_pdfinfo)
        direction = {0: 'n', 90: 'e',
                     180: 's', 270: 'w'}
        orientations = []
        for n, page in enumerate(_pdfinfo):
            angle = _pdfinfo[n].get('rotated', 0)
            if angle != 0:
                orientations.append('{0}{1}'.format(
                    n + 1,
                    direction.get(angle, '')))
        if orientations:
            _log.info('Page orientations detected: ' + ' '.join(orientations))

    return ExitCode.ok

@follows(merge_hf_vcf)
@transform((filtrate_low_qual, select_snp_variants, select_indel_variants, hardfilter_indel_variants,
            hardfilter_snp_variants),
           suffix(vcf_ext), vcf_ext+zeroed_ext)
def remove_intermediate_vcfs(in_vcf, out):
    zero_file(in_vcf)
    os.remove(in_vcf+'.idx')
    open(out, 'w').close()

@follows(merge_hf_vcf)
@transform(realign_indel, suffix(realignedbam_ext), realignedbam_ext+zeroed_ext)
def remove_realigned_bam(in_fn, out_fn):
    zero_file(in_fn)
    os.remove(in_fn[:-1]+'i')
    open(out_fn, 'w').close()

@follows(merge_hf_vcf)
@transform(get_recal_group, suffix(recal_ext), recal_ext+zeroed_ext)
def remove_read_group_file(in_fn, out_fn):
    zero_file(in_fn[0])
    open(out_fn, 'w').close()

options.history_file = '.gatk_exome_pipeline.ruffus_history.sqlite'

cmdline.run(options, gnu_make_maximal_rebuild_mode=True, checksum_level=1, touch_file_only=True)



示例#26
0
def run_pipeline():
    options = parser.parse_args()
    options.verbose_abbreviated_path = 1
    print("Inside of options is: " + options)

    if not check_closed_streams(options):
        return ExitCode.bad_args

    logger_args = {'verbose': options.verbose, 'quiet': options.quiet}

    _log, _log_mutex = proxy_logger.make_shared_logger_and_proxy(
        logging_factory, __name__, logger_args)
    preamble(_log)
    check_options(options, _log)

    # Complain about qpdf version < 7.0.0
    # Suppress the warning if in the test suite, since there are no PPAs
    # for qpdf 7.0.0 for Ubuntu trusty (i.e. Travis)
    if qpdf.version() < '7.0.0' and not os.environ.get('PYTEST_CURRENT_TEST'):
        complain(
            "You are using qpdf version {0} which has known issues including "
            "security vulnerabilities with certain malformed PDFs. Consider "
            "upgrading to version 7.0.0 or newer.".format(qpdf.version()))

    # Any changes to options will not take effect for options that are already
    # bound to function parameters in the pipeline. (For example
    # options.input_file, options.pdf_renderer are already bound.)
    if not options.jobs:
        options.jobs = available_cpu_count()

    # Performance is improved by setting Tesseract to single threaded. In tests
    # this gives better throughput than letting a smaller number of Tesseract
    # jobs run multithreaded. Same story for pngquant. Tess <4 ignores this
    # variable, but harmless to set if ignored.
    os.environ.setdefault('OMP_THREAD_LIMIT', '1')

    check_environ(options, _log)
    if os.environ.get('PYTEST_CURRENT_TEST'):
        os.environ['_OCRMYPDF_TEST_INFILE'] = options.input_file

    try:
        work_folder = mkdtemp(prefix="com.github.ocrmypdf.")
        options.history_file = os.path.join(
            work_folder, 'ruffus_history.sqlite')
        start_input_file = os.path.join(
            work_folder, 'origin')

        check_input_file(options, _log, start_input_file)
        check_requested_output_file(options, _log)

        manager = JobContextManager()
        manager.register('JobContext', JobContext)  # pylint: disable=no-member
        manager.start()

        context = manager.JobContext()  # pylint: disable=no-member
        context.set_options(options)
        context.set_work_folder(work_folder)

        build_pipeline(options, work_folder, _log, context)
        atexit.register(cleanup_working_files, work_folder, options)
        cmdline.run(options)
    except ruffus_exceptions.RethrownJobError as e:
        if options.verbose:
            _log.debug(str(e))  # stringify exception so logger doesn't have to
        exceptions = e.job_exceptions
        exitcode = traverse_ruffus_exception(exceptions, options, _log)
        if exitcode is None:
            _log.error("Unexpected ruffus exception: " + str(e))
            _log.error(repr(e))
            return ExitCode.other_error
        return exitcode
    except ExitCodeException as e:
        return e.exit_code
    except Exception as e:
        _log.error(str(e))
        return ExitCode.other_error

    if options.flowchart:
        _log.info("Flowchart saved to {}".format(options.flowchart))
        return ExitCode.ok
    elif options.output_file == '-':
        _log.info("Output sent to stdout")
    elif os.path.samefile(options.output_file, os.devnull):
        pass  # Say nothing when sending to dev null
    else:
        if options.output_type.startswith('pdfa'):
            pdfa_info = file_claims_pdfa(options.output_file)
            if pdfa_info['pass']:
                msg = 'Output file is a {} (as expected)'
                _log.info(msg.format(pdfa_info['conformance']))
            else:
                msg = 'Output file is okay but is not PDF/A (seems to be {})'
                _log.warning(msg.format(pdfa_info['conformance']))
                return ExitCode.pdfa_conversion_failed
        if not qpdf.check(options.output_file, _log):
            _log.warning('Output file: The generated PDF is INVALID')
            return ExitCode.invalid_output_pdf

        report_output_file_size(options, _log, start_input_file,
                                options.output_file)

    pdfinfo = context.get_pdfinfo()
    if options.verbose:
        from pprint import pformat
        _log.debug(pformat(pdfinfo))

    log_page_orientations(pdfinfo, _log)

    return ExitCode.ok
示例#27
0
def run_pipeline():
    options = parser.parse_args()
    options.verbose_abbreviated_path = 1

    if not check_closed_streams(options):
        return ExitCode.bad_args

    logger_args = {'verbose': options.verbose, 'quiet': options.quiet}

    _log, _log_mutex = proxy_logger.make_shared_logger_and_proxy(
        logging_factory, __name__, logger_args)
    _log.debug('ocrmypdf ' + VERSION)
    _log.debug('tesseract ' + tesseract.version())
    _log.debug('qpdf ' + qpdf.version())

    check_options(options, _log)

    PIL.Image.MAX_IMAGE_PIXELS = int(options.max_image_mpixels * 1000000)
    if PIL.Image.MAX_IMAGE_PIXELS == 0:
        PIL.Image.MAX_IMAGE_PIXELS = None

    # Complain about qpdf version < 7.0.0
    # Suppress the warning if in the test suite, since there are no PPAs
    # for qpdf 7.0.0 for Ubuntu trusty (i.e. Travis)
    if qpdf.version() < '7.0.0' and not os.environ.get('PYTEST_CURRENT_TEST'):
        complain(
            "You are using qpdf version {0} which has known issues including "
            "security vulnerabilities with certain malformed PDFs. Consider "
            "upgrading to version 7.0.0 or newer.".format(qpdf.version()))

    # Any changes to options will not take effect for options that are already
    # bound to function parameters in the pipeline. (For example
    # options.input_file, options.pdf_renderer are already bound.)
    if not options.jobs:
        options.jobs = available_cpu_count()
    try:
        work_folder = mkdtemp(prefix="com.github.ocrmypdf.")
        options.history_file = os.path.join(
            work_folder, 'ruffus_history.sqlite')
        start_input_file = os.path.join(
            work_folder, 'origin')

        if options.input_file == '-':
            # stdin
            _log.info('reading file from standard input')
            with open(start_input_file, 'wb') as stream_buffer:
                from shutil import copyfileobj
                copyfileobj(sys.stdin.buffer, stream_buffer)
        else:
            try:
                re_symlink(options.input_file, start_input_file, _log)
            except FileNotFoundError:
                _log.error("File not found - " + options.input_file)
                return ExitCode.input_file

        if options.output_file == '-':
            if sys.stdout.isatty():
                _log.error(textwrap.dedent("""\
                    Output was set to stdout '-' but it looks like stdout
                    is connected to a terminal.  Please redirect stdout to a
                    file."""))
                return ExitCode.bad_args
        elif not is_file_writable(options.output_file):
            _log.error(
                "Output file location (" + options.output_file + ") " +
                "is not a writable file.")
            return ExitCode.file_access_error

        manager = JobContextManager()
        manager.register('JobContext', JobContext)  # pylint: disable=no-member
        manager.start()

        context = manager.JobContext()  # pylint: disable=no-member
        context.set_options(options)
        context.set_work_folder(work_folder)

        build_pipeline(options, work_folder, _log, context)
        atexit.register(cleanup_working_files, work_folder, options)
        cmdline.run(options)
    except ruffus_exceptions.RethrownJobError as e:
        if options.verbose:
            _log.debug(str(e))  # stringify exception so logger doesn't have to

        # Ruffus flattens exception to 5 element tuples. Because of a bug
        # in <= 2.6.3 it may present either the single:
        #   (task, job, exc, value, stack)
        # or something like:
        #   [[(task, job, exc, value, stack)]]
        #
        # Generally cross-process exception marshalling doesn't work well
        # and ruffus doesn't support because BaseException has its own
        # implementation of __reduce__ that attempts to reconstruct the
        # exception based on e.__init__(e.args).
        #
        # Attempting to log the exception directly marshalls it to the logger
        # which is probably in another process, so it's better to log only
        # data from the exception at this point.

        exitcode = traverse_ruffus_exception(e.args, options, _log)
        if exitcode is None:
            _log.error("Unexpected ruffus exception: " + str(e))
            _log.error(repr(e))
            return ExitCode.other_error
        return exitcode
    except ExitCodeException as e:
        return e.exit_code
    except Exception as e:
        _log.error(e)
        return ExitCode.other_error

    if options.flowchart:
        _log.info("Flowchart saved to {}".format(options.flowchart))
    elif options.output_file == '-':
        _log.info("Output sent to stdout")
    elif os.path.samefile(options.output_file, os.devnull):
        pass  # Say nothing when sending to dev null
    else:
        if options.output_type.startswith('pdfa'):
            pdfa_info = file_claims_pdfa(options.output_file)
            if pdfa_info['pass']:
                msg = 'Output file is a {} (as expected)'
                _log.info(msg.format(pdfa_info['conformance']))
            else:
                msg = 'Output file is okay but is not PDF/A (seems to be {})'
                _log.warning(msg.format(pdfa_info['conformance']))
                return ExitCode.invalid_output_pdf
        if not qpdf.check(options.output_file, _log):
            _log.warning('Output file: The generated PDF is INVALID')
            return ExitCode.invalid_output_pdf

    pdfinfo = context.get_pdfinfo()
    if options.verbose:
        from pprint import pformat
        _log.debug(pformat(pdfinfo))

    log_page_orientations(pdfinfo, _log)

    return ExitCode.ok
示例#28
0
# parser.add_argument('--pipeline', "-p", 
# 					type=str, 
# 					choices = ['sim_pure', 'sim_contam','real_pure','test_pipe'],
#                     help="Defining which pipeline to run")

parser.add_argument('--config_file', "-cf", 
					type=str,
					#metavar="config_file",
					help="yaml file with pipeline parameters")

options = parser.parse_args()



## standard python logger which can be synchronised across concurrent Ruffus tasks
## define logging output with --log_file  log_file_name
logger, logger_mutex = cmdline.setup_logging (__name__, options.log_file, options.verbose)


# if we are printing only
if  not options.just_print and \
    not options.flowchart and \
    not options.touch_files_only:

    config_file= file(options.config_file, 'r')
    config = yaml.load(config_file)
    
    pipeline1a = make_sipp(org_list = config['org_list'], config = config)
    cmdline.run (options, logger = logger)
    sys.exit()
示例#29
0
                continue
            genome = line.rstrip().split("\t")[2]
            scores[genome] += 1

    sorted_scores = sorted(scores.items(), reverse=True,
                           key=operator.itemgetter(1))
    file_root = sam_file.replace(".sorted.sam", "")
    
    fastq_file = file_root + ".fastq"
    bam_file = file_root + ".bam"
    sbam_file = file_root + ".sorted.bam"

    lines = wcl(fastq_file)
    num_lines = int(lines.split()[0])
    num_reads = num_lines / 4

    with open(data_file, "w+") as fh:
        fh.write("### DATA REPORT FOR {} ###\n".format())
        fh.write("fastq lines: \n{}\n".format(lines))
        fh.write("reads: {}\n".format(str(num_reads)))
        fh.write("\n### Genome Hit Data ###")
        for genome,score in sorted_scores:
            fh.write("{}\t{}\n".format(genome, str(score)))

    os.unlink(fastq_file)
    os.unlink(bam_file)
    os.unlink(sbam_file)

# run the pipelined
cmdline.run(options)
示例#30
0
def run_pipeline():
    # Any changes to options will not take effect for options that are already
    # bound to function parameters in the pipeline. (For example
    # options.input_file, options.pdf_renderer are already bound.)
    global options
    if not options.jobs:
        options.jobs = available_cpu_count()
    try:
        options.history_file = os.path.join(work_folder,
                                            'ruffus_history.sqlite')
        start_input_file = os.path.join(work_folder, 'origin')

        if options.input_file == '-':
            # stdin
            _log.info('reading file from standard input')
            with open(start_input_file, 'wb') as stream_buffer:
                from shutil import copyfileobj
                copyfileobj(sys.stdin.buffer, stream_buffer)
        else:
            try:
                re_symlink(options.input_file, start_input_file, _log)
            except FileNotFoundError:
                _log.error("File not found - " + options.input_file)
                return ExitCode.input_file

        cmdline.run(options)
    except ruffus_exceptions.RethrownJobError as e:
        if options.verbose:
            _log.debug(str(e))  # stringify exception so logger doesn't have to

        # Ruffus flattens exception to 5 element tuples. Because of a bug
        # in <= 2.6.3 it may present either the single:
        #   (task, job, exc, value, stack)
        # or something like:
        #   [[(task, job, exc, value, stack)]]
        #
        # Generally cross-process exception marshalling doesn't work well
        # and ruffus doesn't support because BaseException has its own
        # implementation of __reduce__ that attempts to reconstruct the
        # exception based on e.__init__(e.args).
        #
        # Attempting to log the exception directly marshalls it to the logger
        # which is probably in another process, so it's better to log only
        # data from the exception at this point.

        exitcode = traverse_ruffus_exception(e.args)
        if exitcode is None:
            _log.error("Unexpected ruffus exception: " + str(e))
            _log.error(repr(e))
            return ExitCode.other_error
        else:
            return exitcode
    except Exception as e:
        _log.error(e)
        return ExitCode.other_error

    if options.output_type == 'pdfa':
        pdfa_info = file_claims_pdfa(options.output_file)
        if pdfa_info['pass']:
            msg = 'Output file is a {} (as expected)'
            _log.info(msg.format(pdfa_info['conformance']))
        else:
            msg = 'Output file was generated but is not PDF/A (seems to be {})'
            _log.warning(msg.format(pdfa_info['conformance']))

            return ExitCode.invalid_output_pdf

    if not qpdf.check(options.output_file, _log):
        _log.warning('Output file: The generated PDF is INVALID')
        return ExitCode.invalid_output_pdf

    with _pdfinfo_lock:
        _log.debug(_pdfinfo)
        direction = {0: 'n', 90: 'e', 180: 's', 270: 'w'}
        orientations = []
        for n, page in enumerate(_pdfinfo):
            angle = _pdfinfo[n].get('rotated', 0)
            if angle != 0:
                orientations.append('{0}{1}'.format(n + 1,
                                                    direction.get(angle, '')))
        if orientations:
            _log.info('Page orientations detected: ' + ' '.join(orientations))

    return ExitCode.ok
示例#31
0
def run_pipeline():
    options = parser.parse_args()
    options.verbose_abbreviated_path = 1

    if not check_closed_streams(options):
        return ExitCode.bad_args

    logger_args = {'verbose': options.verbose, 'quiet': options.quiet}

    _log, _log_mutex = proxy_logger.make_shared_logger_and_proxy(
        logging_factory, __name__, logger_args)
    _log.debug('ocrmypdf ' + VERSION)
    _log.debug('tesseract ' + tesseract.version())
    _log.debug('qpdf ' + qpdf.version())

    check_options(options, _log)

    PIL.Image.MAX_IMAGE_PIXELS = int(options.max_image_mpixels * 1000000)
    if PIL.Image.MAX_IMAGE_PIXELS == 0:
        PIL.Image.MAX_IMAGE_PIXELS = None

    # Complain about qpdf version < 7.0.0
    # Suppress the warning if in the test suite, since there are no PPAs
    # for qpdf 7.0.0 for Ubuntu trusty (i.e. Travis)
    if qpdf.version() < '7.0.0' and not os.environ.get('PYTEST_CURRENT_TEST'):
        complain(
            "You are using qpdf version {0} which has known issues including "
            "security vulnerabilities with certain malformed PDFs. Consider "
            "upgrading to version 7.0.0 or newer.".format(qpdf.version()))

    # Any changes to options will not take effect for options that are already
    # bound to function parameters in the pipeline. (For example
    # options.input_file, options.pdf_renderer are already bound.)
    if not options.jobs:
        options.jobs = available_cpu_count()
    try:
        work_folder = mkdtemp(prefix="com.github.ocrmypdf.")
        options.history_file = os.path.join(work_folder,
                                            'ruffus_history.sqlite')
        start_input_file = os.path.join(work_folder, 'origin')

        if options.input_file == '-':
            # stdin
            _log.info('reading file from standard input')
            with open(start_input_file, 'wb') as stream_buffer:
                from shutil import copyfileobj
                copyfileobj(sys.stdin.buffer, stream_buffer)
        else:
            try:
                re_symlink(options.input_file, start_input_file, _log)
            except FileNotFoundError:
                _log.error("File not found - " + options.input_file)
                return ExitCode.input_file

        if options.output_file == '-':
            if sys.stdout.isatty():
                _log.error(
                    textwrap.dedent("""\
                    Output was set to stdout '-' but it looks like stdout
                    is connected to a terminal.  Please redirect stdout to a
                    file."""))
                return ExitCode.bad_args
        elif not is_file_writable(options.output_file):
            _log.error("Output file location (" + options.output_file + ") " +
                       "is not a writable file.")
            return ExitCode.file_access_error

        manager = JobContextManager()
        manager.register('JobContext', JobContext)  # pylint: disable=no-member
        manager.start()

        context = manager.JobContext()  # pylint: disable=no-member
        context.set_options(options)
        context.set_work_folder(work_folder)

        build_pipeline(options, work_folder, _log, context)
        atexit.register(cleanup_working_files, work_folder, options)
        cmdline.run(options)
    except ruffus_exceptions.RethrownJobError as e:
        if options.verbose:
            _log.debug(str(e))  # stringify exception so logger doesn't have to

        # Ruffus flattens exception to 5 element tuples. Because of a bug
        # in <= 2.6.3 it may present either the single:
        #   (task, job, exc, value, stack)
        # or something like:
        #   [[(task, job, exc, value, stack)]]
        #
        # Generally cross-process exception marshalling doesn't work well
        # and ruffus doesn't support because BaseException has its own
        # implementation of __reduce__ that attempts to reconstruct the
        # exception based on e.__init__(e.args).
        #
        # Attempting to log the exception directly marshalls it to the logger
        # which is probably in another process, so it's better to log only
        # data from the exception at this point.

        exitcode = traverse_ruffus_exception(e.args, options, _log)
        if exitcode is None:
            _log.error("Unexpected ruffus exception: " + str(e))
            _log.error(repr(e))
            return ExitCode.other_error
        return exitcode
    except ExitCodeException as e:
        return e.exit_code
    except Exception as e:
        _log.error(e)
        return ExitCode.other_error

    if options.flowchart:
        _log.info("Flowchart saved to {}".format(options.flowchart))
    elif options.output_file == '-':
        _log.info("Output sent to stdout")
    elif os.path.samefile(options.output_file, os.devnull):
        pass  # Say nothing when sending to dev null
    else:
        if options.output_type.startswith('pdfa'):
            pdfa_info = file_claims_pdfa(options.output_file)
            if pdfa_info['pass']:
                msg = 'Output file is a {} (as expected)'
                _log.info(msg.format(pdfa_info['conformance']))
            else:
                msg = 'Output file is okay but is not PDF/A (seems to be {})'
                _log.warning(msg.format(pdfa_info['conformance']))
                return ExitCode.invalid_output_pdf
        if not qpdf.check(options.output_file, _log):
            _log.warning('Output file: The generated PDF is INVALID')
            return ExitCode.invalid_output_pdf

    pdfinfo = context.get_pdfinfo()
    if options.verbose:
        from pprint import pformat
        _log.debug(pformat(pdfinfo))

    log_page_orientations(pdfinfo, _log)

    return ExitCode.ok
示例#32
0
def f_alogFamily(inputFiles, outputFiles):
    touch(outputFiles)

#---------------------------------------------------------------
# homeobox figure
#
@merge(homeobox_R, "ruffus/figure.f_hb")
def f_hb(inputFiles, outputFiles):
    touch(outputFiles)

#---------------------------------------------------------------
# data s1
#
@merge([calculateTpm_R, downloadGenomes_sh], "ruffus/datas1")
def datas1_R(inputFiles, outputFiles):
    jobScript = 'src/R/datas1.R'
    ntasks = '1'
    cpus_per_task = '1'
    job_name = 'datas1_R'
    jobId = submit_job(jobScript, ntasks, cpus_per_task, job_name)
    # update ruffus flag
    print("[", print_now(), ": Job " + job_name + " run with JobID " + jobId + " ]")
    touch(outputFiles)

# options for visualising
pipeline_printout()
pipeline_printout_graph("ruffus/flowchart." + slurm_jobid + ".pdf", "pdf")

# run the pipeline (disabled for now)
cmdline.run(options, multithread = 8)
            hardfilter_indel_variants, hardfilter_snp_variants),
           suffix(vcf_ext), vcf_ext + zeroed_ext)
def remove_intermediate_vcfs(in_vcf, out):
    zero_file(in_vcf)
    os.remove(in_vcf + '.idx')
    open(out, 'w').close()


@follows(merge_hf_vcf)
@transform(realign_indel, suffix(realignedbam_ext),
           realignedbam_ext + zeroed_ext)
def remove_realigned_bam(in_fn, out_fn):
    zero_file(in_fn)
    os.remove(in_fn[:-1] + 'i')
    open(out_fn, 'w').close()


@follows(merge_hf_vcf)
@transform(get_recal_group, suffix(recal_ext), recal_ext + zeroed_ext)
def remove_read_group_file(in_fn, out_fn):
    zero_file(in_fn[0])
    open(out_fn, 'w').close()


options.history_file = '.gatk_exome_pipeline.ruffus_history.sqlite'

cmdline.run(options,
            gnu_make_maximal_rebuild_mode=True,
            checksum_level=1,
            touch_file_only=True)