Exemplo n.º 1
0
def main():
    global sc
    global rgc
    global _LOGGER
    parser = build_parser()
    parser = logmuse.add_logging_options(parser)
    args = parser.parse_args()
    if not args.command:
        parser.print_help()
        print("No subcommand given")
        sys.exit(1)

    _LOGGER = logmuse.logger_via_cli(args, make_root=True)
    _LOGGER.info("Welcome to the SeqCol API app")

    # demo_filepath="/home/nsheff/code/seqcolapi/seqcolapi/seqcolapi_config_demo.yaml"
    scc = SeqColConf(filepath=args.config)
    _LOGGER.info(f"Connecting to database... {scc.database.host}")
    pgdb = RDBDict(scc.database.name, scc.database.user, scc.database.password,
                   scc.database.host, scc.database.port)

    rgc = refget.RefGetClient(scc.refget_provider_apis, pgdb)

    sc = SeqColClient(database=pgdb,
                      api_url_base=scc.refget_provider_apis,
                      schemas=scc.schemas)
    seqcolapi_port = args.port if args.port else scc.server.port
    _LOGGER.info("Running on port {}".format(seqcolapi_port))
    uvicorn.run(app, host=scc.server.host, port=seqcolapi_port)
Exemplo n.º 2
0
def test_silence(parser, cmdl, flag, hdlr_type):
    """ Log silencing generates a null handler. """
    opts = parser.parse_args(cmdl)
    assert getattr(opts, SILENCE_LOGS_OPTNAME.lstrip("-")) is flag
    logger = logger_via_cli(opts)
    hs = logger.handlers
    assert 1 == len(hs)
    assert isinstance(hs[0], hdlr_type)
Exemplo n.º 3
0
def main():
    """ Primary workflow """
    from inspect import getdoc
    parser = logmuse.add_logging_options(
        build_argparser(getdoc(PipestatManager)))
    args = parser.parse_args()
    if args.command is None:
        parser.print_help(sys.stderr)
        sys.exit(1)
    global _LOGGER
    _LOGGER = logmuse.logger_via_cli(args, make_root=True)
    _LOGGER.debug("Args namespace:\n{}".format(args))
    if args.database_config and not args.schema:
        parser.error("the following arguments are required: -s/--schema")
    psm = PipestatManager(
        name=args.namespace,
        schema_path=args.schema,
        results_file=args.results_file,
        database_config=args.database_config
    )
    if args.command == REPORT_CMD:
        value = args.value
        result_metadata = psm.schema[args.result_identifier]
        if result_metadata[SCHEMA_TYPE_KEY] in ["object", "image", "file"] \
                and os.path.exists(expandpath(value)):
            from json import load
            _LOGGER.info(f"Reading JSON file with object type value: "
                         f"{expandpath(value)}")
            with open(expandpath(value), "r") as json_file:
                value = load(json_file)
        psm.report(
            result_identifier=args.result_identifier,
            record_identifier=args.record_identifier,
            value=value,
            force_overwrite=args.overwrite,
            strict_type=not args.try_convert
        )
        sys.exit(0)
    if args.command == INSPECT_CMD:
        print("\n")
        print(psm)
        if args.data:
            print("\nData:")
            print(psm.data)
        sys.exit(0)
    if args.command == REMOVE_CMD:
        psm.remove(
            result_identifier=args.result_identifier,
            record_identifier=args.record_identifier
        )
        sys.exit(0)
    if args.command == RETRIEVE_CMD:
        print(psm.retrieve(
            result_identifier=args.result_identifier,
            record_identifier=args.record_identifier
        ))
        sys.exit(0)
Exemplo n.º 4
0
def main():
    """ Primary workflow """

    parser = logmuse.add_logging_options(build_argparser())
    args = parser.parse_args()
    global _LOGGER
    _LOGGER = logmuse.logger_via_cli(args, make_root=True)

    msg = "Input: {input}; Parameter: {parameter}"
    _LOGGER.info(msg.format(input=args.input, parameter=args.parameter))
Exemplo n.º 5
0
def logger_via_cli(opts, **kwargs):
    """
    Build and initialize logger from CLI specification.

    :param argparse.Namespace opts: parse of command-line interface
    :param kwargs: keyword arguments to pass along to underlying logmuse function
    :return logging.Logger: newly created and configured logger
    """
    from copy import deepcopy
    import logmuse
    kwds = deepcopy(kwargs)
    # By default, don't require the logging options to have been added to the parser.
    kwds.setdefault("strict", False)
    return logmuse.logger_via_cli(opts, **kwds)
Exemplo n.º 6
0
def main(cmdl):
    """ Run the script. """

    args = _parse_cmdl(cmdl)
    global _LOGGER
    _LOGGER = logmuse.logger_via_cli(args, make_root=True)

    _LOGGER.debug("Creating counter")
    counter = ReadCounter(args.readsfile,
                          cores=args.cores,
                          outfile=args.outfile,
                          action="CountReads",
                          limit=args.limit)
    _LOGGER.debug("Registering files")
    counter.register_files()

    _LOGGER.info("Counting reads: {}".format(args.readsfile))
    good_chromosomes = counter.run()
    _LOGGER.info("Collecting read counts: {}".format(args.outfile))
    counter.combine(good_chromosomes, chrom_sep="\n")
Exemplo n.º 7
0
def main():
    """ Primary workflow """

    parser = logmuse.add_logging_options(arguments.build_argparser())
    args, remaining_args = parser.parse_known_args()
    global _LOGGER
    _LOGGER = logmuse.logger_via_cli(args)

    _LOGGER.info("Welcome to bedshift version {}".format(__version__))
    _LOGGER.info("Shifting file: '{}'".format(args.bedfile))

    if not args.bedfile:
        parser.print_help()
        _LOGGER.error("No BED file given")
        sys.exit(1)

    if args.chrom_lengths:
        pass
    elif args.genome:
        try:
            import refgenconf

            rgc = refgenconf.RefGenConf(refgenconf.select_genome_config())
            args.chrom_lengths = rgc.seek(args.genome, "fasta", None,
                                          "chrom_sizes")
        except ModuleNotFoundError:
            _LOGGER.error(
                "You must have package refgenconf installed to use a refgenie genome"
            )
            sys.exit(1)

    msg = arguments.param_msg

    if args.repeat < 1:
        _LOGGER.error("repeats specified is less than 1")
        sys.exit(1)

    if args.outputfile:
        outfile_base = args.outputfile
    else:
        outfile_base = "bedshifted_{}".format(os.path.basename(args.bedfile))

    _LOGGER.info(
        msg.format(
            bedfile=args.bedfile,
            chromsizes=args.chrom_lengths,
            droprate=args.droprate,
            dropfile=args.dropfile,
            addrate=args.addrate,
            addmean=args.addmean,
            addstdev=args.addstdev,
            addfile=args.addfile,
            valid_regions=args.valid_regions,
            shiftrate=args.shiftrate,
            shiftmean=args.shiftmean,
            shiftstdev=args.shiftstdev,
            shiftfile=args.shiftfile,
            cutrate=args.cutrate,
            mergerate=args.mergerate,
            outputfile=outfile_base,
            repeat=args.repeat,
            yaml_config=args.yaml_config,
        ))

    bedshifter = Bedshift(args.bedfile, args.chrom_lengths)
    _LOGGER.info(f"Generating {args.repeat} repetitions...")

    pct_reports = [int(x * args.repeat / 100) for x in [5, 25, 50, 75, 100]]

    for i in range(args.repeat):
        n = bedshifter.all_perturbations(
            args.addrate,
            args.addmean,
            args.addstdev,
            args.addfile,
            args.valid_regions,
            args.shiftrate,
            args.shiftmean,
            args.shiftstdev,
            args.shiftfile,
            args.cutrate,
            args.mergerate,
            args.droprate,
            args.dropfile,
            args.yaml_config,
        )
        if args.repeat == 1:
            bedshifter.to_bed(outfile_base)
            _LOGGER.info(
                "REGION COUNT | original: {}\tnew: {}\tchanged: {}\t\noutput file: {}"
                .format(
                    bedshifter.original_num_regions,
                    bedshifter.bed.shape[0],
                    str(n),
                    outfile_base,
                ))
        else:
            basename, ext = os.path.splitext(os.path.basename(outfile_base))
            dirname = os.path.dirname(outfile_base)
            digits = int(math.log10(args.repeat)) + 1

            rep = str(i + 1).zfill(digits)
            modified_outfile_path = os.path.join(dirname,
                                                 f"{basename}_rep{rep}{ext}")
            bedshifter.to_bed(modified_outfile_path)

            pct_finished = int((100 * (i + 1)) / args.repeat)
            if i + 1 in pct_reports:
                _LOGGER.info(
                    f"Rep {i+1}. Finished: {pct_finished}%. Output file: {modified_outfile_path}"
                )

        bedshifter.reset_bed()
Exemplo n.º 8
0
def main():
    """Primary workflow"""
    from inspect import getdoc

    parser = logmuse.add_logging_options(
        build_argparser(getdoc(PipestatManager)))
    args = parser.parse_args()
    if args.command is None:
        parser.print_help(sys.stderr)
        sys.exit(1)
    global _LOGGER
    _LOGGER = logmuse.logger_via_cli(args, make_root=True)
    _LOGGER.debug("Args namespace:\n{}".format(args))
    if args.config and not args.schema and args.command != STATUS_CMD:
        parser.error("the following arguments are required: -s/--schema")
    psm = PipestatManager(
        namespace=args.namespace,
        schema_path=args.schema,
        results_file_path=args.results_file,
        config=args.config,
        database_only=args.database_only,
        status_schema_path=args.status_schema,
        flag_file_dir=args.flag_dir,
    )
    if args.command == REPORT_CMD:
        value = args.value
        if psm.schema is None:
            raise SchemaNotFoundError(msg="report", cli=True)
        result_metadata = psm.schema[args.result_identifier]
        if (result_metadata[SCHEMA_TYPE_KEY] in [
                "object",
                "image",
                "file",
        ] and os.path.exists(expandpath(value))):
            from json import load

            _LOGGER.info(
                f"Reading JSON file with object type value: {expandpath(value)}"
            )
            with open(expandpath(value), "r") as json_file:
                value = load(json_file)
        psm.report(
            record_identifier=args.record_identifier,
            values={args.result_identifier: value},
            force_overwrite=args.overwrite,
            strict_type=args.skip_convert,
        )
    if args.command == INSPECT_CMD:
        print("\n")
        print(psm)
        if args.data and not args.database_only:
            print("\nData:")
            print(psm.data)
    if args.command == REMOVE_CMD:
        psm.remove(
            result_identifier=args.result_identifier,
            record_identifier=args.record_identifier,
        )
    if args.command == RETRIEVE_CMD:
        print(
            psm.retrieve(
                result_identifier=args.result_identifier,
                record_identifier=args.record_identifier,
            ))
    if args.command == STATUS_CMD:
        if args.subcommand == STATUS_GET_CMD:
            print(psm.get_status(record_identifier=args.record_identifier))
        if args.subcommand == STATUS_SET_CMD:
            psm.set_status(
                status_identifier=args.status_identifier,
                record_identifier=args.record_identifier,
            )
    sys.exit(0)
Exemplo n.º 9
0
def main():
    """Run the script."""
    cmdl = sys.argv[1:]
    args = _parse_cmdl(cmdl)
    global _LOGGER
    _LOGGER = logmuse.logger_via_cli(args)
    delete_sra = False  # initialize to False
    # Name the pipeline run after the first element to convert.
    # Maybe we should just have a separate pipeline for each file?

    if args.sample_name:
        run_name = "_".join(uniqify(args.sample_name))
    else:
        primary_srr_acc = os.path.splitext(os.path.basename(args.srr[0]))[0]
        run_name = primary_srr_acc

    if args.output_parent:
        outfolder = os.path.join(args.output_parent, run_name)
    else:
        outfolder = os.path.join(args.srafolder, "sra_convert_pipeline",
                                 run_name)

    _LOGGER.info("Using outfolder: {}".format(outfolder))
    nfiles = len(args.srr)
    failed_files = []

    pm = pypiper.PipelineManager(name="sra_convert",
                                 outfolder=outfolder,
                                 args=args)

    for i in range(nfiles):
        srr_acc = os.path.splitext(os.path.basename(args.srr[i]))[0]
        pm.info("Processing {} of {} files: {}".format(str(i + 1), str(nfiles),
                                                       srr_acc))

        bamfile = os.path.join(args.bamfolder, srr_acc + ".bam")
        fq_prefix = os.path.join(args.fqfolder, srr_acc)

        if args.mode == "convert":
            infile = args.srr[i]
            if not os.path.isfile(infile):
                pm.warning("Couldn't find sra file at: {}.".format(infile))
                failed_files.append(args.srr[i])
            if args.format == "fastq":
                # fastq-dump --split-files will produce *_1.fastq and *_2.fastq
                # for paired-end data, and only *_1.fastq for single-end data.
                outfile = "{fq_prefix}_1.fastq.gz".format(fq_prefix=fq_prefix)
                cmd = "fastq-dump {data_source} --split-files --gzip -O {outfolder}".format(
                    data_source=infile, outfolder=args.fqfolder, nofail=True)
            elif args.format == "bam":
                outfile = os.path.join(args.bamfolder, args.srr[i] + ".bam")
                cmd = "sam-dump -u {data_source} | samtools view -bS - > {outfile}".format(
                    data_source=infile, outfile=outfile, nofail=True)
            else:
                raise KeyError("Unknown format: {}".format(args.format))

            target = outfile
            ret = pm.run(cmd, target=target)
            if ret == 0:
                pm.info("Already completed files: {}".format(failed_files))
                try:
                    failed_files.remove(infile)
                except:
                    pass

        elif args.mode == "delete_bam":
            pm.timestamp("Cleaning bam file: {}".format(bamfile))
            pm.clean_add(bamfile)
        elif args.mode == "delete_fq":
            pm.timestamp("Cleaning fastq file(s): {}*".format(fq_prefix))
            fq_prefix = os.path.join(args.fqfolder, srr_acc)
            pm.clean_add("{fq_prefix}.fastq.gz".format(fq_prefix=fq_prefix))
            pm.clean_add(
                "{fq_prefix}_[0-9].fastq.gz".format(fq_prefix=fq_prefix))
        elif args.mode == "delete_sra":
            delete_sra = True
            # if specifically requested to delete sra files

        if not args.keep_sra and os.path.isfile(outfile):
            # Only delete if the output file was created...
            # we can't trust the sra toolkit return codes because they
            # can return 0 even if the command didn't complete, causing us to
            # delete the sra file when we have no other copy of that data.
            delete_sra = True

        if delete_sra:
            pm.timestamp("Cleaning sra file: {}".format(infile))
            pm.clean_add(infile)

    if len(failed_files) > 0:
        pm.fail_pipeline(
            Exception("Unable to locate the following files: {}".format(
                ",".join(failed_files))))

    pm.stop_pipeline()
Exemplo n.º 10
0
    parser.add_argument('--retain-temp',
                        action='store_true',
                        default=False,
                        help="Retain temporary files? Default: False")

    parser = logmuse.add_logging_options(parser)
    args = parser.parse_args(cmdl)
    if not (args.exactbw or args.smoothbw):
        parser.error('No output requested, use --exactbw and/or --smoothbw')
    return args


if __name__ == "__main__":

    args = parse_args(sys.argv[1:])
    _LOGGER = logmuse.logger_via_cli(args, make_root=True)

    if args.mode == "dnase":
        shift_factor = {"+": 1, "-": 0}  # DNase
    elif args.mode == "atac":
        shift_factor = {"+": 4, "-": -5}  # ATAC
    else:
        shift_factor = {"+": 0, "-": 0}
    ct = CutTracer(reads_filename=args.infile,
                   chrom_sizes_file=args.chrom_sizes_file,
                   scale=args.scale,
                   variable_step=args.variable_step,
                   exactbw=args.exactbw,
                   smoothbw=args.smoothbw,
                   step_size=args.step_size,
                   bedout=args.bedout,
Exemplo n.º 11
0
def main():
    """ Primary workflow """

    parser = logmuse.add_logging_options(build_argparser())
    args, remaining_args = parser.parse_known_args()
    global _LOGGER
    _LOGGER = logmuse.logger_via_cli(args)
    logmuse.logger_via_cli(args, name=refgenconf.__name__)

    _LOGGER.debug("Args: {}".format(args))

    if not args.command:
        parser.print_help()
        _LOGGER.error("No command given")
        sys.exit(1)

    gencfg = yacman.select_config(args.genome_config,
                                  CFG_ENV_VARS,
                                  check_exist=not args.command == INIT_CMD,
                                  on_missing=lambda fp: fp)
    if gencfg is None:
        raise MissingGenomeConfigError(args.genome_config)
    _LOGGER.debug("Determined genome config: {}".format(gencfg))

    if args.command == INIT_CMD:
        _LOGGER.info("Initializing refgenie genome configuration")
        _writeable(os.path.dirname(gencfg), strict_exists=True)
        refgenie_init(gencfg, args.genome_server)
        sys.exit(0)

    rgc = RefGenConf(gencfg)

    if args.command == BUILD_CMD:
        refgenie_build(rgc, args)

    elif args.command == GET_ASSET_CMD:
        _LOGGER.debug("getting asset: '{}/{}'".format(args.genome, args.asset))
        print(" ".join(
            [rgc.get_asset(args.genome, asset) for asset in args.asset]))
        return

    elif args.command == INSERT_CMD:
        if len(args.asset) > 1:
            raise NotImplementedError("Can only add 1 asset at a time")
        else:
            # recast from list to str
            args.asset = args.asset[0]
        refgenie_add(rgc, args)

    elif args.command == PULL_CMD:
        outdir = rgc[CFG_FOLDER_KEY]
        if not os.path.exists(outdir):
            raise MissingFolderError(outdir)
        target = _key_to_name(CFG_FOLDER_KEY)
        if not perm_check_x(outdir, target):
            return
        if not _single_folder_writeable(outdir):
            _LOGGER.error("Insufficient permissions to write to {}: "
                          "{}".format(target, outdir))
            return
        rgc.pull_asset(args.genome,
                       args.asset,
                       gencfg,
                       unpack=not args.no_untar)

    elif args.command in [LIST_LOCAL_CMD, LIST_REMOTE_CMD]:
        pfx, genomes, assets = _exec_list(rgc, args.command == LIST_REMOTE_CMD)
        _LOGGER.info("{} genomes: {}".format(pfx, genomes))
        _LOGGER.info("{} assets:\n{}".format(pfx, assets))
Exemplo n.º 12
0
def main():
    """ Primary workflow """

    parser = logmuse.add_logging_options(build_argparser())
    args, remaining_args = parser.parse_known_args()
    logger_kwargs = {"level": args.verbosity, "devmode": args.logdev}
    logmuse.init_logger(name="yacman", **logger_kwargs)
    global _LOGGER
    _LOGGER = logmuse.logger_via_cli(args)

    _LOGGER.debug("Command given: {}".format(args.command))

    if not args.command:
        parser.print_help()
        _LOGGER.error("No command given")
        sys.exit(1)

    if args.command == "init":
        bulkercfg = args.config
        _LOGGER.debug("Initializing bulker configuration")
        _is_writable(os.path.dirname(bulkercfg), check_exist=False)
        bulker_init(bulkercfg, DEFAULT_CONFIG_FILEPATH, args.engine)
        sys.exit(0)

    bulkercfg = select_bulker_config(args.config)
    bulker_config = yacman.YacAttMap(filepath=bulkercfg, writable=False)

    if args.command == "list":
        # Output header via logger and content via print so the user can
        # redirect the list from stdout if desired without the header as clutter

        if args.simple:
            fmt = "{namespace}/{crate}:{tag}"
        else:
            _LOGGER.info("Available crates:")
            fmt = "{namespace}/{crate}:{tag} -- {path}"

        if bulker_config.bulker.crates:
            for namespace, crates in bulker_config.bulker.crates.items():
                for crate, tags in crates.items():
                    for tag, path in tags.items():
                        print(
                            fmt.format(namespace=namespace,
                                       crate=crate,
                                       tag=tag,
                                       path=path))
        else:
            _LOGGER.info(
                "No crates available. Use 'bulker load' to load a crate.")
        sys.exit(1)

    # For all remaining commands we need a crate identifier

    _LOGGER.info("Bulker config: {}".format(bulkercfg))
    if args.command == "activate":
        try:
            cratelist = parse_registry_paths(
                args.crate_registry_paths,
                bulker_config.bulker.default_namespace)
            _LOGGER.debug(cratelist)
            _LOGGER.info("Activating bulker crate: {}{}".format(
                args.crate_registry_paths, " (Strict)" if args.strict else ""))
            bulker_activate(bulker_config,
                            cratelist,
                            echo=args.echo,
                            strict=args.strict,
                            prompt=args.no_prompt)
        except KeyError as e:
            parser.print_help(sys.stderr)
            _LOGGER.error("{} is not an available crate".format(e))
            sys.exit(1)
        except MissingCrateError as e:
            _LOGGER.error("Missing crate: {}".format(e))
            sys.exit(1)
        except AttributeError as e:
            _LOGGER.error(
                "Your bulker config file is outdated, you need to re-initialize it: {}"
                .format(e))
            sys.exit(1)

    if args.command == "run":
        try:
            cratelist = parse_registry_paths(args.crate_registry_paths)
            _LOGGER.info("Activating crate: {}\n".format(
                args.crate_registry_paths))
            bulker_run(bulker_config, cratelist, args.cmd, strict=args.strict)
        except KeyError as e:
            parser.print_help(sys.stderr)
            _LOGGER.error("{} is not an available crate".format(e))
            sys.exit(1)
        except MissingCrateError as e:
            _LOGGER.error("Missing crate: {}".format(e))
            sys.exit(1)

    if args.command == "load":
        bulker_config.make_writable()
        manifest, cratevars = load_remote_registry_path(
            bulker_config, args.crate_registry_paths, args.manifest)
        exe_template_jinja = None
        build_template_jinja = None
        shell_template_jinja = None

        exe_template = mkabs(bulker_config.bulker.executable_template,
                             os.path.dirname(bulker_config._file_path))
        build_template = mkabs(bulker_config.bulker.build_template,
                               os.path.dirname(bulker_config._file_path))
        try:
            shell_template = mkabs(bulker_config.bulker.shell_template,
                                   os.path.dirname(bulker_config._file_path))
        except AttributeError:
            _LOGGER.error(
                "You need to re-initialize your bulker config or add a 'shell_template' attribute."
            )
            sys.exit(1)

        try:
            assert (os.path.exists(exe_template))
        except AssertionError:
            _LOGGER.error(
                "Bulker config points to a missing executable template: {}".
                format(exe_template))
            sys.exit(1)

        with open(exe_template, 'r') as f:
            # with open(DOCKER_TEMPLATE, 'r') as f:
            contents = f.read()
            exe_template_jinja = jinja2.Template(contents)

        try:
            assert (os.path.exists(shell_template))
        except AssertionError:
            _LOGGER.error(
                "Bulker config points to a missing shell template: {}".format(
                    shell_template))
            sys.exit(1)

        with open(shell_template, 'r') as f:
            # with open(DOCKER_TEMPLATE, 'r') as f:
            contents = f.read()
            shell_template_jinja = jinja2.Template(contents)

        if args.build:
            try:
                assert (os.path.exists(build_template))
            except AssertionError:
                _LOGGER.error(
                    "Bulker config points to a missing build template: {}".
                    format(build_template))
                sys.exit(1)

            _LOGGER.info(
                "Building images with template: {}".format(build_template))
            with open(build_template, 'r') as f:
                contents = f.read()
                build_template_jinja = jinja2.Template(contents)

        bulker_load(manifest,
                    cratevars,
                    bulker_config,
                    exe_jinja2_template=exe_template_jinja,
                    shell_jinja2_template=shell_template_jinja,
                    crate_path=args.path,
                    build=build_template_jinja,
                    force=args.force)

    if args.command == "inspect":
        if args.crate_registry_paths == "":
            _LOGGER.error(
                "No active create. Inspect requires a provided crate, or a currently active create."
            )
            sys.exit(1)
        manifest, cratevars = load_remote_registry_path(
            bulker_config, args.crate_registry_paths, None)
        manifest_name = cratevars['crate']

        print("Bulker manifest: {}".format(args.crate_registry_paths))
        crate_path = os.path.join(bulker_config.bulker.default_crate_folder,
                                  cratevars['namespace'], manifest_name,
                                  cratevars['tag'])
        if not os.path.isabs(crate_path):
            crate_path = os.path.join(os.path.dirname(bcfg._file_path),
                                      crate_path)
        print("Crate path: {}".format(crate_path))
        import glob
        filenames = glob.glob(os.path.join(crate_path, "*"))
        available_commands = [
            x for x in [os.path.basename(x) for x in filenames] if x[0] != "_"
        ]
        print("Available commands: {}".format(available_commands))
Exemplo n.º 13
0
def main():
    """ Primary workflow """
    parser = logmuse.add_logging_options(build_argparser())
    args, remaining_args = parser.parse_known_args()
    global _LOGGER
    _LOGGER = logmuse.logger_via_cli(args, make_root=True)
    _LOGGER.debug(f"versions: refgenie {__version__} | refgenconf {rgc_version}")
    _LOGGER.debug(f"Args: {args}")

    if not args.command:
        parser.print_help()
        _LOGGER.error("No command given")
        sys.exit(1)

    if args.command == ALIAS_CMD and not args.subcommand:
        parser.print_help()
        _LOGGER.error("No alias subcommand command given")
        sys.exit(1)

    gencfg = select_genome_config(
        filename=args.genome_config,
        check_exist=not args.command == INIT_CMD,
        on_missing=lambda fp: fp,
        strict_env=True,
    )
    if gencfg is None:
        raise MissingGenomeConfigError(args.genome_config)
    _LOGGER.debug("Determined genome config: {}".format(gencfg))

    skip_read_lock = _skip_lock(args.skip_read_lock, gencfg)

    # From user input we want to construct a list of asset dicts, where each
    # asset has a genome name, asset name, and tag
    if "asset_registry_paths" in args and args.asset_registry_paths:
        _LOGGER.debug("Found registry_path: {}".format(args.asset_registry_paths))
        asset_list = [parse_registry_path(x) for x in args.asset_registry_paths]

        for a in asset_list:
            # every asset must have a genome, either provided via registry path
            # or the args.genome arg.
            if not a["genome"]:
                if args.genome:
                    a["genome"] = args.genome
                else:
                    _LOGGER.error(
                        "Provided asset registry path ({}/{}:{}) is invalid. See help for usage reference.".format(
                            a["genome"], a["asset"], a["tag"]
                        )
                    )
                    sys.exit(1)
            else:
                if args.genome and args.genome != a["genome"]:
                    _LOGGER.warn(
                        "Two different genomes specified for asset '{}'.".format(
                            a["asset"]
                        )
                    )

    else:
        if args.command in GENOME_ONLY_REQUIRED and not args.genome:
            parser.error("You must provide either a genome or a registry path")
            sys.exit(1)
        if args.command in ASSET_REQUIRED:
            parser.error("You must provide an asset registry path")
            sys.exit(1)

    if args.command == INIT_CMD:
        _LOGGER.debug("Initializing refgenie genome configuration")
        entries = OrderedDict(
            {
                CFG_VERSION_KEY: REQ_CFG_VERSION,
                CFG_FOLDER_KEY: os.path.dirname(os.path.abspath(gencfg)),
                CFG_SERVERS_KEY: args.genome_server or [DEFAULT_SERVER],
                CFG_GENOMES_KEY: None,
            }
        )
        if args.settings_json:
            if os.path.isfile(args.settings_json):
                with open(args.settings_json, "r") as json_file:
                    data = json.load(json_file)
                entries.update(data)
            else:
                raise FileNotFoundError(
                    "JSON file with config init settings does not exist: {}".format(
                        args.settings_json
                    )
                )
        if args.genome_folder:
            entries.update({CFG_FOLDER_KEY: args.genome_folder})
        if args.remote_url_base:
            entries.update({CFG_REMOTE_URL_BASE_KEY: args.remote_url_base})
        if args.genome_archive_folder:
            entries.update({CFG_ARCHIVE_KEY: args.genome_archive_folder})
        if args.genome_archive_config:
            entries.update({CFG_ARCHIVE_CONFIG_KEY: args.genome_archive_config})
        _LOGGER.debug("initializing with entries: {}".format(entries))
        rgc = RefGenConf(entries=entries, skip_read_lock=skip_read_lock)
        rgc.initialize_config_file(os.path.abspath(gencfg))

    elif args.command == BUILD_CMD:
        if not all([x["genome"] == asset_list[0]["genome"] for x in asset_list]):
            _LOGGER.error("Build can only build assets for one genome")
            sys.exit(1)
        recipe_name = None
        if args.recipe:
            if len(asset_list) > 1:
                _LOGGER.error("Recipes cannot be specified for multi-asset builds")
                sys.exit(1)
            recipe_name = args.recipe
        if args.requirements:
            for a in asset_list:
                recipe = recipe_name or a["asset"]
                if recipe not in asset_build_packages.keys():
                    _raise_missing_recipe_error(recipe)
                _LOGGER.info("'{}' recipe requirements: ".format(recipe))
                _make_asset_build_reqs(recipe)
            sys.exit(0)
        refgenie_build(gencfg, asset_list[0]["genome"], asset_list, recipe_name, args)

    elif args.command == GET_ASSET_CMD:
        rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock)
        check = args.check_exists if args.check_exists else None
        for a in asset_list:
            _LOGGER.debug(
                "getting asset: '{}/{}.{}:{}'".format(
                    a["genome"], a["asset"], a["seek_key"], a["tag"]
                )
            )
            print(
                rgc.seek(
                    a["genome"],
                    a["asset"],
                    a["tag"],
                    a["seek_key"],
                    strict_exists=check,
                )
            )
        return

    elif args.command == INSERT_CMD:
        rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock)

        if len(asset_list) > 1:
            raise NotImplementedError("Can only add 1 asset at a time")
        else:
            sk = args.seek_keys
            if sk:
                sk = json.loads(args.seek_keys)
            rgc.add(
                path=args.path,
                genome=asset_list[0]["genome"],
                asset=asset_list[0]["asset"],
                tag=asset_list[0]["tag"],
                seek_keys=sk,
                force=args.force,
            )

    elif args.command == PULL_CMD:
        rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock)

        # existing assets overwriting
        if args.no_overwrite:
            force = False
        elif args.force_overwrite:
            force = True
        else:
            force = None
        # large archive pulling
        if args.no_large:
            force_large = False
        elif args.pull_large:
            force_large = True
        else:
            force_large = None
        # batch mode takes precedence over other choices
        if args.batch:
            force_large = True
            force = False

        outdir = rgc.data_dir
        if not os.path.exists(outdir):
            raise MissingFolderError(outdir)
        if not perm_check_x(outdir):
            return
        if not _single_folder_writeable(outdir):
            _LOGGER.error("Insufficient permissions to write to: {}".format(outdir))
            return

        for a in asset_list:
            rgc.pull(
                a["genome"],
                a["asset"],
                a["tag"],
                force=force,
                force_large=force_large,
                size_cutoff=args.size_cutoff,
            )

    elif args.command in [LIST_LOCAL_CMD, LIST_REMOTE_CMD]:
        rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock)
        console = Console()
        if args.command == LIST_REMOTE_CMD:
            num_servers = 0
            bad_servers = []
            for server_url in rgc[CFG_SERVERS_KEY]:
                num_servers += 1
                try:
                    table = rgc.get_asset_table(
                        genomes=args.genome, server_url=server_url
                    )
                except (DownloadJsonError, ConnectionError, MissingSchema):
                    bad_servers.append(server_url)
                    continue
                else:
                    console.print(table)
            if num_servers >= len(rgc[CFG_SERVERS_KEY]) and bad_servers:
                _LOGGER.error(
                    "Could not list assets from the following servers: {}".format(
                        bad_servers
                    )
                )
        else:
            if args.recipes:
                print(", ".join(sorted(list(asset_build_packages.keys()))))
            else:
                console.print(rgc.get_asset_table(genomes=args.genome))

    elif args.command == GETSEQ_CMD:
        rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock)
        print(rgc.getseq(args.genome, args.locus))

    elif args.command == REMOVE_CMD:
        force = args.force
        rgc = RefGenConf(filepath=gencfg, skip_read_lock=skip_read_lock)
        for a in asset_list:
            a["tag"] = a["tag"] or rgc.get_default_tag(
                a["genome"], a["asset"], use_existing=False
            )
            _LOGGER.debug("Determined tag for removal: {}".format(a["tag"]))
            if a["seek_key"] is not None:
                raise NotImplementedError("You can't remove a specific seek_key.")
            gat = {"genome": a["genome"], "asset": a["asset"], "tag": a["tag"]}
            try:
                if not rgc.is_asset_complete(**gat):
                    with rgc as r:
                        r.cfg_remove_assets(**gat)
                    _LOGGER.info(
                        "Removed an incomplete asset "
                        "'{genome}/{asset}:{tag}'".format(*gat)
                    )
                    return
            except (KeyError, MissingAssetError, MissingGenomeError):
                _LOGGER.info(
                    "Asset '{genome}/{asset}:{tag}' does not exist".format(**gat)
                )
                return
        if len(asset_list) > 1:
            if not query_yes_no(
                "Are you sure you want to remove {} assets?".format(len(asset_list))
            ):
                _LOGGER.info("Action aborted by the user")
                return
            force = True
        for a in asset_list:
            rgc.remove(genome=a["genome"], asset=a["asset"], tag=a["tag"], force=force)

    elif args.command == TAG_CMD:
        rgc = RefGenConf(filepath=gencfg, skip_read_lock=skip_read_lock)
        if len(asset_list) > 1:
            raise NotImplementedError("Can only tag 1 asset at a time")
        if args.default:
            # set the default tag and exit
            with rgc as r:
                r.set_default_pointer(a["genome"], a["asset"], a["tag"], True)
            sys.exit(0)
        rgc.tag(a["genome"], a["asset"], a["tag"], args.tag, force=args.force)

    elif args.command == ID_CMD:
        rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock)
        if len(asset_list) == 1:
            g, a = asset_list[0]["genome"], asset_list[0]["asset"]
            t = asset_list[0]["tag"] or rgc.get_default_tag(g, a)
            print(rgc.id(g, a, t))
            return
        for asset in asset_list:
            g, a = asset["genome"], asset["asset"]
            t = asset["tag"] or rgc.get_default_tag(g, a)
            print("{}/{}:{},".format(g, a, t) + rgc.id(g, a, t))
        return
    elif args.command == SUBSCRIBE_CMD:
        rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock)
        rgc.subscribe(urls=args.genome_server, reset=args.reset)
        return
    elif args.command == UNSUBSCRIBE_CMD:
        rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock)
        rgc.unsubscribe(urls=args.genome_server)
        return
    elif args.command == ALIAS_CMD:
        rgc = RefGenConf(filepath=gencfg, skip_read_lock=skip_read_lock)
        if args.subcommand == ALIAS_GET_CMD:
            if args.aliases is not None:
                for a in args.aliases:
                    print(rgc.get_genome_alias_digest(alias=a))
                return
            console = Console()
            console.print(rgc.genome_aliases_table)

        if args.subcommand == ALIAS_SET_CMD:
            rgc.set_genome_alias(
                digest=args.digest,
                genome=args.aliases,
                reset_digest=args.reset,
                create_genome=args.force,
            )
            return
        elif args.subcommand == ALIAS_REMOVE_CMD:
            rgc.remove_genome_aliases(digest=args.digest, aliases=args.aliases)
            return

    elif args.command == COMPARE_CMD:
        rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock)
        res = rgc.compare(
            args.genome1[0], args.genome2[0], explain=not args.no_explanation
        )
        if args.no_explanation:
            print(res)

    elif args.command == UPGRADE_CMD:
        upgrade_config(
            target_version=args.target_version, filepath=gencfg, force=args.force
        )
Exemplo n.º 14
0
 def create_logger():
     return logger_via_cli(opts, strict=strict)
Exemplo n.º 15
0
def main():
    """ Primary workflow """
    parser = logmuse.add_logging_options(build_argparser())
    args, remaining_args = parser.parse_known_args()
    global _LOGGER
    _LOGGER = logmuse.logger_via_cli(args, make_root=True)
    _LOGGER.debug("refgenie {}".format(__version__))
    _LOGGER.debug("Args: {}".format(args))

    if not args.command:
        parser.print_help()
        _LOGGER.error("No command given")
        sys.exit(1)

    gencfg = refgenconf.select_genome_config(
        filename=args.genome_config,
        check_exist=not args.command == INIT_CMD,
        on_missing=lambda fp: fp,
        strict_env=True)
    if gencfg is None:
        raise MissingGenomeConfigError(args.genome_config)
    _LOGGER.debug("Determined genome config: {}".format(gencfg))

    # From user input we want to construct a list of asset dicts, where each
    # asset has a genome name, asset name, and tag

    if "asset_registry_paths" in args and args.asset_registry_paths:
        _LOGGER.debug("Found registry_path: {}".format(
            args.asset_registry_paths))
        asset_list = [
            parse_registry_path(x) for x in args.asset_registry_paths
        ]

        for a in asset_list:
            # every asset must have a genome, either provided via registry path
            # or the args.genome arg.
            if not a["genome"]:
                if args.genome:
                    a["genome"] = args.genome
                else:
                    _LOGGER.error(
                        "Provided asset registry path ({}/{}:{}) is invalid. See help for usage reference."
                        .format(a["genome"], a["asset"], a["tag"]))
                    sys.exit(1)
            else:
                if args.genome and args.genome != a["genome"]:
                    _LOGGER.warn(
                        "Two different genomes specified for asset '{}'.".
                        format(a["asset"]))

    else:
        if args.command in GENOME_ONLY_REQUIRED and not args.genome:
            parser.error("You must provide either a genome or a registry path")
            sys.exit(1)
        if args.command in ASSET_REQUIRED:
            parser.error("You must provide an asset registry path")
            sys.exit(1)

    if args.command == INIT_CMD:
        _LOGGER.debug("Initializing refgenie genome configuration")
        rgc = RefGenConf(entries=OrderedDict(
            {
                CFG_VERSION_KEY: REQ_CFG_VERSION,
                CFG_FOLDER_KEY: os.path.dirname(os.path.abspath(gencfg)),
                CFG_SERVERS_KEY: args.genome_server or [DEFAULT_SERVER],
                CFG_GENOMES_KEY: None
            }))
        rgc.initialize_config_file(os.path.abspath(gencfg))

    elif args.command == BUILD_CMD:
        if not all(
            [x["genome"] == asset_list[0]["genome"] for x in asset_list]):
            _LOGGER.error("Build can only build assets for one genome")
            sys.exit(1)
        recipe_name = None
        if args.recipe:
            if len(asset_list) > 1:
                _LOGGER.error(
                    "Recipes cannot be specified for multi-asset builds")
                sys.exit(1)
            recipe_name = args.recipe
        if args.requirements:
            for a in asset_list:
                recipe = recipe_name or a["asset"]
                if recipe not in asset_build_packages.keys():
                    _raise_missing_recipe_error(recipe)
                _LOGGER.info("'{}' recipe requirements: ".format(recipe))
                _make_asset_build_reqs(recipe)
            sys.exit(0)
        refgenie_build(gencfg, asset_list[0]["genome"], asset_list,
                       recipe_name, args)

    elif args.command == GET_ASSET_CMD:
        rgc = RefGenConf(filepath=gencfg, writable=False)
        check = args.check_exists if args.check_exists else None
        for a in asset_list:
            _LOGGER.debug("getting asset: '{}/{}.{}:{}'".format(
                a["genome"], a["asset"], a["seek_key"], a["tag"]))
            print(
                rgc.seek(a["genome"],
                         a["asset"],
                         a["tag"],
                         a["seek_key"],
                         strict_exists=check))
        return

    elif args.command == INSERT_CMD:
        rgc = RefGenConf(filepath=gencfg, writable=False)
        if len(asset_list) > 1:
            raise NotImplementedError("Can only add 1 asset at a time")
        else:
            refgenie_add(rgc, asset_list[0], args.path, args.force)

    elif args.command == PULL_CMD:
        rgc = RefGenConf(filepath=gencfg, writable=False)
        force = None if not args.force else True
        outdir = rgc[CFG_FOLDER_KEY]
        if not os.path.exists(outdir):
            raise MissingFolderError(outdir)
        target = _key_to_name(CFG_FOLDER_KEY)
        if not perm_check_x(outdir, target):
            return
        if not _single_folder_writeable(outdir):
            _LOGGER.error("Insufficient permissions to write to {}: {}".format(
                target, outdir))
            return

        for a in asset_list:
            rgc.pull(a["genome"],
                     a["asset"],
                     a["tag"],
                     unpack=not args.no_untar,
                     force=force)

    elif args.command in [LIST_LOCAL_CMD, LIST_REMOTE_CMD]:
        rgc = RefGenConf(filepath=gencfg, writable=False)
        if args.command == LIST_REMOTE_CMD:
            num_servers = 0
            # Keep all servers so that child updates maintain server list
            server_list = rgc[CFG_SERVERS_KEY]
            bad_servers = []
            for server_url in rgc[CFG_SERVERS_KEY]:
                num_servers += 1
                try:
                    rgc[CFG_SERVERS_KEY] = server_url
                    pfx, genomes, assets, recipes = _exec_list(
                        rgc, args.command == LIST_REMOTE_CMD, args.genome)
                    if assets is None and genomes is None:
                        continue
                    _LOGGER.info("{} genomes: {}".format(pfx, genomes))
                    if args.command != LIST_REMOTE_CMD:  # Not implemented yet
                        _LOGGER.info("{} recipes: {}".format(pfx, recipes))
                    _LOGGER.info("{} assets:\n{}\n".format(pfx, assets))
                except (DownloadJsonError, ConnectionError):
                    bad_servers.append(server_url)
                    continue
            if num_servers >= len(server_list) and bad_servers:
                _LOGGER.error(
                    "Could not list assets from the following server(s): {}".
                    format(bad_servers))
            # Restore original server list, even when we couldn't find assets on a server
            rgc[CFG_SERVERS_KEY] = server_list
        else:  # Only check local assets once
            _LOGGER.info("Server subscriptions: {}".format(", ".join(
                rgc[CFG_SERVERS_KEY])))
            pfx, genomes, assets, recipes = _exec_list(
                rgc, args.command == LIST_REMOTE_CMD, args.genome)
            _LOGGER.info("{} genomes: {}".format(pfx, genomes))
            if args.command != LIST_REMOTE_CMD:  # Not implemented yet
                _LOGGER.info("{} recipes: {}".format(pfx, recipes))
            _LOGGER.info("{} assets:\n{}".format(pfx, assets))

    elif args.command == GETSEQ_CMD:
        rgc = RefGenConf(filepath=gencfg, writable=False)
        rgc.getseq(rgc, args.genome, args.locus)

    elif args.command == REMOVE_CMD:
        force = args.force
        rgc = RefGenConf(filepath=gencfg)
        for a in asset_list:
            a["tag"] = a["tag"] or rgc.get_default_tag(
                a["genome"], a["asset"], use_existing=False)
            _LOGGER.debug("Determined tag for removal: {}".format(a["tag"]))
            if a["seek_key"] is not None:
                raise NotImplementedError(
                    "You can't remove a specific seek_key.")
            bundle = [a["genome"], a["asset"], a["tag"]]
            try:
                if not rgc.is_asset_complete(*bundle):
                    with rgc as r:
                        r.cfg_remove_assets(*bundle)
                    _LOGGER.info(
                        "Removed an incomplete asset '{}/{}:{}'".format(
                            *bundle))
                    return
            except (KeyError, MissingAssetError, MissingGenomeError):
                _LOGGER.info("Asset '{}/{}:{}' does not exist".format(*bundle))
                return
        if len(asset_list) > 1:
            if not query_yes_no(
                    "Are you sure you want to remove {} assets?".format(
                        len(asset_list))):
                _LOGGER.info("Action aborted by the user")
                return
            force = True
        for a in asset_list:
            rgc.remove(genome=a["genome"],
                       asset=a["asset"],
                       tag=a["tag"],
                       force=force)

    elif args.command == TAG_CMD:
        rgc = RefGenConf(filepath=gencfg)
        if len(asset_list) > 1:
            raise NotImplementedError("Can only tag 1 asset at a time")
        if args.default:
            # set the default tag and exit
            with rgc as r:
                r.set_default_pointer(a["genome"], a["asset"], a["tag"], True)
            sys.exit(0)
        rgc.tag(a["genome"], a["asset"], a["tag"], args.tag)

    elif args.command == ID_CMD:
        rgc = RefGenConf(filepath=gencfg, writable=False)
        if len(asset_list) == 1:
            g, a = asset_list[0]["genome"], asset_list[0]["asset"]
            t = asset_list[0]["tag"] or rgc.get_default_tag(g, a)
            print(rgc.id(g, a, t))
            return
        for asset in asset_list:
            g, a = asset["genome"], asset["asset"]
            t = asset["tag"] or rgc.get_default_tag(g, a)
            print("{}/{}:{},".format(g, a, t) + rgc.id(g, a, t))
        return
    elif args.command == SUBSCRIBE_CMD:
        rgc = RefGenConf(filepath=gencfg, writable=False)
        rgc.subscribe(urls=args.genome_server, reset=args.reset)
        return
    elif args.command == UNSUBSCRIBE_CMD:
        rgc = RefGenConf(filepath=gencfg, writable=False)
        rgc.unsubscribe(urls=args.genome_server)
        return
Exemplo n.º 16
0
def test_opts_added_none_used(parser):
    """ Addition of logging options allows logger_via_cli to complete. """
    opts = parser.parse_args([])
    assert all(hasattr(opts, _rawopt(n)) for n in LOGGING_CLI_OPTDATA)
    logger = logger_via_cli(opts)
    assert isinstance(logger, logging.Logger)
Exemplo n.º 17
0
def run_geofetch(cmdl):
    """ Main script driver/workflow """

    args = _parse_cmdl(cmdl)
    global _LOGGER
    _LOGGER = logger_via_cli(args, name="geofetch")

    if args.name:
        project_name = args.name
    else:
        project_name = os.path.splitext(os.path.basename(args.input))[0]

    def render_env_var(ev):
        return "{} ({})".format(ev, expandpath(ev))

    metadata_expanded = expandpath(args.metadata_folder)
    _LOGGER.info("Given metadata folder: {} ({})".format(
        args.metadata_folder, metadata_expanded))
    if os.path.isabs(metadata_expanded):
        metadata_raw = args.metadata_folder
    else:
        metadata_expanded = os.path.abspath(metadata_expanded)
        metadata_raw = os.path.abspath(args.metadata_folder)

    _LOGGER.info("Initial raw metadata folder: {}".format(
        render_env_var(metadata_raw)))
    if not args.no_subfolder:
        metadata_expanded = os.path.join(metadata_expanded, project_name)
        metadata_raw = os.path.join(metadata_raw, project_name)
    _LOGGER.info("Final raw metadata folder: {}".format(
        render_env_var(metadata_raw)))

    # Some sanity checks before proceeding
    if args.bam_folder and not which("samtools"):
        raise SystemExit("For SAM/BAM processing, samtools should be on PATH.")

    acc_GSE_list = parse_accessions(args.input, metadata_expanded,
                                    args.just_metadata)

    # Loop through each accession.
    # This will process that accession, produce metadata and download files for
    # the GSM #s included in the list for each GSE#.
    # acc_GSE = "GSE61150" # example

    # This loop populates a list of metadata.
    metadata_dict = OrderedDict()
    subannotation_dict = OrderedDict()
    failed_runs = []

    for acc_GSE in acc_GSE_list.keys():
        _LOGGER.info("Processing accession: " + acc_GSE)
        if len(re.findall(GSE_PATTERN, acc_GSE)) != 1:
            print(len(re.findall(GSE_PATTERN, acc_GSE)))
            _LOGGER.warning("This does not appear to be a correctly formatted "
                            "GSE accession! Continue anyway...")

        # Get GSM#s (away from sample_name)
        GSM_limit_list = list(acc_GSE_list[acc_GSE].keys()
                              )  #[x[1] for x in acc_GSE_list[acc_GSE]]

        if (len(acc_GSE_list[acc_GSE]) > 0):
            _LOGGER.info("Limit to: {}".format(list(
                acc_GSE_list[acc_GSE])))  # a list of GSM#s

        if args.refresh_metadata:
            _LOGGER.info("Refreshing metadata...")
        # For each GSE acc, produce a series of metadata files
        file_gse = os.path.join(metadata_expanded, acc_GSE + '_GSE.soft')
        file_gsm = os.path.join(metadata_expanded, acc_GSE + '_GSM.soft')
        file_sra = os.path.join(metadata_expanded, acc_GSE + '_SRA.csv')
        file_srafilt = os.path.join(metadata_expanded,
                                    acc_GSE + '_SRA_filt.csv')

        # Grab the GSE and GSM SOFT files from GEO.
        # The GSE file has metadata describing the experiment, which includes
        # The SRA number we need to download the raw data from SRA
        # The GSM file has metadata describing each sample, which we will use to
        # produce a sample annotation sheet.
        if not os.path.isfile(file_gse) or args.refresh_metadata:
            Accession(acc_GSE).fetch_metadata(file_gse)
        else:
            _LOGGER.info("Found previous GSE file: " + file_gse)

        if not os.path.isfile(file_gsm) or args.refresh_metadata:
            Accession(acc_GSE).fetch_metadata(file_gsm, typename="GSM")
        else:
            _LOGGER.info("Found previous GSM file: " + file_gsm)

        # A simple state machine to parse SOFT formatted files (Here, the GSM file)
        gsm_metadata = OrderedDict()
        # For multi samples (samples with multiple runs), we keep track of these
        # relations in a separate table, which is called the subannotation table.
        gsm_multi_table = OrderedDict()
        # save the state
        current_sample_id = None
        current_sample_srx = False
        for line in open(file_gsm, 'r'):
            line = line.rstrip()
            if line[0] is "^":
                pl = parse_SOFT_line(line)
                if len(acc_GSE_list[acc_GSE]
                       ) > 0 and pl['SAMPLE'] not in GSM_limit_list:
                    #sys.stdout.write("  Skipping " + a['SAMPLE'] + ".")
                    current_sample_id = None
                    continue
                current_sample_id = pl['SAMPLE']
                current_sample_srx = False
                columns_init = [("sample_name", ""), ("protocol", ""),
                                ("organism", ""), ("read_type", ""),
                                ("data_source", None), ("SRR", None),
                                ("SRX", None)]
                gsm_metadata[current_sample_id] = OrderedDict(columns_init)

                _LOGGER.info("Found sample: {}".format(current_sample_id))
            elif current_sample_id is not None:
                try:
                    pl = parse_SOFT_line(line)
                except IndexError:
                    # TODO: do we "fail the current sample" here and remove it
                    # from gsm_metadata? Or just skip the line?
                    _LOGGER.debug(
                        "Failed to parse alleged SOFT line for sample "
                        "ID {}; line: {}".format(current_sample_id, line))
                    continue
                gsm_metadata[current_sample_id].update(pl)

                # For processed data, here's where we would download it
                if args.processed and not args.just_metadata:
                    found = re.findall(SUPP_FILE_PATTERN, line)
                    if found:
                        print(pl[pl.keys()[0]])

                # Now convert the ids GEO accessions into SRX accessions
                if not current_sample_srx:
                    found = re.findall(EXPERIMENT_PATTERN, line)
                    if found:
                        _LOGGER.info("(SRX accession: {})".format(found[0]))
                        srx_id = found[0]
                        gsm_metadata[srx_id] = gsm_metadata.pop(
                            current_sample_id)
                        gsm_metadata[srx_id][
                            "gsm_id"] = current_sample_id  # save the GSM id
                        current_sample_id = srx_id
                        current_sample_srx = True

        # GSM SOFT file parsed, save it in a list
        metadata_dict[acc_GSE] = gsm_metadata

        # Parse out the SRA project identifier from the GSE file
        acc_SRP = None
        for line in open(file_gse, 'r'):
            found = re.findall(PROJECT_PATTERN, line)
            if found:
                acc_SRP = found[0]
                _LOGGER.info("Found SRA Project accession: {}".format(acc_SRP))
                break
            # For processed data, here's where we would download it
            if args.processed and not args.just_metadata:
                found = re.findall(SER_SUPP_FILE_PATTERN, line)
                if found:
                    pl = parse_SOFT_line(line)
                    file_url = pl[pl.keys()[0]].rstrip()
                    _LOGGER.info("File: " + str(file_url))
                    # download file
                    if args.geofolder:
                        data_folder = os.path.join(args.geofolder, acc_GSE)
                        print(file_url, data_folder)
                        subprocess.call(['wget', file_url, '-P', data_folder])

        if not acc_SRP:
            # If I can't get an SRA accession, maybe raw data wasn't submitted to SRA
            # as part of this GEO submission. Can't proceed.
            _LOGGER.warning(
                "\033[91mUnable to get SRA accession (SRP#) from GEO GSE SOFT file. No raw data?\033[0m"
            )
            # but wait; another possibility: there's no SRP linked to the GSE, but there
            # could still be an SRX linked to the (each) GSM.
            if len(gsm_metadata) == 1:
                acc_SRP = gsm_metadata.keys()[0]
                _LOGGER.warning("But the GSM has an SRX number; instead of an "
                                "SRP, using SRX identifier for this sample: " +
                                acc_SRP)
            else:
                # More than one sample? not sure what to do here. Does this even happen?
                continue

        # Now we have an SRA number, grab the SraRunInfo Metadata sheet:
        # The SRARunInfo sheet has additional sample metadata, which we will combine
        # with the GSM file to produce a single sample a
        if not os.path.isfile(file_sra) or args.refresh_metadata:
            Accession(acc_SRP).fetch_metadata(file_sra)
        else:
            _LOGGER.info("Found previous SRA file: " + file_sra)

        _LOGGER.info("SRP: {}".format(acc_SRP))

        # Parse metadata from SRA
        # Produce an annotated output from the GSM and SRARunInfo files.
        # This will merge the GSM and SRA sample metadata into a dict of dicts,
        # with one entry per sample.
        # NB: There may be multiple SRA Runs (and thus lines in the RunInfo file)
        # Corresponding to each sample.
        if not args.processed:
            file_read = open(file_sra, 'r')
            file_write = open(file_srafilt, 'w')
            _LOGGER.info("Parsing SRA file to download SRR records")
            initialized = False

            input_file = csv.DictReader(file_read)
            for line in input_file:
                if not initialized:
                    initialized = True
                    w = csv.DictWriter(file_write, line.keys())
                    w.writeheader()
                #print(line)
                #print(gsm_metadata[line['SampleName']])
                # SampleName is not necessarily the GSM number, though frequently it is
                #gsm_metadata[line['SampleName']].update(line)

                # Only download if it's in the include list:
                experiment = line["Experiment"]
                run_name = line["Run"]
                if experiment not in gsm_metadata:
                    # print("Skipping: {}".format(experiment))
                    continue

                # local convenience variable
                # possibly set in the input tsv file
                sample_name = None  # initialize to empty
                try:
                    sample_name = acc_GSE_list[acc_GSE][
                        gsm_metadata[experiment]["gsm_id"]]
                except KeyError:
                    pass
                if not sample_name or sample_name is "":
                    temp = gsm_metadata[experiment]['Sample_title']
                    # Now do a series of transformations to cleanse the sample name
                    temp = temp.replace(" ", "_")
                    # Do people put commas in their sample names? Yes.
                    temp = temp.replace(",", "_")
                    temp = temp.replace("__", "_")
                    sample_name = temp

                # Otherwise, record that there's SRA data for this run.
                # And set a few columns that are used as input to the Looper
                # print("Updating columns for looper")
                update_columns(gsm_metadata,
                               experiment,
                               sample_name=sample_name,
                               read_type=line['LibraryLayout'])

                # Some experiments are flagged in SRA as having multiple runs.
                if gsm_metadata[experiment].get("SRR") is not None:
                    # This SRX number already has an entry in the table.
                    _LOGGER.info("Found additional run: {} ({})".format(
                        run_name, experiment))

                    if isinstance(gsm_metadata[experiment]["SRR"], _STRING_TYPES) \
                            and experiment not in gsm_multi_table:
                        # Only one has been stuck in so far, make a list
                        gsm_multi_table[experiment] = []
                        # Add first the original one, which was stored as a string
                        # previously
                        gsm_multi_table[experiment].append([
                            sample_name, experiment,
                            gsm_metadata[experiment]["SRR"]
                        ])
                        # Now append the current SRR number in a list as [SRX, SRR]
                        gsm_multi_table[experiment].append(
                            [sample_name, experiment, run_name])
                    else:
                        # this is the 3rd or later sample; the first two are done,
                        # so just add it.
                        gsm_multi_table[experiment].append(
                            [sample_name, experiment, run_name])

                    if args.split_experiments:
                        # Duplicate the gsm metadata for this experiment (copy to make sure
                        # it's not just an alias).
                        rep_number = len(gsm_multi_table[experiment])
                        new_SRX = experiment + "_" + str(rep_number)
                        gsm_metadata[new_SRX] = copy.copy(
                            gsm_metadata[experiment])
                        # gsm_metadata[new_SRX]["SRX"] = new_SRX
                        gsm_metadata[new_SRX]["sample_name"] += "_" + str(
                            rep_number)
                        gsm_metadata[new_SRX]["SRR"] = run_name
                    else:
                        # Either way, set the srr code to multi in the main table.
                        gsm_metadata[experiment]["SRR"] = "multi"
                else:
                    # The first SRR for this SRX is added to GSM metadata
                    gsm_metadata[experiment]["SRR"] = run_name

                #gsm_metadata[experiment].update(line)

                # Write to filtered SRA Runinfo file
                w.writerow(line)
                _LOGGER.info("Get SRR: {} ({})".format(run_name, experiment))
                bam_file = "" if args.bam_folder == "" else os.path.join(
                    args.bam_folder, run_name + ".bam")

                # TODO: sam-dump has a built-in prefetch. I don't have to do
                # any of this stuff... This also solves the bad sam-dump issues.

                if os.path.exists(bam_file):
                    _LOGGER.info("BAM found:" + bam_file)
                else:
                    if not args.just_metadata:
                        # Use the 'prefetch' utility from the SRA Toolkit
                        # to download the raw reads.
                        # (http://www.ncbi.nlm.nih.gov/books/NBK242621/)

                        # Set up a simple loop to try a few times in case of failure
                        t = 0
                        while True:
                            t = t + 1
                            subprocess_return = subprocess.call([
                                'prefetch', run_name, '--max-size', '50000000'
                            ])
                            if subprocess_return == 0:
                                break
                            if t >= NUM_RETRIES:
                                _LOGGER.info(
                                    "Prefetch retries failed. Try this sample later"
                                )
                                failed_runs.append(run_name)
                                break
                            _LOGGER.info(
                                "Prefetch attempt failed, wait a few seconds to try again"
                            )
                            time.sleep(t * 2)
                    else:
                        _LOGGER.info("Dry run (no data download)")

                    if args.bam_conversion and args.bam_folder is not '':
                        _LOGGER.info("Converting to bam: " + run_name)
                        sra_file = os.path.join(args.sra_folder,
                                                run_name + ".sra")
                        if not os.path.exists(sra_file):
                            _LOGGER.info("SRA file doesn't exist, please "
                                         "download it first: " + sra_file)
                            continue

                        # The -u here allows unaligned reads, and seems to be
                        # required for some sra files regardless of aligned state
                        cmd = "sam-dump -u " + \
                              os.path.join(args.sra_folder, run_name + ".sra") + \
                              " | samtools view -bS - > " + bam_file
                        #sam-dump -u SRR020515.sra | samtools view -bS - > test.bam

                        _LOGGER.info("Conversion command: {}".format(cmd))
                        subprocess.call(cmd, shell=True)

                # check to make sure it worked
                # NS: Sometimes sam-dump fails, yielding an empty bam file, but
                # a fastq-dump works. This happens on files with bad quality
                # encodings. I contacted GEO about it in December 2015
                # Here we check the file size and use fastq -> bam conversion
                # if the sam-dump failed.
                if args.bam_conversion and args.bam_folder is not '':
                    st = os.stat(bam_file)
                    # print("File size: " + str(st.st_size))
                    if st.st_size < 100:
                        _LOGGER.warning(
                            "Bam conversion failed with sam-dump. Trying fastq-dump..."
                        )
                        # recreate?
                        cmd = "fastq-dump --split-3 -O " + \
                              os.path.realpath(args.sra_folder) + " " + \
                              os.path.join(args.sra_folder, run_name + ".sra")
                        _LOGGER.info("Command: {}".format(cmd))
                        subprocess.call(cmd, shell=True)
                        if not args.picard_path:
                            _LOGGER.warning(
                                "Can't convert the fastq to bam without picard path"
                            )
                        else:
                            # was it paired data? you have to process it differently
                            # so it knows it's paired end
                            fastq0 = os.path.join(args.sra_folder,
                                                  run_name + ".fastq")
                            fastq1 = os.path.join(args.sra_folder,
                                                  run_name + "_1.fastq")
                            fastq2 = os.path.join(args.sra_folder,
                                                  run_name + "_2.fastq")

                            cmd = "java -jar " + args.picard_path + " FastqToSam"
                            if os.path.exists(fastq1) and os.path.exists(
                                    fastq2):
                                cmd += " FASTQ=" + fastq1
                                cmd += " FASTQ2=" + fastq2
                            else:
                                cmd += " FASTQ=" + fastq0
                            cmd += " OUTPUT=" + bam_file
                            cmd += " SAMPLE_NAME=" + run_name
                            cmd += " QUIET=true"
                            _LOGGER.info("Conversion command: {}".format(cmd))
                            subprocess.call(cmd, shell=True)

            file_read.close()
            file_write.close()

        # accumulate subannotations
        subannotation_dict[acc_GSE] = gsm_multi_table

    # Combine individual accessions into project-level annotations, and write
    # individual accession files (if requested)

    metadata_dict_combined = OrderedDict()
    for acc_GSE, gsm_metadata in metadata_dict.iteritems():
        file_annotation = os.path.join(metadata_expanded,
                                       acc_GSE + '_annotation.csv')
        if args.acc_anno:
            write_annotation(gsm_metadata,
                             file_annotation,
                             use_key_subset=args.use_key_subset)
        metadata_dict_combined.update(gsm_metadata)

    subannotation_dict_combined = OrderedDict()
    for acc_GSE, gsm_multi_table in subannotation_dict.iteritems():
        file_subannotation = os.path.join(metadata_expanded,
                                          acc_GSE + '_subannotation.csv')
        if args.acc_anno:
            write_subannotation(gsm_multi_table, file_subannotation)
        subannotation_dict_combined.update(gsm_multi_table)

    _LOGGER.info("Finished processing {} accession(s)".format(
        len(acc_GSE_list)))

    if (len(failed_runs) > 0):
        _LOGGER.warn(
            "The following samples could not be downloaded: {}".format(
                failed_runs))

    # if user specified a pipeline interface path, add it into the project config
    if args.pipeline_interfaces:
        file_pipeline_interfaces = args.pipeline_interfaces
    else:
        file_pipeline_interfaces = "null"

    _LOGGER.info(
        "Creating complete project annotation sheets and config file...")
    # If the project included more than one GSE, we can now output combined
    # annotation tables for the entire project.

    # Write combined annotation sheet
    file_annotation = os.path.join(metadata_raw,
                                   project_name + '_annotation.csv')
    write_annotation(metadata_dict_combined,
                     file_annotation,
                     use_key_subset=args.use_key_subset)

    # Write combined subannotation table
    if len(subannotation_dict_combined) > 0:
        file_subannotation = os.path.join(metadata_raw,
                                          project_name + '_subannotation.csv')
        write_subannotation(subannotation_dict_combined, file_subannotation)
    else:
        file_subannotation = "null"

    # Write project config file

    if not args.config_template:
        geofetchdir = os.path.dirname(__file__)
        args.config_template = os.path.join(geofetchdir,
                                            "config_template.yaml")

    with open(args.config_template, 'r') as template_file:
        template = template_file.read()

    template_values = {
        "project_name": project_name,
        "annotation": file_annotation,
        "subannotation": file_subannotation,
        "pipeline_interfaces": file_pipeline_interfaces
    }

    for k, v in template_values.items():
        placeholder = "{" + str(k) + "}"
        template = template.replace(placeholder, str(v))

    config = os.path.join(metadata_raw, project_name + "_config.yaml")
    _write(config, template, msg_pre="  Config file: ")
Exemplo n.º 18
0
def test_typical_verbosity(parser, verbosity):
    """ Typical verbosity specifications yield logger with expected level. """
    opts = parser.parse_args([VERBOSITY_OPTNAME, str(verbosity)])
    logger = logger_via_cli(opts)
    exp = getattr(logging, LEVEL_BY_VERBOSITY[verbosity - 1])
    _assert_level(logger, exp)
Exemplo n.º 19
0
def main():
    """Primary workflow"""

    parser = logmuse.add_logging_options(build_argparser())
    # args, remaining_args = parser.parse_known_args()
    args = parser.parse_args()

    logger_kwargs = {"level": args.verbosity, "devmode": args.logdev}
    logmuse.init_logger("yacman", **logger_kwargs)
    global _LOGGER
    _LOGGER = logmuse.logger_via_cli(args)

    if not args.command:
        parser.print_help()
        _LOGGER.error("No command given")
        sys.exit(1)

    if args.command == "init":
        divcfg = args.config
        _LOGGER.debug("Initializing divvy configuration")
        is_writable(os.path.dirname(divcfg), check_exist=False)
        divvy_init(divcfg, DEFAULT_CONFIG_FILEPATH)
        sys.exit(0)

    _LOGGER.debug("Divvy config: {}".format(args.config))
    divcfg = select_divvy_config(args.config)
    _LOGGER.info("Using divvy config: {}".format(divcfg))
    dcc = ComputingConfiguration(filepath=divcfg)

    if args.command == "list":
        # Output header via logger and content via print so the user can
        # redirect the list from stdout if desired without the header as clutter
        _LOGGER.info("Available compute packages:\n")
        print("{}".format("\n".join(dcc.list_compute_packages())))
        sys.exit(1)

    # Any non-divvy arguments will be passed along as key-value pairs
    # that can be used to populate the template.
    # keys = [str.replace(x, "--", "") for x in remaining_args[::2]]
    # cli_vars = dict(zip(keys, remaining_args[1::2]))
    if args.compute:
        cli_vars = {y[0]: y[1] for y in [x.split("=") for x in args.compute]}
    else:
        cli_vars = {}

    if args.command == "write" or args.command == "submit":
        try:
            dcc.activate_package(args.package)
        except AttributeError:
            parser.print_help(sys.stderr)
            sys.exit(1)

        if args.settings:
            _LOGGER.info("Loading settings file: %s", args.settings)
            with open(args.settings, "r") as f:
                vars_groups = [cli_vars, yaml.load(f, SafeLoader)]
        else:
            vars_groups = [cli_vars]

        _LOGGER.debug(vars_groups)
        if args.command == "write":
            dcc.write_script(args.outfile, vars_groups)
        elif args.command == "submit":
            dcc.submit(args.outfile, vars_groups)