Exemplo n.º 1
0
def run_igdiscover(infname, outfname, outdir):
    if utils.output_exists(args, outfname):
        return

    prepare_igdiscover_outdir(outdir)

    if args.n_random_queries is not None:
        sub_infname = outdir + '/' + os.path.basename(
            infname.replace(
                utils.getsuffix(infname), '-n-random-queries-%d%s' %
                (args.n_random_queries, utils.getsuffix(infname))))
        if os.path.exists(sub_infname):
            print '    --n-random-queries: leaving existing fasta for igdiscover (hopefully it has %d queries)' % args.n_random_queries
        else:
            print '    --n-random-queries: writing new fasta for igdiscover (%d queries)' % args.n_random_queries
            seqfos = utils.read_fastx(infname,
                                      n_random_queries=args.n_random_queries)
            with open(sub_infname, 'w') as sub_infile:
                for seqfo in seqfos:
                    sub_infile.write('>%s\n%s\n' %
                                     (seqfo['name'], seqfo['seq']))
        infname = sub_infname

    igdiscover_outfname = outdir + '/work/final/database/%s.fasta' % args.region.upper(
    )

    cmds = ['#!/bin/bash']
    cmds += ['export PATH=%s:$PATH' % args.condapath]
    cmds += [
        'export PYTHONNOUSERSITE=True'
    ]  # otherwise it finds the pip-installed packages in .local and breaks (see https://github.com/conda/conda/issues/448)
    cmds += ['cd %s' % outdir]
    cmds += ['igdiscover init --db db --single-reads %s work' % infname
             ]  # prepares to run, putting files into <outdir>
    cmds += ['cp %s work/' % os.path.basename(args.yamlfname)]
    cmds += ['cd work']
    cmds += ['igdiscover run']
    utils.simplerun('\n'.join(cmds) + '\n',
                    cmdfname=outdir + '/run.sh',
                    print_time='igdiscover',
                    debug=True)

    template_gldir = args.glfo_dir if args.glfo_dir is not None else 'data/germlines/human'
    glfo = glutils.create_glfo_from_fasta(
        igdiscover_outfname,
        args.locus,
        args.region,
        template_gldir,
        simulation_germline_dir=args.simulation_germline_dir)
    out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus)
    assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname
    glutils.write_glfo(out_gldir, glfo, debug=True)
Exemplo n.º 2
0
def run_igdiscover(infname, outfname, outdir):
    if utils.output_exists(args, outfname):
        return

    prepare_igdiscover_outdir(outdir)

    if args.n_random_queries is not None:
        sub_infname = outdir + '/' + os.path.basename(
            infname.replace(
                utils.getsuffix(infname), '-n-random-queries-%d%s' %
                (args.n_random_queries, utils.getsuffix(infname))))
        if os.path.exists(sub_infname):
            print '    --n-random-queries: leaving existing fasta for igdiscover (hopefully it has %d queries)' % args.n_random_queries
        else:
            print '    --n-random-queries: writing new fasta for igdiscover (%d queries)' % args.n_random_queries
            seqfos = utils.read_fastx(infname,
                                      n_random_queries=args.n_random_queries)
            with open(sub_infname, 'w') as sub_infile:
                for seqfo in seqfos:
                    sub_infile.write('>%s\n%s\n' %
                                     (seqfo['name'], seqfo['seq']))
        infname = sub_infname

    igdiscover_outfname = outdir + '/work/final/database/%s.fasta' % args.region.upper(
    )

    cmds = getpathcmd()
    cmds += ['conda activate %s' % args.env_label]
    cmds += ['cd %s' % outdir]
    cmds += ['igdiscover init --db db --single-reads %s work' % infname
             ]  # prepares to run, putting files into <outdir>
    cmds += ['cp %s work/' % os.path.basename(args.yamlfname)]
    cmds += ['cd work']
    cmds += ['igdiscover run']
    utils.simplerun('\n'.join(cmds) + '\n',
                    cmdfname=outdir + '/run.sh',
                    print_time='igdiscover',
                    debug=True)

    template_gldir = args.glfo_dir  # if args.glfo_dir is not None else 'data/germlines/ XXX human'  # can probably delete this now that --glfo-dir is required (but leaving for now, to show how it used to be in case it comes up)
    glfo = glutils.create_glfo_from_fasta(
        igdiscover_outfname,
        args.locus,
        args.region,
        template_gldir,
        simulation_germline_dir=args.simulation_germline_dir)
    out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus)
    assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname
    glutils.write_glfo(out_gldir, glfo, debug=True)
Exemplo n.º 3
0
 def write(self,
           outfname,
           is_data,
           reco_info=None,
           true_partition=None,
           n_to_write=None,
           calc_missing_values='none',
           partition_lines=None):
     if utils.getsuffix(outfname) != '.csv':
         raise Exception('unhandled file extension %s' % outfname)
     if partition_lines is None:
         partition_lines = self.get_partition_lines(
             is_data,
             reco_info=reco_info,
             true_partition=true_partition,
             n_to_write=n_to_write,
             calc_missing_values=calc_missing_values)
     with open(outfname, 'w') as outfile:
         writer = csv.DictWriter(outfile, self.get_headers(is_data))
         writer.writeheader()
         for row in partition_lines:
             row['partition'] = ';'.join(
                 [':'.join(cluster) for cluster in row['partition']])
             if 'bad_clusters' in row:
                 row['bad_clusters'] = ';'.join(row['bad_clusters'])
             writer.writerow(row)
Exemplo n.º 4
0
def read_partis_output(partition_file, glfo_dir=None, locus=None):
    glfo = (None if utils.getsuffix(partition_file)
            == ".yaml" else glutils.read_glfo(
                glfo_dir if glfo_dir else default_glfo_dir, locus))
    glfo, annotation_list, cpath = utils.read_output(
        partition_file, glfo=glfo
    )  # returns glfo from the file if it's there, otherwise it returns the one we passed in
    return glfo, annotation_list, cpath
Exemplo n.º 5
0
    def readfile(self, fname):
        if fname is None:
            raise Exception('can\'t read NoneType partition file')
        if os.stat(fname).st_size == 0:
            raise Exception('partition file %s has size zero' % fname)

        if utils.getsuffix(fname) == '.csv':
            with open(fname, 'r') as infile:
                reader = csv.DictReader(infile)
                if 'partition' not in reader.fieldnames:
                    raise Exception('\'partition\' not among headers in %s, maybe this isn\'t a partition file? (if you\'re running \'view-output\' on a deprecated csv output file, you may need to run \'view-annotations\' instead, to tell it that this is an annotation file rather than a partition file)' % fname)
                lines = [line for line in reader]  # not sure that I really need this step
            self.readlines(lines, process_csv=True)
        elif utils.getsuffix(fname) == '.yaml':
            utils.read_yaml_output(fname, cpath=self)
        else:
            raise Exception('unhandled annotation file suffix %s' % outfname)
Exemplo n.º 6
0
 def write_presto_partitions(self, outfname, input_info):
     print '   writing presto partition %s' % outfname
     assert utils.getsuffix(outfname) in ['.fa', '.fasta']  # already checked in processargs.py
     with open(outfname, 'w') as outfile:
         iclust = 0
         for cluster in self.partitions[self.i_best]:
             for uid in cluster:
                 assert len(input_info[uid]['seqs']) == 1
                 outfile.write('>%s|CLONE=%d\n%s\n' % (uid, iclust, input_info[uid]['seqs'][0]))
             iclust += 1
Exemplo n.º 7
0
    def readfile(self, fname):
        if fname is None:
            raise Exception('can\'t read NoneType partition file')
        if os.stat(fname).st_size == 0:
            raise Exception('partition file %s has size zero' % fname)

        if utils.getsuffix(fname) == '.csv':
            with open(fname, 'r') as infile:
                reader = csv.DictReader(infile)
                if 'partition' not in reader.fieldnames:
                    raise Exception(
                        '\'partition\' not among headers in %s, maybe this isn\'t a partition file?'
                        % fname)
                lines = [line for line in reader
                         ]  # not sure that I really need this step
            self.readlines(lines, process_csv=True)
        elif utils.getsuffix(fname) == '.yaml':
            utils.read_yaml_output(fname, cpath=self)
        else:
            raise Exception('unhandled annotation file suffix %s' % outfname)
Exemplo n.º 8
0
    def generate_trees(self, seed, outfname, workdir):
        if self.args.input_simulation_treefname is None:  # default: generate our own trees
            ages, treestrs = self.run_treesim(seed, outfname, workdir)
        else:  # read trees from a file that pass set on the command line
            ages, treestrs = self.read_input_tree_file(outfname)
        os.remove(
            outfname
        )  # remove it here, just to make clear that we *re*write it in self.post_process_trees() so that recombinator can later read it

        if self.args.debug or utils.getsuffix(outfname) == '.nwk':
            dtreelist = [
                treeutils.get_dendro_tree(treestr=tstr,
                                          suppress_internal_node_taxa=True)
                for tstr in treestrs
            ]
            mean_leaf_height_list = [
                treeutils.get_mean_leaf_height(tree=dt) for dt in dtreelist
            ]
            n_leaf_list = [treeutils.get_n_leaves(dt) for dt in dtreelist]
            print '    mean over %d trees:   depth %.5f   leaves %.2f' % (
                len(mean_leaf_height_list), numpy.mean(mean_leaf_height_list),
                numpy.mean(n_leaf_list))

        # each tree is written with branch length the mean branch length over the whole sequence (which is different for each tree), but recombinator also needs the relative length for each region (which is the same, it's an average over the whole repertoire)
        with open(outfname, 'w') as yfile:
            if utils.getsuffix(outfname) == '.yaml':
                yamlfo = {
                    'branch-length-ratios': {
                        r: self.branch_lengths[r]['mean'] /
                        self.branch_lengths['all']['mean']
                        for r in utils.regions
                    },
                    'trees': treestrs
                }
                json.dump(yamlfo, yfile)
            elif utils.getsuffix(outfname) == '.nwk':
                print '    writing trees to %s' % outfname
                for treestr in treestrs:
                    yfile.write(treestr + '\n')
            else:
                assert False
Exemplo n.º 9
0
def run_igblast(infname, outfname):
    if utils.output_exists(args, outfname, offset=8):
        return

    if args.glfo_dir is not None:
        print '%s --glfo-dir isn\'t getting plugged in to igblast/changeo (would need to rebuild igblast db)' % utils.color(
            'red', 'warning')

    if args.n_random_queries is not None:
        sub_infname = os.path.dirname(outfname) + '/' + os.path.basename(
            infname.replace(
                utils.getsuffix(infname), '-n-random-queries-%d%s' %
                (args.n_random_queries, utils.getsuffix(infname))))
        if os.path.exists(sub_infname):
            print '    --n-random-queries: leaving existing fasta for igblast (hopefully it has %d queries)' % args.n_random_queries
        else:
            print '    --n-random-queries: writing new fasta for igblast (%d queries)' % args.n_random_queries
            seqfos = utils.read_fastx(infname,
                                      n_random_queries=args.n_random_queries)
            with open(sub_infname, 'w') as sub_infile:
                for seqfo in seqfos:
                    sub_infile.write('>%s\n%s\n' %
                                     (seqfo['name'], seqfo['seq']))
        infname = sub_infname

    cmds = ['#!/bin/bash']
    cmds += ['cd %s/%s' % (args.igbdir, args.locus)]
    cmds += ['export PATH=%s:$PATH' % args.condapath]
    cmds += ['igblastn']
    for tmpreg in utils.regions:
        cmds[-1] += ' -germline_db_%s %s%s-unaligned.fasta' % (
            tmpreg.upper(), args.locus, tmpreg)
    cmds[-1] += ' -auxiliary_data optional_file/%s_gl.aux' % args.species
    cmds[
        -1] += ' -domain_system imgt -ig_seqtype Ig -organism %s -outfmt \'7 std qseq sseq btop\'' % args.species
    cmds[-1] += ' -num_threads %d' % utils.auto_n_procs()
    cmds[-1] += ' -query ' + infname + ' -out ' + outfname
    utils.simplerun('\n'.join(cmds) + '\n', cmdfname=args.workdir + '/run.sh')
Exemplo n.º 10
0
def read_sw_info(sw_cache, locus):
    sw_cache_glfo = (
        utils.replace_suffix(sw_cache, "-glfo")
        if utils.getsuffix(sw_cache) == ".csv"
        else None
    )
    _, sw_annotations, _ = process_partis.read_partis_output(
        sw_cache, sw_cache_glfo, locus
    )

    def sw_uid(line):
        assert (
            len(line["unique_ids"]) == 1
        )  # would only fail if this was not actually an sw cache file, checking to illustrate sw case is special
        return line["unique_ids"][0]

    return {sw_uid(adict): adict for adict in sw_annotations}
Exemplo n.º 11
0
sys.path.insert(1, partis_dir + '/python')

import utils
import glutils
from clusterpath import ClusterPath

parser = argparse.ArgumentParser()
parser.add_argument('--fname',
                    default=partis_dir +
                    '/test/reference-results/partition-ref-simu.yaml')
parser.add_argument('--glfo-dir', default=partis_dir + '/data/germlines/human')
parser.add_argument('--locus', default='igh')
args = parser.parse_args()

glfo = None
if utils.getsuffix(args.fname) == '.csv':
    print '  reading deprecated csv format, so need to read germline info from somewhere else, using --glfo-dir %s, hopefully it works' % args.glfo_dir
    glfo = glutils.read_glfo(args.glfo_dir, locus=args.locus)

glfo, annotation_list, cpath = utils.read_output(args.fname, glfo=glfo)

if cpath is None or len(cpath.partitions) == 0:
    print 'no partitions read from %s, so just printing first annotation:' % args.fname
    utils.print_reco_event(annotation_list[0])
    sys.exit(0)

print utils.color('green', 'list of partitions:')
cpath.print_partitions(
    abbreviate=True
)  # 'abbreviate' print little 'o's instead of the full sequence ids
Exemplo n.º 12
0
def read_sequence_file(infname,
                       is_data,
                       n_max_queries=-1,
                       args=None,
                       simglfo=None,
                       quiet=False,
                       more_input_info=None):
    # NOTE renamed this from get_seqfile_info() since I'm changing the return values, but I don't want to update the calls everywhere (e.g. in compareutils)
    yaml_glfo = None
    suffix = utils.getsuffix(infname)
    if suffix in delimit_info:
        seqfile = open(
            infname
        )  # closes on function exit. no, this isn't the best way to do this
        reader = csv.DictReader(seqfile, delimiter=delimit_info[suffix])
    elif suffix in ['.fa', '.fasta', '.fastx']:
        reader = utils.read_fastx(
            infname,
            name_key='unique_ids',
            seq_key='input_seqs',
            add_info=False,
            sanitize=True,
            n_max_queries=
            n_max_queries,  # NOTE don't use istarstop kw arg here, 'cause it f***s with the istartstop treatment in the loop below
            queries=(args.queries if
                     (args is not None and not args.abbreviate) else None)
        )  # NOTE also can't filter on args.queries here if we're also translating
    elif suffix == '.yaml':
        yaml_glfo, reader, _ = utils.read_yaml_output(
            infname,
            n_max_queries=n_max_queries,
            synth_single_seqs=True,
            dont_add_implicit_info=True
        )  # not really sure that long term I want to synthesize single seq lines, but for backwards compatibility it's nice a.t.m.
        if not is_data:
            simglfo = yaml_glfo  # doesn't replace the contents, of course, which is why we return it
    else:
        raise Exception('unhandled file extension %s' % suffix)

    input_info = OrderedDict()
    reco_info = None
    if not is_data:
        reco_info = OrderedDict()
    # already_printed_forbidden_character_warning = False
    n_queries_added = 0
    found_seed = False
    potential_names, used_names = None, None  # for abbreviating
    iname = None  # line number -- used as sequence id if there isn't a name column in the file
    iline = -1
    for line in reader:
        iline += 1
        if args is not None:
            if args.istartstop is not None:
                if iline < args.istartstop[0]:
                    continue
                if iline >= args.istartstop[1]:
                    break
            if args.name_column is not None:
                line['unique_ids'] = line[args.name_column]
                del line[args.name_column]
            if args.seq_column is not None:
                line['input_seqs'] = line[args.seq_column]
                if args.seq_column != 'seqs':  # stupid god damn weird backwards compatibility edge case bullshit
                    del line[args.seq_column]
        if iname is None and 'unique_ids' not in line and 'unique_id' not in line:
            print '  %s: couldn\'t find a name (unique id) column, so using line number as the sequence label (you can set the name column with --name-column)' % (
                utils.color('yellow', 'warning'))
            iname = 0
        if iname is not None:
            line['unique_ids'] = '%09d' % iname
            iname += 1
        if 'input_seqs' not in line and 'seq' not in line:
            raise Exception(
                'couldn\'t find a sequence column in %s (you can set this with --seq-column)'
                % infname)
        if suffix != '.yaml':
            utils.process_input_line(line)
        if len(line['unique_ids']) > 1:
            raise Exception('can\'t yet handle multi-seq csv input files')
        uid = line['unique_ids'][0]
        if uid in input_info:
            new_uid = uid
            iid = 2
            while new_uid in input_info:
                new_uid = uid + '-' + str(iid)
                iid += 1
            print '  %s uid %s already read from input file %s, so replacing with new uid %s' % (
                utils.color('yellow', 'warning'), uid, infname, new_uid)
            uid = new_uid
        inseq = line['input_seqs'][0]

        # # it would be nice to check here for forbidden characters (in addition to in the .fa code above), but it's hard because we won't have read the csv properly above if it has them
        # if any(fc in uid for fc in utils.forbidden_characters):
        #     raise Exception('found a forbidden character (one of %s) in sequence id \'%s\'' % (' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), uid))
        if args is not None:
            if args.abbreviate:  # note that this changes <uid>, but doesn't modify <line>
                uid, potential_names, used_names = utils.choose_new_uid(
                    potential_names, used_names)
            if args.queries is not None and uid not in args.queries:
                continue
            if args.reco_ids is not None and line[
                    'reco_id'] not in args.reco_ids:
                continue
            if args.seed_unique_id is not None and uid == args.seed_unique_id:
                found_seed = True

        if uid in input_info:
            raise Exception('found uid \'%s\' twice in input file %s' %
                            (uid, infname))

        if any(c not in utils.alphabet for c in inseq):
            unexpected_chars = set(
                [ch for ch in inseq if ch not in utils.alphabet])
            raise Exception(
                'unexpected character%s %s (not among %s) in input sequence with id %s:\n  %s'
                % (utils.plural(len(unexpected_chars)), ', '.join([
                    ('\'%s\'' % ch) for ch in unexpected_chars
                ]), utils.nukes + utils.ambiguous_bases, uid, inseq))

        # da business
        input_info[uid] = {
            'unique_ids': [
                uid,
            ],
            'seqs': [
                inseq,
            ]
        }

        if not is_data:
            if 'v_gene' not in line:
                raise Exception('simulation info not found in %s' % infname)
            reco_info[uid] = copy.deepcopy(line)
            if simglfo is not None:
                utils.add_implicit_info(simglfo, reco_info[uid])
            for line_key in utils.input_metafile_keys.values():
                if line_key in reco_info[
                        uid]:  # this is kind of weird to copy from sim info to input info, but it makes sense because affinity is really meta info (the only other place affinity could come from is --input-metafname below). Where i'm defining meta info more or less as any input info besides name and sequence (i think the distinction is only really important because we want to support fastas, which can't [shouldn't!] handle anything else))
                    input_info[uid][line_key] = copy.deepcopy(
                        reco_info[uid][line_key]
                    )  # note that the args.input_metafname stuff below should print a warning if you've also specified that (which you shouldn't, if it's simulation)

        n_queries_added += 1
        if n_max_queries > 0 and n_queries_added >= n_max_queries:
            if not quiet:  # just adding <quiet>, and too lazy to decide what other print statements it should effect, this is the only one I care about right now
                print '  --n-max-queries: stopped after reading %d queries from input file' % len(
                    input_info)
            break

    if more_input_info is not None:  # if you use this on simulation, the extra queries that aren't in <reco_info> may end up breaking something down the line (but I don't imagine this really getting used on simulation)
        if len(set(more_input_info) & set(input_info)) > 0:
            print '  %s found %d queries in both --infname and --queries-to-include-fname (note that we don\'t check here that they correspond to the same sequence): %s' % (
                utils.color('red', 'note:'),
                len(set(more_input_info) & set(input_info)),
                ' '.join(set(more_input_info) & set(input_info))
            )  # not necessarily a problem, but you probably *shouldn't* have sequences floating around in two different files
        if args is not None and args.seed_unique_id is not None and args.seed_unique_id in more_input_info:
            found_seed = True
        input_info.update(more_input_info)
    if args is not None and args.input_metafname is not None:
        read_input_metafo(args.input_metafname,
                          input_info.values(),
                          debug=True)
    post_process(input_info, reco_info, args, infname, found_seed, is_data,
                 iline)

    if len(input_info) == 0:
        raise Exception('didn\'t read any sequences from %s' % infname)

    return input_info, reco_info, yaml_glfo
Exemplo n.º 13
0
def process(args):
    if args.action == 'run-viterbi':
        print'  note: replacing deprecated action name \'run-viterbi\' with current name \'annotate\' (you don\'t need to change anything unless you want this warning message to go away)'
        args.action = 'annotate'
    if args.action == 'view-alternative-naive-seqs':
        print'  note: replacing deprecated action name \'view-alternative-naive-seqs\' with current name \'view-alternative-annotations\' (you don\'t need to change anything unless you want this warning message to go away)'
        args.action = 'view-alternative-annotations'
    if args.calculate_alternative_naive_seqs:
        print '    note: replacing deprecated option \'--calculate-alternative-naive-seqs\' with new option \'--calculate-alternative-annotations\' (you don\'t need to change anything unless you want this warning message to go away)'
        args.calculate_alternative_annotations = True
        delattr(args, 'calculate_alternative_naive_seqs')

    if args.chain is not None:
        print '    note: transferring argument from deprecated option \'--chain %s\' to new option \'--locus %s\'' % (args.chain, 'ig' + args.chain)
        args.locus = 'ig' + args.chain
        args.chain = None
    args.loci = utils.get_arg_list(args.loci, choices=utils.loci)
    if args.loci is None:  # in principle I should check that at least one of 'em isn't None, but if that's the case it'll crash soon enough
        args.loci = [args.locus]
    else:
        args.locus = args.loci[0]

    args.only_genes = utils.get_arg_list(args.only_genes)
    args.queries = utils.get_arg_list(args.queries)
    args.queries_to_include = utils.get_arg_list(args.queries_to_include)
    args.reco_ids = utils.get_arg_list(args.reco_ids)
    args.istartstop = utils.get_arg_list(args.istartstop, intify=True)
    if args.istartstop is not None:
        if args.istartstop[0] >= args.istartstop[1] or args.istartstop[0] < 0:
            raise Exception('invalid --istartstop specification: %d %d' % (args.istartstop[0], args.istartstop[1]))
    args.n_max_per_region = utils.get_arg_list(args.n_max_per_region, intify=True)
    if len(args.n_max_per_region) != 3:
        raise Exception('n-max-per-region should be of the form \'x:y:z\', but I got ' + str(args.n_max_per_region))
    args.write_additional_cluster_annotations = utils.get_arg_list(args.write_additional_cluster_annotations, intify=True)
    if args.write_additional_cluster_annotations is not None and len(args.write_additional_cluster_annotations) != 2:
        raise Exception('--write-additional-cluster-annotations must be specified as two numbers \'m:n\', but I got %s' % args.write_additional_cluster_annotations)
    args.extra_annotation_columns = utils.get_arg_list(args.extra_annotation_columns, choices=utils.extra_annotation_headers)

    args.cluster_indices = utils.get_arg_list(args.cluster_indices, intify=True)

    args.allowed_cdr3_lengths = utils.get_arg_list(args.allowed_cdr3_lengths, intify=True)

    args.region_end_exclusions = {r : [args.region_end_exclusion_length if ('%s_%s' % (r, e)) in utils.real_erosions else 0 for e in ['5p', '3p']] for r in utils.regions}
    args.region_end_exclusion_length = None  # there isn't really a big reason to set it to None, but this makes clear that I should only be using the dict version

    args.typical_genes_per_region_per_subject = utils.get_arg_list(args.typical_genes_per_region_per_subject, intify=True)
    if len(args.typical_genes_per_region_per_subject) != len(utils.regions):
        raise Exception('wrong length for --typical-genes-per-region-per-subject, has to be three')
    tmpfrac, ntmp = args.min_allele_prevalence_fraction, args.typical_genes_per_region_per_subject
    args.min_allele_prevalence_fractions = {r : tmpfrac * ntmp[utils.regions.index('v')] / ntmp[utils.regions.index(r)] for r in utils.regions}
    delattr(args, 'min_allele_prevalence_fraction')  # delete the non-plural version
    delattr(args, 'typical_genes_per_region_per_subject')  # and we don't need this any more either

    args.annotation_clustering_thresholds = utils.get_arg_list(args.annotation_clustering_thresholds, floatify=True)
    args.naive_hamming_bounds = utils.get_arg_list(args.naive_hamming_bounds, floatify=True)
    if args.small_clusters_to_ignore is not None:
        if '-' in args.small_clusters_to_ignore:
            lo, hi = [int(cluster_size) for cluster_size in args.small_clusters_to_ignore.split('-')]
            args.small_clusters_to_ignore = range(lo, hi + 1)
        else:
            args.small_clusters_to_ignore = utils.get_arg_list(args.small_clusters_to_ignore, intify=True)
    if args.seed_unique_id is not None:
        args.seed_unique_id = args.seed_unique_id.strip()  # protect against the space you may put in front of it if it's got an initial minus sign (better way is to use an equals sign)
        if args.queries is not None and args.seed_unique_id not in args.queries:
            raise Exception('seed uid %s not in --queries %s' % (args.seed_unique_id, ' '.join(args.queries)))
        if args.random_seed_seq:
            raise Exception('can\'t specify both --seed-unique-id and --random-seed-seq')

        if args.queries_to_include is None:  # make sure the seed is in --queries-to-include
            args.queries_to_include = [args.seed_unique_id]
        elif args.seed_unique_id not in args.queries_to_include:
            args.queries_to_include = [args.seed_unique_id] + args.queries_to_include  # may as well put it first, I guess (?)
    elif args.seed_seq is not None:
        args.seed_unique_id = 'seed-seq'

    if args.sw_debug is None:  # if not explicitly set, set equal to regular debug
        args.sw_debug = args.debug

    if args.only_genes is not None:
        for gene in args.only_genes:  # make sure they're all at least valid ig genes
            utils.split_gene(gene)

    if args.print_git_commit or args.action == 'version':
        print '  commit: %s' % subprocess.check_output(['git', 'rev-parse', 'HEAD']).strip()
        cmd = 'git describe --always --tags'
        out, err = utils.simplerun(cmd, return_out_err=True, debug=False)
        if '-' in out:
            if out.count('-') == 2:
                tag, n_ahead, commit_hash_abbrev = out.strip().split('-')
                ahead_str = ''
                if int(n_ahead) > 0:
                    ahead_str = '  (well, %d commits ahead of)' % int(n_ahead)
                print '     tag: %s%s' % (tag, ahead_str)
            else:
                print '    couldn\'t figure out tag from \'%s\' output: %s' % (cmd, out)
        else:
            tag = out.strip()
            print '     tag: %s' % tag

        if args.action == 'version':
            sys.exit(0)

    args.is_data = not args.is_simu  # whole code base uses is_data, this is better than changing all of that

    if args.simultaneous_true_clonal_seqs:
        if args.is_data:
            raise Exception('can only pass true clonal families to multi-hmm together on simulation and with --is-simu set')
        if args.n_simultaneous_seqs is not None:
            raise Exception('can\'t specify both --n-simultaneous-seqs and --simultaneous-true-clonal-seqs')
        if args.all_seqs_simultaneous:
            raise Exception('can\'t specify both --all-seqs-simultaneous and --simultaneous-true-clonal-seqs')
    if args.n_simultaneous_seqs is not None and args.all_seqs_simultaneous:
        raise Exception('doesn\'t make sense to set both --n-simultaneous-seqs and --all-seqs-simultaneous.')

    if args.no_indels:
        print 'forcing --gap-open-penalty to %d to prevent indels, since --no-indels was specified (you can also adjust this penalty directly)' % args.no_indel_gap_open_penalty
        args.gap_open_penalty = args.no_indel_gap_open_penalty

    if args.indel_frequency > 0.:
        if args.indel_frequency < 0. or args.indel_frequency > 1.:
            raise Exception('--indel-frequency must be in [0., 1.] (got %f)' % args.indel_frequency)
    args.n_indels_per_indeld_seq = utils.get_arg_list(args.n_indels_per_indeld_seq, intify=True)
    if args.indel_location not in [None, 'v', 'cdr3']:
        if int(args.indel_location) in range(500):
            args.indel_location = int(args.indel_location)
            if any(n > 1 for n in args.n_indels_per_indeld_seq):
                print '  note: removing entries from --n-indels-per-indeld-seq (%s), since --indel-location was set to a single position.' % [n for n in args.n_indels_per_indeld_seq if n > 1]
                args.n_indels_per_indeld_seq = [n for n in args.n_indels_per_indeld_seq if n <= 1]
        else:
            raise Exception('--indel-location \'%s\' neither one of None, \'v\' or \'cdr3\', nor an integer less than 500' % args.indel_location)

    if 'tr' in args.locus and args.mutation_multiplier is None:
        args.mutation_multiplier = 0.

    if args.workdir is None:  # set default here so we know whether it was set by hand or not
        args.workdir = get_workdir(args.batch_system)
    else:
        args.workdir = args.workdir.rstrip('/')
    if os.path.exists(args.workdir):
        raise Exception('workdir %s already exists' % args.workdir)

    if args.batch_system == 'sge' and args.batch_options is not None:
        if '-e' in args.batch_options or '-o' in args.batch_options:
            print '%s --batch-options contains \'-e\' or \'-o\', but we add these automatically since we need to be able to parse each job\'s stdout and stderr. You can control the directory under which they\'re written with --workdir (which is currently %s).' % (utils.color('red', 'warning'), args.workdir)

    if args.outfname is not None and not args.presto_output and not args.airr_output:
        if utils.getsuffix(args.outfname) not in ['.csv', '.yaml']:
            raise Exception('unhandled --outfname suffix %s' % utils.getsuffix(args.outfname))
        if utils.getsuffix(args.outfname) != '.yaml':
            print '  %s --outfname uses deprecated file format %s. This will still mostly work ok, but the new default .yaml format doesn\'t have to do all the string conversions by hand (so is less buggy), and includes annotations, partitions, and germline info in the same file (so you don\'t get crashes or inconsistent results if you don\'t keep track of what germline info goes with what output file).' % (utils.color('yellow', 'note:'), utils.getsuffix(args.outfname))
        if args.action in ['view-annotations', 'view-partitions'] and utils.getsuffix(args.outfname) == '.yaml':
            raise Exception('have to use \'view-output\' action to view .yaml output files')

    if args.presto_output:
        if args.outfname is None:
            raise Exception('have to set --outfname if --presto-output is set')
        if args.action == 'annotate' and utils.getsuffix(args.outfname) != '.tsv':
            raise Exception('--outfname suffix has to be .tsv for annotation with --presto-output (got %s)' % utils.getsuffix(args.outfname))
        if args.action == 'partition' and utils.getsuffix(args.outfname) not in ['.fa', '.fasta']:
            raise Exception('--outfname suffix has to be .fa or .fasta for partitioning with --presto-output (got %s)' % utils.getsuffix(args.outfname))
        if args.aligned_germline_fname is None:
            args.aligned_germline_fname = '%s/%s/imgt-alignments/%s.fa' % (args.default_initial_germline_dir, args.species, args.locus)
        if not os.path.exists(args.aligned_germline_fname):
            raise Exception('--aligned-germline-fname %s doesn\'t exist, but we need it in order to write presto output' % args.aligned_germline_fname)
    if args.airr_output:
        if args.outfname is None:
            raise Exception('have to set --outfname if --airr-output is set')
        if utils.getsuffix(args.outfname) != '.tsv':
            raise Exception('--outfname suffix has to be .tsv if --airr-output is set (got %s)' % utils.getsuffix(args.outfname))
    if args.airr_input:
        args.seq_column = 'sequence'
        args.name_column = 'sequence_id'

    if args.cluster_annotation_fname is None and args.outfname is not None and utils.getsuffix(args.outfname) == '.csv':  # if it wasn't set on the command line (<outfname> _was_ set), _and_ if we were asked for a csv, then use the old file name format
        args.cluster_annotation_fname = utils.insert_before_suffix('-cluster-annotations', args.outfname)

    if args.calculate_alternative_annotations and args.outfname is None:
        raise Exception('have to specify --outfname in order to calculate alternative annotations')
    if args.action == 'view-alternative-annotations' and args.persistent_cachefname is None:  # handle existing old-style output
        assert args.outfname is not None
        if os.path.exists(utils.getprefix(args.outfname) + '-hmm-cache.csv'):
            args.persistent_cachefname = utils.getprefix(args.outfname) + '-hmm-cache.csv'  # written by bcrham, so has to be csv, not yaml

    if args.plot_performance:
        print '%s encountered deprecated argument --plot-performance, moving value to --plot-annotation-performance' % utils.color('yellow', 'warning')
        args.plot_annotation_performance = True
    if args.plot_annotation_performance:
        if args.plotdir is None:
            raise Exception('can\'t plot performance unless --plotdir is specified')
        if not args.is_simu:
            raise Exception('can\'t plot performance unless --is-simu is set')
    if args.action == 'plot-partitions' and args.plotdir is None:
        raise Exception('--plotdir must be specified for plot-partitions')

    if args.make_per_gene_per_base_plots and not args.make_per_gene_plots:  # the former doesn't do anything unless the latter is turned on
        args.make_per_gene_plots = True

    if args.parameter_type != 'hmm':
        print '  using non-default parameter type \'%s\'' % args.parameter_type

    if args.simulate_from_scratch:
        args.rearrange_from_scratch = True
        args.mutate_from_scratch = True
    if args.flat_mute_freq or args.same_mute_freq_for_all_seqs:
        assert args.mutate_from_scratch

    if args.action == 'simulate':
        if len(args.loci) != 1:
            raise Exception('needs to be implemented')
        if args.batch_system is not None and args.n_procs > 1 and not args.subsimproc:
            print '  %s setting subsimproc' % utils.color('red', 'warning')
            args.subsimproc = True
        if args.n_trees is None:
            args.n_trees = max(1, int(float(args.n_sim_events) / args.n_procs))
        if args.outfname is None:
            print '  note: no --outfname specified, so nothing will be written to disk'
            args.outfname = get_dummy_outfname(args.workdir)  # hackey, but otherwise I have to rewrite the wole run_simulation() in bin/partis to handle None type outfname
        if args.n_max_queries != -1:
            print '  note: --n-max-queries is not used when simulating (use --n-sim-events to set the simulated number of rearrangemt events)'

        # end result of this block: shm/reco parameter dirs are set (unless we're doing their bit from scratch), --parameter-dir is set to None (and if --parameter-dir was set but shm/reco were _not_ set, we've just used --parameter-dir for either/both as needed)
        if args.parameter_dir is not None:
            if args.rearrange_from_scratch or args.mutate_from_scratch:
                raise Exception('can\'t set --parameter-dir if rearranging or mutating from scratch (use --reco-parameter-dir and/or --shm-parameter-dir)')
            if args.reco_parameter_dir is not None or args.shm_parameter_dir is not None:
                raise Exception('can\'t set --parameter-dir if either --reco-parameter-dir or --shm-parameter-dir are also set')
            args.reco_parameter_dir = args.parameter_dir
            args.shm_parameter_dir = args.parameter_dir
            args.parameter_dir = None
        if args.rearrange_from_scratch and args.reco_parameter_dir is not None:
            raise Exception('doesn\'t make sense to set both --rearrange-from-scratch and --reco-parameter-dir')
        if args.mutate_from_scratch and args.shm_parameter_dir is not None:
            raise Exception('doesn\'t make sense to set both --mutate-from-scratch and --shm-parameter-dir')
        if args.reco_parameter_dir is None and not args.rearrange_from_scratch:
            raise Exception('have to either set --rearrange-from-scratch or --reco-parameter-dir')
        if args.shm_parameter_dir is None and not args.mutate_from_scratch:
            raise Exception('have to either set --mutate-from-scratch or --shm-parameter-dir')

        if args.generate_germline_set and not args.rearrange_from_scratch:
            raise Exception('can only --generate-germline-set if also rearranging from scratch (set --rearrange-from-scratch)')

        if args.generate_germline_set:
            args.snp_positions = None  # if you want to control the exact positions, you have to use bin/test-germline-inference.py
            args.indel_positions = None
            process_gls_gen_args(args)

    if args.parameter_dir is not None:
        args.parameter_dir = args.parameter_dir.rstrip('/')
        if os.path.exists(args.parameter_dir) and len(set(os.listdir(args.parameter_dir)) & set(parameter_type_choices)) == 0:
            raise Exception('couldn\'t find any expected parameter types (i.e. subdirs) in --parameter-dir \'%s\'. Allowed types: %s, found: %s. Maybe you added the parameter type to the parameter dir path?' % (args.parameter_dir, ' '.join(parameter_type_choices), ' '.join(os.listdir(args.parameter_dir))))

    if os.path.exists(args.default_initial_germline_dir + '/' + args.species):  # ick that is hackey
        args.default_initial_germline_dir += '/' + args.species

    if args.species != 'human' and not args.allele_cluster:
        print '  non-human species \'%s\', turning on allele clustering' % args.species
        args.allele_cluster = True

    if args.n_max_snps is not None and args.n_max_mutations_per_segment is not None:
        if args.n_max_snps > args.n_max_mutations_per_segment - 10:
            raise Exception('--n-max-snps should be at least ten less than --n-max-mutations-per-segment, but I got %d and %d' % (args.n_max_snps, args.n_max_mutations_per_segment))

    if args.leave_default_germline:
        args.dont_remove_unlikely_alleles = True
        args.allele_cluster = False
        args.dont_find_new_alleles = True

    if args.infname is None and args.action not in ['simulate', 'view-output', 'view-annotations', 'view-partitions', 'view-cluster-annotations', 'plot-partitions', 'view-alternative-annotations', 'get-tree-metrics', 'get-linearham-info']:
        raise Exception('--infname is required for action \'%s\'' % args.action)

    if args.action == 'get-linearham-info':
        if args.linearham_info_fname is None:  # for some reason setting required=True isn't working
            raise Exception('have to specify --linearham-info-fname')
        if args.sw_cachefname is None and args.parameter_dir is None:
            raise Exception('have to specify --sw-cachefname or --parameter-dir, since we need sw info to calculate linearham inputs')
        if args.extra_annotation_columns is None or 'linearham-info' not in args.extra_annotation_columns:
            args.extra_annotation_columns = utils.add_lists(args.extra_annotation_columns, ['linearham-info'])
Exemplo n.º 14
0
parser.add_argument('--indel-reversed-seqs', action='store_true', help='if set, take sequences that have had any shm indels "reversed" (i.e. insertions are reversed, and deletions are replaced with the germline bases) rather than the default of using sequences from the original input file. Indel-reversed sequences can be convenient because they are by definition the same length as and aligned to the naive sequence.')
parser.add_argument('--glfo-dir', help='Directory with germline info. Only necessary for old-style csv output files. Equivalent to a parameter dir with \'/hmm/germline-sets\' appended.')
parser.add_argument('--locus', default='igh', help='only used for old-style csv output files')
parser.add_argument('--plotdir', help='if set, plot annotation parameters from --fname to --plotdir and exit (you still have to set outfile, sorry, it\'s nice having it be a positional arg, but it doesn\'t get used). To add e.g. per-gene-per-position plots comment/uncomment args in the call below.')
parser.add_argument('--fasta-info-separator', default=' ', help='character to use ')

if 'extract-fasta.py' in sys.argv[0]:  # if they're trying to run this old script, which is now just a link to this one, print a warning and rejigger the arguments so it still works
    print '  note: running deprecated script %s, which currently is just a link pointing to %s' % (os.path.basename(sys.argv[0]), os.path.basename(os.path.realpath( __file__)))
    print '  note: transferring deprecated arguments --input-file and --fasta-output-file to the first two positional arguments (this will continue to work, you only need to change things if you want this warning to go away)'
    utils.insert_in_arglist(sys.argv, [utils.get_val_from_arglist(sys.argv, '--input-file'), utils.get_val_from_arglist(sys.argv, '--fasta-output-file')], sys.argv[0])
    utils.remove_from_arglist(sys.argv, '--input-file', has_arg=True)
    utils.remove_from_arglist(sys.argv, '--fasta-output-file', has_arg=True)

args = parser.parse_args()
args.extra_columns = utils.get_arg_list(args.extra_columns)
assert utils.getsuffix(args.outfile) in ['.csv', '.tsv', '.fa', '.fasta']

default_glfo_dir = partis_dir + '/data/germlines/human'
if utils.getsuffix(args.infile) == '.csv' and args.glfo_dir is None:
    print '  note: reading deprecated csv format, so need to get germline info from a separate directory; --glfo-dir was not set, so using default %s. If it doesn\'t crash, it\'s probably ok.' % default_glfo_dir
    args.glfo_dir = default_glfo_dir
glfo, annotation_list, cpath = utils.read_output(args.infile, glfo_dir=args.glfo_dir, locus=args.locus)

if args.plotdir is not None:
    from parametercounter import ParameterCounter
    setattr(args, 'region_end_exclusions', {r : [0 for e in ['5p', '3p']] for r in utils.regions})  # hackity hackity hackity
    pcounter = ParameterCounter(glfo, args)
    for line in annotation_list:
        pcounter.increment(line)
    pcounter.plot(args.plotdir) #, make_per_base_plots=True) #, only_overall=True, make_per_base_plots=True
    sys.exit(0)
Exemplo n.º 15
0
if 'extract-fasta.py' in sys.argv[
        0]:  # if they're trying to run this old script, which is now just a link to this one, print a warning and rejigger the arguments so it still works
    print '  note: running deprecated script %s, which currently is just a link pointing to %s' % (
        os.path.basename(
            sys.argv[0]), os.path.basename(os.path.realpath(__file__)))
    print '  note: transferring deprecated arguments --input-file and --fasta-output-file to the first two positional arguments (this will continue to work, you only need to change things if you want this warning to go away)'
    utils.insert_in_arglist(sys.argv, [
        utils.get_val_from_arglist(sys.argv, '--input-file'),
        utils.get_val_from_arglist(sys.argv, '--fasta-output-file')
    ], sys.argv[0])
    utils.remove_from_arglist(sys.argv, '--input-file', has_arg=True)
    utils.remove_from_arglist(sys.argv, '--fasta-output-file', has_arg=True)

args = parser.parse_args()
args.extra_columns = utils.get_arg_list(args.extra_columns)
assert utils.getsuffix(args.outfile) in ['.csv', '.tsv', '.fa', '.fasta']

default_glfo_dir = partis_dir + '/data/germlines/human'
if utils.getsuffix(args.infile) == '.csv' and args.glfo_dir is None:
    print '  note: reading deprecated csv format, so need to get germline info from a separate directory; --glfo-dir was not set, so using default %s. If it doesn\'t crash, it\'s probably ok.' % default_glfo_dir
    args.glfo_dir = default_glfo_dir
glfo, annotation_list, cpath = utils.read_output(args.infile,
                                                 glfo_dir=args.glfo_dir,
                                                 locus=args.locus)

if args.plotdir is not None:
    from parametercounter import ParameterCounter
    setattr(args, 'region_end_exclusions',
            {r: [0 for e in ['5p', '3p']]
             for r in utils.regions})  # hackity hackity hackity
    pcounter = ParameterCounter(glfo, args)
Exemplo n.º 16
0
    'point in max-abs-diff above which we assume most sequences are chimeric')
parser.add_argument('--title')
parser.add_argument('--locus', default='igh')
args = parser.parse_args()
if args.title == 'good':
    args.title = 'none'
elif args.title == 'chimeras':
    args.title = 'all chimeras'


def gk(uids):
    return ':'.join(uids)


glfo = None
if utils.getsuffix(args.infile) == '.csv':
    glfo = glutils.read_glfo(args.glfo_dir, args.locus)
glfo, annotation_list, _ = utils.read_output(args.infile, glfo=glfo)
annotations = collections.OrderedDict(
    (line['unique_ids'][0], line) for line in annotation_list)

chfo = {
    uid: {
        k: v
        for k, v in zip(
            ('imax', 'max_abs_diff'),
            utils.get_chimera_max_abs_diff(
                annotations[uid], iseq=0, chunk_len=args.chunk_len))
    }
    for uid in annotations
}
Exemplo n.º 17
0
    def run_treesim(self, seed, outfname, workdir):
        if self.args.debug or utils.getsuffix(outfname) == '.nwk':
            print '  generating %d tree%s,' % (
                self.args.n_trees, utils.plural(self.args.n_trees)),
            if self.args.constant_number_of_leaves:
                print 'all with %s leaves' % str(self.args.n_leaves)
            else:
                print 'n-leaves from %s' % (
                    'hist in parameter dir' if self.final_nldist == 'hist' else
                    '%s distribution with parameter %s' %
                    (self.final_nldist, str(self.args.n_leaves)))
            if self.args.debug:
                print '        mean branch lengths from %s' % (
                    self.parameter_dir
                    if self.parameter_dir is not None else 'scratch')
                for mtype in [
                        'all',
                ] + utils.regions:
                    print '         %4s %7.3f (ratio %7.3f)' % (
                        mtype, self.branch_lengths[mtype]['mean'],
                        self.branch_lengths[mtype]['mean'] /
                        self.branch_lengths['all']['mean'])

        ages, treestrs = [], []

        cmd_lines = []
        pkgname = 'TreeSim'  # TreeSimGM when root_mrca_weibull_parameter is set, otherwise TreeSim
        if self.args.root_mrca_weibull_parameter is not None:
            pkgname += 'GM'
        cmd_lines += ['require(%s, quietly=TRUE)' % pkgname]
        cmd_lines += ['set.seed(' + str(seed) + ')']
        for itree in range(self.args.n_trees):
            n_leaves = self.choose_n_leaves()
            age = self.choose_full_sequence_branch_length()
            ages.append(age)
            if n_leaves == 1:  # add singleton trees by hand
                treestrs.append('t1:%f;' % age)
                continue
            treestrs.append(None)

            # NOTE these simulation functions seem to assume that we want all the extant leaves to have the same height. Which is kind of weird. Maybe makes more sense at some point to change this.
            params = {'n': n_leaves, 'numbsim': self.n_trees_each_run}
            if self.args.root_mrca_weibull_parameter is None:
                fcn = 'sim.bd.taxa.age'
                params['lambda'] = 1  # speciation_rate
                params['mu'] = 0.5  # extinction_rate
                params['age'] = age
            else:
                fcn = 'sim.taxa'
                params['distributionspname'] = '"rweibull"'
                params[
                    'distributionspparameters'] = 'c(%f, 1)' % self.args.root_mrca_weibull_parameter
                params[
                    'labellivingsp'] = '"t"'  # TreeSim doesn't let you do this, but a.t.m. this is their default
            cmd_lines += [
                'trees <- %s(%s)' % (fcn, ', '.join(
                    ['%s=%s' % (k, str(v)) for k, v in params.items()]))
            ]
            cmd_lines += [
                'write.tree(trees[[1]], \"' + outfname + '\", append=TRUE)'
            ]

        if None not in treestrs:  # if every tree has one leaf, we don't need to run R
            open(outfname, 'w').close()
        else:
            if os.path.exists(outfname):
                os.remove(outfname)
            utils.run_r(
                cmd_lines,
                workdir,
                print_time='tree generation' if self.args.debug else None)

        with open(outfname) as treefile:
            for itree, tstr in enumerate(treestrs):
                if tstr is None:
                    treestrs[itree] = treefile.readline().strip()
            if None in treestrs:
                raise Exception(
                    'didn\'t read enough trees from %s: still %d empty places in treestrs'
                    % (outfname, treestrs.count(None)))

        # rescale branch lengths (TreeSim lets you specify the number of leaves and the height at the same time, but TreeSimGM doesn't, and TreeSim's numbers are usually a little off anyway... so we rescale everybody)
        for itree in range(len(ages)):
            treestrs[itree] = '(%s):0.0;' % treestrs[itree].rstrip(
                ';'
            )  # the trees it spits out have non-zero branch length above root (or at least that's what the newick strings turn into when dendropy reads them), which is f****d up and annoying, so here we add a new/real root at the top of the original root's branch
            treestrs[itree] = treeutils.rescale_tree(ages[itree],
                                                     treestr=treestrs[itree])

        return ages, treestrs
Exemplo n.º 18
0
def process(args):
    if args.action == 'run-viterbi':
        print '  note: replacing deprecated action name \'run-viterbi\' with current name \'annotate\' (this doesn\'t change any actual behavior)'
        args.action = 'annotate'

    if args.chain is not None:
        print '    note: transferring argument from deprecated option \'--chain %s\' to new option \'--locus %s\'' % (
            args.chain, 'ig' + args.chain)
        args.locus = 'ig' + args.chain
        args.chain = None
    args.loci = utils.get_arg_list(args.loci, choices=utils.loci)
    if args.loci is None:  # in principle I should check that at least one of 'em isn't None, but if that's the case it'll crash soon enough
        args.loci = [args.locus]
    else:
        args.locus = args.loci[0]

    args.only_genes = utils.get_arg_list(args.only_genes)
    args.n_procs = utils.get_arg_list(args.n_procs, intify=True)
    args.n_fewer_procs = args.n_procs[0] if len(
        args.n_procs) == 1 else args.n_procs[1]
    args.n_procs = args.n_procs[0]
    args.queries = utils.get_arg_list(args.queries)
    args.queries_to_include = utils.get_arg_list(args.queries_to_include)
    args.reco_ids = utils.get_arg_list(args.reco_ids)
    args.istartstop = utils.get_arg_list(args.istartstop, intify=True)
    if args.istartstop is not None:
        if args.istartstop[0] >= args.istartstop[1] or args.istartstop[0] < 0:
            raise Exception('invalid --istartstop specification: %d %d' %
                            (args.istartstop[0], args.istartstop[1]))
    args.n_max_per_region = utils.get_arg_list(args.n_max_per_region,
                                               intify=True)
    if len(args.n_max_per_region) != 3:
        raise Exception(
            'n-max-per-region should be of the form \'x:y:z\', but I got ' +
            str(args.n_max_per_region))
    args.write_additional_cluster_annotations = utils.get_arg_list(
        args.write_additional_cluster_annotations, intify=True)
    if args.write_additional_cluster_annotations is not None and len(
            args.write_additional_cluster_annotations) != 2:
        raise Exception(
            '--write-additional-cluster-annotations must be specified as two numbers \'m:n\', but I got %s'
            % args.write_additional_cluster_annotations)
    args.extra_annotation_columns = utils.get_arg_list(
        args.extra_annotation_columns, choices=utils.extra_annotation_headers)

    args.region_end_exclusions = {
        r: [
            args.region_end_exclusion_length if
            ('%s_%s' % (r, e)) in utils.real_erosions else 0
            for e in ['5p', '3p']
        ]
        for r in utils.regions
    }
    args.region_end_exclusion_length = None  # there isn't really a big reason to set it to None, but this makes clear that I should only be using the dict version

    args.initial_match_mismatch = utils.get_arg_list(
        args.initial_match_mismatch, intify=True)
    if len(args.initial_match_mismatch) != 2:
        raise Exception(
            '--initial-match-mismatch should be of the form \'match:mismatch\', but I got '
            + str(args.n_max_per_region))
    args.annotation_clustering_thresholds = utils.get_arg_list(
        args.annotation_clustering_thresholds, floatify=True)
    args.naive_hamming_bounds = utils.get_arg_list(args.naive_hamming_bounds,
                                                   floatify=True)
    if args.small_clusters_to_ignore is not None:
        if '-' in args.small_clusters_to_ignore:
            lo, hi = [
                int(cluster_size)
                for cluster_size in args.small_clusters_to_ignore.split('-')
            ]
            args.small_clusters_to_ignore = range(lo, hi + 1)
        else:
            args.small_clusters_to_ignore = utils.get_arg_list(
                args.small_clusters_to_ignore, intify=True)
    if args.seed_unique_id is not None:
        args.seed_unique_id = args.seed_unique_id.strip(
        )  # protect against the space you may put in front of it if it's got an initial minus sign (better way is to use an equals sign)
        if args.queries is not None and args.seed_unique_id not in args.queries:
            raise Exception('seed uid %s not in --queries %s' %
                            (args.seed_unique_id, ' '.join(args.queries)))
        if args.random_seed_seq:
            raise Exception(
                'can\'t specify both --seed-unique-id and --random-seed-seq')

        if args.queries_to_include is None:  # make sure the seed is in --queries-to-include
            args.queries_to_include = [args.seed_unique_id]
        elif args.seed_unique_id not in args.queries_to_include:
            args.queries_to_include = [
                args.seed_unique_id
            ] + args.queries_to_include  # may as well put it first, I guess (?)

    if args.sw_debug is None:  # if not explicitly set, set equal to regular debug
        args.sw_debug = args.debug

    if args.only_genes is not None:
        for gene in args.only_genes:  # make sure they're all at least valid ig genes
            utils.split_gene(gene)

    # if n_procs < 1 or n_procs > 9999:  # It happened, at least once. You know, probably.
    #     raise Exception('bad n_procs %s' % n_procs)
    if args.n_procs > args.n_max_procs:
        print 'reducing n procs %d to --n-max-procs %d' % (args.n_procs,
                                                           args.n_max_procs)
        args.n_procs = args.n_max_procs
    if args.n_fewer_procs > args.n_max_procs:
        print 'reducing n procs %d to --n-max-procs %d' % (args.n_fewer_procs,
                                                           args.n_max_procs)
        args.n_fewer_procs = args.n_max_procs

    if args.print_git_commit or args.action == 'version':
        print 'RUN ' + ' '.join(sys.argv)
        tag = check_output(['git', 'tag']).split()[-1]
        print '       tag %s' % tag
        print '    commit %s' % check_output(['git', 'rev-parse', 'HEAD'
                                              ]).strip()
        if args.action == 'version':
            sys.exit(0)

    args.is_data = not args.is_simu  # whole code base uses is_data, this is better than changing all of that

    if args.simultaneous_true_clonal_seqs:
        if args.is_data:
            raise Exception(
                'can only pass true clonal families to multi-hmm together on simulation and with --is-simu set'
            )
        if args.n_simultaneous_seqs is not None:
            raise Exception(
                'can\'t specify both --n-simultaneous-seqs and --simultaneous-true-clonal-seqs'
            )

    if args.no_indels and args.gap_open_penalty < 1000:
        print 'forcing --gap-open-penalty to 1000 to prevent indels, since --no-indels was specified (you can also adjust this penalty directly)'
        args.gap_open_penalty = 1000

    if 'tr' in args.locus and args.mutation_multiplier is None:
        args.mutation_multiplier = 0.

    if args.workdir is None:  # set default here so we know whether it was set by hand or not

        def choose_random_subdir(dirname):
            subname = str(random.randint(0, 999999))
            while os.path.exists(dirname + '/' + subname):
                subname = str(random.randint(0, 999999))
            return dirname + '/' + subname

        if args.batch_system is not None and os.path.exists(
                '/fh/fast/matsen_e'):
            args.workdir = choose_random_subdir(
                '/fh/fast/matsen_e/' + os.path.basename(os.getenv('HOME')) +
                '/_tmp/hmms')
        else:
            args.workdir = choose_random_subdir(
                '/tmp/' + os.path.basename(os.getenv('HOME')) + '/hmms')
            if args.batch_system is not None:
                print '  %s: using batch system %s with default --workdir (%s) -- if this isn\'t visible to the batch nodes on your system, you\'ll need to change it' % (
                    utils.color('red',
                                'warning'), args.batch_system, args.workdir)
    else:
        args.workdir = args.workdir.rstrip('/')
    if os.path.exists(args.workdir):
        raise Exception('workdir %s already exists' % args.workdir)

    if args.batch_system == 'sge' and args.batch_options is not None:
        if '-e' in args.batch_options or '-o' in args.batch_options:
            print '%s --batch-options contains \'-e\' or \'-o\', but we add these automatically since we need to be able to parse each job\'s stdout and stderr. You can control the directory under which they\'re written with --workdir (which is currently %s).' % (
                utils.color('red', 'warning'), args.workdir)

    if args.cluster_annotation_fname is None and args.outfname is not None:
        args.cluster_annotation_fname = args.outfname.replace(
            utils.getsuffix(args.outfname), '-cluster-annotations.csv')

    if args.calculate_alternative_naive_seqs or (
            args.action == 'view-alternative-naive-seqs'
            and args.persistent_cachefname is None):
        if args.outfname is None:
            raise Exception(
                'have to specify --outfname in order to calculate alternative naive sequences'
            )
        args.persistent_cachefname = args.outfname.replace(
            '.csv', '-hmm-cache.csv')
        if args.calculate_alternative_naive_seqs and os.path.exists(
                args.persistent_cachefname):
            if os.stat(args.persistent_cachefname).st_size == 0:
                print '  note: removing existing zero-length persistent cache file %s' % args.persistent_cachefname
                os.remove(args.persistent_cachefname)
            else:
                raise Exception(
                    'persistent cache file %s already exists, but we were asked to --calculate-alternative-naive-seqs. Either it\'s an old file (in which case you should delete it), or you\'ve already got the alternative annotations (so you can just run view-alternative-naive-seqs)'
                    % args.persistent_cachefname)

    if args.plot_performance:
        print '%s encountered deprecated argument --plot-performance, moving value to --plot-annotation-performance' % utils.color(
            'yellow', 'warning')
        args.plot_annotation_performance = True
    if args.plot_annotation_performance:
        if args.plotdir is None:
            raise Exception(
                'can\'t plot performance unless --plotdir is specified')
        if not args.is_simu:
            raise Exception('can\'t plot performance unless --is-simu is set')

    if args.parameter_type != 'hmm':
        print '  using non-default parameter type \'%s\'' % args.parameter_type

    if args.presto_output and args.aligned_germline_fname is None:
        raise Exception(
            'in order to get presto output, you have to set --aligned-germline-fname (a fasta file with germline alignments for every germline gene)'
        )

    if args.parameter_dir is not None:
        args.parameter_dir = args.parameter_dir.rstrip('/')

    if args.count_parameters and not args.dont_write_parameters:
        raise Exception(
            'if you set --count-parameters, you should also set --dont-write-parameters to make sure you\'re not accidentally overwriting existing parameters '
        )

    if os.path.exists(args.default_initial_germline_dir + '/' +
                      args.species):  # ick that is hackey
        args.default_initial_germline_dir += '/' + args.species

    if args.n_max_snps is not None and args.n_max_mutations_per_segment is not None:
        if args.n_max_snps > args.n_max_mutations_per_segment - 10:
            raise Exception(
                '--n-max-snps should be at least ten less than --n-max-mutations-per-segment, but I got %d and %d'
                % (args.n_max_snps, args.n_max_mutations_per_segment))

    if args.n_alleles_per_gene is None:
        if not args.dont_find_new_alleles:
            args.n_alleles_per_gene = 1
        else:
            args.n_alleles_per_gene = 2

    if args.leave_default_germline:
        args.dont_remove_unlikely_alleles = True
        args.allele_cluster = False
        args.dont_find_new_alleles = True

    if args.flat_mute_freq is not None or args.same_mute_freq_for_all_seqs:
        assert args.mutate_from_scratch
Exemplo n.º 19
0
def get_seqfile_info(infname,
                     is_data,
                     n_max_queries=-1,
                     args=None,
                     simglfo=None,
                     quiet=False):
    """ return list of sequence info from files of several types """

    suffix = utils.getsuffix(infname)
    if len(re.findall('\.[ct]sv', suffix)) > 0:
        if suffix == '.csv':
            delimiter = ','
        elif suffix == '.tsv':
            delimiter = '\t'
        else:
            assert False
        seqfile = open(infname)
        reader = csv.DictReader(seqfile, delimiter=delimiter)
    else:
        reader = utils.read_fastx(
            infname,
            name_key='unique_ids',
            seq_key='input_seqs',
            add_info=False,
            sanitize=True,
            n_max_queries=
            n_max_queries,  # NOTE don't use istarstop kw arg here, 'cause it f***s with the istartstop treatment in the loop below
            queries=(args.queries if
                     (args is not None and not args.abbreviate) else None)
        )  # NOTE also can't filter on args.queries here if we're also translating

    input_info = OrderedDict()
    reco_info = None
    if not is_data:
        reco_info = OrderedDict()
    # already_printed_forbidden_character_warning = False
    n_queries_added = 0
    found_seed = False
    used_names = set()  # for abbreviating
    if args is not None and args.abbreviate:
        potential_names = list(string.ascii_lowercase)
    iname = None  # line number -- used as sequence id if there isn't a name column in the file
    iline = -1
    for line in reader:
        iline += 1
        if args is not None:
            if args.istartstop is not None:
                if iline < args.istartstop[0]:
                    continue
                if iline >= args.istartstop[1]:
                    break
            if args.name_column is not None:
                line['unique_ids'] = line[args.name_column]
                del line[args.name_column]
            if args.seq_column is not None:
                line['input_seqs'] = line[args.seq_column]
                if args.seq_column != 'seqs':  # stupid god damn weird backwards compatibility edge case bullshit
                    del line[args.seq_column]
        if iname is None and 'unique_ids' not in line and 'unique_id' not in line:
            print '  %s: couldn\'t find a name (unique id) column, so using line number as the sequence label (you can set the name column with --name-column)' % (
                utils.color('yellow', 'warning'))
            iname = 0
        if iname is not None:
            line['unique_ids'] = '%09d' % iname
            iname += 1
        if 'input_seqs' not in line and 'seq' not in line:
            raise Exception(
                'couldn\'t find a sequence column in %s (you can set this with --seq-column)'
                % infname)
        utils.process_input_line(line)
        if len(line['unique_ids']) > 1:
            raise Exception('can\'t yet handle multi-seq csv input files')
        uid = line['unique_ids'][0]
        if uid in input_info:
            new_uid = uid
            iid = 2
            while new_uid in input_info:
                new_uid = uid + '-' + str(iid)
                iid += 1
            print '  %s uid %s already read from input file %s, so replacing with new uid %s' % (
                utils.color('yellow', 'warning'), uid, infname, new_uid)
            uid = new_uid
        inseq = line['input_seqs'][0]

        # # it would be nice to check here for forbidden characters (in addition to in the .fa code above), but it's hard because we won't have read the csv properly above it has them
        # if any(fc in uid for fc in utils.forbidden_characters):
        #     raise Exception('found a forbidden character (one of %s) in sequence id \'%s\'' % (' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), uid))
        if args is not None:
            if args.abbreviate:  # note that this changes <uid>, but doesn't modify <line>
                uid = abbreviate(used_names, potential_names, uid)
            if args.queries is not None and uid not in args.queries:
                continue
            if args.reco_ids is not None and line[
                    'reco_id'] not in args.reco_ids:
                continue
            if args.seed_unique_id is not None and uid == args.seed_unique_id:
                found_seed = True

        if uid in input_info:
            raise Exception('found uid \'%s\' twice in input file %s' %
                            (uid, infname))

        if len(inseq.translate(None, ''.join(utils.alphabet))) > 0:
            unexpected_chars = set(
                [ch for ch in inseq if ch not in utils.alphabet])
            raise Exception(
                'unexpected character%s %s (not among %s) in input sequence with id %s:\n  %s'
                % (utils.plural(len(unexpected_chars)), ', '.join([
                    ('\'%s\'' % ch) for ch in unexpected_chars
                ]), utils.nukes + utils.ambiguous_bases, uid, inseq))

        # da business
        input_info[uid] = {
            'unique_ids': [
                uid,
            ],
            'seqs': [
                inseq,
            ]
        }

        if n_queries_added == 0 and is_data and 'reco_id' in line:
            print '  note: found simulation info in %s -- are you sure you didn\'t mean to set --is-simu?' % infname

        if not is_data:
            if 'v_gene' not in line:
                raise Exception('simulation info not found in %s' % infname)
            reco_info[uid] = copy.deepcopy(line)
            if simglfo is not None:
                utils.add_implicit_info(simglfo, reco_info[uid])

        n_queries_added += 1
        if n_max_queries > 0 and n_queries_added >= n_max_queries:
            if not quiet:  # just adding <quiet>, and too lazy to decide what other print statements it should effect, this is the only one I care about right now
                print '  --n-max-queries: stopped after reading %d queries from input file' % len(
                    input_info)
            break

    post_process(input_info, reco_info, args, infname, found_seed, is_data,
                 iline)

    if len(input_info) == 0:
        raise Exception('didn\'t read any sequences from %s' % infname)

    return input_info, reco_info
Exemplo n.º 20
0
def process(args):
    if args.action == 'run-viterbi':
        print '  note: replacing deprecated action name \'run-viterbi\' with current name \'annotate\' (you don\'t need to change anything unless you want this warning message to go away)'
        args.action = 'annotate'
    if args.action == 'view-alternative-naive-seqs':
        print '  note: replacing deprecated action name \'view-alternative-naive-seqs\' with current name \'view-alternative-annotations\' (you don\'t need to change anything unless you want this warning message to go away)'
        args.action = 'view-alternative-annotations'

    args.light_chain_fractions = utils.get_arg_list(args.light_chain_fractions,
                                                    key_val_pairs=True,
                                                    floatify=True)
    if args.light_chain_fractions is not None and not utils.is_normed(
            args.light_chain_fractions.values()):
        raise Exception('--light-chain-fractions %s don\'t add to 1: %f' %
                        (args.light_chain_fractions,
                         sum(args.light_chain_fractions.values())))
    if args.action == 'merge-paired-partitions':
        assert args.paired_loci
    if args.paired_loci:
        args.locus = None
        if [args.infname, args.paired_indir].count(None) == 0:
            raise Exception('can\'t specify both --infname and --paired-indir')
        if args.outfname is not None:
            raise Exception(
                'can\'t set --outfname if --paired-loci is set (use --paired-outdir)'
            )
        if args.plotdir == 'paired-outdir':
            args.plotdir = args.paired_outdir
        if args.plotdir is None and args.action == 'plot-partitions':
            args.plotdir = args.paired_outdir
    else:
        assert args.paired_indir is None
    if not args.paired_loci and (args.paired_indir is not None
                                 or args.paired_outdir is not None):
        raise Exception(
            '--paired-loci must be set if either --paired-indir or --paired-outdir is set'
        )
    if args.reverse_negative_strands and not args.paired_loci:
        raise Exception(
            '--reverse-negative-strands has no effect unless --paired-loci is set (maybe need to run bin/split-loci.py separately?)'
        )

    args.only_genes = utils.get_arg_list(args.only_genes)
    args.queries = utils.get_arg_list(args.queries)
    args.queries_to_include = utils.get_arg_list(args.queries_to_include)
    args.reco_ids = utils.get_arg_list(args.reco_ids)
    args.istartstop = utils.get_arg_list(args.istartstop, intify=True)
    if args.istartstop is not None:
        if args.istartstop[0] >= args.istartstop[1] or args.istartstop[0] < 0:
            raise Exception('invalid --istartstop specification: %d %d' %
                            (args.istartstop[0], args.istartstop[1]))
    args.n_max_per_region = utils.get_arg_list(args.n_max_per_region,
                                               intify=True)
    if len(args.n_max_per_region) != 3:
        raise Exception(
            'n-max-per-region should be of the form \'x:y:z\', but I got ' +
            str(args.n_max_per_region))
    args.write_additional_cluster_annotations = utils.get_arg_list(
        args.write_additional_cluster_annotations, intify=True)
    if args.write_additional_cluster_annotations is not None and len(
            args.write_additional_cluster_annotations) != 2:
        raise Exception(
            '--write-additional-cluster-annotations must be specified as two numbers \'m:n\', but I got %s'
            % args.write_additional_cluster_annotations)
    args.extra_annotation_columns = utils.get_arg_list(
        args.extra_annotation_columns, choices=utils.extra_annotation_headers)

    args.cluster_indices = utils.get_arg_list(args.cluster_indices,
                                              intify_with_ranges=True)

    args.allowed_cdr3_lengths = utils.get_arg_list(args.allowed_cdr3_lengths,
                                                   intify=True)

    args.region_end_exclusions = {
        r: [
            args.region_end_exclusion_length if
            ('%s_%s' % (r, e)) in utils.real_erosions else 0
            for e in ['5p', '3p']
        ]
        for r in utils.regions
    }
    args.region_end_exclusion_length = None  # there isn't really a big reason to set it to None, but this makes clear that I should only be using the dict version

    args.typical_genes_per_region_per_subject = utils.get_arg_list(
        args.typical_genes_per_region_per_subject, intify=True)
    if len(args.typical_genes_per_region_per_subject) != len(utils.regions):
        raise Exception(
            'wrong length for --typical-genes-per-region-per-subject, has to be three'
        )
    tmpfrac, ntmp = args.min_allele_prevalence_fraction, args.typical_genes_per_region_per_subject
    args.min_allele_prevalence_fractions = {
        r:
        tmpfrac * ntmp[utils.regions.index('v')] / ntmp[utils.regions.index(r)]
        for r in utils.regions
    }
    delattr(args,
            'min_allele_prevalence_fraction')  # delete the non-plural version
    delattr(args, 'typical_genes_per_region_per_subject'
            )  # and we don't need this any more either

    args.annotation_clustering_thresholds = utils.get_arg_list(
        args.annotation_clustering_thresholds, floatify=True)
    args.naive_hamming_bounds = utils.get_arg_list(args.naive_hamming_bounds,
                                                   floatify=True)
    if args.small_clusters_to_ignore is not None:
        if '-' in args.small_clusters_to_ignore:
            lo, hi = [
                int(cluster_size)
                for cluster_size in args.small_clusters_to_ignore.split('-')
            ]
            args.small_clusters_to_ignore = range(lo, hi + 1)
        else:
            args.small_clusters_to_ignore = utils.get_arg_list(
                args.small_clusters_to_ignore, intify=True)
    if args.seed_unique_id is not None:
        args.seed_unique_id = args.seed_unique_id.strip(
        )  # protect against the space you may put in front of it if it's got an initial minus sign (better way is to use an equals sign)
        if args.queries is not None and args.seed_unique_id not in args.queries:
            raise Exception('seed uid %s not in --queries %s' %
                            (args.seed_unique_id, ' '.join(args.queries)))
        if args.random_seed_seq:
            raise Exception(
                'can\'t specify both --seed-unique-id and --random-seed-seq')

        if args.queries_to_include is None:  # make sure the seed is in --queries-to-include
            args.queries_to_include = [args.seed_unique_id]
        elif args.seed_unique_id not in args.queries_to_include:
            args.queries_to_include = [
                args.seed_unique_id
            ] + args.queries_to_include  # may as well put it first, I guess (?)
    elif args.seed_seq is not None:
        args.seed_unique_id = 'seed-seq'

    if args.sw_debug is None:  # if not explicitly set, set equal to regular debug
        args.sw_debug = args.debug

    if args.only_genes is not None:
        for gene in args.only_genes:  # make sure they're all at least valid ig genes
            utils.split_gene(gene)

    if args.print_git_commit or args.action == 'version':
        utils.get_version_info(debug=True)
        if args.action == 'version':
            sys.exit(0)

    args.is_data = not args.is_simu  # whole code base uses is_data, this is better than changing all of that

    if args.collapse_duplicate_sequences and not args.is_data:
        print '  %s collapsing duplicates on simulation, which is often not a good idea since it makes keeping track of performance harder (e.g. purity/completeness of partitions is harder to calculate)' % utils.color(
            'red', 'warning')

    if args.simultaneous_true_clonal_seqs:
        if args.is_data:
            raise Exception(
                'can only pass true clonal families to multi-hmm together on simulation and with --is-simu set'
            )
        if args.n_simultaneous_seqs is not None:
            raise Exception(
                'can\'t specify both --n-simultaneous-seqs and --simultaneous-true-clonal-seqs'
            )
        if args.all_seqs_simultaneous:
            raise Exception(
                'can\'t specify both --all-seqs-simultaneous and --simultaneous-true-clonal-seqs'
            )
        if args.action == 'partition':
            raise Exception(
                'can\'t set --simultaneous-true-clonal-seqs when partitioning')
    if args.n_simultaneous_seqs is not None and args.all_seqs_simultaneous:
        raise Exception(
            'doesn\'t make sense to set both --n-simultaneous-seqs and --all-seqs-simultaneous.'
        )

    if args.no_indels:
        print 'forcing --gap-open-penalty to %d to prevent indels, since --no-indels was specified (you can also adjust this penalty directly)' % args.no_indel_gap_open_penalty
        args.gap_open_penalty = args.no_indel_gap_open_penalty

    if args.indel_frequency > 0.:
        if args.indel_frequency < 0. or args.indel_frequency > 1.:
            raise Exception('--indel-frequency must be in [0., 1.] (got %f)' %
                            args.indel_frequency)
    args.n_indels_per_indeld_seq = utils.get_arg_list(
        args.n_indels_per_indeld_seq, intify=True)
    if args.indel_location not in [None, 'v', 'cdr3']:
        if int(args.indel_location) in range(500):
            args.indel_location = int(args.indel_location)
            if any(n > 1 for n in args.n_indels_per_indeld_seq):
                print '  note: removing entries from --n-indels-per-indeld-seq (%s), since --indel-location was set to a single position.' % [
                    n for n in args.n_indels_per_indeld_seq if n > 1
                ]
                args.n_indels_per_indeld_seq = [
                    n for n in args.n_indels_per_indeld_seq if n <= 1
                ]
        else:
            raise Exception(
                '--indel-location \'%s\' neither one of None, \'v\' or \'cdr3\', nor an integer less than 500'
                % args.indel_location)

    if args.locus is not None and 'tr' in args.locus and args.mutation_multiplier is None:
        args.mutation_multiplier = 0.

    if args.workdir is None:  # set default here so we know whether it was set by hand or not
        args.workdir = get_workdir(args.batch_system)
    else:
        args.workdir = args.workdir.rstrip('/')
    if os.path.exists(args.workdir):
        raise Exception('workdir %s already exists' % args.workdir)

    if args.batch_system == 'sge' and args.batch_options is not None:
        if '-e' in args.batch_options or '-o' in args.batch_options:
            print '%s --batch-options contains \'-e\' or \'-o\', but we add these automatically since we need to be able to parse each job\'s stdout and stderr. You can control the directory under which they\'re written with --workdir (which is currently %s).' % (
                utils.color('red', 'warning'), args.workdir)

    if args.outfname is not None and not args.presto_output and not args.airr_output and not args.generate_trees:
        if utils.getsuffix(args.outfname) not in ['.csv', '.yaml']:
            raise Exception('unhandled --outfname suffix %s' %
                            utils.getsuffix(args.outfname))
        if utils.getsuffix(args.outfname) != '.yaml':
            print '  %s --outfname uses deprecated file format %s. This will still mostly work ok, but the new default .yaml format doesn\'t have to do all the string conversions by hand (so is less buggy), and includes annotations, partitions, and germline info in the same file (so you don\'t get crashes or inconsistent results if you don\'t keep track of what germline info goes with what output file).' % (
                utils.color('yellow', 'note:'), utils.getsuffix(args.outfname))
        if args.action in ['view-annotations', 'view-partitions'
                           ] and utils.getsuffix(args.outfname) == '.yaml':
            raise Exception(
                'have to use \'view-output\' action to view .yaml output files'
            )

    if args.presto_output:
        if args.outfname is None:
            raise Exception('have to set --outfname if --presto-output is set')
        if args.action == 'annotate' and utils.getsuffix(
                args.outfname) != '.tsv':
            raise Exception(
                '--outfname suffix has to be .tsv for annotation with --presto-output (got %s)'
                % utils.getsuffix(args.outfname))
        if args.action == 'partition' and utils.getsuffix(
                args.outfname) not in ['.fa', '.fasta']:
            raise Exception(
                '--outfname suffix has to be .fa or .fasta for partitioning with --presto-output (got %s)'
                % utils.getsuffix(args.outfname))
        if args.aligned_germline_fname is None:
            assert args.locus is not None
            args.aligned_germline_fname = '%s/%s/imgt-alignments/%s.fa' % (
                args.default_initial_germline_dir, args.species, args.locus)
        if not os.path.exists(args.aligned_germline_fname):
            raise Exception(
                '--aligned-germline-fname %s doesn\'t exist, but we need it in order to write presto output'
                % args.aligned_germline_fname)
    if args.airr_output:
        if args.outfname is None:
            raise Exception('have to set --outfname if --airr-output is set')
        if utils.getsuffix(args.outfname) == '.tsv':
            print '  note: writing only airr .tsv to %s' % args.outfname
        elif utils.getsuffix(args.outfname) in ['.yaml', '.csv']:
            print '  note: writing both partis %s to %s and airr .tsv to %s' % (
                utils.getsuffix(args.outfname), args.outfname,
                utils.replace_suffix(args.outfname, '.tsv'))
        else:
            raise Exception(
                '--outfname suffix has to be either .tsv or .yaml if --airr-output is set (got %s)'
                % utils.getsuffix(args.outfname))
    if args.airr_input:
        args.seq_column = 'sequence'
        args.name_column = 'sequence_id'

    if args.cluster_annotation_fname is None and args.outfname is not None and utils.getsuffix(
            args.outfname
    ) == '.csv':  # if it wasn't set on the command line (<outfname> _was_ set), _and_ if we were asked for a csv, then use the old file name format
        args.cluster_annotation_fname = utils.insert_before_suffix(
            '-cluster-annotations', args.outfname)

    if args.calculate_alternative_annotations and args.outfname is None and args.paired_outdir is None:
        raise Exception(
            'have to specify --outfname in order to calculate alternative annotations'
        )
    if args.subcluster_annotation_size == 'None':  # i want it turned on by default, but also to be able to turn it off on the command line
        args.subcluster_annotation_size = None
    else:
        args.subcluster_annotation_size = int(
            args.subcluster_annotation_size
        )  # can't set it in add_argument(), sigh
    if args.subcluster_annotation_size is not None:
        if args.calculate_alternative_annotations or args.write_additional_cluster_annotations is not None:
            raise Exception(
                'can\'t set either --calculate-alternative-annotations or --write-additional-cluster-annotations if --subcluster-annotation-size is also set (you get duplicate annotations, which confuses and crashes things, plus it doesn\'t really make sense -- alternative annotations should be calculated on the subcluster annotations now)'
            )
    if args.action == 'view-alternative-annotations' and args.persistent_cachefname is None:  # handle existing old-style output
        assert args.outfname is not None
        if os.path.exists(utils.getprefix(args.outfname) + '-hmm-cache.csv'):
            args.persistent_cachefname = utils.getprefix(
                args.outfname
            ) + '-hmm-cache.csv'  # written by bcrham, so has to be csv, not yaml

    if args.min_largest_cluster_size is not None and args.n_final_clusters is not None:
        print '  note: both --min-largest-cluster-size and --n-final-clusters are set, which means we\'ll stop clustering when *either* of their criteria are satisfied (not both)'  # maybe it should be both, but whatever

    if not args.paired_loci and (args.action == 'get-selection-metrics'
                                 or args.get_selection_metrics):
        if args.outfname is None and args.selection_metric_fname is None:
            print '    %s calculating selection metrics, but neither --outfname nor --selection-metric-fname were set, which means nothing will be written to disk' % utils.color(
                'yellow', 'warning')
        elif args.selection_metric_fname is None and args.action == 'get-selection-metrics' and not args.add_selection_metrics_to_outfname:
            args.selection_metric_fname = utils.insert_before_suffix(
                '-selection-metrics', args.outfname)

    if args.plot_annotation_performance:
        if args.plotdir is None and args.print_n_worst_annotations is None:
            raise Exception(
                'doesn\'t make sense to set --plot-annotation-performance but not either of --plotdir or --print-n-worst-annotations (we\'ll spend all the cycles counting things up but then they\'ll just disappear from memory without being recorded).'
            )
        if not args.is_simu:
            raise Exception(
                'can\'t plot performance unless --is-simu is set (and this is simulation)'
            )
    if args.print_n_worst_annotations is not None and not args.plot_annotation_performance:
        raise Exception(
            '--plot-annotation-performance must be set if you\'re setting --print-worst-annotations'
        )
    if not args.paired_loci and (
            args.action == 'plot-partitions' or args.action == 'annotate'
            and args.plot_partitions) and args.plotdir is None:
        raise Exception('--plotdir must be specified if plotting partitions')
    if args.action == 'annotate' and args.plot_partitions and args.input_partition_fname is None:  # could set this up to use e.g. --simultaneous-true-clonal-seqs as well, but it can't atm
        print '  %s running annotate with --plot-partitions, but --input-partition-fname is not set, which likely means the partitions will be trivial/singleton partitions' % utils.color(
            'yellow', 'warning')

    if args.make_per_gene_per_base_plots and not args.make_per_gene_plots:  # the former doesn't do anything unless the latter is turned on
        args.make_per_gene_plots = True

    if args.action == 'simulate':
        if args.n_trees is None and not args.paired_loci:
            args.n_trees = max(1, int(float(args.n_sim_events) / args.n_procs))
        if args.n_procs > args.n_sim_events:
            print '  note: reducing --n-procs to %d (was %d) so it isn\'t bigger than --n-sim-events' % (
                args.n_sim_events, args.n_procs)
            args.n_procs = args.n_sim_events
        if args.n_max_queries != -1:
            print '  note: --n-max-queries is not used when simulating (use --n-sim-events to set the simulated number of rearrangemt events)'

        if args.outfname is None and args.paired_outdir is None:
            print '  note: no %s specified, so nothing will be written to disk' % (
                '--paired-outdir' if args.paired_loci else '--outfname')
            args.outfname = get_dummy_outfname(
                args.workdir
            )  # hackey, but otherwise I have to rewrite the whole run_simulation() in bin/partis to handle None type outfname

        if args.simulate_from_scratch:
            args.rearrange_from_scratch = True
            args.mutate_from_scratch = True
        if args.rearrange_from_scratch and not args.force_dont_generate_germline_set:  # i would probably just default to always generating germline sets when rearranging from scratch, but bin/test-germline-inference.py (and any other case where you want to dramatically restrict the germline set) really argue for a way to force just using the genes in the germline dir
            args.generate_germline_set = True
        if args.flat_mute_freq or args.same_mute_freq_for_all_seqs:
            assert args.mutate_from_scratch
        if args.mutate_from_scratch and not args.no_per_base_mutation:
            print '  note: setting --no-per-base-mutation since --mutate-from-scratch was set'
            args.no_per_base_mutation = True

        # end result of this block: shm/reco parameter dirs are set (unless we're doing their bit from scratch), --parameter-dir is set to None (and if --parameter-dir was set but shm/reco were _not_ set, we've just used --parameter-dir for either/both as needed)
        if args.parameter_dir is not None:
            if args.rearrange_from_scratch or args.mutate_from_scratch:
                raise Exception(
                    'can\'t set --parameter-dir if rearranging or mutating from scratch (use --reco-parameter-dir and/or --shm-parameter-dir)'
                )
            if args.reco_parameter_dir is not None or args.shm_parameter_dir is not None:
                raise Exception(
                    'can\'t set --parameter-dir if either --reco-parameter-dir or --shm-parameter-dir are also set'
                )
            args.reco_parameter_dir = args.parameter_dir
            args.shm_parameter_dir = args.parameter_dir
            args.parameter_dir = None
        if args.rearrange_from_scratch and args.reco_parameter_dir is not None:
            raise Exception(
                'doesn\'t make sense to set both --rearrange-from-scratch and --reco-parameter-dir'
            )
        if args.mutate_from_scratch and args.shm_parameter_dir is not None:
            raise Exception(
                'doesn\'t make sense to set both --mutate-from-scratch and --shm-parameter-dir'
            )
        if args.reco_parameter_dir is None and not args.rearrange_from_scratch:
            raise Exception(
                'have to either set --rearrange-from-scratch or --reco-parameter-dir (or --simulate-from-scratch)'
            )
        if args.shm_parameter_dir is None and not args.mutate_from_scratch:
            raise Exception(
                'have to either set --mutate-from-scratch or --shm-parameter-dir (or --simulate-from-scratch)'
            )

        if args.generate_germline_set and not args.rearrange_from_scratch:
            raise Exception(
                'can only --generate-germline-set if also rearranging from scratch (set --rearrange-from-scratch)'
            )

        if args.generate_germline_set:
            args.snp_positions = None  # if you want to control the exact positions, you have to use bin/test-germline-inference.py
            args.indel_positions = None
            process_gls_gen_args(args)

        if args.generate_trees:
            assert args.n_procs == 1  # not set up to handle output, and also no need

        if args.treefname is not None:
            raise Exception(
                '--treefname was set for simulation action (probably meant to use --input-simulation-treefname)'
            )

    if args.parameter_dir is not None and not args.paired_loci:  # if we're splitting loci, this isn't the normal parameter dir, it's a parent of that
        args.parameter_dir = args.parameter_dir.rstrip('/')
        if os.path.exists(args.parameter_dir):
            pdirs = [
                d for d in os.listdir(args.parameter_dir) if os.path.isdir(d)
            ]
            if len(pdirs) > 0 and len(
                    set(pdirs) & set(utils.parameter_type_choices)) == 0:
                raise Exception(
                    'couldn\'t find any expected parameter types (i.e. subdirs) in --parameter-dir \'%s\'. Allowed types: %s, found: %s. Maybe you added the parameter type to the parameter dir path?'
                    % (args.parameter_dir, ' '.join(
                        utils.parameter_type_choices), ' '.join(
                            os.listdir(args.parameter_dir))))

    if os.path.exists(args.default_initial_germline_dir + '/' +
                      args.species):  # ick that is hackey
        args.default_initial_germline_dir += '/' + args.species

    if args.species != 'human' and not args.allele_cluster:
        print '  non-human species \'%s\', turning on allele clustering' % args.species
        args.allele_cluster = True

    if args.n_max_snps is not None and args.n_max_mutations_per_segment is not None:
        if args.n_max_snps > args.n_max_mutations_per_segment - 10:
            raise Exception(
                '--n-max-snps should be at least ten less than --n-max-mutations-per-segment, but I got %d and %d'
                % (args.n_max_snps, args.n_max_mutations_per_segment))

    if args.leave_default_germline:
        args.dont_remove_unlikely_alleles = True
        args.allele_cluster = False
        args.dont_find_new_alleles = True

    if args.action not in actions_not_requiring_input and [
            args.infname, args.paired_indir
    ].count(None) == 2:
        if args.paired_loci:
            raise Exception(
                '--infname or --paired-indir is required for action \'%s\' with --paired-loci'
                % args.action)
        else:
            raise Exception('--infname is required for action \'%s\'' %
                            args.action)

    if args.action == 'get-linearham-info':
        if args.linearham_info_fname is None:  # for some reason setting required=True isn't working
            raise Exception('have to specify --linearham-info-fname')
        if args.sw_cachefname is None and args.parameter_dir is None:
            raise Exception(
                'have to specify --sw-cachefname or --parameter-dir, since we need sw info to calculate linearham inputs'
            )
        if args.extra_annotation_columns is None or 'linearham-info' not in args.extra_annotation_columns:
            args.extra_annotation_columns = utils.add_lists(
                args.extra_annotation_columns, ['linearham-info'])

    if args.ete_path is not None and args.ete_path == 'None':  # it's nice to be able to unset this from the command line (so we don't make the slow tree plots)
        args.ete_path = None
Exemplo n.º 21
0
def process(args):
    if args.action == 'run-viterbi':
        print '  note: replacing deprecated action name \'run-viterbi\' with current name \'annotate\' (this doesn\'t change any actual behavior)'
        args.action = 'annotate'

    if args.chain is not None:
        print '    note: transferring argument from deprecated option \'--chain %s\' to new option \'--locus %s\'' % (
            args.chain, 'ig' + args.chain)
        args.locus = 'ig' + args.chain
        args.chain = None
    args.loci = utils.get_arg_list(args.loci, choices=utils.loci)
    if args.loci is None:  # in principle I should check that at least one of 'em isn't None, but if that's the case it'll crash soon enough
        args.loci = [args.locus]
    else:
        args.locus = args.loci[0]

    args.only_genes = utils.get_arg_list(args.only_genes)
    args.queries = utils.get_arg_list(args.queries)
    args.queries_to_include = utils.get_arg_list(args.queries_to_include)
    args.reco_ids = utils.get_arg_list(args.reco_ids)
    args.istartstop = utils.get_arg_list(args.istartstop, intify=True)
    if args.istartstop is not None:
        if args.istartstop[0] >= args.istartstop[1] or args.istartstop[0] < 0:
            raise Exception('invalid --istartstop specification: %d %d' %
                            (args.istartstop[0], args.istartstop[1]))
    args.n_max_per_region = utils.get_arg_list(args.n_max_per_region,
                                               intify=True)
    if len(args.n_max_per_region) != 3:
        raise Exception(
            'n-max-per-region should be of the form \'x:y:z\', but I got ' +
            str(args.n_max_per_region))
    args.write_additional_cluster_annotations = utils.get_arg_list(
        args.write_additional_cluster_annotations, intify=True)
    if args.write_additional_cluster_annotations is not None and len(
            args.write_additional_cluster_annotations) != 2:
        raise Exception(
            '--write-additional-cluster-annotations must be specified as two numbers \'m:n\', but I got %s'
            % args.write_additional_cluster_annotations)
    args.extra_annotation_columns = utils.get_arg_list(
        args.extra_annotation_columns, choices=utils.extra_annotation_headers)
    if args.linearham:
        assert args.action == 'partition', '--linearham mode must be run with \'partis partition\''
        args.extra_annotation_columns = utils.add_lists(
            args.extra_annotation_columns, ['flexbounds', 'relpos'])

    args.cluster_indices = utils.get_arg_list(args.cluster_indices,
                                              intify=True)

    args.region_end_exclusions = {
        r: [
            args.region_end_exclusion_length if
            ('%s_%s' % (r, e)) in utils.real_erosions else 0
            for e in ['5p', '3p']
        ]
        for r in utils.regions
    }
    args.region_end_exclusion_length = None  # there isn't really a big reason to set it to None, but this makes clear that I should only be using the dict version

    args.annotation_clustering_thresholds = utils.get_arg_list(
        args.annotation_clustering_thresholds, floatify=True)
    args.naive_hamming_bounds = utils.get_arg_list(args.naive_hamming_bounds,
                                                   floatify=True)
    if args.small_clusters_to_ignore is not None:
        if '-' in args.small_clusters_to_ignore:
            lo, hi = [
                int(cluster_size)
                for cluster_size in args.small_clusters_to_ignore.split('-')
            ]
            args.small_clusters_to_ignore = range(lo, hi + 1)
        else:
            args.small_clusters_to_ignore = utils.get_arg_list(
                args.small_clusters_to_ignore, intify=True)
    if args.seed_unique_id is not None:
        args.seed_unique_id = args.seed_unique_id.strip(
        )  # protect against the space you may put in front of it if it's got an initial minus sign (better way is to use an equals sign)
        if args.queries is not None and args.seed_unique_id not in args.queries:
            raise Exception('seed uid %s not in --queries %s' %
                            (args.seed_unique_id, ' '.join(args.queries)))
        if args.random_seed_seq:
            raise Exception(
                'can\'t specify both --seed-unique-id and --random-seed-seq')

        if args.queries_to_include is None:  # make sure the seed is in --queries-to-include
            args.queries_to_include = [args.seed_unique_id]
        elif args.seed_unique_id not in args.queries_to_include:
            args.queries_to_include = [
                args.seed_unique_id
            ] + args.queries_to_include  # may as well put it first, I guess (?)
    elif args.seed_seq is not None:
        args.seed_unique_id = 'seed-seq'

    if args.sw_debug is None:  # if not explicitly set, set equal to regular debug
        args.sw_debug = args.debug

    if args.only_genes is not None:
        for gene in args.only_genes:  # make sure they're all at least valid ig genes
            utils.split_gene(gene)

    if args.print_git_commit or args.action == 'version':
        print 'RUN ' + ' '.join(sys.argv)
        tag = subprocess.check_output(['git', 'tag']).split()[-1]
        print '       tag %s' % tag
        print '    commit %s' % subprocess.check_output(
            ['git', 'rev-parse', 'HEAD']).strip()
        if args.action == 'version':
            sys.exit(0)

    args.is_data = not args.is_simu  # whole code base uses is_data, this is better than changing all of that

    if args.simultaneous_true_clonal_seqs:
        if args.is_data:
            raise Exception(
                'can only pass true clonal families to multi-hmm together on simulation and with --is-simu set'
            )
        if args.n_simultaneous_seqs is not None:
            raise Exception(
                'can\'t specify both --n-simultaneous-seqs and --simultaneous-true-clonal-seqs'
            )

    if args.no_indels:
        print 'forcing --gap-open-penalty to %d to prevent indels, since --no-indels was specified (you can also adjust this penalty directly)' % args.no_indel_gap_open_penalty
        args.gap_open_penalty = args.no_indel_gap_open_penalty

    if args.indel_frequency > 0.:
        if args.indel_frequency < 0. or args.indel_frequency > 1.:
            raise Exception('--indel-frequency must be in [0., 1.] (got %f)' %
                            args.indel_frequency)
    args.n_indels_per_indeld_seq = utils.get_arg_list(
        args.n_indels_per_indeld_seq, intify=True)

    if 'tr' in args.locus and args.mutation_multiplier is None:
        args.mutation_multiplier = 0.

    if args.workdir is None:  # set default here so we know whether it was set by hand or not

        def choose_random_subdir(dirname):
            subname = str(random.randint(0, 999999))
            while os.path.exists(dirname + '/' + subname):
                subname = str(random.randint(0, 999999))
            return dirname + '/' + subname

        if args.batch_system is not None and os.path.exists(
                '/fh/fast/matsen_e'):
            args.workdir = choose_random_subdir(
                '/fh/fast/matsen_e/' + os.path.basename(os.getenv('HOME')) +
                '/_tmp/hmms')
        else:
            args.workdir = choose_random_subdir(
                '/tmp/' + os.path.basename(os.getenv('HOME')) + '/hmms')
            if args.batch_system is not None:
                print '  %s: using batch system %s with default --workdir (%s) -- if this isn\'t visible to the batch nodes on your system, you\'ll need to change it' % (
                    utils.color('red',
                                'warning'), args.batch_system, args.workdir)
    else:
        args.workdir = args.workdir.rstrip('/')
    if os.path.exists(args.workdir):
        raise Exception('workdir %s already exists' % args.workdir)

    if args.batch_system == 'sge' and args.batch_options is not None:
        if '-e' in args.batch_options or '-o' in args.batch_options:
            print '%s --batch-options contains \'-e\' or \'-o\', but we add these automatically since we need to be able to parse each job\'s stdout and stderr. You can control the directory under which they\'re written with --workdir (which is currently %s).' % (
                utils.color('red', 'warning'), args.workdir)

    if args.outfname is not None and not args.presto_output:
        if utils.getsuffix(args.outfname) not in ['.csv', '.yaml']:
            raise Exception('unhandled --outfname suffix %s' %
                            utils.getsuffix(args.outfname))
        if utils.getsuffix(args.outfname) != '.yaml':
            print '  %s --outfname uses deprecated file format %s. This will still work fine, but the new default .yaml format is much cleaner, and includes annotations, partitions, and germline info in the same file.' % (
                utils.color('yellow', 'note:'), utils.getsuffix(args.outfname))
        if args.action in ['view-annotations', 'view-partitions'
                           ] and utils.getsuffix(args.outfname) == '.yaml':
            raise Exception(
                'have to use \'view-output\' action to view .yaml output files'
            )

    if args.presto_output:
        if args.action == 'annotate' and utils.getsuffix(
                args.outfname) != '.tsv':
            raise Exception(
                '--outfname suffix has to be .tsv for annotation with --presto-output (got %s)'
                % utils.getsuffix(args.outfname))
        if args.action == 'partition' and utils.getsuffix(
                args.outfname) not in ['.fa', 'fasta']:
            raise Exception(
                '--outfname suffix has to be .fa or .fasta for partition with --presto-output (got %s)'
                % utils.getsuffix(args.outfname))
        if args.aligned_germline_fname is None:
            raise Exception(
                'in order to get presto output, you have to set --aligned-germline-fname to a fasta file with germline alignments for every germline gene, an example is located in data/germlines/imgt-aligned-igh.fa (this isn\'t set by default because imgt alignments are subject to change)'
            )

    if args.cluster_annotation_fname is None and args.outfname is not None and utils.getsuffix(
            args.outfname
    ) == '.csv':  # if it wasn't set on the command line (<outfname> _was_ set), _and_ if we were asked for a csv, then use the old file name format
        args.cluster_annotation_fname = utils.insert_before_suffix(
            '-cluster-annotations', args.outfname)

    if args.calculate_alternative_naive_seqs or (
            args.action == 'view-alternative-naive-seqs'
            and args.persistent_cachefname is None):
        if args.outfname is None:
            raise Exception(
                'have to specify --outfname in order to calculate alternative naive sequences'
            )
        args.persistent_cachefname = utils.insert_before_suffix(
            '-hmm-cache', args.outfname)
        if args.calculate_alternative_naive_seqs and os.path.exists(
                args.persistent_cachefname):
            if os.stat(args.persistent_cachefname).st_size == 0:
                print '  note: removing existing zero-length persistent cache file %s' % args.persistent_cachefname
                os.remove(args.persistent_cachefname)
            else:
                raise Exception(
                    'persistent cache file %s already exists, but we were asked to --calculate-alternative-naive-seqs. Either it\'s an old file (in which case you should delete it), or you\'ve already got the alternative annotations (so you can just run view-alternative-naive-seqs)'
                    % args.persistent_cachefname)

    if args.plot_performance:
        print '%s encountered deprecated argument --plot-performance, moving value to --plot-annotation-performance' % utils.color(
            'yellow', 'warning')
        args.plot_annotation_performance = True
    if args.plot_annotation_performance:
        if args.plotdir is None:
            raise Exception(
                'can\'t plot performance unless --plotdir is specified')
        if not args.is_simu:
            raise Exception('can\'t plot performance unless --is-simu is set')
    if args.action == 'plot-partitions' and args.plotdir is None:
        raise Exception('--plotdir must be specified ')

    if args.parameter_type != 'hmm':
        print '  using non-default parameter type \'%s\'' % args.parameter_type

    if args.simulate_from_scratch:
        args.rearrange_from_scratch = True
        args.mutate_from_scratch = True
    if args.flat_mute_freq or args.same_mute_freq_for_all_seqs:
        assert args.mutate_from_scratch

    if args.action == 'simulate':
        if len(args.loci) != 1:
            raise Exception('needs to be implemented')
        if args.batch_system is not None and args.n_procs > 1 and not args.subsimproc:
            print '  %s setting subsimproc' % utils.color('red', 'warning')
            args.subsimproc = True
        if args.n_trees is None:
            args.n_trees = max(1, int(float(args.n_sim_events) / args.n_procs))
        if args.outfname is None:
            print '  note: no --outfname specified, so nothing will be written to disk'
        if args.n_max_queries != -1:
            print '  note: --n-max-queries is not used when simulating (use --n-sim-events to set the simulated number of rearrangemt events)'

        # end result of this block: shm/reco parameter dirs are set (unless we're doing their bit from scratch), --parameter-dir is set to None (and if --parameter-dir was set but shm/reco were _not_ set, we've just used --parameter-dir for either/both as needed)
        if args.parameter_dir is not None:
            if args.rearrange_from_scratch or args.mutate_from_scratch:
                raise Exception(
                    'can\'t set --parameter-dir if rearranging or mutating from scratch (use --reco-parameter-dir and/or --shm-parameter-dir)'
                )
            if args.reco_parameter_dir is not None or args.shm_parameter_dir is not None:
                raise Exception(
                    'can\'t set --parameter-dir if either --reco-parameter-dir or --shm-parameter-dir are also set'
                )
            args.reco_parameter_dir = args.parameter_dir
            args.shm_parameter_dir = args.parameter_dir
            args.parameter_dir = None
        if args.rearrange_from_scratch and args.reco_parameter_dir is not None:
            raise Exception(
                'doesn\'t make sense to set both --rearrange-from-scratch and --reco-parameter-dir'
            )
        if args.mutate_from_scratch and args.shm_parameter_dir is not None:
            raise Exception(
                'doesn\'t make sense to set both --mutate-from-scratch and --shm-parameter-dir'
            )
        if args.reco_parameter_dir is None and not args.rearrange_from_scratch:
            raise Exception(
                'have to either set --rearrange-from-scratch or --reco-parameter-dir'
            )
        if args.shm_parameter_dir is None and not args.mutate_from_scratch:
            raise Exception(
                'have to either set --mutate-from-scratch or --shm-parameter-dir'
            )

        if args.generate_germline_set and not args.rearrange_from_scratch:
            raise Exception(
                'can only --generate-germline-set if also rearranging from scratch (set --rearrange-from-scratch)'
            )

    if args.parameter_dir is not None:
        args.parameter_dir = args.parameter_dir.rstrip('/')

    if args.count_parameters and not args.dont_write_parameters:
        raise Exception(
            'if you set --count-parameters, you should also set --dont-write-parameters to make sure you\'re not accidentally overwriting existing parameters '
        )

    if os.path.exists(args.default_initial_germline_dir + '/' +
                      args.species):  # ick that is hackey
        args.default_initial_germline_dir += '/' + args.species

    if args.species != 'human' and not args.allele_cluster:
        print '  non-human species \'%s\', turning on allele clustering' % args.species
        args.allele_cluster = True

    if args.n_max_snps is not None and args.n_max_mutations_per_segment is not None:
        if args.n_max_snps > args.n_max_mutations_per_segment - 10:
            raise Exception(
                '--n-max-snps should be at least ten less than --n-max-mutations-per-segment, but I got %d and %d'
                % (args.n_max_snps, args.n_max_mutations_per_segment))

    if args.n_alleles_per_gene is None:
        if not args.dont_find_new_alleles:
            args.n_alleles_per_gene = 1
        else:
            args.n_alleles_per_gene = 2

    if args.leave_default_germline:
        args.dont_remove_unlikely_alleles = True
        args.allele_cluster = False
        args.dont_find_new_alleles = True

    if args.infname is None and args.action not in [
            'simulate', 'view-output', 'view-annotations', 'view-partitions',
            'view-cluster-annotations', 'plot-partitions',
            'view-alternative-naive-seqs'
    ]:
        raise Exception('--infname is required for action \'%s\'' %
                        args.action)
Exemplo n.º 22
0
parser.add_argument('--locus', default='igh')
parser.add_argument('--outfile', required=True, help='output partis yaml file')
parser.add_argument('--debug', action='store_true')
parser.add_argument(
    '--n-test-subset-seqs',
    type=int,
    help=
    'take only the first N seqs from both the fasta file and the annotation in the partis output file (e.g. for testing when the family is huge)'
)
args = parser.parse_args()

new_seqfos = utils.read_fastx(args.new_seq_file, sanitize_seqs=True)
print '    read %d seqs from %s' % (len(new_seqfos), args.new_seq_file)

glfo = None
if utils.getsuffix(args.partis_output_file) == '.csv':
    print '    reading deprecated csv format, so need to read germline info from somewhere else, using --glfo-dir %s, hopefully it works' % args.glfo_dir
    glfo = glutils.read_glfo(args.glfo_dir, locus=args.locus)

glfo, annotation_list, cpath = utils.read_output(args.partis_output_file,
                                                 glfo=glfo,
                                                 locus=args.locus)
if args.partition_index is not None:
    print '  using non-best partition index %d (best is %d)' % (
        args.partition_index, cpath.i_best)
partition = cpath.partitions[cpath.i_best if args.
                             partition_index is None else args.partition_index]
print '    read partition with %d clusters from %s' % (len(partition),
                                                       args.partis_output_file)

new_uids = set(sfo['name'] for sfo in new_seqfos)