예제 #1
0
def read_ramesh_file(fname, outdir, debug=False):
    seqfos = utils.read_fastx(fname)
    glseqs = {
        l: {r: {}
            for r in utils.loci[l]}
        for l in utils.loci if 'ig' in l
    }
    for sfo in seqfos:
        if os.path.basename(fname) == 'coding.fa':
            meta = [x.strip('[]').split('=') for x in sfo['infostrs']]
            mdict = {m[0]: m[1] for m in meta if len(m) == 2}
            if 'gene' not in mdict:
                print 'no gene for %s' % sfo['infostrs']
                continue
            gene = mdict['gene']
        else:
            mdict = {}
            gene = sfo['name']
        if debug:
            print gene
        if utils.is_constant_gene(gene):
            if debug:
                print '  constant'
            continue
        region = utils.get_region(gene)
        utils.split_gene(gene)
        # if 'partial' in mdict:
        #     gene += '_partial_%s' % mdict['partial'].replace('\'', '').replace(',', '')
        if sfo['seq'] in glseqs[utils.get_locus(gene)][region].values():
            if debug:
                print '  duplicate'
            continue
        glseqs[utils.get_locus(gene)][region][gene] = sfo['seq']

    return glseqs
예제 #2
0
파일: glutils.py 프로젝트: Annak17/partis
def read_fasta_file(seqs, fname, skip_pseudogenes, aligned=False):
    n_skipped_pseudogenes = 0
    for seq_record in SeqIO.parse(fname, 'fasta'):
        linefo = [p.strip() for p in seq_record.description.split('|')]

        # first get gene name
        if linefo[0][:2] != 'IG':  # if it's an imgt file, with a bunch of header info (and the accession number first)
            gene = linefo[imgt_info_indices.index('gene')]
            functionality = linefo[imgt_info_indices.index('functionality')]
            if functionality not in functionalities:
                raise Exception('unexpected functionality %s in %s' % (functionality, fname))
            if skip_pseudogenes and functionality == 'P':
                n_skipped_pseudogenes += 1
                continue
        else:  # plain fasta with just the gene name after the '>'
            gene = linefo[0]
        utils.split_gene(gene)  # just to check if it's a valid gene name
        if not aligned and utils.get_region(gene) != utils.get_region(os.path.basename(fname)):  # if <aligned> is True, file name is expected to be whatever
            raise Exception('gene %s from %s has unexpected region %s' % (gene, os.path.basename(fname), utils.get_region(gene)))

        # then the sequence
        seq = str(seq_record.seq).upper()
        if not aligned:
            seq = utils.remove_gaps(seq)
        if len(seq.strip(''.join(utils.expected_characters))) > 0:  # return the empty string if it only contains expected characters
            raise Exception('unexpected character %s in %s (expected %s)' % (seq.strip(''.join(utils.expected_characters)), seq, ' '.join(utils.expected_characters)))

        seqs[utils.get_region(gene)][gene] = seq

    if n_skipped_pseudogenes > 0:
        print '    skipped %d pseudogenes' % n_skipped_pseudogenes
예제 #3
0
    def plot(self, plotdir, only_csv=False, only_overall=False):
        import plotting
        print '  plotting parameters',
        sys.stdout.flush()
        start = time.time()

        self.clean_plots(plotdir)

        self.mfreqer.plot(plotdir + '/mute-freqs', only_csv=only_csv, only_overall=only_overall)

        overall_plotdir = plotdir + '/overall'

        for column in self.counts:
            if column == 'all':
                continue
            values, gene_values = {}, {}
            for index, count in self.counts[column].iteritems():
                column_val = index[0]

                if column_val not in values:
                    values[column_val] = 0.0
                values[column_val] += count

                if column in self.columns_to_subset_by_gene:
                    gene = index[1]  # NOTE this is hackey, but it works find now and will fail obviously if I ever change the correlations to be incompatible. so screw it
                    utils.split_gene(gene)  # checks validity of gene
                    if gene not in gene_values:
                        gene_values[gene] = {}
                    if column_val not in gene_values[gene]:
                        gene_values[gene][column_val] = 0.0
                    gene_values[gene][column_val] += count

            var_type = 'string' if column in self.string_columns else 'int'

            hist = plotting.make_hist_from_dict_of_counts(values, var_type, column, sort=True)
            plotting.draw_no_root(hist, plotname=column, plotdir=overall_plotdir, xtitle=plotconfig.xtitles.get(column, column), plottitle=plotconfig.plot_titles.get(column, column), errors=True, write_csv=True, only_csv=only_csv)

            if column in self.columns_to_subset_by_gene and not only_overall:
                thisplotdir = plotdir + '/' + column
                for gene in gene_values:
                    plotname = utils.sanitize_name(gene) + '-' + column
                    hist = plotting.make_hist_from_dict_of_counts(gene_values[gene], var_type, plotname, sort=True)
                    plotting.draw_no_root(hist, plotname=plotname, plotdir=thisplotdir, xtitle=plotconfig.plot_titles.get(column, column), plottitle=gene, errors=True, write_csv=True, only_csv=only_csv)
                if not only_csv:
                    plotting.make_html(thisplotdir)

        if not only_csv:
            plotting.make_html(overall_plotdir)

        print '(%.1f sec)' % (time.time()-start)
예제 #4
0
def read_fasta_file(seqs, fname, skip_pseudogenes, aligned=False):
    n_skipped_pseudogenes = 0
    seq_to_gene_map = {}
    for seqfo in utils.read_fastx(fname):
        # first get gene name
        if seqfo['name'][:2] != 'IG' and seqfo['name'][:2] != 'TR':  # if it's an imgt file, with a bunch of header info (and the accession number first)
            gene = seqfo['infostrs'][imgt_info_indices.index('gene')]
            functionality = seqfo['infostrs'][imgt_info_indices.index('functionality')]
            if functionality not in functionalities:
                raise Exception('unexpected functionality %s in %s' % (functionality, fname))
            if skip_pseudogenes and functionality in pseudogene_funcionalities:
                n_skipped_pseudogenes += 1
                continue
        else:  # plain fasta with just the gene name after the '>'
            gene = seqfo['name']
        utils.split_gene(gene)  # just to check if it's a valid gene name
        if not aligned and utils.get_region(gene) != utils.get_region(os.path.basename(fname)):  # if <aligned> is True, file name is expected to be whatever
            raise Exception('gene %s from %s has unexpected region %s' % (gene, os.path.basename(fname), utils.get_region(gene)))
        if gene in seqs[utils.get_region(gene)]:
            raise Exception('gene name %s appears twice in %s' % (gene, fname))

        # then the sequence
        seq = seqfo['seq']
        if not aligned:
            seq = utils.remove_gaps(seq)
        if 'Y' in seq:
            print '      replacing Y --> N (%d of \'em) in %s' % (seq.count('Y'), utils.color_gene(gene))
            seq = seq.replace('Y', 'N')
        if len(seq.strip(''.join(utils.expected_characters))) > 0:  # return the empty string if it only contains expected characters
            raise Exception('unexpected character %s in %s (expected %s)' % (seq.strip(''.join(utils.expected_characters)), seq, ' '.join(utils.expected_characters)))
        if seq not in seq_to_gene_map:
            seq_to_gene_map[seq] = []
        seq_to_gene_map[seq].append(gene)

        seqs[utils.get_region(gene)][gene] = seq

    tmpcounts = [len(gl) for gl in seq_to_gene_map.values()]  # number of names corresponding to each sequence (should all be ones)
    if tmpcounts.count(1) != len(tmpcounts):
        print '  mutliple names in %s for the following sequences:' % fname
        for seq, genelist in seq_to_gene_map.items():
            if len(genelist) > 1:
                print '    %-50s   %s' % (' '.join(genelist), seq)
        raise Exception('please de-duplicate the fasta and re-run.')

    if n_skipped_pseudogenes > 0:
        print '    skipped %d %s pseudogenes (leaving %d)' % (n_skipped_pseudogenes, utils.get_region(os.path.basename(fname)), len(seqs[utils.get_region(os.path.basename(fname))]))
예제 #5
0
def read_fasta_file(seqs, fname, skip_pseudogenes, aligned=False):
    n_skipped_pseudogenes = 0
    seq_to_gene_map = {}
    for seq_record in SeqIO.parse(fname, "fasta"):
        linefo = [p.strip() for p in seq_record.description.split("|")]

        # first get gene name
        if linefo[0][:2] != "IG":  # if it's an imgt file, with a bunch of header info (and the accession number first)
            gene = linefo[imgt_info_indices.index("gene")]
            functionality = linefo[imgt_info_indices.index("functionality")]
            if functionality not in functionalities:
                raise Exception("unexpected functionality %s in %s" % (functionality, fname))
            if skip_pseudogenes and functionality in pseudogene_funcionalities:
                n_skipped_pseudogenes += 1
                continue
        else:  # plain fasta with just the gene name after the '>'
            gene = linefo[0]
        utils.split_gene(gene)  # just to check if it's a valid gene name
        if not aligned and utils.get_region(gene) != utils.get_region(
            os.path.basename(fname)
        ):  # if <aligned> is True, file name is expected to be whatever
            raise Exception(
                "gene %s from %s has unexpected region %s" % (gene, os.path.basename(fname), utils.get_region(gene))
            )
        if gene in seqs[utils.get_region(gene)]:
            raise Exception("gene name %s appears twice in %s" % (gene, fname))

        # then the sequence
        seq = str(seq_record.seq).upper()
        if not aligned:
            seq = utils.remove_gaps(seq)
        if "Y" in seq:
            print "      replacing Y --> N (%d of 'em) in %s" % (seq.count("Y"), utils.color_gene(gene))
            seq = seq.replace("Y", "N")
        if (
            len(seq.strip("".join(utils.expected_characters))) > 0
        ):  # return the empty string if it only contains expected characters
            raise Exception(
                "unexpected character %s in %s (expected %s)"
                % (seq.strip("".join(utils.expected_characters)), seq, " ".join(utils.expected_characters))
            )
        if seq not in seq_to_gene_map:
            seq_to_gene_map[seq] = []
        seq_to_gene_map[seq].append(gene)

        seqs[utils.get_region(gene)][gene] = seq

    tmpcounts = [
        len(gl) for gl in seq_to_gene_map.values()
    ]  # number of names corresponding to each sequence (should all be ones)
    if tmpcounts.count(1) != len(tmpcounts):
        print "  mutliple names in %s for the following sequences:" % fname
        for seq, genelist in seq_to_gene_map.items():
            if len(genelist) > 1:
                print "    %-50s   %s" % (" ".join(genelist), seq)
        raise Exception("please de-duplicate the fasta and re-run.")

    if n_skipped_pseudogenes > 0:
        print "    skipped %d %s pseudogenes (leaving %d)" % (
            n_skipped_pseudogenes,
            utils.get_region(os.path.basename(fname)),
            len(seqs[utils.get_region(os.path.basename(fname))]),
        )
예제 #6
0
def process(args):
    if args.action == 'run-viterbi':
        print'  note: replacing deprecated action name \'run-viterbi\' with current name \'annotate\' (you don\'t need to change anything unless you want this warning message to go away)'
        args.action = 'annotate'
    if args.action == 'view-alternative-naive-seqs':
        print'  note: replacing deprecated action name \'view-alternative-naive-seqs\' with current name \'view-alternative-annotations\' (you don\'t need to change anything unless you want this warning message to go away)'
        args.action = 'view-alternative-annotations'
    if args.calculate_alternative_naive_seqs:
        print '    note: replacing deprecated option \'--calculate-alternative-naive-seqs\' with new option \'--calculate-alternative-annotations\' (you don\'t need to change anything unless you want this warning message to go away)'
        args.calculate_alternative_annotations = True
        delattr(args, 'calculate_alternative_naive_seqs')

    if args.chain is not None:
        print '    note: transferring argument from deprecated option \'--chain %s\' to new option \'--locus %s\'' % (args.chain, 'ig' + args.chain)
        args.locus = 'ig' + args.chain
        args.chain = None
    args.loci = utils.get_arg_list(args.loci, choices=utils.loci)
    if args.loci is None:  # in principle I should check that at least one of 'em isn't None, but if that's the case it'll crash soon enough
        args.loci = [args.locus]
    else:
        args.locus = args.loci[0]

    args.only_genes = utils.get_arg_list(args.only_genes)
    args.queries = utils.get_arg_list(args.queries)
    args.queries_to_include = utils.get_arg_list(args.queries_to_include)
    args.reco_ids = utils.get_arg_list(args.reco_ids)
    args.istartstop = utils.get_arg_list(args.istartstop, intify=True)
    if args.istartstop is not None:
        if args.istartstop[0] >= args.istartstop[1] or args.istartstop[0] < 0:
            raise Exception('invalid --istartstop specification: %d %d' % (args.istartstop[0], args.istartstop[1]))
    args.n_max_per_region = utils.get_arg_list(args.n_max_per_region, intify=True)
    if len(args.n_max_per_region) != 3:
        raise Exception('n-max-per-region should be of the form \'x:y:z\', but I got ' + str(args.n_max_per_region))
    args.write_additional_cluster_annotations = utils.get_arg_list(args.write_additional_cluster_annotations, intify=True)
    if args.write_additional_cluster_annotations is not None and len(args.write_additional_cluster_annotations) != 2:
        raise Exception('--write-additional-cluster-annotations must be specified as two numbers \'m:n\', but I got %s' % args.write_additional_cluster_annotations)
    args.extra_annotation_columns = utils.get_arg_list(args.extra_annotation_columns, choices=utils.extra_annotation_headers)

    args.cluster_indices = utils.get_arg_list(args.cluster_indices, intify=True)

    args.allowed_cdr3_lengths = utils.get_arg_list(args.allowed_cdr3_lengths, intify=True)

    args.region_end_exclusions = {r : [args.region_end_exclusion_length if ('%s_%s' % (r, e)) in utils.real_erosions else 0 for e in ['5p', '3p']] for r in utils.regions}
    args.region_end_exclusion_length = None  # there isn't really a big reason to set it to None, but this makes clear that I should only be using the dict version

    args.typical_genes_per_region_per_subject = utils.get_arg_list(args.typical_genes_per_region_per_subject, intify=True)
    if len(args.typical_genes_per_region_per_subject) != len(utils.regions):
        raise Exception('wrong length for --typical-genes-per-region-per-subject, has to be three')
    tmpfrac, ntmp = args.min_allele_prevalence_fraction, args.typical_genes_per_region_per_subject
    args.min_allele_prevalence_fractions = {r : tmpfrac * ntmp[utils.regions.index('v')] / ntmp[utils.regions.index(r)] for r in utils.regions}
    delattr(args, 'min_allele_prevalence_fraction')  # delete the non-plural version
    delattr(args, 'typical_genes_per_region_per_subject')  # and we don't need this any more either

    args.annotation_clustering_thresholds = utils.get_arg_list(args.annotation_clustering_thresholds, floatify=True)
    args.naive_hamming_bounds = utils.get_arg_list(args.naive_hamming_bounds, floatify=True)
    if args.small_clusters_to_ignore is not None:
        if '-' in args.small_clusters_to_ignore:
            lo, hi = [int(cluster_size) for cluster_size in args.small_clusters_to_ignore.split('-')]
            args.small_clusters_to_ignore = range(lo, hi + 1)
        else:
            args.small_clusters_to_ignore = utils.get_arg_list(args.small_clusters_to_ignore, intify=True)
    if args.seed_unique_id is not None:
        args.seed_unique_id = args.seed_unique_id.strip()  # protect against the space you may put in front of it if it's got an initial minus sign (better way is to use an equals sign)
        if args.queries is not None and args.seed_unique_id not in args.queries:
            raise Exception('seed uid %s not in --queries %s' % (args.seed_unique_id, ' '.join(args.queries)))
        if args.random_seed_seq:
            raise Exception('can\'t specify both --seed-unique-id and --random-seed-seq')

        if args.queries_to_include is None:  # make sure the seed is in --queries-to-include
            args.queries_to_include = [args.seed_unique_id]
        elif args.seed_unique_id not in args.queries_to_include:
            args.queries_to_include = [args.seed_unique_id] + args.queries_to_include  # may as well put it first, I guess (?)
    elif args.seed_seq is not None:
        args.seed_unique_id = 'seed-seq'

    if args.sw_debug is None:  # if not explicitly set, set equal to regular debug
        args.sw_debug = args.debug

    if args.only_genes is not None:
        for gene in args.only_genes:  # make sure they're all at least valid ig genes
            utils.split_gene(gene)

    if args.print_git_commit or args.action == 'version':
        print '  commit: %s' % subprocess.check_output(['git', 'rev-parse', 'HEAD']).strip()
        cmd = 'git describe --always --tags'
        out, err = utils.simplerun(cmd, return_out_err=True, debug=False)
        if '-' in out:
            if out.count('-') == 2:
                tag, n_ahead, commit_hash_abbrev = out.strip().split('-')
                ahead_str = ''
                if int(n_ahead) > 0:
                    ahead_str = '  (well, %d commits ahead of)' % int(n_ahead)
                print '     tag: %s%s' % (tag, ahead_str)
            else:
                print '    couldn\'t figure out tag from \'%s\' output: %s' % (cmd, out)
        else:
            tag = out.strip()
            print '     tag: %s' % tag

        if args.action == 'version':
            sys.exit(0)

    args.is_data = not args.is_simu  # whole code base uses is_data, this is better than changing all of that

    if args.simultaneous_true_clonal_seqs:
        if args.is_data:
            raise Exception('can only pass true clonal families to multi-hmm together on simulation and with --is-simu set')
        if args.n_simultaneous_seqs is not None:
            raise Exception('can\'t specify both --n-simultaneous-seqs and --simultaneous-true-clonal-seqs')
        if args.all_seqs_simultaneous:
            raise Exception('can\'t specify both --all-seqs-simultaneous and --simultaneous-true-clonal-seqs')
    if args.n_simultaneous_seqs is not None and args.all_seqs_simultaneous:
        raise Exception('doesn\'t make sense to set both --n-simultaneous-seqs and --all-seqs-simultaneous.')

    if args.no_indels:
        print 'forcing --gap-open-penalty to %d to prevent indels, since --no-indels was specified (you can also adjust this penalty directly)' % args.no_indel_gap_open_penalty
        args.gap_open_penalty = args.no_indel_gap_open_penalty

    if args.indel_frequency > 0.:
        if args.indel_frequency < 0. or args.indel_frequency > 1.:
            raise Exception('--indel-frequency must be in [0., 1.] (got %f)' % args.indel_frequency)
    args.n_indels_per_indeld_seq = utils.get_arg_list(args.n_indels_per_indeld_seq, intify=True)
    if args.indel_location not in [None, 'v', 'cdr3']:
        if int(args.indel_location) in range(500):
            args.indel_location = int(args.indel_location)
            if any(n > 1 for n in args.n_indels_per_indeld_seq):
                print '  note: removing entries from --n-indels-per-indeld-seq (%s), since --indel-location was set to a single position.' % [n for n in args.n_indels_per_indeld_seq if n > 1]
                args.n_indels_per_indeld_seq = [n for n in args.n_indels_per_indeld_seq if n <= 1]
        else:
            raise Exception('--indel-location \'%s\' neither one of None, \'v\' or \'cdr3\', nor an integer less than 500' % args.indel_location)

    if 'tr' in args.locus and args.mutation_multiplier is None:
        args.mutation_multiplier = 0.

    if args.workdir is None:  # set default here so we know whether it was set by hand or not
        args.workdir = get_workdir(args.batch_system)
    else:
        args.workdir = args.workdir.rstrip('/')
    if os.path.exists(args.workdir):
        raise Exception('workdir %s already exists' % args.workdir)

    if args.batch_system == 'sge' and args.batch_options is not None:
        if '-e' in args.batch_options or '-o' in args.batch_options:
            print '%s --batch-options contains \'-e\' or \'-o\', but we add these automatically since we need to be able to parse each job\'s stdout and stderr. You can control the directory under which they\'re written with --workdir (which is currently %s).' % (utils.color('red', 'warning'), args.workdir)

    if args.outfname is not None and not args.presto_output and not args.airr_output:
        if utils.getsuffix(args.outfname) not in ['.csv', '.yaml']:
            raise Exception('unhandled --outfname suffix %s' % utils.getsuffix(args.outfname))
        if utils.getsuffix(args.outfname) != '.yaml':
            print '  %s --outfname uses deprecated file format %s. This will still mostly work ok, but the new default .yaml format doesn\'t have to do all the string conversions by hand (so is less buggy), and includes annotations, partitions, and germline info in the same file (so you don\'t get crashes or inconsistent results if you don\'t keep track of what germline info goes with what output file).' % (utils.color('yellow', 'note:'), utils.getsuffix(args.outfname))
        if args.action in ['view-annotations', 'view-partitions'] and utils.getsuffix(args.outfname) == '.yaml':
            raise Exception('have to use \'view-output\' action to view .yaml output files')

    if args.presto_output:
        if args.outfname is None:
            raise Exception('have to set --outfname if --presto-output is set')
        if args.action == 'annotate' and utils.getsuffix(args.outfname) != '.tsv':
            raise Exception('--outfname suffix has to be .tsv for annotation with --presto-output (got %s)' % utils.getsuffix(args.outfname))
        if args.action == 'partition' and utils.getsuffix(args.outfname) not in ['.fa', '.fasta']:
            raise Exception('--outfname suffix has to be .fa or .fasta for partitioning with --presto-output (got %s)' % utils.getsuffix(args.outfname))
        if args.aligned_germline_fname is None:
            args.aligned_germline_fname = '%s/%s/imgt-alignments/%s.fa' % (args.default_initial_germline_dir, args.species, args.locus)
        if not os.path.exists(args.aligned_germline_fname):
            raise Exception('--aligned-germline-fname %s doesn\'t exist, but we need it in order to write presto output' % args.aligned_germline_fname)
    if args.airr_output:
        if args.outfname is None:
            raise Exception('have to set --outfname if --airr-output is set')
        if utils.getsuffix(args.outfname) != '.tsv':
            raise Exception('--outfname suffix has to be .tsv if --airr-output is set (got %s)' % utils.getsuffix(args.outfname))
    if args.airr_input:
        args.seq_column = 'sequence'
        args.name_column = 'sequence_id'

    if args.cluster_annotation_fname is None and args.outfname is not None and utils.getsuffix(args.outfname) == '.csv':  # if it wasn't set on the command line (<outfname> _was_ set), _and_ if we were asked for a csv, then use the old file name format
        args.cluster_annotation_fname = utils.insert_before_suffix('-cluster-annotations', args.outfname)

    if args.calculate_alternative_annotations and args.outfname is None:
        raise Exception('have to specify --outfname in order to calculate alternative annotations')
    if args.action == 'view-alternative-annotations' and args.persistent_cachefname is None:  # handle existing old-style output
        assert args.outfname is not None
        if os.path.exists(utils.getprefix(args.outfname) + '-hmm-cache.csv'):
            args.persistent_cachefname = utils.getprefix(args.outfname) + '-hmm-cache.csv'  # written by bcrham, so has to be csv, not yaml

    if args.plot_performance:
        print '%s encountered deprecated argument --plot-performance, moving value to --plot-annotation-performance' % utils.color('yellow', 'warning')
        args.plot_annotation_performance = True
    if args.plot_annotation_performance:
        if args.plotdir is None:
            raise Exception('can\'t plot performance unless --plotdir is specified')
        if not args.is_simu:
            raise Exception('can\'t plot performance unless --is-simu is set')
    if args.action == 'plot-partitions' and args.plotdir is None:
        raise Exception('--plotdir must be specified for plot-partitions')

    if args.make_per_gene_per_base_plots and not args.make_per_gene_plots:  # the former doesn't do anything unless the latter is turned on
        args.make_per_gene_plots = True

    if args.parameter_type != 'hmm':
        print '  using non-default parameter type \'%s\'' % args.parameter_type

    if args.simulate_from_scratch:
        args.rearrange_from_scratch = True
        args.mutate_from_scratch = True
    if args.flat_mute_freq or args.same_mute_freq_for_all_seqs:
        assert args.mutate_from_scratch

    if args.action == 'simulate':
        if len(args.loci) != 1:
            raise Exception('needs to be implemented')
        if args.batch_system is not None and args.n_procs > 1 and not args.subsimproc:
            print '  %s setting subsimproc' % utils.color('red', 'warning')
            args.subsimproc = True
        if args.n_trees is None:
            args.n_trees = max(1, int(float(args.n_sim_events) / args.n_procs))
        if args.outfname is None:
            print '  note: no --outfname specified, so nothing will be written to disk'
            args.outfname = get_dummy_outfname(args.workdir)  # hackey, but otherwise I have to rewrite the wole run_simulation() in bin/partis to handle None type outfname
        if args.n_max_queries != -1:
            print '  note: --n-max-queries is not used when simulating (use --n-sim-events to set the simulated number of rearrangemt events)'

        # end result of this block: shm/reco parameter dirs are set (unless we're doing their bit from scratch), --parameter-dir is set to None (and if --parameter-dir was set but shm/reco were _not_ set, we've just used --parameter-dir for either/both as needed)
        if args.parameter_dir is not None:
            if args.rearrange_from_scratch or args.mutate_from_scratch:
                raise Exception('can\'t set --parameter-dir if rearranging or mutating from scratch (use --reco-parameter-dir and/or --shm-parameter-dir)')
            if args.reco_parameter_dir is not None or args.shm_parameter_dir is not None:
                raise Exception('can\'t set --parameter-dir if either --reco-parameter-dir or --shm-parameter-dir are also set')
            args.reco_parameter_dir = args.parameter_dir
            args.shm_parameter_dir = args.parameter_dir
            args.parameter_dir = None
        if args.rearrange_from_scratch and args.reco_parameter_dir is not None:
            raise Exception('doesn\'t make sense to set both --rearrange-from-scratch and --reco-parameter-dir')
        if args.mutate_from_scratch and args.shm_parameter_dir is not None:
            raise Exception('doesn\'t make sense to set both --mutate-from-scratch and --shm-parameter-dir')
        if args.reco_parameter_dir is None and not args.rearrange_from_scratch:
            raise Exception('have to either set --rearrange-from-scratch or --reco-parameter-dir')
        if args.shm_parameter_dir is None and not args.mutate_from_scratch:
            raise Exception('have to either set --mutate-from-scratch or --shm-parameter-dir')

        if args.generate_germline_set and not args.rearrange_from_scratch:
            raise Exception('can only --generate-germline-set if also rearranging from scratch (set --rearrange-from-scratch)')

        if args.generate_germline_set:
            args.snp_positions = None  # if you want to control the exact positions, you have to use bin/test-germline-inference.py
            args.indel_positions = None
            process_gls_gen_args(args)

    if args.parameter_dir is not None:
        args.parameter_dir = args.parameter_dir.rstrip('/')
        if os.path.exists(args.parameter_dir) and len(set(os.listdir(args.parameter_dir)) & set(parameter_type_choices)) == 0:
            raise Exception('couldn\'t find any expected parameter types (i.e. subdirs) in --parameter-dir \'%s\'. Allowed types: %s, found: %s. Maybe you added the parameter type to the parameter dir path?' % (args.parameter_dir, ' '.join(parameter_type_choices), ' '.join(os.listdir(args.parameter_dir))))

    if os.path.exists(args.default_initial_germline_dir + '/' + args.species):  # ick that is hackey
        args.default_initial_germline_dir += '/' + args.species

    if args.species != 'human' and not args.allele_cluster:
        print '  non-human species \'%s\', turning on allele clustering' % args.species
        args.allele_cluster = True

    if args.n_max_snps is not None and args.n_max_mutations_per_segment is not None:
        if args.n_max_snps > args.n_max_mutations_per_segment - 10:
            raise Exception('--n-max-snps should be at least ten less than --n-max-mutations-per-segment, but I got %d and %d' % (args.n_max_snps, args.n_max_mutations_per_segment))

    if args.leave_default_germline:
        args.dont_remove_unlikely_alleles = True
        args.allele_cluster = False
        args.dont_find_new_alleles = True

    if args.infname is None and args.action not in ['simulate', 'view-output', 'view-annotations', 'view-partitions', 'view-cluster-annotations', 'plot-partitions', 'view-alternative-annotations', 'get-tree-metrics', 'get-linearham-info']:
        raise Exception('--infname is required for action \'%s\'' % args.action)

    if args.action == 'get-linearham-info':
        if args.linearham_info_fname is None:  # for some reason setting required=True isn't working
            raise Exception('have to specify --linearham-info-fname')
        if args.sw_cachefname is None and args.parameter_dir is None:
            raise Exception('have to specify --sw-cachefname or --parameter-dir, since we need sw info to calculate linearham inputs')
        if args.extra_annotation_columns is None or 'linearham-info' not in args.extra_annotation_columns:
            args.extra_annotation_columns = utils.add_lists(args.extra_annotation_columns, ['linearham-info'])
예제 #7
0
def process(args):
    if args.action == 'run-viterbi':
        print '  note: replacing deprecated action name \'run-viterbi\' with current name \'annotate\' (you don\'t need to change anything unless you want this warning message to go away)'
        args.action = 'annotate'
    if args.action == 'view-alternative-naive-seqs':
        print '  note: replacing deprecated action name \'view-alternative-naive-seqs\' with current name \'view-alternative-annotations\' (you don\'t need to change anything unless you want this warning message to go away)'
        args.action = 'view-alternative-annotations'

    args.light_chain_fractions = utils.get_arg_list(args.light_chain_fractions,
                                                    key_val_pairs=True,
                                                    floatify=True)
    if args.light_chain_fractions is not None and not utils.is_normed(
            args.light_chain_fractions.values()):
        raise Exception('--light-chain-fractions %s don\'t add to 1: %f' %
                        (args.light_chain_fractions,
                         sum(args.light_chain_fractions.values())))
    if args.action == 'merge-paired-partitions':
        assert args.paired_loci
    if args.paired_loci:
        args.locus = None
        if [args.infname, args.paired_indir].count(None) == 0:
            raise Exception('can\'t specify both --infname and --paired-indir')
        if args.outfname is not None:
            raise Exception(
                'can\'t set --outfname if --paired-loci is set (use --paired-outdir)'
            )
        if args.plotdir == 'paired-outdir':
            args.plotdir = args.paired_outdir
        if args.plotdir is None and args.action == 'plot-partitions':
            args.plotdir = args.paired_outdir
    else:
        assert args.paired_indir is None
    if not args.paired_loci and (args.paired_indir is not None
                                 or args.paired_outdir is not None):
        raise Exception(
            '--paired-loci must be set if either --paired-indir or --paired-outdir is set'
        )
    if args.reverse_negative_strands and not args.paired_loci:
        raise Exception(
            '--reverse-negative-strands has no effect unless --paired-loci is set (maybe need to run bin/split-loci.py separately?)'
        )

    args.only_genes = utils.get_arg_list(args.only_genes)
    args.queries = utils.get_arg_list(args.queries)
    args.queries_to_include = utils.get_arg_list(args.queries_to_include)
    args.reco_ids = utils.get_arg_list(args.reco_ids)
    args.istartstop = utils.get_arg_list(args.istartstop, intify=True)
    if args.istartstop is not None:
        if args.istartstop[0] >= args.istartstop[1] or args.istartstop[0] < 0:
            raise Exception('invalid --istartstop specification: %d %d' %
                            (args.istartstop[0], args.istartstop[1]))
    args.n_max_per_region = utils.get_arg_list(args.n_max_per_region,
                                               intify=True)
    if len(args.n_max_per_region) != 3:
        raise Exception(
            'n-max-per-region should be of the form \'x:y:z\', but I got ' +
            str(args.n_max_per_region))
    args.write_additional_cluster_annotations = utils.get_arg_list(
        args.write_additional_cluster_annotations, intify=True)
    if args.write_additional_cluster_annotations is not None and len(
            args.write_additional_cluster_annotations) != 2:
        raise Exception(
            '--write-additional-cluster-annotations must be specified as two numbers \'m:n\', but I got %s'
            % args.write_additional_cluster_annotations)
    args.extra_annotation_columns = utils.get_arg_list(
        args.extra_annotation_columns, choices=utils.extra_annotation_headers)

    args.cluster_indices = utils.get_arg_list(args.cluster_indices,
                                              intify_with_ranges=True)

    args.allowed_cdr3_lengths = utils.get_arg_list(args.allowed_cdr3_lengths,
                                                   intify=True)

    args.region_end_exclusions = {
        r: [
            args.region_end_exclusion_length if
            ('%s_%s' % (r, e)) in utils.real_erosions else 0
            for e in ['5p', '3p']
        ]
        for r in utils.regions
    }
    args.region_end_exclusion_length = None  # there isn't really a big reason to set it to None, but this makes clear that I should only be using the dict version

    args.typical_genes_per_region_per_subject = utils.get_arg_list(
        args.typical_genes_per_region_per_subject, intify=True)
    if len(args.typical_genes_per_region_per_subject) != len(utils.regions):
        raise Exception(
            'wrong length for --typical-genes-per-region-per-subject, has to be three'
        )
    tmpfrac, ntmp = args.min_allele_prevalence_fraction, args.typical_genes_per_region_per_subject
    args.min_allele_prevalence_fractions = {
        r:
        tmpfrac * ntmp[utils.regions.index('v')] / ntmp[utils.regions.index(r)]
        for r in utils.regions
    }
    delattr(args,
            'min_allele_prevalence_fraction')  # delete the non-plural version
    delattr(args, 'typical_genes_per_region_per_subject'
            )  # and we don't need this any more either

    args.annotation_clustering_thresholds = utils.get_arg_list(
        args.annotation_clustering_thresholds, floatify=True)
    args.naive_hamming_bounds = utils.get_arg_list(args.naive_hamming_bounds,
                                                   floatify=True)
    if args.small_clusters_to_ignore is not None:
        if '-' in args.small_clusters_to_ignore:
            lo, hi = [
                int(cluster_size)
                for cluster_size in args.small_clusters_to_ignore.split('-')
            ]
            args.small_clusters_to_ignore = range(lo, hi + 1)
        else:
            args.small_clusters_to_ignore = utils.get_arg_list(
                args.small_clusters_to_ignore, intify=True)
    if args.seed_unique_id is not None:
        args.seed_unique_id = args.seed_unique_id.strip(
        )  # protect against the space you may put in front of it if it's got an initial minus sign (better way is to use an equals sign)
        if args.queries is not None and args.seed_unique_id not in args.queries:
            raise Exception('seed uid %s not in --queries %s' %
                            (args.seed_unique_id, ' '.join(args.queries)))
        if args.random_seed_seq:
            raise Exception(
                'can\'t specify both --seed-unique-id and --random-seed-seq')

        if args.queries_to_include is None:  # make sure the seed is in --queries-to-include
            args.queries_to_include = [args.seed_unique_id]
        elif args.seed_unique_id not in args.queries_to_include:
            args.queries_to_include = [
                args.seed_unique_id
            ] + args.queries_to_include  # may as well put it first, I guess (?)
    elif args.seed_seq is not None:
        args.seed_unique_id = 'seed-seq'

    if args.sw_debug is None:  # if not explicitly set, set equal to regular debug
        args.sw_debug = args.debug

    if args.only_genes is not None:
        for gene in args.only_genes:  # make sure they're all at least valid ig genes
            utils.split_gene(gene)

    if args.print_git_commit or args.action == 'version':
        utils.get_version_info(debug=True)
        if args.action == 'version':
            sys.exit(0)

    args.is_data = not args.is_simu  # whole code base uses is_data, this is better than changing all of that

    if args.collapse_duplicate_sequences and not args.is_data:
        print '  %s collapsing duplicates on simulation, which is often not a good idea since it makes keeping track of performance harder (e.g. purity/completeness of partitions is harder to calculate)' % utils.color(
            'red', 'warning')

    if args.simultaneous_true_clonal_seqs:
        if args.is_data:
            raise Exception(
                'can only pass true clonal families to multi-hmm together on simulation and with --is-simu set'
            )
        if args.n_simultaneous_seqs is not None:
            raise Exception(
                'can\'t specify both --n-simultaneous-seqs and --simultaneous-true-clonal-seqs'
            )
        if args.all_seqs_simultaneous:
            raise Exception(
                'can\'t specify both --all-seqs-simultaneous and --simultaneous-true-clonal-seqs'
            )
        if args.action == 'partition':
            raise Exception(
                'can\'t set --simultaneous-true-clonal-seqs when partitioning')
    if args.n_simultaneous_seqs is not None and args.all_seqs_simultaneous:
        raise Exception(
            'doesn\'t make sense to set both --n-simultaneous-seqs and --all-seqs-simultaneous.'
        )

    if args.no_indels:
        print 'forcing --gap-open-penalty to %d to prevent indels, since --no-indels was specified (you can also adjust this penalty directly)' % args.no_indel_gap_open_penalty
        args.gap_open_penalty = args.no_indel_gap_open_penalty

    if args.indel_frequency > 0.:
        if args.indel_frequency < 0. or args.indel_frequency > 1.:
            raise Exception('--indel-frequency must be in [0., 1.] (got %f)' %
                            args.indel_frequency)
    args.n_indels_per_indeld_seq = utils.get_arg_list(
        args.n_indels_per_indeld_seq, intify=True)
    if args.indel_location not in [None, 'v', 'cdr3']:
        if int(args.indel_location) in range(500):
            args.indel_location = int(args.indel_location)
            if any(n > 1 for n in args.n_indels_per_indeld_seq):
                print '  note: removing entries from --n-indels-per-indeld-seq (%s), since --indel-location was set to a single position.' % [
                    n for n in args.n_indels_per_indeld_seq if n > 1
                ]
                args.n_indels_per_indeld_seq = [
                    n for n in args.n_indels_per_indeld_seq if n <= 1
                ]
        else:
            raise Exception(
                '--indel-location \'%s\' neither one of None, \'v\' or \'cdr3\', nor an integer less than 500'
                % args.indel_location)

    if args.locus is not None and 'tr' in args.locus and args.mutation_multiplier is None:
        args.mutation_multiplier = 0.

    if args.workdir is None:  # set default here so we know whether it was set by hand or not
        args.workdir = get_workdir(args.batch_system)
    else:
        args.workdir = args.workdir.rstrip('/')
    if os.path.exists(args.workdir):
        raise Exception('workdir %s already exists' % args.workdir)

    if args.batch_system == 'sge' and args.batch_options is not None:
        if '-e' in args.batch_options or '-o' in args.batch_options:
            print '%s --batch-options contains \'-e\' or \'-o\', but we add these automatically since we need to be able to parse each job\'s stdout and stderr. You can control the directory under which they\'re written with --workdir (which is currently %s).' % (
                utils.color('red', 'warning'), args.workdir)

    if args.outfname is not None and not args.presto_output and not args.airr_output and not args.generate_trees:
        if utils.getsuffix(args.outfname) not in ['.csv', '.yaml']:
            raise Exception('unhandled --outfname suffix %s' %
                            utils.getsuffix(args.outfname))
        if utils.getsuffix(args.outfname) != '.yaml':
            print '  %s --outfname uses deprecated file format %s. This will still mostly work ok, but the new default .yaml format doesn\'t have to do all the string conversions by hand (so is less buggy), and includes annotations, partitions, and germline info in the same file (so you don\'t get crashes or inconsistent results if you don\'t keep track of what germline info goes with what output file).' % (
                utils.color('yellow', 'note:'), utils.getsuffix(args.outfname))
        if args.action in ['view-annotations', 'view-partitions'
                           ] and utils.getsuffix(args.outfname) == '.yaml':
            raise Exception(
                'have to use \'view-output\' action to view .yaml output files'
            )

    if args.presto_output:
        if args.outfname is None:
            raise Exception('have to set --outfname if --presto-output is set')
        if args.action == 'annotate' and utils.getsuffix(
                args.outfname) != '.tsv':
            raise Exception(
                '--outfname suffix has to be .tsv for annotation with --presto-output (got %s)'
                % utils.getsuffix(args.outfname))
        if args.action == 'partition' and utils.getsuffix(
                args.outfname) not in ['.fa', '.fasta']:
            raise Exception(
                '--outfname suffix has to be .fa or .fasta for partitioning with --presto-output (got %s)'
                % utils.getsuffix(args.outfname))
        if args.aligned_germline_fname is None:
            assert args.locus is not None
            args.aligned_germline_fname = '%s/%s/imgt-alignments/%s.fa' % (
                args.default_initial_germline_dir, args.species, args.locus)
        if not os.path.exists(args.aligned_germline_fname):
            raise Exception(
                '--aligned-germline-fname %s doesn\'t exist, but we need it in order to write presto output'
                % args.aligned_germline_fname)
    if args.airr_output:
        if args.outfname is None:
            raise Exception('have to set --outfname if --airr-output is set')
        if utils.getsuffix(args.outfname) == '.tsv':
            print '  note: writing only airr .tsv to %s' % args.outfname
        elif utils.getsuffix(args.outfname) in ['.yaml', '.csv']:
            print '  note: writing both partis %s to %s and airr .tsv to %s' % (
                utils.getsuffix(args.outfname), args.outfname,
                utils.replace_suffix(args.outfname, '.tsv'))
        else:
            raise Exception(
                '--outfname suffix has to be either .tsv or .yaml if --airr-output is set (got %s)'
                % utils.getsuffix(args.outfname))
    if args.airr_input:
        args.seq_column = 'sequence'
        args.name_column = 'sequence_id'

    if args.cluster_annotation_fname is None and args.outfname is not None and utils.getsuffix(
            args.outfname
    ) == '.csv':  # if it wasn't set on the command line (<outfname> _was_ set), _and_ if we were asked for a csv, then use the old file name format
        args.cluster_annotation_fname = utils.insert_before_suffix(
            '-cluster-annotations', args.outfname)

    if args.calculate_alternative_annotations and args.outfname is None and args.paired_outdir is None:
        raise Exception(
            'have to specify --outfname in order to calculate alternative annotations'
        )
    if args.subcluster_annotation_size == 'None':  # i want it turned on by default, but also to be able to turn it off on the command line
        args.subcluster_annotation_size = None
    else:
        args.subcluster_annotation_size = int(
            args.subcluster_annotation_size
        )  # can't set it in add_argument(), sigh
    if args.subcluster_annotation_size is not None:
        if args.calculate_alternative_annotations or args.write_additional_cluster_annotations is not None:
            raise Exception(
                'can\'t set either --calculate-alternative-annotations or --write-additional-cluster-annotations if --subcluster-annotation-size is also set (you get duplicate annotations, which confuses and crashes things, plus it doesn\'t really make sense -- alternative annotations should be calculated on the subcluster annotations now)'
            )
    if args.action == 'view-alternative-annotations' and args.persistent_cachefname is None:  # handle existing old-style output
        assert args.outfname is not None
        if os.path.exists(utils.getprefix(args.outfname) + '-hmm-cache.csv'):
            args.persistent_cachefname = utils.getprefix(
                args.outfname
            ) + '-hmm-cache.csv'  # written by bcrham, so has to be csv, not yaml

    if args.min_largest_cluster_size is not None and args.n_final_clusters is not None:
        print '  note: both --min-largest-cluster-size and --n-final-clusters are set, which means we\'ll stop clustering when *either* of their criteria are satisfied (not both)'  # maybe it should be both, but whatever

    if not args.paired_loci and (args.action == 'get-selection-metrics'
                                 or args.get_selection_metrics):
        if args.outfname is None and args.selection_metric_fname is None:
            print '    %s calculating selection metrics, but neither --outfname nor --selection-metric-fname were set, which means nothing will be written to disk' % utils.color(
                'yellow', 'warning')
        elif args.selection_metric_fname is None and args.action == 'get-selection-metrics' and not args.add_selection_metrics_to_outfname:
            args.selection_metric_fname = utils.insert_before_suffix(
                '-selection-metrics', args.outfname)

    if args.plot_annotation_performance:
        if args.plotdir is None and args.print_n_worst_annotations is None:
            raise Exception(
                'doesn\'t make sense to set --plot-annotation-performance but not either of --plotdir or --print-n-worst-annotations (we\'ll spend all the cycles counting things up but then they\'ll just disappear from memory without being recorded).'
            )
        if not args.is_simu:
            raise Exception(
                'can\'t plot performance unless --is-simu is set (and this is simulation)'
            )
    if args.print_n_worst_annotations is not None and not args.plot_annotation_performance:
        raise Exception(
            '--plot-annotation-performance must be set if you\'re setting --print-worst-annotations'
        )
    if not args.paired_loci and (
            args.action == 'plot-partitions' or args.action == 'annotate'
            and args.plot_partitions) and args.plotdir is None:
        raise Exception('--plotdir must be specified if plotting partitions')
    if args.action == 'annotate' and args.plot_partitions and args.input_partition_fname is None:  # could set this up to use e.g. --simultaneous-true-clonal-seqs as well, but it can't atm
        print '  %s running annotate with --plot-partitions, but --input-partition-fname is not set, which likely means the partitions will be trivial/singleton partitions' % utils.color(
            'yellow', 'warning')

    if args.make_per_gene_per_base_plots and not args.make_per_gene_plots:  # the former doesn't do anything unless the latter is turned on
        args.make_per_gene_plots = True

    if args.action == 'simulate':
        if args.n_trees is None and not args.paired_loci:
            args.n_trees = max(1, int(float(args.n_sim_events) / args.n_procs))
        if args.n_procs > args.n_sim_events:
            print '  note: reducing --n-procs to %d (was %d) so it isn\'t bigger than --n-sim-events' % (
                args.n_sim_events, args.n_procs)
            args.n_procs = args.n_sim_events
        if args.n_max_queries != -1:
            print '  note: --n-max-queries is not used when simulating (use --n-sim-events to set the simulated number of rearrangemt events)'

        if args.outfname is None and args.paired_outdir is None:
            print '  note: no %s specified, so nothing will be written to disk' % (
                '--paired-outdir' if args.paired_loci else '--outfname')
            args.outfname = get_dummy_outfname(
                args.workdir
            )  # hackey, but otherwise I have to rewrite the whole run_simulation() in bin/partis to handle None type outfname

        if args.simulate_from_scratch:
            args.rearrange_from_scratch = True
            args.mutate_from_scratch = True
        if args.rearrange_from_scratch and not args.force_dont_generate_germline_set:  # i would probably just default to always generating germline sets when rearranging from scratch, but bin/test-germline-inference.py (and any other case where you want to dramatically restrict the germline set) really argue for a way to force just using the genes in the germline dir
            args.generate_germline_set = True
        if args.flat_mute_freq or args.same_mute_freq_for_all_seqs:
            assert args.mutate_from_scratch
        if args.mutate_from_scratch and not args.no_per_base_mutation:
            print '  note: setting --no-per-base-mutation since --mutate-from-scratch was set'
            args.no_per_base_mutation = True

        # end result of this block: shm/reco parameter dirs are set (unless we're doing their bit from scratch), --parameter-dir is set to None (and if --parameter-dir was set but shm/reco were _not_ set, we've just used --parameter-dir for either/both as needed)
        if args.parameter_dir is not None:
            if args.rearrange_from_scratch or args.mutate_from_scratch:
                raise Exception(
                    'can\'t set --parameter-dir if rearranging or mutating from scratch (use --reco-parameter-dir and/or --shm-parameter-dir)'
                )
            if args.reco_parameter_dir is not None or args.shm_parameter_dir is not None:
                raise Exception(
                    'can\'t set --parameter-dir if either --reco-parameter-dir or --shm-parameter-dir are also set'
                )
            args.reco_parameter_dir = args.parameter_dir
            args.shm_parameter_dir = args.parameter_dir
            args.parameter_dir = None
        if args.rearrange_from_scratch and args.reco_parameter_dir is not None:
            raise Exception(
                'doesn\'t make sense to set both --rearrange-from-scratch and --reco-parameter-dir'
            )
        if args.mutate_from_scratch and args.shm_parameter_dir is not None:
            raise Exception(
                'doesn\'t make sense to set both --mutate-from-scratch and --shm-parameter-dir'
            )
        if args.reco_parameter_dir is None and not args.rearrange_from_scratch:
            raise Exception(
                'have to either set --rearrange-from-scratch or --reco-parameter-dir (or --simulate-from-scratch)'
            )
        if args.shm_parameter_dir is None and not args.mutate_from_scratch:
            raise Exception(
                'have to either set --mutate-from-scratch or --shm-parameter-dir (or --simulate-from-scratch)'
            )

        if args.generate_germline_set and not args.rearrange_from_scratch:
            raise Exception(
                'can only --generate-germline-set if also rearranging from scratch (set --rearrange-from-scratch)'
            )

        if args.generate_germline_set:
            args.snp_positions = None  # if you want to control the exact positions, you have to use bin/test-germline-inference.py
            args.indel_positions = None
            process_gls_gen_args(args)

        if args.generate_trees:
            assert args.n_procs == 1  # not set up to handle output, and also no need

        if args.treefname is not None:
            raise Exception(
                '--treefname was set for simulation action (probably meant to use --input-simulation-treefname)'
            )

    if args.parameter_dir is not None and not args.paired_loci:  # if we're splitting loci, this isn't the normal parameter dir, it's a parent of that
        args.parameter_dir = args.parameter_dir.rstrip('/')
        if os.path.exists(args.parameter_dir):
            pdirs = [
                d for d in os.listdir(args.parameter_dir) if os.path.isdir(d)
            ]
            if len(pdirs) > 0 and len(
                    set(pdirs) & set(utils.parameter_type_choices)) == 0:
                raise Exception(
                    'couldn\'t find any expected parameter types (i.e. subdirs) in --parameter-dir \'%s\'. Allowed types: %s, found: %s. Maybe you added the parameter type to the parameter dir path?'
                    % (args.parameter_dir, ' '.join(
                        utils.parameter_type_choices), ' '.join(
                            os.listdir(args.parameter_dir))))

    if os.path.exists(args.default_initial_germline_dir + '/' +
                      args.species):  # ick that is hackey
        args.default_initial_germline_dir += '/' + args.species

    if args.species != 'human' and not args.allele_cluster:
        print '  non-human species \'%s\', turning on allele clustering' % args.species
        args.allele_cluster = True

    if args.n_max_snps is not None and args.n_max_mutations_per_segment is not None:
        if args.n_max_snps > args.n_max_mutations_per_segment - 10:
            raise Exception(
                '--n-max-snps should be at least ten less than --n-max-mutations-per-segment, but I got %d and %d'
                % (args.n_max_snps, args.n_max_mutations_per_segment))

    if args.leave_default_germline:
        args.dont_remove_unlikely_alleles = True
        args.allele_cluster = False
        args.dont_find_new_alleles = True

    if args.action not in actions_not_requiring_input and [
            args.infname, args.paired_indir
    ].count(None) == 2:
        if args.paired_loci:
            raise Exception(
                '--infname or --paired-indir is required for action \'%s\' with --paired-loci'
                % args.action)
        else:
            raise Exception('--infname is required for action \'%s\'' %
                            args.action)

    if args.action == 'get-linearham-info':
        if args.linearham_info_fname is None:  # for some reason setting required=True isn't working
            raise Exception('have to specify --linearham-info-fname')
        if args.sw_cachefname is None and args.parameter_dir is None:
            raise Exception(
                'have to specify --sw-cachefname or --parameter-dir, since we need sw info to calculate linearham inputs'
            )
        if args.extra_annotation_columns is None or 'linearham-info' not in args.extra_annotation_columns:
            args.extra_annotation_columns = utils.add_lists(
                args.extra_annotation_columns, ['linearham-info'])

    if args.ete_path is not None and args.ete_path == 'None':  # it's nice to be able to unset this from the command line (so we don't make the slow tree plots)
        args.ete_path = None
예제 #8
0
def process(args):
    if args.action == 'run-viterbi':
        print '  note: replacing deprecated action name \'run-viterbi\' with current name \'annotate\' (this doesn\'t change any actual behavior)'
        args.action = 'annotate'

    if args.chain is not None:
        print '    note: transferring argument from deprecated option \'--chain %s\' to new option \'--locus %s\'' % (
            args.chain, 'ig' + args.chain)
        args.locus = 'ig' + args.chain
        args.chain = None
    args.loci = utils.get_arg_list(args.loci, choices=utils.loci)
    if args.loci is None:  # in principle I should check that at least one of 'em isn't None, but if that's the case it'll crash soon enough
        args.loci = [args.locus]
    else:
        args.locus = args.loci[0]

    args.only_genes = utils.get_arg_list(args.only_genes)
    args.queries = utils.get_arg_list(args.queries)
    args.queries_to_include = utils.get_arg_list(args.queries_to_include)
    args.reco_ids = utils.get_arg_list(args.reco_ids)
    args.istartstop = utils.get_arg_list(args.istartstop, intify=True)
    if args.istartstop is not None:
        if args.istartstop[0] >= args.istartstop[1] or args.istartstop[0] < 0:
            raise Exception('invalid --istartstop specification: %d %d' %
                            (args.istartstop[0], args.istartstop[1]))
    args.n_max_per_region = utils.get_arg_list(args.n_max_per_region,
                                               intify=True)
    if len(args.n_max_per_region) != 3:
        raise Exception(
            'n-max-per-region should be of the form \'x:y:z\', but I got ' +
            str(args.n_max_per_region))
    args.write_additional_cluster_annotations = utils.get_arg_list(
        args.write_additional_cluster_annotations, intify=True)
    if args.write_additional_cluster_annotations is not None and len(
            args.write_additional_cluster_annotations) != 2:
        raise Exception(
            '--write-additional-cluster-annotations must be specified as two numbers \'m:n\', but I got %s'
            % args.write_additional_cluster_annotations)
    args.extra_annotation_columns = utils.get_arg_list(
        args.extra_annotation_columns, choices=utils.extra_annotation_headers)
    if args.linearham:
        assert args.action == 'partition', '--linearham mode must be run with \'partis partition\''
        args.extra_annotation_columns = utils.add_lists(
            args.extra_annotation_columns, ['flexbounds', 'relpos'])

    args.cluster_indices = utils.get_arg_list(args.cluster_indices,
                                              intify=True)

    args.region_end_exclusions = {
        r: [
            args.region_end_exclusion_length if
            ('%s_%s' % (r, e)) in utils.real_erosions else 0
            for e in ['5p', '3p']
        ]
        for r in utils.regions
    }
    args.region_end_exclusion_length = None  # there isn't really a big reason to set it to None, but this makes clear that I should only be using the dict version

    args.annotation_clustering_thresholds = utils.get_arg_list(
        args.annotation_clustering_thresholds, floatify=True)
    args.naive_hamming_bounds = utils.get_arg_list(args.naive_hamming_bounds,
                                                   floatify=True)
    if args.small_clusters_to_ignore is not None:
        if '-' in args.small_clusters_to_ignore:
            lo, hi = [
                int(cluster_size)
                for cluster_size in args.small_clusters_to_ignore.split('-')
            ]
            args.small_clusters_to_ignore = range(lo, hi + 1)
        else:
            args.small_clusters_to_ignore = utils.get_arg_list(
                args.small_clusters_to_ignore, intify=True)
    if args.seed_unique_id is not None:
        args.seed_unique_id = args.seed_unique_id.strip(
        )  # protect against the space you may put in front of it if it's got an initial minus sign (better way is to use an equals sign)
        if args.queries is not None and args.seed_unique_id not in args.queries:
            raise Exception('seed uid %s not in --queries %s' %
                            (args.seed_unique_id, ' '.join(args.queries)))
        if args.random_seed_seq:
            raise Exception(
                'can\'t specify both --seed-unique-id and --random-seed-seq')

        if args.queries_to_include is None:  # make sure the seed is in --queries-to-include
            args.queries_to_include = [args.seed_unique_id]
        elif args.seed_unique_id not in args.queries_to_include:
            args.queries_to_include = [
                args.seed_unique_id
            ] + args.queries_to_include  # may as well put it first, I guess (?)
    elif args.seed_seq is not None:
        args.seed_unique_id = 'seed-seq'

    if args.sw_debug is None:  # if not explicitly set, set equal to regular debug
        args.sw_debug = args.debug

    if args.only_genes is not None:
        for gene in args.only_genes:  # make sure they're all at least valid ig genes
            utils.split_gene(gene)

    if args.print_git_commit or args.action == 'version':
        print 'RUN ' + ' '.join(sys.argv)
        tag = subprocess.check_output(['git', 'tag']).split()[-1]
        print '       tag %s' % tag
        print '    commit %s' % subprocess.check_output(
            ['git', 'rev-parse', 'HEAD']).strip()
        if args.action == 'version':
            sys.exit(0)

    args.is_data = not args.is_simu  # whole code base uses is_data, this is better than changing all of that

    if args.simultaneous_true_clonal_seqs:
        if args.is_data:
            raise Exception(
                'can only pass true clonal families to multi-hmm together on simulation and with --is-simu set'
            )
        if args.n_simultaneous_seqs is not None:
            raise Exception(
                'can\'t specify both --n-simultaneous-seqs and --simultaneous-true-clonal-seqs'
            )

    if args.no_indels:
        print 'forcing --gap-open-penalty to %d to prevent indels, since --no-indels was specified (you can also adjust this penalty directly)' % args.no_indel_gap_open_penalty
        args.gap_open_penalty = args.no_indel_gap_open_penalty

    if args.indel_frequency > 0.:
        if args.indel_frequency < 0. or args.indel_frequency > 1.:
            raise Exception('--indel-frequency must be in [0., 1.] (got %f)' %
                            args.indel_frequency)
    args.n_indels_per_indeld_seq = utils.get_arg_list(
        args.n_indels_per_indeld_seq, intify=True)

    if 'tr' in args.locus and args.mutation_multiplier is None:
        args.mutation_multiplier = 0.

    if args.workdir is None:  # set default here so we know whether it was set by hand or not

        def choose_random_subdir(dirname):
            subname = str(random.randint(0, 999999))
            while os.path.exists(dirname + '/' + subname):
                subname = str(random.randint(0, 999999))
            return dirname + '/' + subname

        if args.batch_system is not None and os.path.exists(
                '/fh/fast/matsen_e'):
            args.workdir = choose_random_subdir(
                '/fh/fast/matsen_e/' + os.path.basename(os.getenv('HOME')) +
                '/_tmp/hmms')
        else:
            args.workdir = choose_random_subdir(
                '/tmp/' + os.path.basename(os.getenv('HOME')) + '/hmms')
            if args.batch_system is not None:
                print '  %s: using batch system %s with default --workdir (%s) -- if this isn\'t visible to the batch nodes on your system, you\'ll need to change it' % (
                    utils.color('red',
                                'warning'), args.batch_system, args.workdir)
    else:
        args.workdir = args.workdir.rstrip('/')
    if os.path.exists(args.workdir):
        raise Exception('workdir %s already exists' % args.workdir)

    if args.batch_system == 'sge' and args.batch_options is not None:
        if '-e' in args.batch_options or '-o' in args.batch_options:
            print '%s --batch-options contains \'-e\' or \'-o\', but we add these automatically since we need to be able to parse each job\'s stdout and stderr. You can control the directory under which they\'re written with --workdir (which is currently %s).' % (
                utils.color('red', 'warning'), args.workdir)

    if args.outfname is not None and not args.presto_output:
        if utils.getsuffix(args.outfname) not in ['.csv', '.yaml']:
            raise Exception('unhandled --outfname suffix %s' %
                            utils.getsuffix(args.outfname))
        if utils.getsuffix(args.outfname) != '.yaml':
            print '  %s --outfname uses deprecated file format %s. This will still work fine, but the new default .yaml format is much cleaner, and includes annotations, partitions, and germline info in the same file.' % (
                utils.color('yellow', 'note:'), utils.getsuffix(args.outfname))
        if args.action in ['view-annotations', 'view-partitions'
                           ] and utils.getsuffix(args.outfname) == '.yaml':
            raise Exception(
                'have to use \'view-output\' action to view .yaml output files'
            )

    if args.presto_output:
        if args.action == 'annotate' and utils.getsuffix(
                args.outfname) != '.tsv':
            raise Exception(
                '--outfname suffix has to be .tsv for annotation with --presto-output (got %s)'
                % utils.getsuffix(args.outfname))
        if args.action == 'partition' and utils.getsuffix(
                args.outfname) not in ['.fa', 'fasta']:
            raise Exception(
                '--outfname suffix has to be .fa or .fasta for partition with --presto-output (got %s)'
                % utils.getsuffix(args.outfname))
        if args.aligned_germline_fname is None:
            raise Exception(
                'in order to get presto output, you have to set --aligned-germline-fname to a fasta file with germline alignments for every germline gene, an example is located in data/germlines/imgt-aligned-igh.fa (this isn\'t set by default because imgt alignments are subject to change)'
            )

    if args.cluster_annotation_fname is None and args.outfname is not None and utils.getsuffix(
            args.outfname
    ) == '.csv':  # if it wasn't set on the command line (<outfname> _was_ set), _and_ if we were asked for a csv, then use the old file name format
        args.cluster_annotation_fname = utils.insert_before_suffix(
            '-cluster-annotations', args.outfname)

    if args.calculate_alternative_naive_seqs or (
            args.action == 'view-alternative-naive-seqs'
            and args.persistent_cachefname is None):
        if args.outfname is None:
            raise Exception(
                'have to specify --outfname in order to calculate alternative naive sequences'
            )
        args.persistent_cachefname = utils.insert_before_suffix(
            '-hmm-cache', args.outfname)
        if args.calculate_alternative_naive_seqs and os.path.exists(
                args.persistent_cachefname):
            if os.stat(args.persistent_cachefname).st_size == 0:
                print '  note: removing existing zero-length persistent cache file %s' % args.persistent_cachefname
                os.remove(args.persistent_cachefname)
            else:
                raise Exception(
                    'persistent cache file %s already exists, but we were asked to --calculate-alternative-naive-seqs. Either it\'s an old file (in which case you should delete it), or you\'ve already got the alternative annotations (so you can just run view-alternative-naive-seqs)'
                    % args.persistent_cachefname)

    if args.plot_performance:
        print '%s encountered deprecated argument --plot-performance, moving value to --plot-annotation-performance' % utils.color(
            'yellow', 'warning')
        args.plot_annotation_performance = True
    if args.plot_annotation_performance:
        if args.plotdir is None:
            raise Exception(
                'can\'t plot performance unless --plotdir is specified')
        if not args.is_simu:
            raise Exception('can\'t plot performance unless --is-simu is set')
    if args.action == 'plot-partitions' and args.plotdir is None:
        raise Exception('--plotdir must be specified ')

    if args.parameter_type != 'hmm':
        print '  using non-default parameter type \'%s\'' % args.parameter_type

    if args.simulate_from_scratch:
        args.rearrange_from_scratch = True
        args.mutate_from_scratch = True
    if args.flat_mute_freq or args.same_mute_freq_for_all_seqs:
        assert args.mutate_from_scratch

    if args.action == 'simulate':
        if len(args.loci) != 1:
            raise Exception('needs to be implemented')
        if args.batch_system is not None and args.n_procs > 1 and not args.subsimproc:
            print '  %s setting subsimproc' % utils.color('red', 'warning')
            args.subsimproc = True
        if args.n_trees is None:
            args.n_trees = max(1, int(float(args.n_sim_events) / args.n_procs))
        if args.outfname is None:
            print '  note: no --outfname specified, so nothing will be written to disk'
        if args.n_max_queries != -1:
            print '  note: --n-max-queries is not used when simulating (use --n-sim-events to set the simulated number of rearrangemt events)'

        # end result of this block: shm/reco parameter dirs are set (unless we're doing their bit from scratch), --parameter-dir is set to None (and if --parameter-dir was set but shm/reco were _not_ set, we've just used --parameter-dir for either/both as needed)
        if args.parameter_dir is not None:
            if args.rearrange_from_scratch or args.mutate_from_scratch:
                raise Exception(
                    'can\'t set --parameter-dir if rearranging or mutating from scratch (use --reco-parameter-dir and/or --shm-parameter-dir)'
                )
            if args.reco_parameter_dir is not None or args.shm_parameter_dir is not None:
                raise Exception(
                    'can\'t set --parameter-dir if either --reco-parameter-dir or --shm-parameter-dir are also set'
                )
            args.reco_parameter_dir = args.parameter_dir
            args.shm_parameter_dir = args.parameter_dir
            args.parameter_dir = None
        if args.rearrange_from_scratch and args.reco_parameter_dir is not None:
            raise Exception(
                'doesn\'t make sense to set both --rearrange-from-scratch and --reco-parameter-dir'
            )
        if args.mutate_from_scratch and args.shm_parameter_dir is not None:
            raise Exception(
                'doesn\'t make sense to set both --mutate-from-scratch and --shm-parameter-dir'
            )
        if args.reco_parameter_dir is None and not args.rearrange_from_scratch:
            raise Exception(
                'have to either set --rearrange-from-scratch or --reco-parameter-dir'
            )
        if args.shm_parameter_dir is None and not args.mutate_from_scratch:
            raise Exception(
                'have to either set --mutate-from-scratch or --shm-parameter-dir'
            )

        if args.generate_germline_set and not args.rearrange_from_scratch:
            raise Exception(
                'can only --generate-germline-set if also rearranging from scratch (set --rearrange-from-scratch)'
            )

    if args.parameter_dir is not None:
        args.parameter_dir = args.parameter_dir.rstrip('/')

    if args.count_parameters and not args.dont_write_parameters:
        raise Exception(
            'if you set --count-parameters, you should also set --dont-write-parameters to make sure you\'re not accidentally overwriting existing parameters '
        )

    if os.path.exists(args.default_initial_germline_dir + '/' +
                      args.species):  # ick that is hackey
        args.default_initial_germline_dir += '/' + args.species

    if args.species != 'human' and not args.allele_cluster:
        print '  non-human species \'%s\', turning on allele clustering' % args.species
        args.allele_cluster = True

    if args.n_max_snps is not None and args.n_max_mutations_per_segment is not None:
        if args.n_max_snps > args.n_max_mutations_per_segment - 10:
            raise Exception(
                '--n-max-snps should be at least ten less than --n-max-mutations-per-segment, but I got %d and %d'
                % (args.n_max_snps, args.n_max_mutations_per_segment))

    if args.n_alleles_per_gene is None:
        if not args.dont_find_new_alleles:
            args.n_alleles_per_gene = 1
        else:
            args.n_alleles_per_gene = 2

    if args.leave_default_germline:
        args.dont_remove_unlikely_alleles = True
        args.allele_cluster = False
        args.dont_find_new_alleles = True

    if args.infname is None and args.action not in [
            'simulate', 'view-output', 'view-annotations', 'view-partitions',
            'view-cluster-annotations', 'plot-partitions',
            'view-alternative-naive-seqs'
    ]:
        raise Exception('--infname is required for action \'%s\'' %
                        args.action)
예제 #9
0
    def plot(self, plotdir, only_csv=False, only_overall=False):
        print "  plotting parameters",
        sys.stdout.flush()
        start = time.time()

        self.clean_plots(plotdir)

        self.mfreqer.plot(plotdir + "/mute-freqs", only_csv=only_csv, only_overall=only_overall)

        overall_plotdir = plotdir + "/overall"

        for column in self.counts:
            if column == "all":
                continue
            values, gene_values = {}, {}
            for index, count in self.counts[column].iteritems():
                column_val = index[0]

                if column_val not in values:
                    values[column_val] = 0.0
                values[column_val] += count

                if column in self.columns_to_subset_by_gene:
                    gene = index[
                        1
                    ]  # NOTE this is hackey, but it works find now and will fail obviously if I ever change the correlations to be incompatible. so screw it
                    utils.split_gene(gene)  # checks validity of gene
                    if gene not in gene_values:
                        gene_values[gene] = {}
                    if column_val not in gene_values[gene]:
                        gene_values[gene][column_val] = 0.0
                    gene_values[gene][column_val] += count

            var_type = "string" if column in self.string_columns else "int"

            hist = plotting.make_hist_from_dict_of_counts(values, var_type, column, sort=True)
            plotting.draw_no_root(
                hist,
                plotname=column,
                plotdir=overall_plotdir,
                xtitle=plotconfig.xtitles.get(column, column),
                plottitle=plotconfig.plot_titles.get(column, column),
                errors=True,
                write_csv=True,
                only_csv=only_csv,
            )

            if column in self.columns_to_subset_by_gene and not only_overall:
                thisplotdir = plotdir + "/" + column
                for gene in gene_values:
                    plotname = utils.sanitize_name(gene) + "-" + column
                    hist = plotting.make_hist_from_dict_of_counts(gene_values[gene], var_type, plotname, sort=True)
                    plotting.draw_no_root(
                        hist,
                        plotname=plotname,
                        plotdir=thisplotdir,
                        xtitle=plotconfig.plot_titles.get(column, column),
                        plottitle=gene,
                        errors=True,
                        write_csv=True,
                        only_csv=only_csv,
                    )
                if not only_csv:
                    plotting.make_html(thisplotdir)

        if not only_csv:
            plotting.make_html(overall_plotdir)

        print "(%.1f sec)" % (time.time() - start)
예제 #10
0
def process(args):
    if args.action == 'run-viterbi':
        print '  note: replacing deprecated action name \'run-viterbi\' with current name \'annotate\' (this doesn\'t change any actual behavior)'
        args.action = 'annotate'

    if args.chain is not None:
        print '    note: transferring argument from deprecated option \'--chain %s\' to new option \'--locus %s\'' % (
            args.chain, 'ig' + args.chain)
        args.locus = 'ig' + args.chain
        args.chain = None
    args.loci = utils.get_arg_list(args.loci, choices=utils.loci)
    if args.loci is None:  # in principle I should check that at least one of 'em isn't None, but if that's the case it'll crash soon enough
        args.loci = [args.locus]
    else:
        args.locus = args.loci[0]

    args.only_genes = utils.get_arg_list(args.only_genes)
    args.n_procs = utils.get_arg_list(args.n_procs, intify=True)
    args.n_fewer_procs = args.n_procs[0] if len(
        args.n_procs) == 1 else args.n_procs[1]
    args.n_procs = args.n_procs[0]
    args.queries = utils.get_arg_list(args.queries)
    args.queries_to_include = utils.get_arg_list(args.queries_to_include)
    args.reco_ids = utils.get_arg_list(args.reco_ids)
    args.istartstop = utils.get_arg_list(args.istartstop, intify=True)
    if args.istartstop is not None:
        if args.istartstop[0] >= args.istartstop[1] or args.istartstop[0] < 0:
            raise Exception('invalid --istartstop specification: %d %d' %
                            (args.istartstop[0], args.istartstop[1]))
    args.n_max_per_region = utils.get_arg_list(args.n_max_per_region,
                                               intify=True)
    if len(args.n_max_per_region) != 3:
        raise Exception(
            'n-max-per-region should be of the form \'x:y:z\', but I got ' +
            str(args.n_max_per_region))
    args.write_additional_cluster_annotations = utils.get_arg_list(
        args.write_additional_cluster_annotations, intify=True)
    if args.write_additional_cluster_annotations is not None and len(
            args.write_additional_cluster_annotations) != 2:
        raise Exception(
            '--write-additional-cluster-annotations must be specified as two numbers \'m:n\', but I got %s'
            % args.write_additional_cluster_annotations)
    args.extra_annotation_columns = utils.get_arg_list(
        args.extra_annotation_columns, choices=utils.extra_annotation_headers)

    args.region_end_exclusions = {
        r: [
            args.region_end_exclusion_length if
            ('%s_%s' % (r, e)) in utils.real_erosions else 0
            for e in ['5p', '3p']
        ]
        for r in utils.regions
    }
    args.region_end_exclusion_length = None  # there isn't really a big reason to set it to None, but this makes clear that I should only be using the dict version

    args.initial_match_mismatch = utils.get_arg_list(
        args.initial_match_mismatch, intify=True)
    if len(args.initial_match_mismatch) != 2:
        raise Exception(
            '--initial-match-mismatch should be of the form \'match:mismatch\', but I got '
            + str(args.n_max_per_region))
    args.annotation_clustering_thresholds = utils.get_arg_list(
        args.annotation_clustering_thresholds, floatify=True)
    args.naive_hamming_bounds = utils.get_arg_list(args.naive_hamming_bounds,
                                                   floatify=True)
    if args.small_clusters_to_ignore is not None:
        if '-' in args.small_clusters_to_ignore:
            lo, hi = [
                int(cluster_size)
                for cluster_size in args.small_clusters_to_ignore.split('-')
            ]
            args.small_clusters_to_ignore = range(lo, hi + 1)
        else:
            args.small_clusters_to_ignore = utils.get_arg_list(
                args.small_clusters_to_ignore, intify=True)
    if args.seed_unique_id is not None:
        args.seed_unique_id = args.seed_unique_id.strip(
        )  # protect against the space you may put in front of it if it's got an initial minus sign (better way is to use an equals sign)
        if args.queries is not None and args.seed_unique_id not in args.queries:
            raise Exception('seed uid %s not in --queries %s' %
                            (args.seed_unique_id, ' '.join(args.queries)))
        if args.random_seed_seq:
            raise Exception(
                'can\'t specify both --seed-unique-id and --random-seed-seq')

        if args.queries_to_include is None:  # make sure the seed is in --queries-to-include
            args.queries_to_include = [args.seed_unique_id]
        elif args.seed_unique_id not in args.queries_to_include:
            args.queries_to_include = [
                args.seed_unique_id
            ] + args.queries_to_include  # may as well put it first, I guess (?)

    if args.sw_debug is None:  # if not explicitly set, set equal to regular debug
        args.sw_debug = args.debug

    if args.only_genes is not None:
        for gene in args.only_genes:  # make sure they're all at least valid ig genes
            utils.split_gene(gene)

    # if n_procs < 1 or n_procs > 9999:  # It happened, at least once. You know, probably.
    #     raise Exception('bad n_procs %s' % n_procs)
    if args.n_procs > args.n_max_procs:
        print 'reducing n procs %d to --n-max-procs %d' % (args.n_procs,
                                                           args.n_max_procs)
        args.n_procs = args.n_max_procs
    if args.n_fewer_procs > args.n_max_procs:
        print 'reducing n procs %d to --n-max-procs %d' % (args.n_fewer_procs,
                                                           args.n_max_procs)
        args.n_fewer_procs = args.n_max_procs

    if args.print_git_commit or args.action == 'version':
        print 'RUN ' + ' '.join(sys.argv)
        tag = check_output(['git', 'tag']).split()[-1]
        print '       tag %s' % tag
        print '    commit %s' % check_output(['git', 'rev-parse', 'HEAD'
                                              ]).strip()
        if args.action == 'version':
            sys.exit(0)

    args.is_data = not args.is_simu  # whole code base uses is_data, this is better than changing all of that

    if args.simultaneous_true_clonal_seqs:
        if args.is_data:
            raise Exception(
                'can only pass true clonal families to multi-hmm together on simulation and with --is-simu set'
            )
        if args.n_simultaneous_seqs is not None:
            raise Exception(
                'can\'t specify both --n-simultaneous-seqs and --simultaneous-true-clonal-seqs'
            )

    if args.no_indels and args.gap_open_penalty < 1000:
        print 'forcing --gap-open-penalty to 1000 to prevent indels, since --no-indels was specified (you can also adjust this penalty directly)'
        args.gap_open_penalty = 1000

    if 'tr' in args.locus and args.mutation_multiplier is None:
        args.mutation_multiplier = 0.

    if args.workdir is None:  # set default here so we know whether it was set by hand or not

        def choose_random_subdir(dirname):
            subname = str(random.randint(0, 999999))
            while os.path.exists(dirname + '/' + subname):
                subname = str(random.randint(0, 999999))
            return dirname + '/' + subname

        if args.batch_system is not None and os.path.exists(
                '/fh/fast/matsen_e'):
            args.workdir = choose_random_subdir(
                '/fh/fast/matsen_e/' + os.path.basename(os.getenv('HOME')) +
                '/_tmp/hmms')
        else:
            args.workdir = choose_random_subdir(
                '/tmp/' + os.path.basename(os.getenv('HOME')) + '/hmms')
            if args.batch_system is not None:
                print '  %s: using batch system %s with default --workdir (%s) -- if this isn\'t visible to the batch nodes on your system, you\'ll need to change it' % (
                    utils.color('red',
                                'warning'), args.batch_system, args.workdir)
    else:
        args.workdir = args.workdir.rstrip('/')
    if os.path.exists(args.workdir):
        raise Exception('workdir %s already exists' % args.workdir)

    if args.batch_system == 'sge' and args.batch_options is not None:
        if '-e' in args.batch_options or '-o' in args.batch_options:
            print '%s --batch-options contains \'-e\' or \'-o\', but we add these automatically since we need to be able to parse each job\'s stdout and stderr. You can control the directory under which they\'re written with --workdir (which is currently %s).' % (
                utils.color('red', 'warning'), args.workdir)

    if args.cluster_annotation_fname is None and args.outfname is not None:
        args.cluster_annotation_fname = args.outfname.replace(
            utils.getsuffix(args.outfname), '-cluster-annotations.csv')

    if args.calculate_alternative_naive_seqs or (
            args.action == 'view-alternative-naive-seqs'
            and args.persistent_cachefname is None):
        if args.outfname is None:
            raise Exception(
                'have to specify --outfname in order to calculate alternative naive sequences'
            )
        args.persistent_cachefname = args.outfname.replace(
            '.csv', '-hmm-cache.csv')
        if args.calculate_alternative_naive_seqs and os.path.exists(
                args.persistent_cachefname):
            if os.stat(args.persistent_cachefname).st_size == 0:
                print '  note: removing existing zero-length persistent cache file %s' % args.persistent_cachefname
                os.remove(args.persistent_cachefname)
            else:
                raise Exception(
                    'persistent cache file %s already exists, but we were asked to --calculate-alternative-naive-seqs. Either it\'s an old file (in which case you should delete it), or you\'ve already got the alternative annotations (so you can just run view-alternative-naive-seqs)'
                    % args.persistent_cachefname)

    if args.plot_performance:
        print '%s encountered deprecated argument --plot-performance, moving value to --plot-annotation-performance' % utils.color(
            'yellow', 'warning')
        args.plot_annotation_performance = True
    if args.plot_annotation_performance:
        if args.plotdir is None:
            raise Exception(
                'can\'t plot performance unless --plotdir is specified')
        if not args.is_simu:
            raise Exception('can\'t plot performance unless --is-simu is set')

    if args.parameter_type != 'hmm':
        print '  using non-default parameter type \'%s\'' % args.parameter_type

    if args.presto_output and args.aligned_germline_fname is None:
        raise Exception(
            'in order to get presto output, you have to set --aligned-germline-fname (a fasta file with germline alignments for every germline gene)'
        )

    if args.parameter_dir is not None:
        args.parameter_dir = args.parameter_dir.rstrip('/')

    if args.count_parameters and not args.dont_write_parameters:
        raise Exception(
            'if you set --count-parameters, you should also set --dont-write-parameters to make sure you\'re not accidentally overwriting existing parameters '
        )

    if os.path.exists(args.default_initial_germline_dir + '/' +
                      args.species):  # ick that is hackey
        args.default_initial_germline_dir += '/' + args.species

    if args.n_max_snps is not None and args.n_max_mutations_per_segment is not None:
        if args.n_max_snps > args.n_max_mutations_per_segment - 10:
            raise Exception(
                '--n-max-snps should be at least ten less than --n-max-mutations-per-segment, but I got %d and %d'
                % (args.n_max_snps, args.n_max_mutations_per_segment))

    if args.n_alleles_per_gene is None:
        if not args.dont_find_new_alleles:
            args.n_alleles_per_gene = 1
        else:
            args.n_alleles_per_gene = 2

    if args.leave_default_germline:
        args.dont_remove_unlikely_alleles = True
        args.allele_cluster = False
        args.dont_find_new_alleles = True

    if args.flat_mute_freq is not None or args.same_mute_freq_for_all_seqs:
        assert args.mutate_from_scratch