Пример #1
0
def run_igdiscover(infname, outfname, outdir):
    if utils.output_exists(args, outfname):
        return

    prepare_igdiscover_outdir(outdir)

    if args.n_random_queries is not None:
        sub_infname = outdir + '/' + os.path.basename(infname.replace(utils.getsuffix(infname), '-n-random-queries-%d%s' % (args.n_random_queries, utils.getsuffix(infname))))
        if os.path.exists(sub_infname):
            print '    --n-random-queries: leaving existing fasta for igdiscover (hopefully it has %d queries)' % args.n_random_queries
        else:
            print '    --n-random-queries: writing new fasta for igdiscover (%d queries)' % args.n_random_queries
            seqfos = utils.read_fastx(infname, n_random_queries=args.n_random_queries)
            with open(sub_infname, 'w') as sub_infile:
                for seqfo in seqfos:
                    sub_infile.write('>%s\n%s\n' % (seqfo['name'], seqfo['seq']))
        infname = sub_infname

    igdiscover_outfname = outdir + '/work/final/database/%s.fasta' % args.region.upper()

    cmds = getpathcmd()
    cmds += ['conda activate %s' % args.env_label]
    cmds += ['cd %s' % outdir]
    cmds += ['igdiscover init --db db --single-reads %s work' % infname]  # prepares to run, putting files into <outdir>
    cmds += ['cp %s work/' % os.path.basename(args.yamlfname)]
    cmds += ['cd work']
    cmds += ['igdiscover run']
    utils.simplerun('\n'.join(cmds) + '\n', cmdfname=outdir + '/run.sh', print_time='igdiscover', debug=True)

    template_gldir = args.glfo_dir  # if args.glfo_dir is not None else 'data/germlines/ XXX human'  # can probably delete this now that --glfo-dir is required (but leaving for now, to show how it used to be in case it comes up)
    glfo = glutils.create_glfo_from_fasta(igdiscover_outfname, args.locus, args.region, template_gldir, simulation_germline_dir=args.simulation_germline_dir)
    out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus)
    assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname
    glutils.write_glfo(out_gldir, glfo, debug=True)
Пример #2
0
def get_gls_fname(
    outdir,
    method,
    locus,
    sim_truth=False,
    data=False,
    annotation_performance_plots=False
):  # NOTE duplicates/depends on code in test-germline-inference.py
    if annotation_performance_plots:
        return outdir + '/' + method + '/annotation-performance-plots/sw/mutation'

    if data:
        if method == 'partis' or method == 'full':
            outdir += '/hmm/germline-sets'  # NOTE this is inside the datascripts output dir, also NOTE doesn't use <method> (since we only have partis for a method a.t.m., although could use --label or --extra-str to differentiate)
        else:
            outdir += '/' + method
    elif sim_truth:
        outdir += '/germlines/simulation'
    elif method == 'partis' or method == 'full':
        outdir += '/' + method + '/sw/germline-sets'
    elif 'tigger' in method or method == 'igdiscover':
        outdir += '/' + method
    else:
        assert False
    return glutils.get_fname(outdir, locus, region)
Пример #3
0
def parse_ramesh_seqs(glseqs, outdir, debug=False):
    for locus in glseqs:
        glutils.remove_glfo_files(outdir, locus)
        # write to a glfo dir without extra info
        for region in glseqs[locus]:
            fn = glutils.get_fname(outdir, locus, region)
            if not os.path.exists(os.path.dirname(fn)):
                os.makedirs(os.path.dirname(fn))
            with open(fn, 'w') as ofile:
                for gene, seq in glseqs[locus][region].items():
                    ofile.write('>%s\n%s\n' % (gene, seq))

        # figure out extra info
        template_glfo = glutils.read_glfo('data/germlines/macaque', locus)
        glfo = glutils.read_glfo(outdir,
                                 locus,
                                 template_glfo=template_glfo,
                                 remove_bad_genes=True,
                                 debug=True)

        # trim non-coding stuff upstream of v (and remove non-full-length ones)
        gene_groups = {}
        for region in ['v']:
            group_labels = sorted(
                set([utils.gene_family(g) for g in glfo['seqs'][region]]))
            gene_groups[region] = [(glabel, {
                g: glfo['seqs'][region][g]
                for g in glfo['seqs'][region] if utils.gene_family(g) == glabel
            }) for glabel in group_labels]
        for region in [r for r in utils.regions if r in gene_groups]:
            if debug:
                print '%s' % utils.color('reverse_video',
                                         utils.color('green', region))
            for group_label, group_seqs in gene_groups[
                    region]:  # ok, this isn't really doing anything any more
                if debug:
                    print '  %s' % utils.color('blue', group_label)
                for gene, seq in group_seqs.items():
                    trim_and_remove_genes(region,
                                          gene,
                                          seq,
                                          glfo,
                                          template_glfo,
                                          debug=debug)

        # remove any seqs with ambiguous bases
        for region in [r for r in utils.regions if r in glfo['seqs']]:
            for gene, seq in glfo['seqs'][region].items():
                if utils.ambig_frac(seq) > 0.:
                    if debug:
                        print '   %d ambiguous bases: %s' % (
                            len(seq) * utils.ambig_frac(seq),
                            utils.color_gene(gene))
                    glutils.remove_gene(glfo, gene)

        # glutils.print_glfo(glfo)

        # write final result
        glutils.write_glfo(outdir, glfo, debug=True)
Пример #4
0
def get_outfname(args, method, annotation_performance_plots=False, return_parent_gl_dir=False):
    outdir = args.outdir + '/' + method
    if not annotation_performance_plots:  # default: output is igh/ighv.fasta
        if method == 'partis' or method == 'full':  # parameter directory, not regular file (although, could change it to the gls .fa in sw/)
            outdir += '/sw/germline-sets'
        if not return_parent_gl_dir:
            return glutils.get_fname(outdir, args.locus, 'v')
        else:
            return outdir
    else:  # product of running partis annotation with --plot-annotation-performance
        return outdir + '/annotation-performance-plots'
Пример #5
0
def run_igdiscover(infname, outfname, outdir):
    if utils.output_exists(args, outfname):
        return

    prepare_igdiscover_outdir(outdir)

    if args.n_random_queries is not None:
        sub_infname = outdir + '/' + os.path.basename(
            infname.replace(
                utils.getsuffix(infname), '-n-random-queries-%d%s' %
                (args.n_random_queries, utils.getsuffix(infname))))
        if os.path.exists(sub_infname):
            print '    --n-random-queries: leaving existing fasta for igdiscover (hopefully it has %d queries)' % args.n_random_queries
        else:
            print '    --n-random-queries: writing new fasta for igdiscover (%d queries)' % args.n_random_queries
            seqfos = utils.read_fastx(infname,
                                      n_random_queries=args.n_random_queries)
            with open(sub_infname, 'w') as sub_infile:
                for seqfo in seqfos:
                    sub_infile.write('>%s\n%s\n' %
                                     (seqfo['name'], seqfo['seq']))
        infname = sub_infname

    igdiscover_outfname = outdir + '/work/final/database/%s.fasta' % args.region.upper(
    )

    cmds = ['#!/bin/bash']
    cmds += ['export PATH=%s:$PATH' % args.condapath]
    cmds += [
        'export PYTHONNOUSERSITE=True'
    ]  # otherwise it finds the pip-installed packages in .local and breaks (see https://github.com/conda/conda/issues/448)
    cmds += ['cd %s' % outdir]
    cmds += ['igdiscover init --db db --single-reads %s work' % infname
             ]  # prepares to run, putting files into <outdir>
    cmds += ['cp %s work/' % os.path.basename(args.yamlfname)]
    cmds += ['cd work']
    cmds += ['igdiscover run']
    utils.simplerun('\n'.join(cmds) + '\n',
                    cmdfname=outdir + '/run.sh',
                    print_time='igdiscover',
                    debug=True)

    template_gldir = args.glfo_dir if args.glfo_dir is not None else 'data/germlines/human'
    glfo = glutils.create_glfo_from_fasta(
        igdiscover_outfname,
        args.locus,
        args.region,
        template_gldir,
        simulation_germline_dir=args.simulation_germline_dir)
    out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus)
    assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname
    glutils.write_glfo(out_gldir, glfo, debug=True)
Пример #6
0
def run_tigger(infname, outfname, outdir):
    if utils.output_exists(args, outfname, offset=8):
        return

    rcmds = ['library(tigger)', 'library(dplyr)']
    # rcmds += ['data(sample_db, germline_ighv)']

    db_name = 'annotations'
    gls_name = 'gls'
    rcmds += ['%s = read.csv("%s", sep="\t")' % (db_name, infname)]
    rcmds += ['%s = readIgFasta("%s")' % (gls_name, get_glfname('v', aligned=True))]

    tigger_outfname = outdir + '/tigger.fasta'
    rcmds += ['novel_df = findNovelAlleles(%s, %s, germline_min=2, nproc=%d)' % (db_name, gls_name, args.n_procs)]  #
    rcmds += ['geno = inferGenotype(%s, find_unmutated = FALSE, germline_db = %s, novel_df = novel_df)' % (db_name, gls_name)]
    rcmds += ['genotype_seqs = genotypeFasta(geno, %s, novel_df)' % (gls_name)]
    rcmds += ['writeFasta(genotype_seqs, "%s")' % tigger_outfname]
    cmdfname = args.workdir + '/tigger-in.cmd'
    with open(cmdfname, 'w') as cmdfile:
        cmdfile.write('\n'.join(rcmds) + '\n')
    cmdstr = 'R --slave -f ' + cmdfname
    utils.simplerun(cmdstr, shell=True, print_time='tigger')

    # post-process tigger .fa
    gldir = args.glfo_dir if args.glfo_dir is not None else 'data/germlines/human'
    glfo = glutils.read_glfo(gldir, args.locus)
    tigger_alleles = set()
    for seqfo in utils.read_fastx(tigger_outfname):
        seq = seqfo['seq'].replace(utils.gap_chars[0], '')  # it should be just dots...
        tigger_alleles.add(seqfo['name'])
        if seqfo['name'] not in glfo['seqs'][args.region]:
            newfo = {'gene' : seqfo['name'], 'seq' : seq}
            use_template_for_codon_info = False
            if '+' in newfo['gene']:
                newfo['template-gene'] = newfo['gene'].split('+')[0]
                use_template_for_codon_info = True
            glutils.add_new_allele(glfo, newfo, use_template_for_codon_info=use_template_for_codon_info, debug=True)
        elif glfo['seqs'][args.region][seqfo['name']] != seq:
            print '%s different sequences in glfo and tigger output for %s:\n    %s\n    %s' % (utils.color('red', 'error'), seqfo['name'], glfo['seqs'][args.region][seqfo['name']], seqfo['seq'])
    for gene in glfo['seqs'][args.region]:  # remove them afterwards so we can use existing ones to get codon info
        if gene not in tigger_alleles:
            glutils.remove_gene(glfo, gene)

    out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus)
    assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname
    glutils.write_glfo(out_gldir, glfo)

    os.remove(cmdfname)
Пример #7
0
def prepare_igdiscover_outdir(outdir):
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    if os.path.exists(outdir + '/db'):
        for fn in [get_igd_glsfname(outdir, r) for r in utils.regions]:
            if os.path.exists(fn):
                os.remove(fn)
    else:
        os.makedirs(outdir + '/db')
    for region in utils.regions:
        targetname = glutils.get_fname(args.glfo_dir, args.locus, region)
        linkname = get_igd_glsfname(outdir, region)
        if region in utils.getregions(args.locus):
            if not os.path.exists(targetname):
                raise Exception('gl file %s d.n.e.' % targetname)
            if not os.path.islink(linkname):
                subprocess.check_call(['ln', '-s', targetname, linkname])
        else:
            with open(linkname, 'w') as dummy_d_file:
                dummy_d_file.write('>%sDx-x*x\n%s\n' %
                                   (args.locus.upper(), 'aa'))

    cfgfname = outdir + '/' + os.path.basename(
        args.yamlfname
    )  # this is the .yaml in igdiscover/ (but *not* in igdiscover/work/) have to write it in the parent workdir, then cp to work/, because... meh, who cares why, just do it like this so shit works
    if os.path.exists(cfgfname):
        os.remove(cfgfname)
    with open(
            args.yamlfname
    ) as cfgfile:  # whereas this is the template .yaml in partis/test/
        cfgdata = yaml.load(cfgfile)
    if True:  #not args.gls_gen:
        for filtername in ['pre_germline_filter', 'germline_filter']:
            for cfgvar in ['unique_js', 'unique_cdr3s']:
                cfgdata[filtername][cfgvar] = 0
    if args.species != 'human':
        if args.species == 'macaque':
            cfgdata['species'] = 'rhesus_monkey'
        else:
            assert False
    with open(cfgfname, 'w') as cfgfile:
        yaml.dump(cfgdata, cfgfile, width=200)

    if os.path.exists(
            outdir + '/work'
    ):  # sigh, it spams out too much different output, can't get away without a '-r'
        subprocess.check_call(['rm', '-r', outdir + '/work'])
Пример #8
0
def get_gls_fname(
    region,
    outdir,
    method,
    locus,
    sim_truth=False,
    data=False,
    annotation_performance_plots=False
):  # NOTE duplicates/depends on code in test-germline-inference.py
    if annotation_performance_plots:
        return outdir + '/' + method + '/annotation-performance-plots/sw/mutation'
    gls_dir = get_gls_dir(
        outdir,
        method,
        sim_truth=sim_truth,
        data=data,
        annotation_performance_plots=annotation_performance_plots)
    return glutils.get_fname(gls_dir, locus, region)
Пример #9
0
def get_gls_fname(
        outdir,
        method,
        locus,
        sim_truth=False,
        data=False
):  # NOTE duplicates/depends on code in test-allele-finding.py
    if data:
        if method == 'partis' or method == 'full':
            outdir += '/hmm/germline-sets'  # NOTE this is inside the datascripts output dir, also NOTE doesn't use <method> (since we only have partis for a method a.t.m., although could use --label or --extra-str to differentiate)
        else:
            outdir += '/' + method
    elif sim_truth:
        outdir += '/germlines/simulation'
    elif method == 'partis' or method == 'full':
        outdir += '/' + method + '/sw/germline-sets'
    elif method == 'tigger':
        outdir += '/' + method
    else:
        assert False
    return glutils.get_fname(outdir, locus, region)
Пример #10
0
def prepare_igdiscover_outdir(outdir):
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    if os.path.exists(outdir + '/db'):
        for fn in [get_igd_glsfname(outdir, r) for r in utils.regions]:
            if os.path.exists(fn):
                os.remove(fn)
    else:
        os.makedirs(outdir + '/db')
    for region in utils.regions:
        subprocess.check_call([
            'ln', '-s',
            glutils.get_fname(args.glfo_dir, args.locus, region),
            get_igd_glsfname(outdir, region)
        ])

    cfgfname = outdir + '/' + os.path.basename(
        args.yamlfname
    )  # this is the .yaml in igdiscover/ (but *not* in igdiscover/work/) have to write it in the parent workdir, then cp to work/, because... meh, who cares why, just do it like this so shit works
    if os.path.exists(cfgfname):
        os.remove(cfgfname)
    with open(
            args.yamlfname
    ) as cfgfile:  # whereas this is the template .yaml in partis/test/
        cfgdata = yaml.load(cfgfile)
    if True:  #not args.gls_gen:
        for filtername in ['pre_germline_filter', 'germline_filter']:
            for cfgvar in ['unique_js', 'unique_cdr3s']:
                cfgdata[filtername][cfgvar] = 0
    with open(cfgfname, 'w') as cfgfile:
        yaml.dump(cfgdata, cfgfile, width=200)

    if os.path.exists(
            outdir + '/work'
    ):  # sigh, it spams out too much different output, can't get away without a -r
        subprocess.check_call(['rm', '-r', outdir + '/work'])
Пример #11
0
def run_tigger(infname, outfname, outdir):
    if utils.output_exists(args, outfname, offset=8):
        return

    rcmds = [
        'library(ggplot2)', 'library(tigger, warn.conflicts=FALSE)',
        'library(dplyr, warn.conflicts=FALSE)'
    ]
    # rcmds += ['data(sample_db, germline_ighv)']

    db_name = 'annotations'
    gls_name = 'gls'
    rcmds += ['%s = read.csv("%s", sep="\t")' % (db_name, infname)]
    rcmds += [
        '%s = readIgFasta("%s")' % (gls_name, get_glfname('v', aligned=True))
    ]

    tigger_outfname = outdir + '/tigger.fasta'
    find_novel_argstr = '%s, %s, nproc=%d' % (db_name, gls_name,
                                              utils.auto_n_procs())
    if args.tuned_tigger_params:
        germline_min = 5  # only analyze genes which correspond to at least this many V calls (default 200)
        min_seqs = 5  # minimum number of total sequences
        j_max = 0.95  # of sequences which align perfectly (i.e. zero mutation?) to a new allele, no more than this fraction can correspond to each junction length + j gene combination (default 0.15)
        find_novel_argstr += ', germline_min=%d, min_seqs=%d, j_max=%f' % (
            germline_min, min_seqs, j_max)
    rcmds += ['novel_df = findNovelAlleles(%s)' % find_novel_argstr]
    # rcmds += ['sessionInfo()']
    rcmds += ['print(novel_df)']
    rcmds += [
        'geno = inferGenotype(%s, find_unmutated = TRUE, germline_db = %s, novel_df = novel_df)'
        % (db_name, gls_name)
    ]
    rcmds += ['genotype_seqs = genotypeFasta(geno, %s, novel_df)' % (gls_name)]
    rcmds += ['writeFasta(genotype_seqs, "%s")' % tigger_outfname]
    cmdfname = args.workdir + '/tigger-in.cmd'
    with open(cmdfname, 'w') as cmdfile:
        cmdfile.write('\n'.join(rcmds) + '\n')
    cmdstr = 'R --slave -f ' + cmdfname

    cmdfo = {'cmd_str': cmdstr, 'logdir': args.workdir, 'env': os.environ}
    proc = utils.run_cmd(cmdfo)
    while proc.poll() is None:
        time.sleep(0.01)
    if proc.returncode != 0:  # damn thing crashes if it thinks the sample size is small
        with open(args.workdir + '/err') as ferr:
            errstr = ''.join(ferr.readlines())
        if 'Not enough sample sequences were assigned to any germline' in errstr:
            with open(tigger_outfname, 'w') as dummy_outfasta:
                dummy_outfasta.write('')
        else:
            subprocess.check_call(['cat', args.workdir + '/out'])
            subprocess.check_call(['cat', args.workdir + '/err'])
            sys.exit(proc.returncode)

    for oe in ['err', 'out']:
        with open(args.workdir + '/' + oe) as oefile:
            print ''.join(oefile.readlines())
        os.remove(args.workdir + '/' + oe)

    # post-process tigger .fa
    template_gldir = args.glfo_dir if args.glfo_dir is not None else 'data/germlines/human'
    glfo = glutils.create_glfo_from_fasta(
        tigger_outfname,
        args.locus,
        args.region,
        template_gldir,
        simulation_germline_dir=args.simulation_germline_dir)
    out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus)
    assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname
    glutils.write_glfo(out_gldir, glfo)

    os.remove(cmdfname)
Пример #12
0
def get_outfname(args, method):
    outdir = args.outdir + '/' + method
    if method == 'partis' or method == 'full':  # parameter directory, not regular file (although, could change it to the gls .fa in sw/)
        outdir += '/sw/germline-sets'
    return glutils.get_fname(outdir, args.locus, 'v')