def read_linearham_output(): lh_info = {} clusterdirs = glob.glob( '%s/linearham/%s/cluster*' % (args.basedir, args.locus) ) # /mcmciter10000_mcmcthin10_tuneiter5000_tunethin100_numrates4_seed0/burninfrac0.1_subsampfrac0.05/aa_naive_seqs.dnamap' for cdir in clusterdirs: input_seqfos = utils.read_fastx('%s/input_seqs.fasta' % cdir) input_uids = [ sfo['name'] for sfo in input_seqfos if sfo['name'] != 'naive' ] outfnames = subprocess.check_output( ['find', cdir, '-name', 'aa_naive_seqs.dnamap']).strip().split() if len(outfnames) == 0: print ' no linearham output for %s in %s' % ( os.path.basename(cdir), cdir) continue elif len(outfnames) != 1: raise Exception('too many outfnames %s' % outfnames) clusterfo = [] with open(outfnames[0]) as outfile: for line in outfile: if line[0] == '>': # just skip these for now, we're just printing nucleotide level stuff, not aa continue prob, naive_seq = line.strip().split(',') clusterfo.append((naive_seq, float(prob))) clusterfo = sorted( clusterfo, key=operator.itemgetter(1), reverse=True ) # it's sorted by aa naive seq in the file, and within that I think by nuc naive seq? Anyway, we need to make sure lh_info[':'.join(input_uids)] = clusterfo return lh_info
def read_ramesh_file(fname, outdir, debug=False): seqfos = utils.read_fastx(fname) glseqs = { l: {r: {} for r in utils.loci[l]} for l in utils.loci if 'ig' in l } for sfo in seqfos: if os.path.basename(fname) == 'coding.fa': meta = [x.strip('[]').split('=') for x in sfo['infostrs']] mdict = {m[0]: m[1] for m in meta if len(m) == 2} if 'gene' not in mdict: print 'no gene for %s' % sfo['infostrs'] continue gene = mdict['gene'] else: mdict = {} gene = sfo['name'] if debug: print gene if utils.is_constant_gene(gene): if debug: print ' constant' continue region = utils.get_region(gene) utils.split_gene(gene) # if 'partial' in mdict: # gene += '_partial_%s' % mdict['partial'].replace('\'', '').replace(',', '') if sfo['seq'] in glseqs[utils.get_locus(gene)][region].values(): if debug: print ' duplicate' continue glseqs[utils.get_locus(gene)][region][gene] = sfo['seq'] return glseqs
def run_igdiscover(infname, outfname, outdir): if utils.output_exists(args, outfname): return prepare_igdiscover_outdir(outdir) if args.n_random_queries is not None: sub_infname = outdir + '/' + os.path.basename(infname.replace(utils.getsuffix(infname), '-n-random-queries-%d%s' % (args.n_random_queries, utils.getsuffix(infname)))) if os.path.exists(sub_infname): print ' --n-random-queries: leaving existing fasta for igdiscover (hopefully it has %d queries)' % args.n_random_queries else: print ' --n-random-queries: writing new fasta for igdiscover (%d queries)' % args.n_random_queries seqfos = utils.read_fastx(infname, n_random_queries=args.n_random_queries) with open(sub_infname, 'w') as sub_infile: for seqfo in seqfos: sub_infile.write('>%s\n%s\n' % (seqfo['name'], seqfo['seq'])) infname = sub_infname igdiscover_outfname = outdir + '/work/final/database/%s.fasta' % args.region.upper() cmds = getpathcmd() cmds += ['conda activate %s' % args.env_label] cmds += ['cd %s' % outdir] cmds += ['igdiscover init --db db --single-reads %s work' % infname] # prepares to run, putting files into <outdir> cmds += ['cp %s work/' % os.path.basename(args.yamlfname)] cmds += ['cd work'] cmds += ['igdiscover run'] utils.simplerun('\n'.join(cmds) + '\n', cmdfname=outdir + '/run.sh', print_time='igdiscover', debug=True) template_gldir = args.glfo_dir # if args.glfo_dir is not None else 'data/germlines/ XXX human' # can probably delete this now that --glfo-dir is required (but leaving for now, to show how it used to be in case it comes up) glfo = glutils.create_glfo_from_fasta(igdiscover_outfname, args.locus, args.region, template_gldir, simulation_germline_dir=args.simulation_germline_dir) out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus) assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname glutils.write_glfo(out_gldir, glfo, debug=True)
def alternative_naives_with_probabilities(f): """ Create seq, probability tuples by reading the ranked naive probabilities fasta """ seqfos = partisutils.read_fastx(f) return [(sfo["seq"], float(sfo["name"].split("_probability_")[1])) for sfo in seqfos]
def vsearch_cluster_v_seqs(self, qr_seqs, threshold, debug=False): # then vsearch cluster the v-sequences in <qr_seqs> using a heuristic j-mutation-based threshold msa_fname = self.args.workdir + '/msa.fa' print ' vsearch clustering %d %s segments with threshold %.2f (*300 = %d)' % (len(qr_seqs), self.region, threshold, int(threshold * 300)) assert self.region == 'v' # would need to change the 300 _ = utils.run_vsearch('cluster', qr_seqs, self.args.workdir + '/vsearch', threshold=threshold, msa_fname=msa_fname, vsearch_binary=self.args.vsearch_binary) msa_info = [] msa_seqs = utils.read_fastx(msa_fname) for seqfo in msa_seqs: if seqfo['name'][0] == '*': # start of new cluster (centroid is first, and is marked with a '*') centroid = seqfo['name'].lstrip('*') msa_info.append({'centroid' : centroid, 'seqfos' : [{'name' : centroid, 'seq' : seqfo['seq']}]}) # I don't seem to actually be using the identity of the centroid sequence for anything elif seqfo['name'] == 'consensus': msa_info[-1]['cons_seq'] = seqfo['seq'].replace('+', '') # gaaaaah not sure what the +s mean else: msa_info[-1]['seqfos'].append(seqfo) os.remove(msa_fname) n_initial_clusters = len(msa_info) print ' read %d vsearch clusters (%d sequences))' % (n_initial_clusters, sum([len(cfo['seqfos']) for cfo in msa_info])) # then throw out smaller clusters # n_seqs_min = max(self.absolute_n_seqs_min, self.min_cluster_fraction * len(msa_info)) n_seqs_min = self.absolute_n_seqs_min clusterfos = [cfo for cfo in msa_info if len(cfo['seqfos']) >= n_seqs_min] print ' removed %d clusters with fewer than %d sequences' % (n_initial_clusters - len(clusterfos), n_seqs_min) clusterfos = sorted(clusterfos, key=lambda cfo: len(cfo['seqfos']), reverse=True) if len(clusterfos) > self.max_number_of_clusters: print ' taking the %d largest clusters (removing %d)' % (self.max_number_of_clusters, len(clusterfos) - self.max_number_of_clusters) clusterfos = clusterfos[:self.max_number_of_clusters] return clusterfos, msa_info
def run_partis(infname, outfname): if utils.output_exists(args, outfname, offset=8): return aligned_gl_seqs = {} # keyed by seq so it's easy to check for duplicates for r in utils.regions: # deduplicate before passing to partis for seqfo in utils.read_fastx(get_glfname(r, aligned=True)): if seqfo['seq'] in aligned_gl_seqs: continue aligned_gl_seqs[seqfo['seq']] = '|'.join(seqfo['infostrs']) aligned_germline_fname = args.workdir + '/all-aligned-gl-seqs.fa' with open(aligned_germline_fname, 'w') as merged_file: for seq, gene in aligned_gl_seqs.items(): merged_file.write('>%s\n%s\n' % (gene, seq)) cmd = './bin/partis cache-parameters' cmd += ' --infname ' + infname cmd += ' --leave-default-germline' cmd += ' --presto-output --only-smith-waterman' cmd += ' --outfname ' + outfname if args.glfo_dir is not None: cmd += ' --initial-germline-dir ' + args.glfo_dir cmd += ' --aligned-germline-fname ' + aligned_germline_fname cmd += ' --n-procs ' + str(args.n_procs) utils.simplerun(cmd, print_time='partis annotation') os.remove(aligned_germline_fname)
def run_igdiscover(infname, outfname, outdir): if utils.output_exists(args, outfname): return prepare_igdiscover_outdir(outdir) if args.n_random_queries is not None: sub_infname = outdir + '/' + os.path.basename( infname.replace( utils.getsuffix(infname), '-n-random-queries-%d%s' % (args.n_random_queries, utils.getsuffix(infname)))) if os.path.exists(sub_infname): print ' --n-random-queries: leaving existing fasta for igdiscover (hopefully it has %d queries)' % args.n_random_queries else: print ' --n-random-queries: writing new fasta for igdiscover (%d queries)' % args.n_random_queries seqfos = utils.read_fastx(infname, n_random_queries=args.n_random_queries) with open(sub_infname, 'w') as sub_infile: for seqfo in seqfos: sub_infile.write('>%s\n%s\n' % (seqfo['name'], seqfo['seq'])) infname = sub_infname igdiscover_outfname = outdir + '/work/final/database/%s.fasta' % args.region.upper( ) cmds = ['#!/bin/bash'] cmds += ['export PATH=%s:$PATH' % args.condapath] cmds += [ 'export PYTHONNOUSERSITE=True' ] # otherwise it finds the pip-installed packages in .local and breaks (see https://github.com/conda/conda/issues/448) cmds += ['cd %s' % outdir] cmds += ['igdiscover init --db db --single-reads %s work' % infname ] # prepares to run, putting files into <outdir> cmds += ['cp %s work/' % os.path.basename(args.yamlfname)] cmds += ['cd work'] cmds += ['igdiscover run'] utils.simplerun('\n'.join(cmds) + '\n', cmdfname=outdir + '/run.sh', print_time='igdiscover', debug=True) template_gldir = args.glfo_dir if args.glfo_dir is not None else 'data/germlines/human' glfo = glutils.create_glfo_from_fasta( igdiscover_outfname, args.locus, args.region, template_gldir, simulation_germline_dir=args.simulation_germline_dir) out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus) assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname glutils.write_glfo(out_gldir, glfo, debug=True)
def read_bppseqgen_output(self, cmdfo, n_leaf_nodes): mutated_seqs = [] for seqfo in utils.read_fastx( cmdfo['outfname'] ): # get the leaf node sequences from the file that bppseqgen wrote mutated_seqs.append(seqfo['seq']) if n_leaf_nodes == 1: # skip the extra leaf we added earlier break assert n_leaf_nodes == len(mutated_seqs) # self.check_tree_simulation(leaf_seq_fname, chosen_tree) os.remove(cmdfo['outfname']) for otherfname in cmdfo['other-files']: os.remove(otherfname) os.rmdir(cmdfo['workdir']) return mutated_seqs
def run_tigger(infname, outfname, outdir): if utils.output_exists(args, outfname, offset=8): return rcmds = ['library(tigger)', 'library(dplyr)'] # rcmds += ['data(sample_db, germline_ighv)'] db_name = 'annotations' gls_name = 'gls' rcmds += ['%s = read.csv("%s", sep="\t")' % (db_name, infname)] rcmds += ['%s = readIgFasta("%s")' % (gls_name, get_glfname('v', aligned=True))] tigger_outfname = outdir + '/tigger.fasta' rcmds += ['novel_df = findNovelAlleles(%s, %s, germline_min=2, nproc=%d)' % (db_name, gls_name, args.n_procs)] # rcmds += ['geno = inferGenotype(%s, find_unmutated = FALSE, germline_db = %s, novel_df = novel_df)' % (db_name, gls_name)] rcmds += ['genotype_seqs = genotypeFasta(geno, %s, novel_df)' % (gls_name)] rcmds += ['writeFasta(genotype_seqs, "%s")' % tigger_outfname] cmdfname = args.workdir + '/tigger-in.cmd' with open(cmdfname, 'w') as cmdfile: cmdfile.write('\n'.join(rcmds) + '\n') cmdstr = 'R --slave -f ' + cmdfname utils.simplerun(cmdstr, shell=True, print_time='tigger') # post-process tigger .fa gldir = args.glfo_dir if args.glfo_dir is not None else 'data/germlines/human' glfo = glutils.read_glfo(gldir, args.locus) tigger_alleles = set() for seqfo in utils.read_fastx(tigger_outfname): seq = seqfo['seq'].replace(utils.gap_chars[0], '') # it should be just dots... tigger_alleles.add(seqfo['name']) if seqfo['name'] not in glfo['seqs'][args.region]: newfo = {'gene' : seqfo['name'], 'seq' : seq} use_template_for_codon_info = False if '+' in newfo['gene']: newfo['template-gene'] = newfo['gene'].split('+')[0] use_template_for_codon_info = True glutils.add_new_allele(glfo, newfo, use_template_for_codon_info=use_template_for_codon_info, debug=True) elif glfo['seqs'][args.region][seqfo['name']] != seq: print '%s different sequences in glfo and tigger output for %s:\n %s\n %s' % (utils.color('red', 'error'), seqfo['name'], glfo['seqs'][args.region][seqfo['name']], seqfo['seq']) for gene in glfo['seqs'][args.region]: # remove them afterwards so we can use existing ones to get codon info if gene not in tigger_alleles: glutils.remove_gene(glfo, gene) out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus) assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname glutils.write_glfo(out_gldir, glfo) os.remove(cmdfname)
def read_bppseqgen_output(self, cmdfo, n_leaf_nodes): mutated_seqs = {} for seqfo in utils.read_fastx(cmdfo['outfname']): # get the leaf node sequences from the file that bppseqgen wrote if seqfo['name'] == dummy_name_so_bppseqgen_doesnt_break: # in the unlikely (impossible unless we change tree generators and don't tell them to use the same leaf names) event that we get a non-dummy leaf with this name, it'll fail at the assertion just below continue mutated_seqs[seqfo['name'].strip('\'')] = seqfo['seq'] try: # make sure names are all of form t<n>, and keep track of which sequences goes with which name (have to keep around the t<n> labels so we can translate the tree labels, in event.py) names_seqs = [('t' + str(iseq + 1), mutated_seqs['t' + str(iseq + 1)]) for iseq in range(len(mutated_seqs))] except KeyError as ke: raise Exception('leaf name %s not as expected in bppseqgen output %s' % (ke, cmdfo['outfname'])) assert n_leaf_nodes == len(names_seqs) os.remove(cmdfo['outfname']) for otherfname in cmdfo['other-files']: os.remove(otherfname) os.rmdir(cmdfo['workdir']) return zip(*names_seqs)
def read_fasta_file(seqs, fname, skip_pseudogenes, aligned=False): n_skipped_pseudogenes = 0 seq_to_gene_map = {} for seqfo in utils.read_fastx(fname): # first get gene name if seqfo['name'][:2] != 'IG' and seqfo['name'][:2] != 'TR': # if it's an imgt file, with a bunch of header info (and the accession number first) gene = seqfo['infostrs'][imgt_info_indices.index('gene')] functionality = seqfo['infostrs'][imgt_info_indices.index('functionality')] if functionality not in functionalities: raise Exception('unexpected functionality %s in %s' % (functionality, fname)) if skip_pseudogenes and functionality in pseudogene_funcionalities: n_skipped_pseudogenes += 1 continue else: # plain fasta with just the gene name after the '>' gene = seqfo['name'] utils.split_gene(gene) # just to check if it's a valid gene name if not aligned and utils.get_region(gene) != utils.get_region(os.path.basename(fname)): # if <aligned> is True, file name is expected to be whatever raise Exception('gene %s from %s has unexpected region %s' % (gene, os.path.basename(fname), utils.get_region(gene))) if gene in seqs[utils.get_region(gene)]: raise Exception('gene name %s appears twice in %s' % (gene, fname)) # then the sequence seq = seqfo['seq'] if not aligned: seq = utils.remove_gaps(seq) if 'Y' in seq: print ' replacing Y --> N (%d of \'em) in %s' % (seq.count('Y'), utils.color_gene(gene)) seq = seq.replace('Y', 'N') if len(seq.strip(''.join(utils.expected_characters))) > 0: # return the empty string if it only contains expected characters raise Exception('unexpected character %s in %s (expected %s)' % (seq.strip(''.join(utils.expected_characters)), seq, ' '.join(utils.expected_characters))) if seq not in seq_to_gene_map: seq_to_gene_map[seq] = [] seq_to_gene_map[seq].append(gene) seqs[utils.get_region(gene)][gene] = seq tmpcounts = [len(gl) for gl in seq_to_gene_map.values()] # number of names corresponding to each sequence (should all be ones) if tmpcounts.count(1) != len(tmpcounts): print ' mutliple names in %s for the following sequences:' % fname for seq, genelist in seq_to_gene_map.items(): if len(genelist) > 1: print ' %-50s %s' % (' '.join(genelist), seq) raise Exception('please de-duplicate the fasta and re-run.') if n_skipped_pseudogenes > 0: print ' skipped %d %s pseudogenes (leaving %d)' % (n_skipped_pseudogenes, utils.get_region(os.path.basename(fname)), len(seqs[utils.get_region(os.path.basename(fname))]))
def run_igblast(infname, outfname): if utils.output_exists(args, outfname, offset=8): return if args.glfo_dir is not None: print '%s --glfo-dir isn\'t getting plugged in to igblast/changeo (would need to rebuild igblast db)' % utils.color( 'red', 'warning') if args.n_random_queries is not None: sub_infname = os.path.dirname(outfname) + '/' + os.path.basename( infname.replace( utils.getsuffix(infname), '-n-random-queries-%d%s' % (args.n_random_queries, utils.getsuffix(infname)))) if os.path.exists(sub_infname): print ' --n-random-queries: leaving existing fasta for igblast (hopefully it has %d queries)' % args.n_random_queries else: print ' --n-random-queries: writing new fasta for igblast (%d queries)' % args.n_random_queries seqfos = utils.read_fastx(infname, n_random_queries=args.n_random_queries) with open(sub_infname, 'w') as sub_infile: for seqfo in seqfos: sub_infile.write('>%s\n%s\n' % (seqfo['name'], seqfo['seq'])) infname = sub_infname cmds = ['#!/bin/bash'] cmds += ['cd %s/%s' % (args.igbdir, args.locus)] cmds += ['export PATH=%s:$PATH' % args.condapath] cmds += ['igblastn'] for tmpreg in utils.regions: cmds[-1] += ' -germline_db_%s %s%s-unaligned.fasta' % ( tmpreg.upper(), args.locus, tmpreg) cmds[-1] += ' -auxiliary_data optional_file/%s_gl.aux' % args.species cmds[ -1] += ' -domain_system imgt -ig_seqtype Ig -organism %s -outfmt \'7 std qseq sseq btop\'' % args.species cmds[-1] += ' -num_threads %d' % utils.auto_n_procs() cmds[-1] += ' -query ' + infname + ' -out ' + outfname utils.simplerun('\n'.join(cmds) + '\n', cmdfname=args.workdir + '/run.sh')
def read_bppseqgen_output(self, cmdfo, n_leaf_nodes): mutated_seqs = {} for seqfo in utils.read_fastx( cmdfo['outfname'] ): # get the leaf node sequences from the file that bppseqgen wrote if seqfo[ 'name'] == dummy_name_so_bppseqgen_doesnt_break: # in the unlikely (impossible unless we change tree generators and don't tell them to use the same leaf names) event that we get a non-dummy leaf with this name, it'll fail at the assertion just below continue mutated_seqs[seqfo['name'].strip('\'')] = seqfo['seq'] try: mutated_seqs = [ mutated_seqs['t' + str(iseq + 1)] for iseq in range(len(mutated_seqs)) ] except KeyError as ke: raise Exception( 'leaf name %s not as expected in bppseqgen output %s' % (ke, cmdfo['outfname'])) assert n_leaf_nodes == len(mutated_seqs) os.remove(cmdfo['outfname']) for otherfname in cmdfo['other-files']: os.remove(otherfname) os.rmdir(cmdfo['workdir']) return mutated_seqs
import json import operator # if you move this script, you'll need to change this method of getting the imports partis_dir = os.path.dirname(os.path.realpath(__file__)).replace('/bin', '') sys.path.insert(1, partis_dir + '/python') import utils parser = argparse.ArgumentParser() parser.add_argument('infname') #default='/fh/fast/matsen_e/data/goo-dengue-10x/test/filtered_contig.fasta') parser.add_argument('outfname') parser.add_argument('--droplet-id-separator', default='_', help='everything in the sequence id before this character is treated as the droplet id, e.g. for the default, the uid AAACGGGCAAGCGAGT-1_contig_2 has a droplet id of AAACGGGCAAGCGAGT-1') args = parser.parse_args() seqfos = utils.read_fastx(args.infname) droplet_ids = {} for sfo in seqfos: did = utils.get_droplet_id(sfo['name']) if did not in droplet_ids: droplet_ids[did] = [] droplet_ids[did].append(sfo['name']) print ' read %d sequences with %d droplet ids' % (len(seqfos), len(droplet_ids)) count_info = {} for dlist in droplet_ids.values(): if len(dlist) not in count_info: count_info[len(dlist)] = 0 count_info[len(dlist)] += 1 print ' contigs per' print ' droplet count fraction'
def get_seqfile_info(infname, is_data, n_max_queries=-1, args=None, simglfo=None, quiet=False): """ return list of sequence info from files of several types """ suffix = utils.getsuffix(infname) if len(re.findall('\.[ct]sv', suffix)) > 0: if suffix == '.csv': delimiter = ',' elif suffix == '.tsv': delimiter = '\t' else: assert False seqfile = open(infname) reader = csv.DictReader(seqfile, delimiter=delimiter) else: reader = utils.read_fastx( infname, name_key='unique_ids', seq_key='input_seqs', add_info=False, sanitize=True, n_max_queries= n_max_queries, # NOTE don't use istarstop kw arg here, 'cause it f***s with the istartstop treatment in the loop below queries=(args.queries if (args is not None and not args.abbreviate) else None) ) # NOTE also can't filter on args.queries here if we're also translating input_info = OrderedDict() reco_info = None if not is_data: reco_info = OrderedDict() # already_printed_forbidden_character_warning = False n_queries_added = 0 found_seed = False used_names = set() # for abbreviating if args is not None and args.abbreviate: potential_names = list(string.ascii_lowercase) iname = None # line number -- used as sequence id if there isn't a name column in the file iline = -1 for line in reader: iline += 1 if args is not None: if args.istartstop is not None: if iline < args.istartstop[0]: continue if iline >= args.istartstop[1]: break if args.name_column is not None: line['unique_ids'] = line[args.name_column] del line[args.name_column] if args.seq_column is not None: line['input_seqs'] = line[args.seq_column] if args.seq_column != 'seqs': # stupid god damn weird backwards compatibility edge case bullshit del line[args.seq_column] if iname is None and 'unique_ids' not in line and 'unique_id' not in line: print ' %s: couldn\'t find a name (unique id) column, so using line number as the sequence label (you can set the name column with --name-column)' % ( utils.color('yellow', 'warning')) iname = 0 if iname is not None: line['unique_ids'] = '%09d' % iname iname += 1 if 'input_seqs' not in line and 'seq' not in line: raise Exception( 'couldn\'t find a sequence column in %s (you can set this with --seq-column)' % infname) utils.process_input_line(line) if len(line['unique_ids']) > 1: raise Exception('can\'t yet handle multi-seq csv input files') uid = line['unique_ids'][0] if uid in input_info: new_uid = uid iid = 2 while new_uid in input_info: new_uid = uid + '-' + str(iid) iid += 1 print ' %s uid %s already read from input file %s, so replacing with new uid %s' % ( utils.color('yellow', 'warning'), uid, infname, new_uid) uid = new_uid inseq = line['input_seqs'][0] # # it would be nice to check here for forbidden characters (in addition to in the .fa code above), but it's hard because we won't have read the csv properly above it has them # if any(fc in uid for fc in utils.forbidden_characters): # raise Exception('found a forbidden character (one of %s) in sequence id \'%s\'' % (' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), uid)) if args is not None: if args.abbreviate: # note that this changes <uid>, but doesn't modify <line> uid = abbreviate(used_names, potential_names, uid) if args.queries is not None and uid not in args.queries: continue if args.reco_ids is not None and line[ 'reco_id'] not in args.reco_ids: continue if args.seed_unique_id is not None and uid == args.seed_unique_id: found_seed = True if uid in input_info: raise Exception('found uid \'%s\' twice in input file %s' % (uid, infname)) if len(inseq.translate(None, ''.join(utils.alphabet))) > 0: unexpected_chars = set( [ch for ch in inseq if ch not in utils.alphabet]) raise Exception( 'unexpected character%s %s (not among %s) in input sequence with id %s:\n %s' % (utils.plural(len(unexpected_chars)), ', '.join([ ('\'%s\'' % ch) for ch in unexpected_chars ]), utils.nukes + utils.ambiguous_bases, uid, inseq)) # da business input_info[uid] = { 'unique_ids': [ uid, ], 'seqs': [ inseq, ] } if n_queries_added == 0 and is_data and 'reco_id' in line: print ' note: found simulation info in %s -- are you sure you didn\'t mean to set --is-simu?' % infname if not is_data: if 'v_gene' not in line: raise Exception('simulation info not found in %s' % infname) reco_info[uid] = copy.deepcopy(line) if simglfo is not None: utils.add_implicit_info(simglfo, reco_info[uid]) n_queries_added += 1 if n_max_queries > 0 and n_queries_added >= n_max_queries: if not quiet: # just adding <quiet>, and too lazy to decide what other print statements it should effect, this is the only one I care about right now print ' --n-max-queries: stopped after reading %d queries from input file' % len( input_info) break post_process(input_info, reco_info, args, infname, found_seed, is_data, iline) if len(input_info) == 0: raise Exception('didn\'t read any sequences from %s' % infname) return input_info, reco_info
def read_sequence_file(infname, is_data, n_max_queries=-1, args=None, simglfo=None, quiet=False, more_input_info=None): # NOTE renamed this from get_seqfile_info() since I'm changing the return values, but I don't want to update the calls everywhere (e.g. in compareutils) yaml_glfo = None suffix = utils.getsuffix(infname) if suffix in delimit_info: seqfile = open( infname ) # closes on function exit. no, this isn't the best way to do this reader = csv.DictReader(seqfile, delimiter=delimit_info[suffix]) elif suffix in ['.fa', '.fasta', '.fastx']: reader = utils.read_fastx( infname, name_key='unique_ids', seq_key='input_seqs', add_info=False, sanitize=True, n_max_queries= n_max_queries, # NOTE don't use istarstop kw arg here, 'cause it f***s with the istartstop treatment in the loop below queries=(args.queries if (args is not None and not args.abbreviate) else None) ) # NOTE also can't filter on args.queries here if we're also translating elif suffix == '.yaml': yaml_glfo, reader, _ = utils.read_yaml_output( infname, n_max_queries=n_max_queries, synth_single_seqs=True, dont_add_implicit_info=True ) # not really sure that long term I want to synthesize single seq lines, but for backwards compatibility it's nice a.t.m. if not is_data: simglfo = yaml_glfo # doesn't replace the contents, of course, which is why we return it else: raise Exception('unhandled file extension %s' % suffix) input_info = OrderedDict() reco_info = None if not is_data: reco_info = OrderedDict() # already_printed_forbidden_character_warning = False n_queries_added = 0 found_seed = False potential_names, used_names = None, None # for abbreviating iname = None # line number -- used as sequence id if there isn't a name column in the file iline = -1 for line in reader: iline += 1 if args is not None: if args.istartstop is not None: if iline < args.istartstop[0]: continue if iline >= args.istartstop[1]: break if args.name_column is not None: line['unique_ids'] = line[args.name_column] del line[args.name_column] if args.seq_column is not None: line['input_seqs'] = line[args.seq_column] if args.seq_column != 'seqs': # stupid god damn weird backwards compatibility edge case bullshit del line[args.seq_column] if iname is None and 'unique_ids' not in line and 'unique_id' not in line: print ' %s: couldn\'t find a name (unique id) column, so using line number as the sequence label (you can set the name column with --name-column)' % ( utils.color('yellow', 'warning')) iname = 0 if iname is not None: line['unique_ids'] = '%09d' % iname iname += 1 if 'input_seqs' not in line and 'seq' not in line: raise Exception( 'couldn\'t find a sequence column in %s (you can set this with --seq-column)' % infname) if suffix != '.yaml': utils.process_input_line(line) if len(line['unique_ids']) > 1: raise Exception('can\'t yet handle multi-seq csv input files') uid = line['unique_ids'][0] if uid in input_info: new_uid = uid iid = 2 while new_uid in input_info: new_uid = uid + '-' + str(iid) iid += 1 print ' %s uid %s already read from input file %s, so replacing with new uid %s' % ( utils.color('yellow', 'warning'), uid, infname, new_uid) uid = new_uid inseq = line['input_seqs'][0] # # it would be nice to check here for forbidden characters (in addition to in the .fa code above), but it's hard because we won't have read the csv properly above if it has them # if any(fc in uid for fc in utils.forbidden_characters): # raise Exception('found a forbidden character (one of %s) in sequence id \'%s\'' % (' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), uid)) if args is not None: if args.abbreviate: # note that this changes <uid>, but doesn't modify <line> uid, potential_names, used_names = utils.choose_new_uid( potential_names, used_names) if args.queries is not None and uid not in args.queries: continue if args.reco_ids is not None and line[ 'reco_id'] not in args.reco_ids: continue if args.seed_unique_id is not None and uid == args.seed_unique_id: found_seed = True if uid in input_info: raise Exception('found uid \'%s\' twice in input file %s' % (uid, infname)) if any(c not in utils.alphabet for c in inseq): unexpected_chars = set( [ch for ch in inseq if ch not in utils.alphabet]) raise Exception( 'unexpected character%s %s (not among %s) in input sequence with id %s:\n %s' % (utils.plural(len(unexpected_chars)), ', '.join([ ('\'%s\'' % ch) for ch in unexpected_chars ]), utils.nukes + utils.ambiguous_bases, uid, inseq)) # da business input_info[uid] = { 'unique_ids': [ uid, ], 'seqs': [ inseq, ] } if not is_data: if 'v_gene' not in line: raise Exception('simulation info not found in %s' % infname) reco_info[uid] = copy.deepcopy(line) if simglfo is not None: utils.add_implicit_info(simglfo, reco_info[uid]) for line_key in utils.input_metafile_keys.values(): if line_key in reco_info[ uid]: # this is kind of weird to copy from sim info to input info, but it makes sense because affinity is really meta info (the only other place affinity could come from is --input-metafname below). Where i'm defining meta info more or less as any input info besides name and sequence (i think the distinction is only really important because we want to support fastas, which can't [shouldn't!] handle anything else)) input_info[uid][line_key] = copy.deepcopy( reco_info[uid][line_key] ) # note that the args.input_metafname stuff below should print a warning if you've also specified that (which you shouldn't, if it's simulation) n_queries_added += 1 if n_max_queries > 0 and n_queries_added >= n_max_queries: if not quiet: # just adding <quiet>, and too lazy to decide what other print statements it should effect, this is the only one I care about right now print ' --n-max-queries: stopped after reading %d queries from input file' % len( input_info) break if more_input_info is not None: # if you use this on simulation, the extra queries that aren't in <reco_info> may end up breaking something down the line (but I don't imagine this really getting used on simulation) if len(set(more_input_info) & set(input_info)) > 0: print ' %s found %d queries in both --infname and --queries-to-include-fname (note that we don\'t check here that they correspond to the same sequence): %s' % ( utils.color('red', 'note:'), len(set(more_input_info) & set(input_info)), ' '.join(set(more_input_info) & set(input_info)) ) # not necessarily a problem, but you probably *shouldn't* have sequences floating around in two different files if args is not None and args.seed_unique_id is not None and args.seed_unique_id in more_input_info: found_seed = True input_info.update(more_input_info) if args is not None and args.input_metafname is not None: read_input_metafo(args.input_metafname, input_info.values(), debug=True) post_process(input_info, reco_info, args, infname, found_seed, is_data, iline) if len(input_info) == 0: raise Exception('didn\'t read any sequences from %s' % infname) return input_info, reco_info, yaml_glfo
def parse_bcr_phylo_output(glfos, naive_events, outdir, ievent): # ---------------------------------------------------------------------------------------- def split_seqfos(seqfos): hline, lline = naive_events[ievent] hseqfos, lseqfos = [], [] for sfo in seqfos: padseq = utils.pad_nuc_seq(hline['naive_seq']) assert len(sfo['seq']) == len(padseq) + len(lline['naive_seq']) hseqfos.append({ 'name': sfo['name'], 'seq': sfo['seq'][:len(hline['naive_seq'])] }) lseqfos.append({ 'name': sfo['name'], 'seq': sfo['seq'][len(padseq):] }) return hseqfos, lseqfos # ---------------------------------------------------------------------------------------- def read_kdvals(kdfname): nodefo = {} with open(kdfname) as kdfile: reader = csv.DictReader(kdfile) for line in reader: nodefo[line['uid']] = { 'kd': float(line['kd']), 'relative_kd': float(line['relative_kd']), 'lambda': line.get('lambda', None), 'target_index': int(line['target_index']), } return nodefo # ---------------------------------------------------------------------------------------- def get_mature_line(sfos, naive_line, glfo, nodefo, dtree, target_sfos, locus=None): assert len( naive_line['unique_ids'] ) == 1 # enforces that we ran naive-only, 1-leaf partis simulation above assert not indelutils.has_indels( naive_line['indelfos'][0]) # would have to handle this below if args.debug: utils.print_reco_event(naive_line) reco_info = collections.OrderedDict() for sfo in sfos: mline = utils.get_non_implicit_copy(naive_line) del mline['tree'] mline['unique_ids'] = [sfo['name']] mline['seqs'] = [sfo['seq']] mline['input_seqs'] = [ sfo['seq'] ] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['duplicates'] = [[]] reco_info[sfo['name']] = mline try: utils.add_implicit_info(glfo, mline) except: # TODO not sure if I really want to leave this in long term, but it shouldn't hurt anything (it's crashing on unequal naive/mature sequence lengths, and I need this to track down which event it is) UPDATE: yeah it was just because something crashed in the middle of writing a .fa file print 'implicit info adding failed for ievent %d in %s' % ( ievent, outdir) lines = traceback.format_exception(*sys.exc_info()) print utils.pad_lines( ''.join(lines) ) # NOTE this will still crash on the next line if implicit info adding failed final_line = utils.synthesize_multi_seq_line_from_reco_info( [sfo['name'] for sfo in sfos], reco_info) ftree = copy.deepcopy(dtree) if locus is not None: def ltr(u): return u + '-' + locus new_nodefo = {} for u_old in nodefo: new_nodefo[ltr(u_old)] = nodefo[u_old] nodefo = new_nodefo treeutils.translate_labels(ftree, [(u, ltr(u)) for u in final_line['unique_ids']]) final_line['unique_ids'] = [ ltr(u) for u in final_line['unique_ids'] ] assert len(sfos) == len(final_line['unique_ids']) for iseq, sfo in enumerate(sfos): naive_id = naive_line['unique_ids'][0] assert naive_id.count('-') == 1 bstr = naive_id.replace('-' + locus, '') pids = final_line['paired-uids'][iseq] assert len(pids) == 1 and pids[0].find( bstr ) == 0 and pids[0].count('-') == 1 and pids[0].split( '-' )[1] in utils.loci # if uid is xxx-igh, paired id shoud be e.g. xxx-igk final_line['paired-uids'][iseq] = [ p.replace(bstr, sfo['name']) for p in pids ] if args.debug: utils.print_reco_event(final_line) # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read) if len( set(nodefo) - set(final_line['unique_ids']) ) > 0: # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes print ' in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % ( set(nodefo) - set(final_line['unique_ids'])) if len(set(final_line['unique_ids']) - set(nodefo)) > 0: print ' in final_line, but missing from kdvals: %s' % ' '.join( set(final_line['unique_ids']) - set(nodefo)) final_line['affinities'] = [ 1. / nodefo[u]['kd'] for u in final_line['unique_ids'] ] final_line['relative_affinities'] = [ 1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids'] ] final_line['lambdas'] = [ nodefo[u]['lambda'] for u in final_line['unique_ids'] ] final_line['nearest_target_indices'] = [ nodefo[u]['target_index'] for u in final_line['unique_ids'] ] ftree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']])) if args.debug: print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=ftree), padwidth=12) final_line['tree'] = ftree.as_string(schema='newick') tmp_event = RecombinationEvent( glfo ) # I don't want to move the function out of event.py right now tmp_event.set_reco_id( final_line, irandom=ievent ) # not sure that setting <irandom> here actually does anything final_line['target_seqs'] = [tfo['seq'] for tfo in target_sfos] return final_line # ---------------------------------------------------------------------------------------- assert args.stype == 'selection' # i don't know that non-'selection' is possible or has any point at this point (can just set selection strength to zero) kdfname, nwkfname = '%s/kd-vals.csv' % outdir, '%s/simu.nwk' % outdir if not utils.output_exists( args, kdfname, outlabel='kd/nwk conversion', offset=4 ): # eh, don't really need to check for both kd and nwk file, chances of only one being missing are really small, and it'll just crash when it looks for it a couple lines later cmd = './bin/read-bcr-phylo-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s --newick-tree-file %s' % ( outdir, args.extrastr, kdfname, nwkfname) utils.run_ete_script(cmd, ete_path, debug=args.n_procs == 1) nodefo = read_kdvals(kdfname) dtree = treeutils.get_dendro_tree(treefname=nwkfname) seqfos = utils.read_fastx(bcr_phylo_fasta_fname( outdir)) # output mutated sequences from bcr-phylo target_seqfos = utils.read_fastx('%s/%s_targets.fa' % (outdir, args.extrastr)) if args.paired_loci: mevents = [] for tline, sfos, tsfos in zip(naive_events[ievent], split_seqfos(seqfos), split_seqfos(target_seqfos)): mevents.append( get_mature_line(sfos, tline, glfos[tline['loci'][0]], nodefo, dtree, target_seqfos, locus=tline['loci'][0])) return mevents else: return get_mature_line(seqfos, naive_events[ievent], glfos[0], nodefo, dtree, target_seqfos)
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent): seqfos = utils.read_fastx(bcr_phylo_fasta_fname( outdir)) # output mutated sequences from bcr-phylo assert len( naive_line['unique_ids'] ) == 1 # enforces that we ran naive-only, 1-leaf partis simulation above assert not indelutils.has_indels( naive_line['indelfos'][0]) # would have to handle this below if args.debug: utils.print_reco_event(naive_line) reco_info = collections.OrderedDict() for sfo in seqfos: mline = copy.deepcopy(naive_line) utils.remove_all_implicit_info(mline) del mline['tree'] mline['unique_ids'] = [sfo['name']] mline['seqs'] = [ sfo['seq'] ] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['input_seqs'] = [ sfo['seq'] ] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['duplicates'] = [[]] reco_info[sfo['name']] = mline utils.add_implicit_info(glfo, mline) final_line = utils.synthesize_multi_seq_line_from_reco_info( [sfo['name'] for sfo in seqfos], reco_info) if args.debug: utils.print_reco_event(final_line) # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read) if args.stype == 'selection': cmd = './bin/read-bcr-phylo-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s/kd-vals.csv --newick-tree-file %s/simu.nwk' % ( outdir, args.extrastr, outdir, outdir) utils.run_ete_script(cmd, ete_path) nodefo = {} with open('%s/kd-vals.csv' % outdir) as kdfile: reader = csv.DictReader(kdfile) for line in reader: nodefo[line['uid']] = { 'kd': float(line['kd']), 'relative_kd': float(line['relative_kd']), 'lambda': line.get('lambda', None), 'target_index': int(line['target_index']), } if len( set(nodefo) - set(final_line['unique_ids']) ) > 0: # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes print ' in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % ( set(nodefo) - set(final_line['unique_ids'])) if len(set(final_line['unique_ids']) - set(nodefo)) > 0: print ' in final_line, but missing from kdvals: %s' % ' '.join( set(final_line['unique_ids']) - set(nodefo)) final_line['affinities'] = [ 1. / nodefo[u]['kd'] for u in final_line['unique_ids'] ] final_line['relative_affinities'] = [ 1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids'] ] final_line['lambdas'] = [ nodefo[u]['lambda'] for u in final_line['unique_ids'] ] final_line['nearest_target_indices'] = [ nodefo[u]['target_index'] for u in final_line['unique_ids'] ] tree = treeutils.get_dendro_tree(treefname='%s/simu.nwk' % outdir) tree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']])) if args.debug: print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree), padwidth=12) final_line['tree'] = tree.as_string(schema='newick') tmp_event = RecombinationEvent( glfo) # I don't want to move the function out of event.py right now tmp_event.set_reco_id( final_line, irandom=ievent ) # not sure that setting <irandom> here actually does anything # get target sequences target_seqfos = utils.read_fastx('%s/%s_targets.fa' % (outdir, args.extrastr)) final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos] return final_line
args = parser.parse_args() if os.path.dirname(args.fname) == '': args.fname = '%s/%s' % (os.getcwd(), args.fname) if args.outdir is None: args.outdir = utils.getprefix(args.fname) if any( os.path.exists(ofn) for ofn in paircluster.paired_dir_fnames(args.outdir)): if args.overwrite: paircluster.clean_paired_dir(args.outdir) else: print ' split-loci.py output exists and --overwrite was not set, so not doing anything: %s' % args.outdir sys.exit(0) seqfos = utils.read_fastx(args.fname, n_max_queries=args.n_max_queries) if args.fasta_info_index is not None: for sfo in seqfos: sfo['name'] = sfo['infostrs'][args.fasta_info_index] if args.reverse_negative_strands: revfos = [ { 'name': s['name'], 'seq': utils.revcomp(s['seq']) } for s in seqfos ] # NOTE this is not on an equal footing with <seqfos>, since we add all the vsearch info to <seqfos>, then use it do decide on locus, and then to write output if os.path.exists(args.germline_dir + '/' + args.species): # ick that is hackey args.germline_dir += '/' + args.species
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent): seqfos = utils.read_fastx( '%s/%s.fasta' % (outdir, args.extrastr)) # output mutated sequences from bcr-phylo assert len( naive_line['unique_ids'] ) == 1 # enforces that we ran naive-only, 1-leaf partis simulation above assert not indelutils.has_indels( naive_line['indelfos'][0]) # would have to handle this below if args.debug: utils.print_reco_event(naive_line) reco_info = collections.OrderedDict() for sfo in seqfos: mline = copy.deepcopy(naive_line) utils.remove_all_implicit_info(mline) del mline['tree'] mline['unique_ids'] = [sfo['name']] mline['seqs'] = [ sfo['seq'] ] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['input_seqs'] = [ sfo['seq'] ] # it's really important to set both the seqs (since they're both already in there from the naive line) reco_info[sfo['name']] = mline utils.add_implicit_info(glfo, mline) final_line = utils.synthesize_multi_seq_line_from_reco_info( [sfo['name'] for sfo in seqfos], reco_info) if args.debug: utils.print_reco_event(final_line) # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read) if args.stype == 'selection': cmd = 'export PATH=%s:$PATH && xvfb-run -a python ./bin/view-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s/kd-vals.csv --newick-tree-file %s/simu.nwk' % ( ete_path, outdir, args.extrastr, outdir, outdir) utils.simplerun(cmd, shell=True) kdvals = {} with open('%s/kd-vals.csv' % outdir) as kdfile: reader = csv.DictReader(kdfile) for line in reader: kdvals[line['uid']] = float(line['kd']) if len( set(kdvals) - set(final_line['unique_ids']) ) > 0: # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes print ' in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % ( set(kdvals) - set(final_line['unique_ids'])) if len(set(final_line['unique_ids']) - set(kdvals)) > 0: print ' in final_line, but missing from kdvals: %s' % ' '.join( set(final_line['unique_ids']) - set(kdvals)) final_line['affinities'] = [ 1. / kdvals[u] for u in final_line['unique_ids'] ] tree = treeutils.get_dendro_tree(treefname='%s/simu.nwk' % outdir) if args.debug: print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree), padwidth=12) final_line['tree'] = tree.as_string(schema='newick') tmp_event = RecombinationEvent( glfo) # I don't want to move the function out of event.py right now tmp_event.set_reco_id( final_line, irandom=ievent ) # not sure that setting <irandom> here actually does anything # get target sequences target_seqfos = utils.read_fastx('%s/%s_targets.fa' % (outdir, args.extrastr)) final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos] from Bio.Seq import Seq final_line['nearest_target_indices'] = [] aa_targets = [Seq(seq).translate() for seq in final_line['target_seqs']] for mseq in final_line['input_seqs']: aa_mseq = Seq(mseq).translate() aa_hdists = [ utils.hamming_distance(aa_t, aa_mseq, amino_acid=True) for aa_t in aa_targets ] imin = aa_hdists.index( min(aa_hdists) ) # NOTE doesn't do anything differently if there's more than one min final_line['nearest_target_indices'].append(imin) return final_line
from sklearn.metrics import euclidean_distances from sklearn.decomposition import PCA from sklearn.cluster import KMeans sys.path.insert(0, 'python') import utils import mds # ---------------------------------------------------------------------------------------- parser = argparse.ArgumentParser() parser.add_argument('--n-clusters', type=int, required=True) parser.add_argument('--n-components', type=int, default=2) parser.add_argument('--plotdir') parser.add_argument('--workdir', default='/tmp/dralph/mds/' + str(random.randint(0, 999999))) parser.add_argument('--seed', type=int, default=1) args = parser.parse_args() seqfos = utils.read_fastx('v-qr.fa', n_max_queries=500) for iseq in range(len(seqfos)): seqfos[iseq]['name'] = str(iseq) # mds.run_sklearn_mds(args.n_components, args.n_clusters, seqfos, args.seed, plotdir=args.plotdir) mds.bios2mds_kmeans_cluster(args.n_components, args.n_clusters, seqfos, args.workdir, args.seed, plotdir=args.plotdir)
def get_seqfile_info(infname, is_data, n_max_queries=-1, args=None, glfo=None, simglfo=None): """ return list of sequence info from files of several types """ if not is_data and glfo is None: print ' WARNING glfo is None, so not adding implicit info' suffix = os.path.splitext(infname)[1] if len(re.findall('\.[ct]sv', suffix)) > 0: if suffix == '.csv': delimiter = ',' elif suffix == '.tsv': delimiter = '\t' else: assert False seqfile = open(infname) reader = csv.DictReader(seqfile, delimiter=delimiter) else: reader = utils.read_fastx( infname, name_key='unique_ids', seq_key='input_seqs', add_info=False, sanitize=True, queries=(args.queries if args is not None else None), n_max_queries=n_max_queries) input_info = OrderedDict() reco_info = None if not is_data: reco_info = OrderedDict() # already_printed_forbidden_character_warning = False n_queries_added = 0 found_seed = False used_names = set() # for abbreviating if args is not None and args.abbreviate: potential_names = list(string.ascii_lowercase) iname = None # line number -- used as sequence id if there isn't a name column in the file iline = -1 for line in reader: iline += 1 if args is not None: if args.istartstop is not None: if iline < args.istartstop[0]: continue if iline >= args.istartstop[1]: break if args.name_column is not None: line['unique_ids'] = line[args.name_column] del line[args.name_column] if args.seq_column is not None: line['input_seqs'] = line[args.seq_column] if args.seq_column != 'seqs': # stupid god damn weird backwards compatibility edge case bullshit del line[args.seq_column] if iname is None and 'unique_ids' not in line and 'unique_id' not in line: print ' %s: couldn\'t find a name (unique id) column, so using line number as the sequence label (you can set the name column with --name-column)' % ( utils.color('yellow', 'warning')) iname = 0 if iname is not None: line['unique_ids'] = '%09d' % iname iname += 1 if 'input_seqs' not in line and 'seq' not in line: raise Exception( 'couldn\'t find a sequence column in %s (you can set this with --seq-column)' % infname) utils.process_input_line(line) if len(line['unique_ids']) > 1: raise Exception('can\'t yet handle multi-seq csv input files') uid = line['unique_ids'][0] inseq = line['input_seqs'][0] # # it would be nice to check here for forbidden characters (in addition to in the .fa code above), but it's hard because we won't have read the csv properly above it has them # if any(fc in uid for fc in utils.forbidden_characters): # raise Exception('found a forbidden character (one of %s) in sequence id \'%s\'' % (' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), uid)) if args is not None: if args.abbreviate: # note that this changes <uid>, but doesn't modify <line> uid = abbreviate(used_names, potential_names, uid) if args.queries is not None and uid not in args.queries: continue if args.reco_ids is not None and line[ 'reco_id'] not in args.reco_ids: continue if args.seed_unique_id is not None and uid == args.seed_unique_id: found_seed = True if uid in input_info: raise Exception('found uid \'%s\' twice in input file %s' % (uid, infname)) if len(inseq.translate(None, ''.join(utils.alphabet))) > 0: raise Exception( 'unexpected character (not among %s) in input sequence with id %s:\n %s' % (utils.nukes + utils.ambiguous_bases, uid, inseq)) input_info[uid] = { 'unique_ids': [ uid, ], 'seqs': [ inseq, ] } if n_queries_added == 0 and is_data and 'v_gene' in line: print ' note: found simulation info in %s -- are you sure you didn\'t mean to set --is-simu?' % infname if not is_data: if 'v_gene' not in line: raise Exception('simulation info not found in %s' % infname) reco_info[uid] = copy.deepcopy(line) if simglfo is not None: utils.add_implicit_info(simglfo, reco_info[uid]) n_queries_added += 1 if n_max_queries > 0 and n_queries_added >= n_max_queries: break post_process(input_info, reco_info, args, infname, found_seed, is_data) if len(input_info) == 0: raise Exception('didn\'t read any sequences from %s' % infname) return input_info, reco_info
default=partis_dir + '/data/germlines/human', help= 'germline info directory. Only used if --partis-output-file is an old-style .csv, and this default dir may work if your output file doesn\'t have novel inferred genes. Otherwise, is the germline info dir from the partis inferred parameter directory corresponding to your output file --partis-output-file.' ) parser.add_argument('--locus', default='igh') parser.add_argument('--outfile', required=True, help='output partis yaml file') parser.add_argument('--debug', action='store_true') parser.add_argument( '--n-test-subset-seqs', type=int, help= 'take only the first N seqs from both the fasta file and the annotation in the partis output file (e.g. for testing when the family is huge)' ) args = parser.parse_args() new_seqfos = utils.read_fastx(args.new_seq_file, sanitize_seqs=True) print ' read %d seqs from %s' % (len(new_seqfos), args.new_seq_file) glfo = None if utils.getsuffix(args.partis_output_file) == '.csv': print ' reading deprecated csv format, so need to read germline info from somewhere else, using --glfo-dir %s, hopefully it works' % args.glfo_dir glfo = glutils.read_glfo(args.glfo_dir, locus=args.locus) glfo, annotation_list, cpath = utils.read_output(args.partis_output_file, glfo=glfo, locus=args.locus) if args.partition_index is not None: print ' using non-best partition index %d (best is %d)' % ( args.partition_index, cpath.i_best) partition = cpath.partitions[cpath.i_best if args. partition_index is None else args.partition_index]
def get_new_alignments(glfo, region, debug=False): aligned_seqs = {} genes_with_alignments = set(aligned_seqs) # used to already have some sequences aligned, and may as well keep around the code to handle that case genes_without_alignments = set(glfo['seqs'][region]) - set(aligned_seqs) if len(genes_without_alignments) == 0: if debug: print ' no missing %s alignments' % region return if debug: print ' missing alignments for %d %s genes' % (len(genes_without_alignments), region) if len(aligned_seqs) > 0: print ' existing alignments:' for g, seq in aligned_seqs.items(): print ' %s %s' % (seq, utils.color_gene(g)) # find the longest aligned sequence, so we can pad everybody else with dots on the right out to that length biggest_length = None for gene in genes_with_alignments: if biggest_length is None or len(aligned_seqs[gene]) > biggest_length: biggest_length = len(aligned_seqs[gene]) tmpdir = tempfile.mkdtemp() already_aligned_fname = tmpdir + '/already-aligned.fasta' not_aligned_fname = tmpdir + '/not-aligned.fasta' msa_table_fname = tmpdir + '/msa-table.txt' aligned_and_not_fnamefname = tmpdir + '/aligned-and-not.fasta' mafft_outfname = tmpdir + '/everybody-aligned.fasta' with open(already_aligned_fname, 'w') as tmpfile, open(msa_table_fname, 'w') as msafile: mysterious_index = 1 msa_str = '' for gene in genes_with_alignments: dotstr = '.' * (biggest_length - len(aligned_seqs[gene])) alistr = aligned_seqs[gene] + dotstr tmpfile.write('>%s\n%s\n' % (gene, alistr.replace('.', '-'))) msa_str += ' ' + str(mysterious_index) mysterious_index += 1 msafile.write('%s # %s\n' % (msa_str, already_aligned_fname)) with open(not_aligned_fname, 'w') as tmpfile: for gene in genes_without_alignments: tmpfile.write('>%s\n%s\n' % (gene, glfo['seqs'][region][gene])) check_call('cat ' + already_aligned_fname + ' ' + not_aligned_fname + ' >' + aligned_and_not_fnamefname, shell=True) # actually run mafft cmd = 'mafft --merge ' + msa_table_fname + ' ' + aligned_and_not_fnamefname + ' >' + mafft_outfname # options= # "--localpair --maxiterate 1000" if debug: print ' RUN %s' % cmd proc = Popen(cmd, shell=True, stderr=PIPE) _, err = proc.communicate() # debug info goes to err if debug and False: # aw, screw it, I don't even know what any of mafft's output means # deal with debug info (for err -- out gets redirected to a file) err = err.replace('\r', '\n') printstrs = [] for errstr in err.split('\n'): # remove the stupid progress bar things matches = re.findall('[0-9][0-9]* / [0-9][0-9]*', errstr) if len(matches) == 1 and errstr.strip() == matches[0]: continue if len(errstr) == 0: continue printstrs.append(errstr) print ' ' + '\n '.join(printstrs) # deal with fasta output for seqfo in utils.read_fastx(mafft_outfname): gene = seqfo['name'] seq = seqfo['seq'] if gene not in glfo['seqs'][region]: # only really possible if there's a bug in the preceding fifty lines, but oh well, you can't be too careful raise Exception('unexpected gene %s in mafft output' % gene) aligned_seqs[gene] = seq # overwrite the old alignment with the new one if debug > 1: print ' new alignments:' for g, seq in aligned_seqs.items(): print ' %s %s %s' % (seq, utils.color_gene(g, width=12 if region == 'v' else 8), '<--- new' if g in genes_without_alignments else '') os.remove(already_aligned_fname) os.remove(not_aligned_fname) os.remove(msa_table_fname) os.remove(aligned_and_not_fnamefname) os.remove(mafft_outfname) os.rmdir(tmpdir) return aligned_seqs
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent): seqfos = utils.read_fastx(bcr_phylo_fasta_fname(outdir)) # output mutated sequences from bcr-phylo assert len(naive_line['unique_ids']) == 1 # enforces that we ran naive-only, 1-leaf partis simulation above assert not indelutils.has_indels(naive_line['indelfos'][0]) # would have to handle this below if args.debug: utils.print_reco_event(naive_line) reco_info = collections.OrderedDict() for sfo in seqfos: mline = copy.deepcopy(naive_line) utils.remove_all_implicit_info(mline) del mline['tree'] mline['unique_ids'] = [sfo['name']] mline['seqs'] = [sfo['seq']] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['input_seqs'] = [sfo['seq']] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['duplicates'] = [[]] reco_info[sfo['name']] = mline try: utils.add_implicit_info(glfo, mline) except: # TODO not sure if I really want to leave this in long term, but it shouldn't hurt anything (it's crashing on unequal naive/mature sequence lengths, and I need this to track down which event it is) UPDATE: yeah it was just because something crashed in the middle of writing a .fa file print 'implicit info adding failed for ievent %d in %s' % (ievent, outdir) lines = traceback.format_exception(*sys.exc_info()) print utils.pad_lines(''.join(lines)) # NOTE this will still crash on the next line if implicit info adding failed final_line = utils.synthesize_multi_seq_line_from_reco_info([sfo['name'] for sfo in seqfos], reco_info) if args.debug: utils.print_reco_event(final_line) # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read) if args.stype == 'selection': kdfname, nwkfname = '%s/kd-vals.csv' % outdir, '%s/simu.nwk' % outdir if not utils.output_exists(args, kdfname, outlabel='kd/nwk conversion', offset=4): # eh, don't really need to check for both kd an nwk file, chances of only one being missing are really small, and it'll just crash when it looks for it a couple lines later cmd = './bin/read-bcr-phylo-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s --newick-tree-file %s' % (outdir, args.extrastr, kdfname, nwkfname) utils.run_ete_script(cmd, ete_path, debug=args.n_procs==1) nodefo = {} with open(kdfname) as kdfile: reader = csv.DictReader(kdfile) for line in reader: nodefo[line['uid']] = { 'kd' : float(line['kd']), 'relative_kd' : float(line['relative_kd']), 'lambda' : line.get('lambda', None), 'target_index' : int(line['target_index']), } if len(set(nodefo) - set(final_line['unique_ids'])) > 0: # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes print ' in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (set(nodefo) - set(final_line['unique_ids'])) if len(set(final_line['unique_ids']) - set(nodefo)) > 0: print ' in final_line, but missing from kdvals: %s' % ' '.join(set(final_line['unique_ids']) - set(nodefo)) final_line['affinities'] = [1. / nodefo[u]['kd'] for u in final_line['unique_ids']] final_line['relative_affinities'] = [1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids']] final_line['lambdas'] = [nodefo[u]['lambda'] for u in final_line['unique_ids']] final_line['nearest_target_indices'] = [nodefo[u]['target_index'] for u in final_line['unique_ids']] tree = treeutils.get_dendro_tree(treefname=nwkfname) tree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']])) if args.debug: print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree), padwidth=12) final_line['tree'] = tree.as_string(schema='newick') tmp_event = RecombinationEvent(glfo) # I don't want to move the function out of event.py right now tmp_event.set_reco_id(final_line, irandom=ievent) # not sure that setting <irandom> here actually does anything # get target sequences target_seqfos = utils.read_fastx('%s/%s_targets.fa' % (outdir, args.extrastr)) final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos] return final_line