def read_annotations(fname, glfo): annotations = {} with open(fname.replace('.csv', '-cluster-annotations.csv')) as csvfile: reader = csv.DictReader(csvfile) for line in reader: # there's a line for each cluster if line['v_gene'] == '': # failed (i.e. couldn't find an annotation) continue utils.process_input_line( line ) # converts strings in the csv file to floats/ints/dicts/etc. utils.add_implicit_info( glfo, line ) # add stuff to <line> that's useful, isn't written to the csv since it's redundant # utils.print_reco_event(line) # print ascii-art representation of the rearrangement event annotations[getkey(line['unique_ids'])] = line return annotations
#!/usr/bin/env python import csv import sys partis_path = '.' # edit this if you're not running from the main partis dir sys.path.insert(1, partis_path + '/python') import utils import glutils from clusterpath import ClusterPath # read default germline info glfo = glutils.read_glfo(partis_path + '/data/germlines/human', locus='igh') print 'first parse an annotation csv file:' with open(partis_path + '/test/reference-results/annotate-new-simu.csv') as csvfile: reader = csv.DictReader(csvfile) for line in reader: if line['v_gene'] == '': # failed (i.e. couldn't find an annotation) continue utils.process_input_line(line) utils.add_implicit_info(glfo, line) utils.print_reco_event(line) break print 'then parse a partition csv file:' cp = ClusterPath() cp.readfile(partis_path + '/test/reference-results/seed-partition-new-simu.csv') cp.print_partitions(abbreviate=True)
def read_sequence_file(infname, is_data, n_max_queries=-1, args=None, simglfo=None, quiet=False, more_input_info=None): # NOTE renamed this from get_seqfile_info() since I'm changing the return values, but I don't want to update the calls everywhere (e.g. in compareutils) yaml_glfo = None suffix = utils.getsuffix(infname) if suffix in delimit_info: seqfile = open( infname ) # closes on function exit. no, this isn't the best way to do this reader = csv.DictReader(seqfile, delimiter=delimit_info[suffix]) elif suffix in ['.fa', '.fasta', '.fastx']: reader = utils.read_fastx( infname, name_key='unique_ids', seq_key='input_seqs', add_info=False, sanitize=True, n_max_queries= n_max_queries, # NOTE don't use istarstop kw arg here, 'cause it f***s with the istartstop treatment in the loop below queries=(args.queries if (args is not None and not args.abbreviate) else None) ) # NOTE also can't filter on args.queries here if we're also translating elif suffix == '.yaml': yaml_glfo, reader, _ = utils.read_yaml_output( infname, n_max_queries=n_max_queries, synth_single_seqs=True, dont_add_implicit_info=True ) # not really sure that long term I want to synthesize single seq lines, but for backwards compatibility it's nice a.t.m. if not is_data: simglfo = yaml_glfo # doesn't replace the contents, of course, which is why we return it else: raise Exception('unhandled file extension %s' % suffix) input_info = OrderedDict() reco_info = None if not is_data: reco_info = OrderedDict() # already_printed_forbidden_character_warning = False n_queries_added = 0 found_seed = False potential_names, used_names = None, None # for abbreviating iname = None # line number -- used as sequence id if there isn't a name column in the file iline = -1 for line in reader: iline += 1 if args is not None: if args.istartstop is not None: if iline < args.istartstop[0]: continue if iline >= args.istartstop[1]: break if args.name_column is not None: line['unique_ids'] = line[args.name_column] del line[args.name_column] if args.seq_column is not None: line['input_seqs'] = line[args.seq_column] if args.seq_column != 'seqs': # stupid god damn weird backwards compatibility edge case bullshit del line[args.seq_column] if iname is None and 'unique_ids' not in line and 'unique_id' not in line: print ' %s: couldn\'t find a name (unique id) column, so using line number as the sequence label (you can set the name column with --name-column)' % ( utils.color('yellow', 'warning')) iname = 0 if iname is not None: line['unique_ids'] = '%09d' % iname iname += 1 if 'input_seqs' not in line and 'seq' not in line: raise Exception( 'couldn\'t find a sequence column in %s (you can set this with --seq-column)' % infname) if suffix != '.yaml': utils.process_input_line(line) if len(line['unique_ids']) > 1: raise Exception('can\'t yet handle multi-seq csv input files') uid = line['unique_ids'][0] if uid in input_info: new_uid = uid iid = 2 while new_uid in input_info: new_uid = uid + '-' + str(iid) iid += 1 print ' %s uid %s already read from input file %s, so replacing with new uid %s' % ( utils.color('yellow', 'warning'), uid, infname, new_uid) uid = new_uid inseq = line['input_seqs'][0] # # it would be nice to check here for forbidden characters (in addition to in the .fa code above), but it's hard because we won't have read the csv properly above if it has them # if any(fc in uid for fc in utils.forbidden_characters): # raise Exception('found a forbidden character (one of %s) in sequence id \'%s\'' % (' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), uid)) if args is not None: if args.abbreviate: # note that this changes <uid>, but doesn't modify <line> uid, potential_names, used_names = utils.choose_new_uid( potential_names, used_names) if args.queries is not None and uid not in args.queries: continue if args.reco_ids is not None and line[ 'reco_id'] not in args.reco_ids: continue if args.seed_unique_id is not None and uid == args.seed_unique_id: found_seed = True if uid in input_info: raise Exception('found uid \'%s\' twice in input file %s' % (uid, infname)) if any(c not in utils.alphabet for c in inseq): unexpected_chars = set( [ch for ch in inseq if ch not in utils.alphabet]) raise Exception( 'unexpected character%s %s (not among %s) in input sequence with id %s:\n %s' % (utils.plural(len(unexpected_chars)), ', '.join([ ('\'%s\'' % ch) for ch in unexpected_chars ]), utils.nukes + utils.ambiguous_bases, uid, inseq)) # da business input_info[uid] = { 'unique_ids': [ uid, ], 'seqs': [ inseq, ] } if not is_data: if 'v_gene' not in line: raise Exception('simulation info not found in %s' % infname) reco_info[uid] = copy.deepcopy(line) if simglfo is not None: utils.add_implicit_info(simglfo, reco_info[uid]) for line_key in utils.input_metafile_keys.values(): if line_key in reco_info[ uid]: # this is kind of weird to copy from sim info to input info, but it makes sense because affinity is really meta info (the only other place affinity could come from is --input-metafname below). Where i'm defining meta info more or less as any input info besides name and sequence (i think the distinction is only really important because we want to support fastas, which can't [shouldn't!] handle anything else)) input_info[uid][line_key] = copy.deepcopy( reco_info[uid][line_key] ) # note that the args.input_metafname stuff below should print a warning if you've also specified that (which you shouldn't, if it's simulation) n_queries_added += 1 if n_max_queries > 0 and n_queries_added >= n_max_queries: if not quiet: # just adding <quiet>, and too lazy to decide what other print statements it should effect, this is the only one I care about right now print ' --n-max-queries: stopped after reading %d queries from input file' % len( input_info) break if more_input_info is not None: # if you use this on simulation, the extra queries that aren't in <reco_info> may end up breaking something down the line (but I don't imagine this really getting used on simulation) if len(set(more_input_info) & set(input_info)) > 0: print ' %s found %d queries in both --infname and --queries-to-include-fname (note that we don\'t check here that they correspond to the same sequence): %s' % ( utils.color('red', 'note:'), len(set(more_input_info) & set(input_info)), ' '.join(set(more_input_info) & set(input_info)) ) # not necessarily a problem, but you probably *shouldn't* have sequences floating around in two different files if args is not None and args.seed_unique_id is not None and args.seed_unique_id in more_input_info: found_seed = True input_info.update(more_input_info) if args is not None and args.input_metafname is not None: read_input_metafo(args.input_metafname, input_info.values(), debug=True) post_process(input_info, reco_info, args, infname, found_seed, is_data, iline) if len(input_info) == 0: raise Exception('didn\'t read any sequences from %s' % infname) return input_info, reco_info, yaml_glfo
# print utils.print_reco_event(line) # formatting necessity def getkey(uid_list): return ':'.join(uid_list) # creates a dictionary with keys = unique_ids and values = annotations annotations = {} with open(args.infile.replace('.csv', '-cluster-annotations.csv')) as csvfile: reader = csv.DictReader(csvfile) for line in reader: # there's a line for each cluster if line['v_gene'] == '': # failed (i.e. couldn't find an annotation) continue utils.process_input_line( line) # converts strings in the csv file to floats/ints/dicts/etc. utils.add_implicit_info( glfo, line ) # add stuff to <line> that's useful, isn't written to the csv since it's redundant # utils.print_reco_event(line) # print ascii-art representation of the rearrangement event annotations[getkey(line['unique_ids'])] = line # sort by size sorted_clusters = sorted(annotations, key=lambda q: len(annotations[q]['unique_ids']), reverse=True) #### sorted_clusters = [c for c in sorted_clusters if utils.is_functional(annotations[c])] # checks if the cluster contains ANY non-functional sequences # total size of repertoire (number sequences) n_total = sum([len(cluster) for cluster in sorted_clusters])
def get_seqfile_info(infname, is_data, n_max_queries=-1, args=None, glfo=None, simglfo=None): """ return list of sequence info from files of several types """ if not is_data and glfo is None: print ' WARNING glfo is None, so not adding implicit info' suffix = os.path.splitext(infname)[1] if len(re.findall('\.[ct]sv', suffix)) > 0: if suffix == '.csv': delimiter = ',' elif suffix == '.tsv': delimiter = '\t' else: assert False seqfile = open(infname) reader = csv.DictReader(seqfile, delimiter=delimiter) else: reader = utils.read_fastx( infname, name_key='unique_ids', seq_key='input_seqs', add_info=False, sanitize=True, queries=(args.queries if args is not None else None), n_max_queries=n_max_queries) input_info = OrderedDict() reco_info = None if not is_data: reco_info = OrderedDict() # already_printed_forbidden_character_warning = False n_queries_added = 0 found_seed = False used_names = set() # for abbreviating if args is not None and args.abbreviate: potential_names = list(string.ascii_lowercase) iname = None # line number -- used as sequence id if there isn't a name column in the file iline = -1 for line in reader: iline += 1 if args is not None: if args.istartstop is not None: if iline < args.istartstop[0]: continue if iline >= args.istartstop[1]: break if args.name_column is not None: line['unique_ids'] = line[args.name_column] del line[args.name_column] if args.seq_column is not None: line['input_seqs'] = line[args.seq_column] if args.seq_column != 'seqs': # stupid god damn weird backwards compatibility edge case bullshit del line[args.seq_column] if iname is None and 'unique_ids' not in line and 'unique_id' not in line: print ' %s: couldn\'t find a name (unique id) column, so using line number as the sequence label (you can set the name column with --name-column)' % ( utils.color('yellow', 'warning')) iname = 0 if iname is not None: line['unique_ids'] = '%09d' % iname iname += 1 if 'input_seqs' not in line and 'seq' not in line: raise Exception( 'couldn\'t find a sequence column in %s (you can set this with --seq-column)' % infname) utils.process_input_line(line) if len(line['unique_ids']) > 1: raise Exception('can\'t yet handle multi-seq csv input files') uid = line['unique_ids'][0] inseq = line['input_seqs'][0] # # it would be nice to check here for forbidden characters (in addition to in the .fa code above), but it's hard because we won't have read the csv properly above it has them # if any(fc in uid for fc in utils.forbidden_characters): # raise Exception('found a forbidden character (one of %s) in sequence id \'%s\'' % (' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), uid)) if args is not None: if args.abbreviate: # note that this changes <uid>, but doesn't modify <line> uid = abbreviate(used_names, potential_names, uid) if args.queries is not None and uid not in args.queries: continue if args.reco_ids is not None and line[ 'reco_id'] not in args.reco_ids: continue if args.seed_unique_id is not None and uid == args.seed_unique_id: found_seed = True if uid in input_info: raise Exception('found uid \'%s\' twice in input file %s' % (uid, infname)) if len(inseq.translate(None, ''.join(utils.alphabet))) > 0: raise Exception( 'unexpected character (not among %s) in input sequence with id %s:\n %s' % (utils.nukes + utils.ambiguous_bases, uid, inseq)) input_info[uid] = { 'unique_ids': [ uid, ], 'seqs': [ inseq, ] } if n_queries_added == 0 and is_data and 'v_gene' in line: print ' note: found simulation info in %s -- are you sure you didn\'t mean to set --is-simu?' % infname if not is_data: if 'v_gene' not in line: raise Exception('simulation info not found in %s' % infname) reco_info[uid] = copy.deepcopy(line) if simglfo is not None: utils.add_implicit_info(simglfo, reco_info[uid]) n_queries_added += 1 if n_max_queries > 0 and n_queries_added >= n_max_queries: break post_process(input_info, reco_info, args, infname, found_seed, is_data) if len(input_info) == 0: raise Exception('didn\'t read any sequences from %s' % infname) return input_info, reco_info
def extract_seqs(fnam): ''' Reads a partis cluster-annotations file and extracts relevant information and sequences. ''' # Read cluster annotations into a data list of dictionaries: with open(fnam) as fh: reader = csv.DictReader(fh) data = list(reader) sequences_i = list() info_i = list() if args.allele_finding: fnam_base = fnam.split('_partitions')[0].split('/') glfo = glutils.read_glfo('{}/_output/{}/hmm/germline-sets'.format(fnam_base[0], fnam_base[-1]), locus=args.LOCUS) else: glfo = glutils.read_glfo(partis_path + '/data/germlines/human', locus=args.LOCUS) for row in data: # Process the partis data row and add germline information: try: utils.process_input_line(row) # Read default germline info utils.add_implicit_info(glfo, row) except Exception as e: # Skip rows that cannot be processed if 'failed annotation' not in e: pass # print('First skip') # print(e) else: print 'Reading from' print '{}/_output/{}/hmm/germline-sets'.format(fnam_base[0], fnam_base[-1]) print e continue # # Process the partis data row and add germline information: # try: # utils.process_input_line(row) # utils.add_implicit_info(glfo, row) # except: # Skip rows that cannot be processed # continue # Extract the full N padded naive sequence, # and find the v -and j gene bound on this naive sequence: cdr3_bounds = (row['codon_positions']['v'], row['codon_positions']['j'] + 3) vj_bounds = (row['regional_bounds']['v'][0], row['regional_bounds']['j'][1]) naiveDNA = row['naive_seq'] # Skip naive sequences too short or with stop codons: if repair_seq(naiveDNA, naiveDNA, vj_bounds, keep_check=True) is False: continue trimmed_naiveDNA = repair_seq(naiveDNA[:], naiveDNA[:], vj_bounds) naiveAA = str(Seq(trimmed_naiveDNA, generic_dna).translate()) # There has been a name change and this try/except is meant to provide backwards compatability: try: lseq = row['input_seqs'][:] except: lseq = row['seqs'][:] ir_lseq = row['indel_reversed_seqs'] stop_seq = row['stops'] assert(len(lseq) == len(ir_lseq)) assert(len(lseq) == len(stop_seq)) # Only keep sequences without indels and stop codons and minimum length amino acid length (QC): ### ir_lseq[i] == '' or lseq[i] == ir_lseq[i] <-- No indels ### stop_seq[i] <-- No partis annotated stops (there seems still to be stops after these are removed though) ### repair_seq(lseq[i], naiveDNA, vj_bounds, keep_check=True) <-- Checks whether the sequence is long enougth or have stop codons keep_idx = [1 if ((ir_lseq[i] == '' or lseq[i] == ir_lseq[i]) and stop_seq[i] is False and repair_seq(lseq[i], naiveDNA, vj_bounds, keep_check=True)) else 0 for i in range(len(lseq))] # Now only keep those sequences that passed QC: lseq = [s for s, keep in zip(lseq, keep_idx) if keep == 1] # Get amino acid sequences: lAAseq = [str(Seq(repair_seq(s[:], naiveDNA[:], vj_bounds), generic_dna).translate()) for s in lseq] # And mutation frequencies: mut_freqs = [s for s, keep in zip(row['mut_freqs'], keep_idx) if keep == 1] assert(len(mut_freqs) == len(lseq)) # Convert frequency to counts: Nmuts = [int(round(float(t[0])*len(t[1].strip('N')))) for i, t in enumerate(zip(mut_freqs, lseq))] # Deduplicate AAseqs and lseq according to the duplications on amino acid level: lAAseq_dict = dict() lseq_unique = list() for i, aa in enumerate(lAAseq): if aa in lAAseq_dict: lAAseq_dict[aa].append(i) else: lAAseq_dict[aa] = [i] lseq_unique.append(repair_seq(lseq[i][:], naiveDNA[:], vj_bounds)) assert(len(lAAseq_dict) == len(lseq_unique)) # Make the deduplicated sequence list and the mutation rates: lAAseq_dedup = list() Nmuts_dedup = list() for aa, idxs in lAAseq_dict.items(): lAAseq_dedup.append(aa) Nmut_list = [float(Nmuts[i]) for i in idxs] Nmuts_dedup.append(int(round(sum(Nmut_list)/len(Nmut_list)))) assert(len(lAAseq_dedup) == len(Nmuts_dedup)) assert(len(lAAseq_dedup) == len(lseq_unique)) # Exclude small clonal families after all the QC and deduplication: if len(lAAseq_dedup) < args.MIN_OBS: continue # Store the results in a list: sequences_i.append(['naive_seq', naiveAA]) # This format is for ANARCI numbering info_i.append({'fnam': fnam, 'v_gene': row['v_gene'], 'd_gene': row['d_gene'], 'j_gene': row['j_gene'], 'naive_seq': naiveAA, 'naive_seq_DNA': trimmed_naiveDNA, 'Nmuts': Nmuts_dedup[:], 'AAseqs': lAAseq_dedup[:], 'DNAseqs': lseq_unique[:]}) return(sequences_i, info_i)
def partis_naive_seq(lseq, fnam): ''' Given a number of sequences infer the naive sequence using partis. ''' # Specify filenames: pretty_random_fnam = str(random.randint(1, 10**100)) inpf = pretty_random_fnam + '_input' outf = pretty_random_fnam + '_output' # Write input fasta file for partis: with open(TMPDIR+'/'+inpf+'.fa', 'w') as fho: for i, s in enumerate(lseq): fho.write('>{}\n{}\n'.format(str(i), s)) # Run partis: cmd = '{}/bin/partis partition --locus {} --species {} --infname {}/{}.fa --outfname {}/{}.csv'.format(partis_path, args.LOCUS, args.SPECIES, TMPDIR, inpf, TMPDIR, outf) # os.system(cmd) # Print partis STDOUT to screen os.system('{} > {}/{}.log'.format(cmd, TMPDIR, pretty_random_fnam)) try: # Read the partis output file and extract the naive sequence: with open(TMPDIR+'/'+outf+'-cluster-annotations.csv') as fh: reader = csv.DictReader(fh) data = list(reader) # assert(len(data) == 1) # There should really only be one clonal family, but there often are, so just take the first (largest) # Extract germline bounds info and trim the naive DNA sequence: try: utils.process_input_line(data[0]) # Process dataframe row fnam_base = fnam.split('_partitions')[0].split('/') #glfo = glutils.read_glfo('{}/_output/{}/hmm/germline-sets'.format(fnam_base[0], fnam_base[-1]), locus=args.LOCUS) glfo = glutils.read_glfo(partis_path + '/data/germlines/human', locus=args.LOCUS) utils.add_implicit_info(glfo, data[0]) # Adding germline infor except Exception as e: print e raise e naiveDNA = data[0]['naive_seq'][:] first_lseq = data[0]['input_seqs'][:][0] vj_bounds = (data[0]['regional_bounds']['v'][0], data[0]['regional_bounds']['j'][1]) naiveDNA = repair_new_naive(naiveDNA[:], naiveDNA[:], vj_bounds) first_lseq = repair_new_naive(first_lseq, naiveDNA[:], vj_bounds) try: assert(len(first_lseq) == len(naiveDNA)) except: print 'len(first_lseq) != len(data[0]["naive_seq"])' print len(first_lseq) print first_lseq print len(naiveDNA) print naiveDNA # If the inferred naive sequence contains a stop codon replace it by the input sequence codon: if '*' in str(Seq(naiveDNA, generic_dna).translate()): print 'Found stop codon in inferred naive sequnce, will replace with input sequence codon.' print 'Before replacement:', naiveDNA naiveDNA_l = list(naiveDNA[:]) for codon in range(vj_bounds[0], vj_bounds[1], 3): if '*' == str(Seq(naiveDNA[codon:codon+3], generic_dna).translate()): naiveDNA_l[codon:codon+3] = first_lseq[codon:codon+3] naiveDNA = ''.join(naiveDNA_l) print 'After replacement:', naiveDNA if naiveDNA == first_lseq: print 'Complaining to say naiveDNA == first_lseq (nothing bad just to be sure the repair is not just replacing the naive sequence with the input entirely)' return(naiveDNA) finally: # Clean up: os.system('rm -r {}/{}* _output/*{}*'.format(TMPDIR, pretty_random_fnam, pretty_random_fnam))
def extract_seqs(fnam, uid2iso): '''Reads a partis cluster-annotations files and extrats relevant information and sequences.''' # Read cluster annotations into a data list of dictionaries: with open(fnam) as fh: reader = csv.DictReader(fh) data = list(reader) sequences_i = list() info_i = list() for row in data: fnam_base = fnam.split('_partitions')[0] cwd = os.getcwd() if 'IgK' in fnam_base: locus = 'igk' elif 'IgL' in fnam_base: locus = 'igl' else: locus = 'igh' # Process the partis data row and add germline information: try: utils.process_input_line(row) # Read default germline info glfo = glutils.read_glfo('{}/_output/{}/hmm/germline-sets'.format(cwd, fnam_base), locus=locus) utils.add_implicit_info(glfo, row) except Exception as e: # Skip rows that cannot be processed print('First skip') print(e) continue uids = [dl + [u] if (len(dl) > 0 and dl[0] != '') else [u] for dl, u in zip(row['duplicates'], row['unique_ids'])] # Extract the full N padded naive sequence, # and find the v -and j gene bound on this naive sequence: cdr3_bounds = (row['codon_positions']['v'], row['codon_positions']['j'] + 3) vj_bounds = (row['regional_bounds']['v'][0], row['regional_bounds']['j'][1]) if row['invalid'] is True or (cdr3_bounds[0]-cdr3_bounds[1])%3 != 0: print('Invalid clonal family, skipping.') continue naiveDNA = row['naive_seq'] if repair_seq(naiveDNA, naiveDNA, vj_bounds, keep_check=True) is False: # Skip naive sequences too short or with stop codons: # print('Third skip') if len(row['input_seqs'][:]) > 100: print('Bad naive even after 100 seqs in clonal family.') repair_seq_debug(naiveDNA, naiveDNA, vj_bounds) continue trimmed_naiveDNA = repair_seq(naiveDNA[:], naiveDNA[:], vj_bounds) naiveAA = str(Seq(trimmed_naiveDNA, generic_dna).translate()) # There has been a name change and this try/except # is meant to provide backwards compatability: try: lseq = row['input_seqs'][:] except: lseq = row['seqs'][:] ir_lseq = row['indel_reversed_seqs'] stop_seq = row['stops'] assert(len(lseq) == len(ir_lseq)) assert(len(lseq) == len(stop_seq)) # Only keep sequences without indels and stop codons and minimum length amino acid length: ### ir_lseq[i] == '' or lseq[i] == ir_lseq[i] <-- No indels ### stop_seq[i] <-- No partis annotated stops (there seems still to be stops after these are removed though) ### repair_seq(lseq[i], naiveDNA, vj_bounds, keep_check=True) <-- Checks whether the sequence is long enougth or have stop codons keep_idx = [1 if ((ir_lseq[i] == '' or lseq[i] == ir_lseq[i]) and stop_seq[i] is False and repair_seq(lseq[i], naiveDNA, vj_bounds, keep_check=True)) else 0 for i in range(len(lseq))] # Now only keep those sequences that passed QC: lseq = [s for s, keep in zip(lseq, keep_idx) if keep == 1] # Exclude small clonal families: if len(lseq) < MIN_OBS: # print(len(lseq)) # print('Fourth skip') continue # Get amino acid sequences: lAAseq = [str(Seq(repair_seq(s[:], naiveDNA[:], vj_bounds), generic_dna).translate()) for s in lseq] # mut_freqs = [s for s, keep in zip(row['mut_freqs'], keep_idx) if keep == 1] # print(row['n_mutations'].split(':')) Nmuts = [int(s) for s, keep in zip(row['n_mutations'].split(':'), keep_idx) if keep == 1] abundance = [len(d) for d, keep in zip(uids, keep_idx) if keep == 1] uids = [s for s, keep in zip(uids, keep_idx) if keep == 1] assert(len(Nmuts) == len(lseq)) assert(len(abundance) == len(lseq)) assert(len(uids) == len(lseq)) # assert(len(mut_freqs) == len(lseq)) # Convert frequency to counts and throw out info for discarded sequences: # Nmuts = [int(round(float(t[0])*len(t[1].strip('N')))) for i, t in enumerate(zip(mut_freqs, lseq))] # Deduplicate AAseqs and lseq according to the AA deduplication: ''' lAAseq_dict = dict() lAAseq_sort = dict() lseq_dedup = list() for i, aa in enumerate(lAAseq): if aa in lAAseq_sort: lAAseq_sort[aa].append((i, repair_seq(lseq[i][:], naiveDNA[:], vj_bounds), abundance[i])) else: lAAseq_sort[aa] = [(i, repair_seq(lseq[i][:], naiveDNA[:], vj_bounds), abundance[i])] for i, aa in enumerate(lAAseq_sort): lAAseq_dict[aa] = [t[0] for t in lAAseq_sort[aa]] s = sorted(lAAseq_sort[aa], ) ab_seq = sorted(lAAseq_sort[aa], key=lambda x: x[2], reverse=True)[0][1] lseq_dedup.append(ab_seq) assert(len(lAAseq_dict) == len(lseq_dedup)) # Make the deduplicated list and take the mutation rates, # as the mutation rate for the deduplicated sequence: lAAseq_dedup = list() Nmuts_dedup = list() abundance_dedup = list() for aa, idxs in lAAseq_dict.items(): lAAseq_dedup.append(aa) Nmut_list = [float(Nmuts[i]) for i in idxs] Nmuts_dedup.append(int(round(sum(Nmut_list)/len(Nmut_list)))) abundance_list = [abundance[i] for i in idxs] abundance_dedup.append(sum(abundance_list)) assert(len(lAAseq_dedup) == len(Nmuts_dedup)) assert(len(lAAseq_dedup) == len(abundance_dedup)) assert(len(lAAseq_dedup) == len(lseq_dedup)) # Exclude small clonal families: if len(lAAseq_dedup) < MIN_OBS: # print(len(lseq)) # print('Fourth skip') continue ''' iso_list = [[uid2iso[u] for u in ul] for ul in uids] # Store the results in a list: sequences_i.append(['naive_seq', naiveAA]) # This format is for ANARCI numbering info_i.append({'fnam': fnam, 'v_gene': row['v_gene'], 'd_gene': row['d_gene'], 'j_gene': row['j_gene'], 'naive_seq': naiveAA, 'naive_seq_DNA': trimmed_naiveDNA, 'Nmuts': Nmuts[:], 'abundance': abundance[:], 'AAseqs': lAAseq[:], 'DNAseqs': lseq[:], 'UID': uids[:], 'isotype': iso_list[:], 'CDR3_start': cdr3_bounds[0], 'CDR3_end': cdr3_bounds[1]}) return(sequences_i, info_i)
def write_partis_data_from_annotations( output_genes, output_seqs, path_to_annotations, metadata, filters={}, seq_filters={}, min_clonal_family_size=0, min_seq_len=0, max_mut_pct=1., min_mut_pct=0., clone_str='', region='v', germline_family='v', ): """ Function to read partis annotations csv @param path_to_annotations: path to annotations files @param metadata: csv file of metadata; if None defaults will be used for chain/species @param filters: dictionary of lists with keys as column name and items as those values of the column variable to retain; filters out families, e.g., {'locus': ['igk']}, etc. @param seq_filters: same as filters, but for sequences, e.g., {indel_reversed_seqs': [''], 'in_frames': [False]} will only retain sequences that are out of frame and did not have an indel @param min_clonal_family_size: minimum clonal family size @param min_seq_len: minimum sequence length @param max_mut_pct: maximum mutation percentage @param min_mut_pct: minimum mutation percentage @param clone_str: string for identifying clones (useful if merging annotations from multiple datasets) @param region: B-cell receptor region ('v', 'd', 'j', or 'vdj') @param germline_family: for performing cross validation ('v', 'd', or 'j') @write genes to output_genes and seqs to output_seqs """ families = ['v', 'd', 'j'] if germline_family not in families: raise ValueError("Invalid germline_family: %s. Must be one of %s" % (germline_family, families)) regions = ['v', 'd', 'j', 'vdj'] if region not in regions: raise ValueError("Invalid region: %s. Must be one of %s" % (region, regions)) PARTIS_PATH = os.path.dirname(os.path.realpath(__file__)) + '/partis' sys.path.insert(1, PARTIS_PATH + '/python') from utils import add_implicit_info, process_input_line import glutils partition_info = get_partition_info( path_to_annotations, metadata, ) with open(output_genes, 'w') as genes_file, open(output_seqs, 'w') as seqs_file: gene_writer = csv.DictWriter(genes_file, ['germline_name', 'germline_sequence']) gene_writer.writeheader() seq_header = [ 'germline_name', 'sequence_name', 'sequence', 'germline_family', 'v_gene', 'region', ] for key, _ in partition_info[0].iteritems(): seq_header += [key] seq_writer = csv.DictWriter(seqs_file, seq_header) seq_writer.writeheader() for data_idx, data_info in enumerate(partition_info): if any([ data_info[key] not in values for key, values in filters.iteritems() ]): continue glfo = glutils.read_glfo(data_info['germline_file'], locus=data_info['locus']) with open(data_info['annotations_file'], "r") as csvfile: reader = csv.DictReader(csvfile) for idx, line in enumerate(reader): if line['v_gene'] == '': # failed annotations continue # add goodies from partis process_input_line(line) add_implicit_info(glfo, line) n_seqs = len(line['input_seqs']) if n_seqs < min_clonal_family_size: # don't take small clonal families---for data quality purposes continue if region == 'vdj': gl_seq = line['naive_seq'].lower() all_seqs = [seq.lower() for seq in line['seqs']] else: gl_seq = line['v_gl_seq'].lower() all_seqs = [seq.lower() for seq in line['v_qr_seqs']] idx_list = [] # frequency filter idx_list.append( set([ i for i, val in enumerate(line['mut_freqs']) if val < max_mut_pct and val >= min_mut_pct ])) # sequence length filter idx_list.append( set([ i for i, val in enumerate(all_seqs) if len(val.translate(None, 'n')) > min_seq_len ])) for key, values in seq_filters.iteritems(): idx_list.append( set([ i for i, val in enumerate(line[key]) if val in values ])) good_seq_idx = set.intersection(*idx_list) if not good_seq_idx: # no sequences after filtering... skip continue gl_name = 'clone{}-{}-{}'.format( *[data_idx, idx, clone_str]) gene_writer.writerow({ 'germline_name': gl_name, 'germline_sequence': gl_seq, }) for good_idx in good_seq_idx: base_dict = { 'germline_name': gl_name, 'sequence_name': '-'.join([gl_name, line['unique_ids'][good_idx]]), 'sequence': all_seqs[good_idx].lower(), 'germline_family': line['{}_gene'.format(germline_family)][:5], 'v_gene': line['v_gene'], 'region': region, } for key, value in data_info.iteritems(): base_dict[key] = value seq_writer.writerow(base_dict)
def get_seqfile_info(fname, is_data, glfo=None, n_max_queries=-1, queries=None, reco_ids=None, name_column=None, seq_column=None, seed_unique_id=None, abbreviate_names=False): """ return list of sequence info from files of several types """ # WARNING defaults for <name_column> and <seq_column> also set in partis (since we call this from places other than partis, but we also want people to be able set them from the partis command line) internal_name_column = 'unique_id' # key we use in the internal dictionaries internal_seq_column = 'seq' if name_column is None: # header we expect in the file name_column = internal_name_column if seq_column is None: seq_column = internal_seq_column if not is_data and glfo is None: print ' WARNING glfo is None, so not adding implicit info' suffix = os.path.splitext(fname)[1] if len(re.findall('\.[ct]sv', suffix)) > 0: if suffix == '.csv': delimiter = ',' elif suffix == '.tsv': delimiter = '\t' else: assert False seqfile = opener('r')(fname) reader = csv.DictReader(seqfile, delimiter=delimiter) else: if suffix == '.fasta' or suffix == '.fa': ftype = 'fasta' elif suffix == '.fastq' or suffix == '.fq': ftype = 'fastq' else: raise Exception('couldn\'t handle file extension for %s' % fname) reader = [] n_fasta_queries = 0 for seq_record in SeqIO.parse(fname, ftype): # if command line specified query or reco ids, skip other ones (can't have/don't allow simulation info in a fast[aq]) if queries is not None and seq_record.name not in queries: continue reader.append({}) reader[-1][name_column] = seq_record.name reader[-1][seq_column] = str(seq_record.seq).upper() n_fasta_queries += 1 if n_max_queries > 0 and n_fasta_queries >= n_max_queries: break input_info = OrderedDict() reco_info = None if not is_data: reco_info = OrderedDict() n_queries = 0 found_seed = False used_names = set() # for abbreviating if abbreviate_names: potential_names = list(string.ascii_lowercase) for line in reader: if name_column not in line or seq_column not in line: raise Exception('mandatory headers \'%s\' and \'%s\' not both present in %s (you can set column names with --name-column and --seq-column)' % (name_column, seq_column, fname)) if name_column != internal_name_column or seq_column != internal_seq_column: translate_columns(line, {name_column : internal_name_column, seq_column: internal_seq_column}) utils.process_input_line(line) unique_id = line[internal_name_column] ## Actually deal with colons properly since they come up VERY OFTEN in sequence IDs unique_id = unique_id.replace(":", "_") if any(fc in unique_id for fc in utils.forbidden_characters): raise Exception('found a forbidden character (one of %s) in sequence id \'%s\' -- sorry, you\'ll have to replace it with something else' % (' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), unique_id)) if abbreviate_names: unique_id = abbreviate(used_names, potential_names, unique_id) # if command line specified query or reco ids, skip other ones if queries is not None and unique_id not in queries: continue if reco_ids is not None and line['reco_id'] not in reco_ids: continue if unique_id in input_info: raise Exception('found id %s twice in file %s' % (unique_id, fname)) if seed_unique_id is not None and unique_id == seed_unique_id: found_seed = True input_info[unique_id] = {'unique_id' : unique_id, 'seq' : line[internal_seq_column]} if n_queries == 0 and is_data and 'v_gene' in line: print ' note: found simulation info in %s -- are you sure you didn\'t mean to set --is-simu?' % fname if not is_data: if 'v_gene' not in line: raise Exception('simulation info not found in %s' % fname) reco_info[unique_id] = copy.deepcopy(line) reco_info[unique_id]['unique_id'] = unique_id # in case we're abbreviating if glfo is not None: utils.add_implicit_info(glfo, reco_info[unique_id], multi_seq=False, existing_implicit_keys=('cdr3_length', )) # single seqs, since each seq is on its own line in the file n_queries += 1 if n_max_queries > 0 and n_queries >= n_max_queries: break if len(input_info) == 0: raise Exception('didn\'t end up pulling any input info out of %s while looking for queries: %s reco_ids: %s\n' % (fname, str(queries), str(reco_ids))) if seed_unique_id is not None and not found_seed: raise Exception('couldn\'t find seed %s in %s' % (seed_unique_id, fname)) return (input_info, reco_info)
def get_seqfile_info(infname, is_data, n_max_queries=-1, args=None, glfo=None, simglfo=None): """ return list of sequence info from files of several types """ if not is_data and glfo is None: print ' WARNING glfo is None, so not adding implicit info' suffix = os.path.splitext(infname)[1] if len(re.findall('\.[ct]sv', suffix)) > 0: if suffix == '.csv': delimiter = ',' elif suffix == '.tsv': delimiter = '\t' else: assert False seqfile = opener('r')(infname) reader = csv.DictReader(seqfile, delimiter=delimiter) else: if suffix == '.fasta' or suffix == '.fa': ftype = 'fasta' elif suffix == '.fastq' or suffix == '.fq': ftype = 'fastq' else: raise Exception('couldn\'t handle file extension for %s' % infname) reader = [] n_fasta_queries = 0 already_printed_forbidden_character_warning = False for seq_record in SeqIO.parse(infname, ftype): # if command line specified query or reco ids, skip other ones (can't have/don't allow simulation info in a fast[aq]) if args is not None and args.queries is not None and seq_record.name not in args.queries: continue reader.append({}) uid = seq_record.name if any(fc in uid for fc in utils.forbidden_characters): if not already_printed_forbidden_character_warning: print ' %s: found a forbidden character (one of %s) in sequence id \'%s\'. This means we\'ll be replacing each of these forbidden characters with a single letter from their name (in this case %s). If this will cause problems you should replace the characters with something else beforehand.' % (utils.color('yellow', 'warning'), ' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), uid, uid.translate(utils.forbidden_character_translations)) already_printed_forbidden_character_warning = True uid = uid.translate(utils.forbidden_character_translations) reader[-1]['unique_ids'] = uid reader[-1]['input_seqs'] = str(seq_record.seq).upper() n_fasta_queries += 1 if n_max_queries > 0 and n_fasta_queries >= n_max_queries: break input_info = OrderedDict() reco_info = None if not is_data: reco_info = OrderedDict() # already_printed_forbidden_character_warning = False n_queries_added = 0 found_seed = False used_names = set() # for abbreviating if args is not None and args.abbreviate: potential_names = list(string.ascii_lowercase) iname = None # line number -- used as sequence id if there isn't a name column in the file iline = -1 for line in reader: iline += 1 if args is not None: if args.istartstop is not None: if iline < args.istartstop[0]: continue if iline >= args.istartstop[1]: break if args.name_column is not None: line['unique_ids'] = line[args.name_column] del line[args.name_column] if args.seq_column is not None: line['input_seqs'] = line[args.seq_column] del line[args.seq_column] if iname is None and 'unique_ids' not in line and 'unique_id' not in line: print ' %s: couldn\'t find a name (unique id) column, so using line number as the sequence label (you can set the name column with --name-column)' % (utils.color('yellow', 'warning')) iname = 0 if iname is not None: line['unique_ids'] = '%09d' % iname iname += 1 if 'input_seqs' not in line and 'seqs' not in line and 'seq' not in line: raise Exception('couldn\'t find a sequence column in %s (you can set this with --seq-column)' % infname) utils.process_input_line(line) if len(line['unique_ids']) > 1: raise Exception('can\'t yet handle multi-seq csv input files') uid = line['unique_ids'][0] inseq = line['input_seqs'][0] # NOTE I just moved this to the .fa loop, since otherwise we have no way of knowing how to interpret special characters... nevertheless if someone passesin a csv with special characters as part of a uid this will break # if any(fc in uid for fc in utils.forbidden_characters): # if not already_printed_forbidden_character_warning: # print ' %s: found a forbidden character (one of %s) in sequence id \'%s\'. This means we\'ll be replacing each of these forbidden characters with a single letter from their name (in this case %s). If this will cause problems you should replace the characters with something else beforehand.' % (utils.color('yellow', 'warning'), ' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), uid, uid.translate(utils.forbidden_character_translations)) # already_printed_forbidden_character_warning = True # uid = uid.translate(utils.forbidden_character_translations) if args is not None: if args.abbreviate: # note that this changes <uid>, but doesn't modify <line> uid = abbreviate(used_names, potential_names, uid) if args.queries is not None and uid not in args.queries: continue if args.reco_ids is not None and line['reco_id'] not in args.reco_ids: continue if args.seed_unique_id is not None and uid == args.seed_unique_id: found_seed = True if uid in input_info: raise Exception('found uid \'%s\' twice in input file %s' % (uid, infname)) if len(inseq.translate(None, ''.join(utils.alphabet))) > 0: raise Exception('unexpected character (not among %s) in input sequence with id %s:\n %s' % (utils.nukes + utils.ambiguous_bases, uid, inseq)) input_info[uid] = {'unique_ids' : [uid, ], 'seqs' : [inseq, ]} if n_queries_added == 0 and is_data and 'v_gene' in line: print ' note: found simulation info in %s -- are you sure you didn\'t mean to set --is-simu?' % infname if not is_data: if 'v_gene' not in line: raise Exception('simulation info not found in %s' % infname) reco_info[uid] = copy.deepcopy(line) if simglfo is not None: utils.add_implicit_info(simglfo, reco_info[uid]) n_queries_added += 1 if n_max_queries > 0 and n_queries_added >= n_max_queries: break if args is not None: if args.istartstop is not None: n_lines_in_file = iline + 1 if n_lines_in_file < args.istartstop[1]: raise Exception('--istartstop upper bound %d larger than number of lines in file %d' % (args.istartstop[1], n_lines_in_file)) if len(input_info) == 0: if args.queries is not None: raise Exception('didn\'t find the specified --queries (%s) in %s' % (str(args.queries), infname)) if args.reco_ids is not None: raise Exception('didn\'t find the specified --reco-ids (%s) in %s' % (str(args.reco_ids), infname)) if args.queries is not None: missing_queries = set(args.queries) - set(input_info) extra_queries = set(input_info) - set(args.queries) # this is just checking for a bug in the code just above here... if len(missing_queries) > 0: raise Exception('didn\'t find some of the specified --queries: %s' % ' '.join(missing_queries)) if len(extra_queries) > 0: raise Exception('extracted uids %s that weren\'t specified with --queries' % ' '.join(extra_queries)) if args.seed_unique_id is not None: if found_seed: if args.seed_seq is not None: # and input_info[args.seed_unique_id]['seqs'][0] != args.seed_seq: # raise Exception('incompatible --seed-unique-id and --seed-seq (i.e. the sequence in %s corresponding to %s wasn\'t %s)' % (infname, args.seed_unique_id, args.seed_seq)) raise Exception('--seed-seq was specified, but --seed-unique-id was also present in input file') else: if args.seed_seq is None: raise Exception('couldn\'t find seed unique id %s in %s' % (args.seed_unique_id, infname)) add_seed_seq(args, input_info, reco_info, is_data) elif args.seed_seq is not None: args.seed_unique_id = 'seed-seq' add_seed_seq(args, input_info, reco_info, is_data) elif args.random_seed_seq: # already checked (in bin/partis) that other seed args aren't set args.seed_unique_id = random.choice(input_info.keys()) print ' chose random seed unique id %s' % args.seed_unique_id if len(input_info) == 0: raise Exception('didn\'t read any sequences from %s' % infname) return input_info, reco_info
def _get_clonal_family_stats(path_to_annotations, metadata, use_np=False, use_immunized=False, locus=''): ''' get data statistics from partis annotations @param path_to_annotations: path to partis annotations @param metadata: path to partis metadata @param use_np: use nonproductive seqs? @param use_immunized: for Cui data, use immunized mice? @param locus: which locus to use @return list of dicts with clonal family sizes and naive seqs from processed data ''' partition_info = get_partition_info( path_to_annotations, metadata, ) if use_np: # return only nonproductive sequences # here "nonproductive" is defined as having a stop codon or being # out of frame or having a mutated conserved cysteine good_seq = lambda seqs: seqs['stops'] or not seqs['in_frames'] or seqs[ 'mutated_invariants'] else: # return all sequences good_seq = lambda seqs: [True for seq in seqs['seqs']] all_germline_dicts = [] for data_idx, data_info in enumerate(partition_info): if use_immunized and data_info['group'] != 'immunized': continue if not locus or data_info['locus'] != locus: continue PARTIS_PATH = os.path.dirname(os.path.realpath(__file__)) + '/partis' sys.path.insert(1, PARTIS_PATH + '/python') from utils import add_implicit_info, process_input_line import glutils glfo = glutils.read_glfo(data_info['germline_file'], locus=data_info['locus']) with open(data_info['annotations_file'], "r") as csvfile: reader = csv.DictReader(csvfile) for idx, line in enumerate(reader): # add goodies from partis if len(line['input_seqs']) == 0: # sometimes data will have empty clusters continue process_input_line(line) add_implicit_info(glfo, line) good_seq_idx = [ i for i, is_good in enumerate(good_seq(line)) if is_good ] if not good_seq_idx: # no nonproductive sequences... skip continue else: all_germline_dicts.append({ 'n_taxa': len(good_seq_idx), 'germline_sequence': disambiguate(line['v_gl_seq'].lower()), 'germline_name': '-'.join([line['v_gene'], str(idx)]), 'v_call': line['v_gene'], }) return all_germline_dicts
def get_seqfile_info(fname, is_data, glfo=None, n_max_queries=-1, queries=None, reco_ids=None): """ return list of sequence info from files of several types """ suffix = os.path.splitext(fname)[1] if suffix == '.csv': delimiter = ',' name_column = 'unique_id' seq_column = 'seq' seqfile = opener('r')(fname) reader = csv.DictReader(seqfile, delimiter=delimiter) elif suffix == '.tsv': delimiter = '\t' name_column = 'name' seq_column = 'nucleotide' seqfile = opener('r')(fname) reader = csv.DictReader(seqfile, delimiter=delimiter) else: if suffix == '.fasta' or suffix == '.fa': ftype = 'fasta' elif suffix == '.fastq' or suffix == '.fq': ftype = 'fastq' else: raise Exception('couldn\'t handle file extension for %s' % fname) name_column = 'unique_id' seq_column = 'seq' reader = [] n_fasta_queries = 0 for seq_record in SeqIO.parse(fname, ftype): # if command line specified query or reco ids, skip other ones if queries is not None and seq_record.name not in queries: continue # if reco_ids is not None and line['reco_id'] not in reco_ids: # probably no reco ids in a fasta file # continue reader.append({}) reader[-1][name_column] = seq_record.name reader[-1][seq_column] = str(seq_record.seq).upper() n_fasta_queries += 1 if n_max_queries > 0 and n_fasta_queries >= n_max_queries: break input_info = OrderedDict() reco_info = None if not is_data: reco_info = OrderedDict() n_queries = 0 for line in reader: if '.csv' in fname and name_column not in line: # hackey hackey hackey name_column = 'name' seq_column = 'nucleotide' utils.process_input_line(line) unique_id = line[name_column] if ':' in unique_id: raise Exception('found a \':\' in sequence id \'%s\' -- you\'ll have to replace it with something else, as we use \':\'s internally to concatenate sequence ids' % unique_id) # if command line specified query or reco ids, skip other ones if queries is not None and unique_id not in queries: continue if reco_ids is not None and line['reco_id'] not in reco_ids: continue input_info[unique_id] = {'unique_id' : unique_id, 'seq' : line[seq_column]} if not is_data: if 'v_gene' not in line: raise Exception('simulation info not found in %s -- if this is data add option --is-data' % fname) reco_info[unique_id] = dict(line) if 'indels' in line and line['indels']['reversed_seq'] != '': # TODO unhackify this reco_info[unique_id]['seq'] = line['indels']['reversed_seq'] if 'indels' not in line: # TODO unhackify this reco_info[unique_id]['indels'] = None if glfo is not None: utils.remove_implicit_info(reco_info[unique_id], multi_seq=False) utils.add_implicit_info(glfo, reco_info[unique_id], multi_seq=False) # each seq is on its own line in the file n_queries += 1 if n_max_queries > 0 and n_queries >= n_max_queries: break if len(input_info) == 0: raise Exception('didn\'t end up pulling any input info out of %s while looking for queries: %s reco_ids: %s\n' % (fname, str(queries), str(reco_ids))) return (input_info, reco_info)
import sys partis_path = '.' # edit this if you're not running from the main partis dir sys.path.insert(1, partis_path + '/python') import utils import glutils from clusterpath import ClusterPath # read default germline info glfo = glutils.read_glfo(partis_path + '/data/germlines/human', chain='h') print 'first parse an annotation csv file:' with open(partis_path + '/test/reference-results/annotate-new-simu.csv') as csvfile: reader = csv.DictReader(csvfile) for line in reader: utils.process_input_line(line) utils.add_implicit_info(glfo, line) utils.print_reco_event(glfo['seqs'], line) cdr3_bounds = (line['codon_positions']['v'], line['codon_positions']['j'] + 3) print '' print ' should match the above:' print ' %s naive cdr3' % line['naive_seq'][cdr3_bounds[0] : cdr3_bounds[1]] print ' %s mature' % line['indel_reversed_seqs'][0][cdr3_bounds[0] : cdr3_bounds[1]] print '' break print 'then parse a partition csv file:' cp = ClusterPath() cp.readfile(partis_path + '/test/reference-results/seed-partition-new-simu.csv') cp.print_partitions(abbreviate=True)
def get_seqfile_info(fname, is_data, glfo=None, n_max_queries=-1, queries=None, reco_ids=None): """ return list of sequence info from files of several types """ if '.csv' in fname: delimiter = ',' name_column = 'unique_id' seq_column = 'seq' seqfile = opener('r')(fname) reader = csv.DictReader(seqfile, delimiter=delimiter) elif '.tsv' in fname: delimiter = '\t' name_column = 'name' seq_column = 'nucleotide' seqfile = opener('r')(fname) reader = csv.DictReader(seqfile, delimiter=delimiter) elif '.fasta' in fname or '.fa' in fname or '.fastq' in fname or '.fq' in fname: name_column = 'unique_id' seq_column = 'seq' reader = [] n_fasta_queries = 0 ftype = 'fasta' if ('.fasta' in fname or '.fa' in fname) else 'fastq' for seq_record in SeqIO.parse(fname, ftype): reader.append({}) reader[-1][name_column] = seq_record.name reader[-1][seq_column] = str(seq_record.seq).upper() n_fasta_queries += 1 if n_max_queries > 0 and n_fasta_queries >= n_max_queries: break else: raise Exception('unrecognized file format %s' % fname) input_info = OrderedDict() reco_info = None if not is_data: reco_info = OrderedDict() n_queries = 0 for line in reader: if '.csv' in fname and name_column not in line: # hackey hackey hackey name_column = 'name' seq_column = 'nucleotide' utils.process_input_line(line, int_columns=('v_5p_del', 'd_5p_del', 'cdr3_length', 'j_5p_del', 'j_3p_del', 'd_3p_del', 'v_3p_del'), literal_columns=('indels')) unique_id = line[name_column] # if command line specified query or reco ids, skip other ones if queries is not None and unique_id not in queries: continue if reco_ids is not None and line['reco_id'] not in reco_ids: continue input_info[unique_id] = {'unique_id' : unique_id, 'seq' : line[seq_column]} if not is_data: if 'v_gene' not in line: raise Exception('simulation info not found in %s -- if this is data add option --is-data' % fname) reco_info[unique_id] = dict(line) if 'indels' in line and line['indels']['reversed_seq'] != '': # TODO unhackify this reco_info[unique_id]['seq'] = line['indels']['reversed_seq'] if 'indels' not in line: # TODO unhackify this reco_info[unique_id]['indels'] = None if glfo is not None: utils.add_match_info(glfo, reco_info[unique_id]) n_queries += 1 if n_max_queries > 0 and n_queries >= n_max_queries: break if len(input_info) == 0: raise Exception('didn\'t end up pulling any input info out of %s while looking for queries: %s reco_ids: %s\n' % (fname, str(queries), str(reco_ids))) return (input_info, reco_info)
def get_seqfile_info(infname, is_data, n_max_queries=-1, args=None, simglfo=None, quiet=False): """ return list of sequence info from files of several types """ suffix = utils.getsuffix(infname) if len(re.findall('\.[ct]sv', suffix)) > 0: if suffix == '.csv': delimiter = ',' elif suffix == '.tsv': delimiter = '\t' else: assert False seqfile = open(infname) reader = csv.DictReader(seqfile, delimiter=delimiter) else: reader = utils.read_fastx( infname, name_key='unique_ids', seq_key='input_seqs', add_info=False, sanitize=True, n_max_queries= n_max_queries, # NOTE don't use istarstop kw arg here, 'cause it f***s with the istartstop treatment in the loop below queries=(args.queries if (args is not None and not args.abbreviate) else None) ) # NOTE also can't filter on args.queries here if we're also translating input_info = OrderedDict() reco_info = None if not is_data: reco_info = OrderedDict() # already_printed_forbidden_character_warning = False n_queries_added = 0 found_seed = False used_names = set() # for abbreviating if args is not None and args.abbreviate: potential_names = list(string.ascii_lowercase) iname = None # line number -- used as sequence id if there isn't a name column in the file iline = -1 for line in reader: iline += 1 if args is not None: if args.istartstop is not None: if iline < args.istartstop[0]: continue if iline >= args.istartstop[1]: break if args.name_column is not None: line['unique_ids'] = line[args.name_column] del line[args.name_column] if args.seq_column is not None: line['input_seqs'] = line[args.seq_column] if args.seq_column != 'seqs': # stupid god damn weird backwards compatibility edge case bullshit del line[args.seq_column] if iname is None and 'unique_ids' not in line and 'unique_id' not in line: print ' %s: couldn\'t find a name (unique id) column, so using line number as the sequence label (you can set the name column with --name-column)' % ( utils.color('yellow', 'warning')) iname = 0 if iname is not None: line['unique_ids'] = '%09d' % iname iname += 1 if 'input_seqs' not in line and 'seq' not in line: raise Exception( 'couldn\'t find a sequence column in %s (you can set this with --seq-column)' % infname) utils.process_input_line(line) if len(line['unique_ids']) > 1: raise Exception('can\'t yet handle multi-seq csv input files') uid = line['unique_ids'][0] if uid in input_info: new_uid = uid iid = 2 while new_uid in input_info: new_uid = uid + '-' + str(iid) iid += 1 print ' %s uid %s already read from input file %s, so replacing with new uid %s' % ( utils.color('yellow', 'warning'), uid, infname, new_uid) uid = new_uid inseq = line['input_seqs'][0] # # it would be nice to check here for forbidden characters (in addition to in the .fa code above), but it's hard because we won't have read the csv properly above it has them # if any(fc in uid for fc in utils.forbidden_characters): # raise Exception('found a forbidden character (one of %s) in sequence id \'%s\'' % (' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), uid)) if args is not None: if args.abbreviate: # note that this changes <uid>, but doesn't modify <line> uid = abbreviate(used_names, potential_names, uid) if args.queries is not None and uid not in args.queries: continue if args.reco_ids is not None and line[ 'reco_id'] not in args.reco_ids: continue if args.seed_unique_id is not None and uid == args.seed_unique_id: found_seed = True if uid in input_info: raise Exception('found uid \'%s\' twice in input file %s' % (uid, infname)) if len(inseq.translate(None, ''.join(utils.alphabet))) > 0: unexpected_chars = set( [ch for ch in inseq if ch not in utils.alphabet]) raise Exception( 'unexpected character%s %s (not among %s) in input sequence with id %s:\n %s' % (utils.plural(len(unexpected_chars)), ', '.join([ ('\'%s\'' % ch) for ch in unexpected_chars ]), utils.nukes + utils.ambiguous_bases, uid, inseq)) # da business input_info[uid] = { 'unique_ids': [ uid, ], 'seqs': [ inseq, ] } if n_queries_added == 0 and is_data and 'reco_id' in line: print ' note: found simulation info in %s -- are you sure you didn\'t mean to set --is-simu?' % infname if not is_data: if 'v_gene' not in line: raise Exception('simulation info not found in %s' % infname) reco_info[uid] = copy.deepcopy(line) if simglfo is not None: utils.add_implicit_info(simglfo, reco_info[uid]) n_queries_added += 1 if n_max_queries > 0 and n_queries_added >= n_max_queries: if not quiet: # just adding <quiet>, and too lazy to decide what other print statements it should effect, this is the only one I care about right now print ' --n-max-queries: stopped after reading %d queries from input file' % len( input_info) break post_process(input_info, reco_info, args, infname, found_seed, is_data, iline) if len(input_info) == 0: raise Exception('didn\'t read any sequences from %s' % infname) return input_info, reco_info
def run_partis(seq): ''' Infer VDJ genes and the naive sequence using partis. ''' # Specify filenames: pretty_random_fnam = str(random.randint(1, 10**100)) inpf = pretty_random_fnam + '_input' outf = pretty_random_fnam + '_output' # Write input fasta file for partis: with open(TMPDIR + '/' + inpf + '.fa', 'w') as fho: fho.write('>{}\n{}\n'.format('input_sequence', seq)) # Run partis: cmd = '{}/bin/partis annotate --locus {} --species {} --infname {}/{}.fa --outfname {}/{}.csv'.format( partis_path, args.LOCUS, args.SPECIES, TMPDIR, inpf, TMPDIR, outf) os.system('{} > {}/{}.log'.format(cmd, TMPDIR, pretty_random_fnam)) try: # Read the partis output file and extract the naive sequence: with open(TMPDIR + '/' + outf + '.csv') as fh: reader = csv.DictReader(fh) data = list(reader) ann = data[0] # Extract germline bounds info and trim the naive DNA sequence: try: utils.process_input_line(ann) # Process dataframe row utils.add_implicit_info(glfo, ann) # Adding germline infor except Exception as e: print e raise e if ann['stops'] is True: raise Exception( 'Input sequence contain stop codon. This is no valid.') elif ann['v_5p_del'] > 30 or ann['j_3p_del'] > 12: raise Exception( 'Incomplete input sequence error. 5-prime end missing {} nt and 3-prime missing {} nt. Max allowed is 30 and 12, respectively.' .format(ann['v_5p_del'], ann['j_3p_del'])) elif ann['indelfos'][0]['indels']: raise Exception( 'Input sequence contains indels, this is currently not supported.' ) # Extract full size VDJ sequence for both the inferred naive and the input: full_gl_v = glfo['seqs']['v'][ann['v_gene']] # Germline V full_gl_j = glfo['seqs']['j'][ann['j_gene']] # Germline J gl_v_5p_del = full_gl_v[:ann[ 'v_5p_del']] # 5-prime not included in input gl_j_3p_del = full_gl_j[( len(full_gl_j) - ann['j_3p_del']):] # 3-prime not included in input #assert full_gl_v[ann['v_5p_del']:] == ann['v_gl_seq'] naiveDNA = gl_v_5p_del + ann[ 'naive_seq'] + gl_j_3p_del # Add the missing positions full_input_seq = 'N' * ann['v_5p_del'] + ann['input_seqs'][ 0] + 'N' * ann['j_3p_del'] # N pad the input sequence assert (len(naiveDNA) == len(full_input_seq)) # Remove the untranslated end: if len(naiveDNA) % 3 != 0: naiveDNA = naiveDNA[0:-(len(naiveDNA) % 3)] if len(full_input_seq) % 3 != 0: full_input_seq = full_input_seq[0:-(len(full_input_seq) % 3)] if len(naiveDNA) != len(full_input_seq): raise Exception( 'Sequences not equally long after trimming.\nInput: {}\nNaive: {}\n.' .format(full_input_seq, naiveDNA)) # Replace Ns in input sequence with naive DNA bases: full_input_seq = repair_seq(full_input_seq, naiveDNA[:]) # If the inferred naive sequence contains a stop codon replace it by the input sequence codon: if '*' in str(Seq(naiveDNA, generic_dna).translate()): print 'Found stop codon in inferred naive sequnce, will replace with input sequence codon.' print 'Before replacement:', naiveDNA naiveDNA_l = list(naiveDNA[:]) for codon in range(0, len(naiveDNA), 3): if '*' == str( Seq(naiveDNA[codon:codon + 3], generic_dna).translate()): naiveDNA_l[codon:codon + 3] = full_input_seq[codon:codon + 3] naiveDNA = ''.join(naiveDNA_l) print 'After replacement:', naiveDNA if '*' in str(Seq(naiveDNA, generic_dna).translate()): raise Exception('Naive sequence could not be repaired.') if naiveDNA == full_input_seq: print 'Warning: input sequence is identical to the inferred naive sequence.' finally: # Clean up: os.system('rm -r {}/{}* _output/*{}*'.format(TMPDIR, pretty_random_fnam, pretty_random_fnam)) return (naiveDNA, full_input_seq, (ann['v_gene'], ann['d_gene'], ann['j_gene']))