def read_file(session, fmt, handle, sample, v_germlines, j_germlines, props): reader = csv.DictReader(handle, delimiter='\t') uniques = {} for i, line in enumerate(reader): if fmt == 'adaptive': try: line = extract_adaptive_sequence(i, line, v_germlines, j_germlines) except (AlignmentException, KeyError) as e: seq = VDJSequence('seq_{}'.format(i), '') add_noresults_for_vdj(session, seq, sample, str(e)) continue seq = VDJSequence(line['SEQUENCE_ID'], line['SEQUENCE_IMGT'].replace('.', '-')) if 'DUPCOUNT' in line: seq.copy_number = int(line['DUPCOUNT']) try: alignment = create_alignment(seq, line, v_germlines, j_germlines) for other in uniques.setdefault(len(alignment.sequence.sequence), []): if dnautils.equal(other.sequence.sequence, alignment.sequence.sequence): other.sequence.copy_number += ( alignment.sequence.copy_number) break else: uniques[len(alignment.sequence.sequence)].append(alignment) except AlignmentException as e: add_noresults_for_vdj(session, seq, sample, str(e)) uniques = [s for k in sorted(uniques.keys()) for s in uniques[k]] lens = [] muts = [] for unique in uniques: try: props.validate(unique) add_sequences(session, [unique], sample) lens.append(unique.v_length) muts.append(unique.v_mutation_fraction) except AlignmentException as e: add_noresults_for_vdj(session, seq, sample, str(e)) if len(lens) > 0: sample.v_ties_len = sum(lens) / float(len(lens)) sample.v_ties_mutations = sum(muts) / float(len(muts)) session.commit()
def read_input(path): vdjs = [] parser = SeqIO.parse(path, 'fasta' if path.endswith('.fasta') else 'fastq') # Collapse identical sequences logger.info('Parsing input') for record in parser: try: vdjs.append( VDJSequence( seq_id=record.description, sequence=str(record.seq), quality=funcs.ord_to_quality( record.letter_annotations.get('phred_quality')))) except ValueError: continue logger.info('There are {} sequences'.format(len(vdjs))) return vdjs
def process_sample(session, sample, indexes, temp, v_germlines, j_germlines, nproc): indels = session.query(Sequence.ai, Sequence.seq_id, Sequence.sample_id, Sequence.sequence).filter( Sequence.sample_id == sample.id, Sequence.probable_indel_or_misalign == 1) # Get the sequences that were not identifiable noresults = session.query(NoResult).filter(NoResult.sample_id == sample.id) if indels.count() == 0 and noresults.count() == 0: logger.info('Sample {} has no indels or noresults'.format(sample.id)) return logger.info('Sample {} has {} indels and {} noresults'.format( sample.id, indels.count(), noresults.count())) mut_bucket = v_germlines.mut_bucket(sample.v_ties_mutations) len_bucket = v_germlines.length_bucket(sample.v_ties_len) bucket = '{}_{}'.format(str(mut_bucket).replace('.', ''), len_bucket) sample_v_germlines = get_formatted_ties( v_germlines.all_ties(sample.v_ties_len, sample.v_ties_mutations)) sample_j_germlines = get_formatted_ties( j_germlines.all_ties(sample.v_ties_len, sample.v_ties_mutations)) if bucket not in indexes: indexes.add(bucket) v_path = os.path.join(temp, 'v_genes_{}'.format(bucket)) j_path = os.path.join(temp, 'j_genes_{}'.format(bucket)) logger.info('Creating index for V-ties at {} length, {} ' 'mutation'.format(len_bucket, mut_bucket)) build_index(sample_v_germlines, v_path) build_index(sample_j_germlines, j_path) seq_path = os.path.join(temp, 'll_{}.fasta'.format(sample.id)) with open(seq_path, 'w+') as fh: fh.write( get_fasta({ 'tp=Sequence|ai={}|sample_id={}|seq_id={}'.format( r.ai, r.sample_id, r.seq_id): r.sequence for r in indels })) fh.write( get_fasta({ 'tp=NoResult|pk={}|sample_id={}|seq_id={}'.format( r.pk, r.sample_id, r.seq_id): r.sequence for r in noresults })) alignments = {} logger.info('Running bowtie2 for V-gene sequences') for line in get_reader( align_reference(temp, 'v_genes_{}'.format(bucket), seq_path, nproc)): line['ref_offset'] = int(line['ref_offset']) - 1 ref_gene = line['reference'] ref, seq, rem_seqs = create_seqs( ref_seq=sample_v_germlines[ref_gene].replace('-', ''), min_size=CDR3_OFFSET, **line) if len(rem_seqs) == 0: continue ref, seq, seq_start = add_imgt_gaps(sample_v_germlines[ref_gene], ref, seq, line['ref_offset']) if len(ref) < CDR3_OFFSET: continue alignments[line['seq_id']] = { 'v_germline': ref, 'v_gene': line['reference'], 'seq_start': seq_start, 'v_sequence': seq, 'v_rem_seq': rem_seqs[-1], 'cdr3_start': len(ref) } seq_path = os.path.join(temp, 'll_j_{}.fasta'.format(sample.id)) with open(seq_path, 'w+') as fh: seqs = { k: v['v_rem_seq'] for k, v in alignments.iteritems() if len(v['v_rem_seq']) > 0 } fh.write(get_fasta(seqs)) tasks = [] logger.info('Running bowtie2 for J-gene sequences') for line in get_reader( align_reference(temp, 'j_genes_{}'.format(bucket), seq_path, nproc)): line['ref_offset'] = int(line['ref_offset']) - 1 ref_gene = line['reference'] ref, seq, rem_seqs = create_seqs( ref_seq=sample_j_germlines[ref_gene].replace('-', ''), min_size=j_germlines.upstream_of_cdr3, **line) alignments[line['seq_id']]['j_gene'] = line['reference'] full_seq = (alignments[line['seq_id']]['v_sequence'] + alignments[line['seq_id']]['v_rem_seq']) if len(rem_seqs) > 0: full_seq = full_seq[:-len(rem_seqs[-1])] cdr3_end = len(full_seq) if len(ref) < j_germlines.upstream_of_cdr3: continue for i in range(j_germlines.upstream_of_cdr3): if ref[-i] != '-': cdr3_end -= 1 alignments[line['seq_id']]['cdr3_end'] = cdr3_end cdr3_length = cdr3_end - alignments[line['seq_id']]['cdr3_start'] full_germ = (alignments[line['seq_id']]['v_germline'] + (GAP_PLACEHOLDER * cdr3_length)) j_length = len(full_seq) - len(full_germ) if j_length <= 0 or cdr3_length <= 0: continue full_germ += ref[-j_length:] r_type, pk, sample_id, seq_id = [ v.split('=', 1)[1] for v in line['seq_id'].split('|', 3) ] insertions = gap_positions(full_germ) deletions = gap_positions(full_seq) alignment = VDJAlignment( VDJSequence(seq_id, full_seq.replace(GAP_PLACEHOLDER, '-'))) alignment.germline = full_germ.replace(GAP_PLACEHOLDER, '-') if len(alignment.germline) != len(alignment.sequence.sequence): continue alignment.v_gene.add(GeneName(alignments[line['seq_id']]['v_gene'])) alignment.j_gene.add(GeneName(alignments[line['seq_id']]['j_gene'])) alignment.seq_offset = alignments[line['seq_id']]['seq_start'] # TODO: This should really look for a streak like in anchoring alignment.germline_cdr3 = '-' * cdr3_length gaps_in_seq = alignment.sequence.sequence[ alignment. seq_start:alignments[line['seq_id']]['cdr3_start']].count('-') alignment.v_length = (alignments[line['seq_id']]['cdr3_start'] - alignment.seq_offset) - gaps_in_seq alignment.j_length = j_length alignment.v_mutation_fraction = 1 - (alignment.v_match / float(alignment.v_length)) alignment.cdr3_start = alignments[line['seq_id']]['cdr3_start'] alignment.cdr3_num_nts = cdr3_length alignment.post_cdr3_length = j_length alignment.insertions = insertions alignment.deletions = deletions alignment.locally_aligned = True tasks.append({ 'r_type': r_type, 'pk': int(pk), 'sample_id': int(sample_id), 'alignment': alignment }) return tasks
def parse_airr(line, v_germlines, j_germlines): seq = VDJSequence( seq_id=line['sequence_id'].replace('reversed|', ''), sequence=line['sequence_alignment'], rev_comp=line['rev_comp'] == 'T', ) if not all([line['v_call'], line['j_call'], line['junction_aa']]): raise AlignmentException(seq, 'Missing v_gene, j_gene, or junction_aa') seq.pad(int(line['v_germline_start']) - 1) try: v_germ_seq = v_germlines.get_ties(line['v_call'].split(',')) except KeyError: raise AlignmentException( seq, 'V-gene {} not in germline database'.format(line['v_call']) ) aligned_germ = ''.join([ v_germ_seq.replace('-', '')[:int(line['v_germline_start']) - 1], line['germline_alignment'] ]) # Append the missing portion, if any, of the J to the germline j_germ_seq = j_germlines.get_ties(line['j_call'].split(',')) append_j = len(j_germ_seq) - int(line['j_germline_end']) if append_j > 0: aligned_germ += j_germ_seq[-append_j:] seq.pad_right(append_j) aligned_seq, gaps_added = add_imgt_gaps(v_germ_seq, seq) aligned_germ = add_imgt_gaps( v_germ_seq, VDJSequence('', aligned_germ) )[0].sequence cdr3_start = int(line['cdr3_start']) - int(line['v_sequence_start']) # Push the start of the CDR3 based on number of IMGT gaps added. Then add # 3 because IgBLAST's CDR3 excludes the preserved Cysteine cdr3_start += gaps_added - 3 cdr3_start += aligned_seq.sequence[:cdr3_start].count('-') cdr3_start += int(line['v_germline_start']) - 1 cdr3_end = cdr3_start + len(line['cdr3']) + 6 # If there is an insertion in the CDR3 but not junction, increase CDR3 # length junction_insertions = aligned_germ[cdr3_end - 3:cdr3_end].count('-') cdr3_end += junction_insertions cdr3_seq = aligned_seq.sequence[cdr3_start:cdr3_end] germline_cdr3 = aligned_germ[cdr3_start:cdr3_end] aligned_germ = ''.join([ aligned_germ[:cdr3_start], '.' * (cdr3_end - cdr3_start), aligned_germ[cdr3_end:] ]) aligned_seq = ''.join([ aligned_seq.sequence[:cdr3_start], cdr3_seq, aligned_seq.sequence[cdr3_end:] ]) total_insertions = line['v_germline_alignment'].count('-') correct_cdr3_start = CDR3_OFFSET + total_insertions if cdr3_start != correct_cdr3_start: raise AlignmentException( seq, 'CDR3 starts at {} instead of {} ({} insertions)'.format( cdr3_start, correct_cdr3_start, total_insertions)) alignment = funcs.ClassProxy(VDJAlignment( VDJSequence(line['sequence_id'], aligned_seq.replace('.', '-')) )) alignment.germline = aligned_germ.replace('.', '-') alignment.v_gene = set([GeneName(c) for c in line['v_call'].split(',')]) alignment.j_gene = set([GeneName(c) for c in line['j_call'].split(',')]) alignment.cdr3_start = cdr3_start alignment.cdr3_num_nts = len(cdr3_seq) alignment.locally_aligned = True alignment.germline_cdr3 = germline_cdr3 alignment.seq_offset = int(line['v_germline_start']) - 1 alignment.v_length = int(line['v_alignment_end']) alignment.j_length = (int(line['j_alignment_end']) - int(line['j_alignment_start'])) alignment.v_mutation_fraction = (100 - float(line['v_identity'])) / 100 # Skipping the germline_cdr3 field and instead populating its dependencies # via the proxy alignment.j_match = float(line['j_identity']) * alignment.j_length / 100 alignment.post_cdr3_length = len(alignment.sequence.sequence) - cdr3_end alignment.insertions = funcs.gap_positions(aligned_germ) alignment.deletions = funcs.gap_positions(aligned_seq) return alignment
def read_file(session, handle, sample, v_germlines, j_germlines, columns, remaps): seqs = _collapse_seqs(session, sample, csv.DictReader(handle, delimiter='\t'), columns) aligned_seqs = {} missed = 0 total = 0 for total, seq in enumerate(seqs): if total > 0 and total % 1000 == 0: logger.info('Finished {}'.format(total)) session.commit() orig_v_genes = set( re.findall('IGHV[^ ,]+', seq['record'][columns.v_gene])) orig_j_genes = set( re.findall('IGHJ[^ ,]+', seq['record'][columns.j_gene])) if remaps is not None: remapped_j_genes = set([]) for j in orig_j_genes: for remap_from, remap_to in remaps.iteritems(): if j.startswith(remap_from): remapped_j_genes.add(remap_to) break else: remapped_j_genes.add(j) orig_j_genes = remapped_j_genes v_genes = filter(lambda v: v in v_germlines, orig_v_genes) j_genes = filter(lambda j: j in j_germlines, orig_j_genes) vdj = VDJSequence(seq['seq_ids'], seq['record'][columns.full_sequence], v_germlines, j_germlines, force_vs=v_genes, force_js=j_genes) try: if len(v_genes) == 0: raise AlignmentException('No valid V germline for {}'.format( ','.join(sorted(orig_v_genes)))) if len(j_genes) == 0: raise AlignmentException('No valid J germline for {}'.format( ','.join(sorted(orig_j_genes)))) vdj.analyze() if vdj.sequence in aligned_seqs: aligned_seqs[vdj.sequence].ids += vdj.ids else: aligned_seqs[vdj.sequence] = vdj except AlignmentException as e: add_as_noresult(session, vdj, sample, str(e)) missed += 1 logger.info('Aligned {} / {} sequences'.format(total - missed + 1, total)) logger.info('Collapsing ambiguous character sequences') if len(aligned_seqs) > 0: avg_mut = sum([v.mutation_fraction for v in aligned_seqs.values() ]) / float(len(aligned_seqs)) avg_len = sum([v.v_length for v in aligned_seqs.values()]) / float( len(aligned_seqs)) sample.v_ties_mutations = avg_mut sample.v_ties_len = avg_len if columns.ties: add_uniques(session, sample, aligned_seqs.values(), realign_mut=avg_mut, realign_len=avg_len, trim_to=columns.trim_to, max_padding=columns.max_padding) else: add_uniques(session, sample, aligned_seqs.values()) session.commit()
def do_task(self, args): meta = args['meta'] self.info('Starting sample {}'.format(meta['sample_name'])) study, sample = self._setup_sample(meta) vdjs = {} parser = SeqIO.parse( args['path'], 'fasta' if args['path'].endswith('.fasta') else 'fastq') # Collapse identical sequences self.info('\tCollapsing identical sequences') for record in parser: try: seq = str(record.seq) if seq not in vdjs: vdjs[seq] = VDJSequence( ids=[], sequence=seq, quality=funcs.ord_to_quality( record.letter_annotations.get('phred_quality'))) vdjs[seq].ids.append(record.description) except ValueError: continue alignments = {} aligner = AnchorAligner(self._v_germlines, self._j_germlines) self.info('\tAligning {} unique sequences'.format(len(vdjs))) # Attempt to align all unique sequences for sequence in funcs.periodic_commit(self._session, sorted(vdjs.keys())): vdj = vdjs[sequence] del vdjs[sequence] try: # The alignment was successful. If the aligned sequence # already exists, append the seq_ids. Otherwise add it as a # new unique sequence. alignment = aligner.get_alignment(vdj) seq_key = alignment.sequence.sequence if seq_key in alignments: alignments[seq_key].sequence.ids.extend( alignment.sequence.ids) else: alignments[seq_key] = alignment except AlignmentException as e: add_as_noresult(self._session, vdj, sample, str(e)) except Exception: self.error( '\tUnexpected error processing sequence {}\n\t{}'.format( vdj.ids[0], traceback.format_exc())) if len(alignments) > 0: avg_len = (sum([v.v_length for v in alignments.values()]) / float(len(alignments))) avg_mut = ( sum([v.v_mutation_fraction for v in alignments.values()]) / float(len(alignments))) sample.v_ties_mutations = avg_mut sample.v_ties_len = avg_len self.info('\tRe-aligning {} sequences to V-ties, Mutations={}, ' 'Length={}'.format(len(alignments), round(avg_mut, 2), round(avg_len, 2))) add_uniques(self._session, sample, alignments.values(), self._props, aligner, avg_len, avg_mut) self._session.commit() self.info('Completed sample {}'.format(sample.name))
def do_task(self, args): meta = args['meta'] self.info('Starting sample {}'.format(meta.get('sample_name'))) study, sample = self._setup_sample(meta) vdjs = {} parser = SeqIO.parse( os.path.join(args['path'], args['fn']), 'fasta' if args['fn'].endswith('.fasta') else 'fastq') # Collapse identical sequences self.info('\tCollapsing identical sequences') for record in parser: seq = str(record.seq) if seq not in vdjs: vdjs[seq] = VDJSequence( ids=[], seq=seq, v_germlines=self._v_germlines, j_germlines=self._j_germlines, quality=funcs.ord_to_quality( record.letter_annotations.get('phred_quality'))) vdjs[seq].ids.append(record.description) self.info('\tAligning {} unique sequences'.format(len(vdjs))) # Attempt to align all unique sequences for sequence in funcs.periodic_commit(self._session, sorted(vdjs.keys())): vdj = vdjs[sequence] del vdjs[sequence] try: # The alignment was successful. If the aligned sequence # already exists, append the seq_ids. Otherwise add it as a # new unique sequence. vdj.analyze() if vdj.sequence in vdjs: vdjs[vdj.sequence].ids += vdj.ids else: vdjs[vdj.sequence] = vdj except AlignmentException as e: add_as_noresult(self._session, vdj, sample, str(e)) except: self.error( '\tUnexpected error processing sequence {}\n\t{}'.format( vdj.ids[0], traceback.format_exc())) if len(vdjs) > 0: avg_len = sum(map(lambda vdj: vdj.v_length, vdjs.values())) / float(len(vdjs)) avg_mut = sum(map(lambda vdj: vdj.mutation_fraction, vdjs.values())) / float(len(vdjs)) sample.v_ties_mutations = avg_mut sample.v_ties_len = avg_len self.info('\tRe-aligning {} sequences to V-ties, Mutations={}, ' 'Length={}'.format(len(vdjs), round(avg_mut, 2), round(avg_len, 2))) add_uniques(self._session, sample, vdjs.values(), avg_len, avg_mut, self._min_similarity, self._max_vties, self._trim_to, self._max_padding) self._session.commit() self.info('Completed sample {}'.format(sample.name))