def get_selection(session, filter_type=None, sample_ids=None): query = session.query(SelectionPressure).options( joinedload(SelectionPressure.clone), joinedload(SelectionPressure.sample), ) if filter_type == 'overall': query = query.filter(SelectionPressure.sample_id.is_(None)) elif filter_type == 'samples': if sample_ids: query.filter(SelectionPressure.sample_id.in_(sample_ids)) else: query = query.filter(~SelectionPressure.sample_id.is_(None)) base_fields = SelectionPressure.__table__.c.keys() base_fields.remove('id') base_fields.remove('sample_id') writer = StreamingTSV(['sample', 'subject'] + base_fields) yield writer.writeheader() for sel in yield_limit(query, SelectionPressure.id): row = {f: getattr(sel, f) for f in base_fields} row['sample'] = sel.sample.name if sel.sample else 'All Samples' row['subject'] = sel.clone.subject.identifier yield writer.writerow(row)
def get_sample_vdjtools(session, sample, min_clone_size, clone_features): writer = StreamingTSV(['count', 'freq', 'cdr3nt', 'cdr3aa', 'v', 'd', 'j']) sample_clones = Counter() stats = session.query( CloneStats.clone_id, CloneStats.total_cnt).filter(CloneStats.sample_id == sample.id) for stat in stats: sample_clones[clone_features[stat.clone_id]] += stat.total_cnt total = sum(sample_clones.values()) yield writer.writeheader() for key in sorted(sample_clones, key=sample_clones.get, reverse=True): counts = sample_clones[key] if counts < min_clone_size: continue v, j, cdr3_nt = key yield writer.writerow({ 'count': counts, 'freq': counts / total, 'cdr3nt': cdr3_nt, 'cdr3aa': aas_from_nts(cdr3_nt), 'v': v, 'd': '.', 'j': j, })
def get_immunedb_output(session, clones): writer = StreamingTSV(DEFAULT_CLONE_FIELDS) yield writer.writeheader() for clone, agg in clones.items(): counts = agg['counts'] row = get_clone_row(clone) row['copies'] = counts['copies'] row['instances'] = counts['instances'] row['top_copy_seq'] = agg['top_seq'] row['avg_v_identity'] = round(agg['avg_v_identity'], 4) yield writer.writerow(row)
def get_clone_summary(session, include_lineages): fields = [ 'clone_id', 'subject', 'v_gene', 'j_gene', 'functional', 'insertions', 'deletions', 'cdr3_nt', 'cdr3_num_nt', 'cdr3_aa', 'uniques', 'instances', 'copies', 'germline', 'parent_id', 'avg_mutations_per_copy' ] if include_lineages: fields.append('lineage') writer = StreamingTSV(fields) yield writer.writeheader() for clone in yield_limit(session.query(Clone), Clone.id): row = {} for field in writer.fieldnames: try: row[field] = getattr(clone, field) except AttributeError: pass row.update({ 'clone_id': clone.id, 'subject': clone.subject.identifier, 'functional': 'T' if clone.functional else 'F', 'insertions': clone._insertions, 'deletions': clone._deletions, 'uniques': clone.overall_unique_cnt, 'instances': clone.overall_instance_cnt, 'copies': clone.overall_total_cnt, 'avg_mutations_per_copy': round(clone.overall_stats.total_mutations(normalize=True), 2) }) if include_lineages: row['lineage'] = clone.tree yield writer.writerow(row)
def get_samples(session, for_update=False): meta = [ s.key for s in session.query(SampleMetadata.key).group_by( SampleMetadata.key).order_by(SampleMetadata.key) ] clone_cnts = {s.sample_id: s.clones for s in session.query( CloneStats.sample_id, func.count(CloneStats.clone_id.distinct()).label('clones') ).filter( ~CloneStats.sample_id.is_(None) ).group_by(CloneStats.sample_id)} if for_update: fields = ['name', 'new_name'] else: fields = ['id', 'name', 'subject', 'input_sequences', 'identified', 'in_frame', 'stops', 'functional', 'clones'] fields.extend(meta) writer = StreamingTSV(fields) yield writer.writeheader() for sample in session.query(Sample).order_by(Sample.name): row = { 'id': sample.id, 'name': sample.name, 'new_name': sample.name } stats = sample.stats if sample.stats else Passthrough() if not for_update: row.update({ 'subject': sample.subject.identifier, 'input_sequences': stats.sequence_cnt + stats.no_result_cnt, 'identified': stats.sequence_cnt, 'in_frame': stats.in_frame_cnt, 'stops': stats.stop_cnt, 'functional': stats.functional_cnt, 'clones': clone_cnts.get(sample.id, 0) }) row.update(sample.metadata_dict) yield writer.writerow(row)
def get_clone_overlap(session): writer = StreamingTSV( ['clone_id', 'sample', 'uniques', 'copies', 'avg_mutations_per_copy']) stats = session.query(CloneStats).filter(~CloneStats.sample_id.is_(None)) yield writer.writeheader() for stat in yield_limit(stats, CloneStats.id): yield writer.writerow({ 'clone_id': stat.clone_id, 'sample': stat.sample.name, 'uniques': stat.unique_cnt, 'copies': stat.total_cnt, 'avg_mutations_per_copy': round(stat.total_mutations(normalize=True), 2) })
def get_vdjtools_output(session, clones): writer = StreamingTSV(['count', 'freq', 'cdr3nt', 'cdr3aa', 'v', 'd', 'j']) counts = Counter() total_copies = 0 for clone, agg in clones.items(): key = (clone.v_gene, clone.j_gene, clone.cdr3_nt) counts[key] += agg['counts']['copies'] total_copies += counts[key] yield writer.writeheader() for key in sorted(counts, key=counts.get, reverse=True): count = counts[key] v, j, cdr3_nt = key yield writer.writerow({ 'count': count, 'freq': count / total_copies, 'cdr3nt': cdr3_nt, 'cdr3aa': aas_from_nts(cdr3_nt), 'v': v, 'd': '.', 'j': j, })
def get_samples(session, for_update=False, sample_ids=None): meta = [ s.key for s in session.query(SampleMetadata.key).group_by( SampleMetadata.key).order_by(SampleMetadata.key) ] clone_cnts = { s.sample_id: s.clones for s in session.query( CloneStats.sample_id, func.count(CloneStats.clone_id.distinct()).label('clones')).filter( ~CloneStats.sample_id.is_(None)).group_by(CloneStats.sample_id) } if for_update: fields = ['name', 'new_name'] else: fields = [ 'id', 'name', 'subject', 'input_sequences', 'identified', 'in_frame', 'stops', 'avg_clone_cdr3_num_nts', 'avg_clone_v_identity', 'functional', 'clones' ] fields.extend(meta) writer = StreamingTSV(fields) yield writer.writeheader() samples = session.query(Sample) if sample_ids: samples = samples.filter(Sample.id.in_(sample_ids)) for sample in samples.order_by(Sample.name): row = {'id': sample.id, 'name': sample.name, 'new_name': sample.name} stats = sample.stats if sample.stats else Passthrough() if not for_update: v_iden = session.query( func.avg(CloneStats.avg_v_identity).label('avg')).filter( CloneStats.sample_id == sample.id).first() cdr3_len = session.query(CloneStats).filter( CloneStats.sample_id == sample.id) cdr3_len = [c.clone.cdr3_num_nts for c in cdr3_len] if cdr3_len: cdr3_len = sum(cdr3_len) / len(cdr3_len) else: cdr3_len = 'NA' row.update({ 'subject': sample.subject.identifier, 'input_sequences': stats.sequence_cnt + stats.no_result_cnt, 'identified': stats.sequence_cnt, 'in_frame': stats.in_frame_cnt, 'stops': stats.stop_cnt, 'avg_clone_v_identity': round(v_iden.avg, 5) if v_iden else 'NA', 'avg_clone_cdr3_num_nts': round(cdr3_len, 5), 'functional': stats.functional_cnt, 'clones': clone_cnts.get(sample.id, 0) }) row.update(sample.metadata_dict) yield writer.writerow(row)