Exemplo n.º 1
0
def get_sample_vdjtools(session, sample, min_clone_size, clone_features):
    writer = StreamingTSV(['count', 'freq', 'cdr3nt', 'cdr3aa', 'v', 'd', 'j'])

    sample_clones = Counter()
    stats = session.query(
        CloneStats.clone_id,
        CloneStats.total_cnt).filter(CloneStats.sample_id == sample.id)

    for stat in stats:
        sample_clones[clone_features[stat.clone_id]] += stat.total_cnt

    total = sum(sample_clones.values())
    yield writer.writeheader()
    for key in sorted(sample_clones, key=sample_clones.get, reverse=True):
        counts = sample_clones[key]
        if counts < min_clone_size:
            continue
        v, j, cdr3_nt = key
        yield writer.writerow({
            'count': counts,
            'freq': counts / total,
            'cdr3nt': cdr3_nt,
            'cdr3aa': aas_from_nts(cdr3_nt),
            'v': v,
            'd': '.',
            'j': j,
        })
Exemplo n.º 2
0
def get_selection(session, filter_type=None, sample_ids=None):
    query = session.query(SelectionPressure).options(
        joinedload(SelectionPressure.clone),
        joinedload(SelectionPressure.sample),
    )
    if filter_type == 'overall':
        query = query.filter(SelectionPressure.sample_id.is_(None))
    elif filter_type == 'samples':
        if sample_ids:
            query.filter(SelectionPressure.sample_id.in_(sample_ids))
        else:
            query = query.filter(~SelectionPressure.sample_id.is_(None))

    base_fields = SelectionPressure.__table__.c.keys()
    base_fields.remove('id')
    base_fields.remove('sample_id')

    writer = StreamingTSV(['sample', 'subject'] + base_fields)
    yield writer.writeheader()

    for sel in yield_limit(query, SelectionPressure.id):
        row = {f: getattr(sel, f) for f in base_fields}
        row['sample'] = sel.sample.name if sel.sample else 'All Samples'
        row['subject'] = sel.clone.subject.identifier
        yield writer.writerow(row)
Exemplo n.º 3
0
def get_immunedb_output(session, clones):
    writer = StreamingTSV(DEFAULT_CLONE_FIELDS)
    yield writer.writeheader()

    for clone, agg in clones.items():
        counts = agg['counts']
        row = get_clone_row(clone)
        row['copies'] = counts['copies']
        row['instances'] = counts['instances']
        row['top_copy_seq'] = agg['top_seq']
        row['avg_v_identity'] = round(agg['avg_v_identity'], 4)
        yield writer.writerow(row)
def get_samples(session, for_update=False, sample_ids=None):
    meta = [
        s.key for s in session.query(SampleMetadata.key).group_by(
            SampleMetadata.key).order_by(SampleMetadata.key)
    ]

    clone_cnts = {s.sample_id: s.clones for s in session.query(
        CloneStats.sample_id,
        func.count(CloneStats.clone_id.distinct()).label('clones')
    ).filter(
        ~CloneStats.sample_id.is_(None)
    ).group_by(CloneStats.sample_id)}

    if for_update:
        fields = ['name', 'new_name']
    else:
        fields = ['id', 'name', 'subject', 'input_sequences', 'identified',
                  'in_frame', 'stops', 'functional', 'clones']
    fields.extend(meta)
    writer = StreamingTSV(fields)
    yield writer.writeheader()
    samples = session.query(Sample)
    if sample_ids:
        samples = samples.filter(Sample.id.in_(sample_ids))
    for sample in samples.order_by(Sample.name):
        row = {
            'id': sample.id,
            'name': sample.name,
            'new_name': sample.name
        }
        stats = sample.stats if sample.stats else Passthrough()
        if not for_update:
            row.update({
                'subject': sample.subject.identifier,
                'input_sequences': stats.sequence_cnt +
                stats.no_result_cnt,
                'identified': stats.sequence_cnt,
                'in_frame': stats.in_frame_cnt,
                'stops': stats.stop_cnt,
                'functional': stats.functional_cnt,
                'clones': clone_cnts.get(sample.id, 0)
            })

        row.update(sample.metadata_dict)
        yield writer.writerow(row)
Exemplo n.º 5
0
def get_clone_summary(session, include_lineages):
    fields = [
        'clone_id', 'subject', 'v_gene', 'j_gene', 'functional', 'insertions',
        'deletions', 'cdr3_nt', 'cdr3_num_nt', 'cdr3_aa', 'uniques',
        'instances', 'copies', 'germline', 'parent_id',
        'avg_mutations_per_copy'
    ]
    if include_lineages:
        fields.append('lineage')
    writer = StreamingTSV(fields)

    yield writer.writeheader()
    for clone in yield_limit(session.query(Clone), Clone.id):
        row = {}
        for field in writer.fieldnames:
            try:
                row[field] = getattr(clone, field)
            except AttributeError:
                pass
        row.update({
            'clone_id':
            clone.id,
            'subject':
            clone.subject.identifier,
            'functional':
            'T' if clone.functional else 'F',
            'insertions':
            clone._insertions,
            'deletions':
            clone._deletions,
            'uniques':
            clone.overall_unique_cnt,
            'instances':
            clone.overall_instance_cnt,
            'copies':
            clone.overall_total_cnt,
            'avg_mutations_per_copy':
            round(clone.overall_stats.total_mutations(normalize=True), 2)
        })
        if include_lineages:
            row['lineage'] = clone.tree
        yield writer.writerow(row)
Exemplo n.º 6
0
def get_samples(session, for_update=False):
    meta = [
        s.key for s in session.query(SampleMetadata.key).group_by(
            SampleMetadata.key).order_by(SampleMetadata.key)
    ]

    clone_cnts = {s.sample_id: s.clones for s in session.query(
        CloneStats.sample_id,
        func.count(CloneStats.clone_id.distinct()).label('clones')
    ).filter(
        ~CloneStats.sample_id.is_(None)
    ).group_by(CloneStats.sample_id)}

    if for_update:
        fields = ['name', 'new_name']
    else:
        fields = ['id', 'name', 'subject', 'input_sequences', 'identified',
                  'in_frame', 'stops', 'functional', 'clones']
    fields.extend(meta)
    writer = StreamingTSV(fields)
    yield writer.writeheader()
    for sample in session.query(Sample).order_by(Sample.name):
        row = {
            'id': sample.id,
            'name': sample.name,
            'new_name': sample.name
        }
        stats = sample.stats if sample.stats else Passthrough()
        if not for_update:
            row.update({
                'subject': sample.subject.identifier,
                'input_sequences': stats.sequence_cnt +
                stats.no_result_cnt,
                'identified': stats.sequence_cnt,
                'in_frame': stats.in_frame_cnt,
                'stops': stats.stop_cnt,
                'functional': stats.functional_cnt,
                'clones': clone_cnts.get(sample.id, 0)
            })

        row.update(sample.metadata_dict)
        yield writer.writerow(row)
Exemplo n.º 7
0
def get_clone_overlap(session):
    writer = StreamingTSV(
        ['clone_id', 'sample', 'uniques', 'copies', 'avg_mutations_per_copy'])

    stats = session.query(CloneStats).filter(~CloneStats.sample_id.is_(None))

    yield writer.writeheader()
    for stat in yield_limit(stats, CloneStats.id):
        yield writer.writerow({
            'clone_id':
            stat.clone_id,
            'sample':
            stat.sample.name,
            'uniques':
            stat.unique_cnt,
            'copies':
            stat.total_cnt,
            'avg_mutations_per_copy':
            round(stat.total_mutations(normalize=True), 2)
        })
Exemplo n.º 8
0
def get_vdjtools_output(session, clones):
    writer = StreamingTSV(['count', 'freq', 'cdr3nt', 'cdr3aa', 'v', 'd', 'j'])
    counts = Counter()
    total_copies = 0
    for clone, agg in clones.items():
        key = (clone.v_gene, clone.j_gene, clone.cdr3_nt)
        counts[key] += agg['counts']['copies']
        total_copies += counts[key]

    yield writer.writeheader()
    for key in sorted(counts, key=counts.get, reverse=True):
        count = counts[key]
        v, j, cdr3_nt = key
        yield writer.writerow({
            'count': count,
            'freq': count / total_copies,
            'cdr3nt': cdr3_nt,
            'cdr3aa': aas_from_nts(cdr3_nt),
            'v': v,
            'd': '.',
            'j': j,
        })
Exemplo n.º 9
0
def get_samples(session, for_update=False, sample_ids=None):
    meta = [
        s.key for s in session.query(SampleMetadata.key).group_by(
            SampleMetadata.key).order_by(SampleMetadata.key)
    ]

    clone_cnts = {
        s.sample_id: s.clones
        for s in session.query(
            CloneStats.sample_id,
            func.count(CloneStats.clone_id.distinct()).label('clones')).filter(
                ~CloneStats.sample_id.is_(None)).group_by(CloneStats.sample_id)
    }

    if for_update:
        fields = ['name', 'new_name']
    else:
        fields = [
            'id', 'name', 'subject', 'input_sequences', 'identified',
            'in_frame', 'stops', 'avg_clone_cdr3_num_nts',
            'avg_clone_v_identity', 'functional', 'clones'
        ]
    fields.extend(meta)
    writer = StreamingTSV(fields)
    yield writer.writeheader()
    samples = session.query(Sample)
    if sample_ids:
        samples = samples.filter(Sample.id.in_(sample_ids))
    for sample in samples.order_by(Sample.name):
        row = {'id': sample.id, 'name': sample.name, 'new_name': sample.name}
        stats = sample.stats if sample.stats else Passthrough()
        if not for_update:
            v_iden = session.query(
                func.avg(CloneStats.avg_v_identity).label('avg')).filter(
                    CloneStats.sample_id == sample.id).first()
            cdr3_len = session.query(CloneStats).filter(
                CloneStats.sample_id == sample.id)
            cdr3_len = [c.clone.cdr3_num_nts for c in cdr3_len]
            if cdr3_len:
                cdr3_len = sum(cdr3_len) / len(cdr3_len)
            else:
                cdr3_len = 'NA'

            row.update({
                'subject':
                sample.subject.identifier,
                'input_sequences':
                stats.sequence_cnt + stats.no_result_cnt,
                'identified':
                stats.sequence_cnt,
                'in_frame':
                stats.in_frame_cnt,
                'stops':
                stats.stop_cnt,
                'avg_clone_v_identity':
                round(v_iden.avg, 5) if v_iden else 'NA',
                'avg_clone_cdr3_num_nts':
                round(cdr3_len, 5),
                'functional':
                stats.functional_cnt,
                'clones':
                clone_cnts.get(sample.id, 0)
            })

        row.update(sample.metadata_dict)
        yield writer.writerow(row)
Exemplo n.º 10
0
def get_samples(session, for_update=False, sample_ids=None):
    meta = [
        s.key for s in session.query(SampleMetadata.key).group_by(
            SampleMetadata.key).order_by(SampleMetadata.key)
    ]

    clone_cnts = {s.sample_id: s.clones for s in session.query(
        CloneStats.sample_id,
        func.count(CloneStats.clone_id.distinct()).label('clones')
    ).filter(
        ~CloneStats.sample_id.is_(None)
    ).group_by(CloneStats.sample_id)}

    if for_update:
        fields = ['name', 'new_name']
    else:
        fields = ['id', 'name', 'subject', 'input_sequences', 'identified',
                  'in_frame', 'stops', 'avg_clone_cdr3_num_nts',
                  'avg_clone_v_identity', 'functional', 'clones']
    fields.extend(meta)
    writer = StreamingTSV(fields)
    yield writer.writeheader()
    samples = session.query(Sample)
    if sample_ids:
        samples = samples.filter(Sample.id.in_(sample_ids))
    for sample in samples.order_by(Sample.name):
        row = {
            'id': sample.id,
            'name': sample.name,
            'new_name': sample.name
        }
        stats = sample.stats if sample.stats else Passthrough()
        if not for_update:
            v_iden = session.query(
                func.avg(CloneStats.avg_v_identity).label('avg')
            ).filter(
                CloneStats.sample_id == sample.id
            ).first()
            cdr3_len = session.query(
                CloneStats
            ).filter(
                CloneStats.sample_id == sample.id
            )
            cdr3_len = [c.clone.cdr3_num_nts for c in cdr3_len]
            if cdr3_len:
                cdr3_len = sum(cdr3_len) / len(cdr3_len)
            else:
                cdr3_len = 'NA'

            row.update({
                'subject': sample.subject.identifier,
                'input_sequences': stats.sequence_cnt +
                stats.no_result_cnt,
                'identified': stats.sequence_cnt,
                'in_frame': stats.in_frame_cnt,
                'stops': stats.stop_cnt,
                'avg_clone_v_identity': round(v_iden.avg, 5)
                if v_iden else 'NA',
                'avg_clone_cdr3_num_nts': round(cdr3_len, 5),
                'functional': stats.functional_cnt,
                'clones': clone_cnts.get(sample.id, 0)
            })

        row.update(sample.metadata_dict)
        yield writer.writerow(row)