Exemplo n.º 1
0
def collapse_identical(session, buckets):
    for i, bucket in enumerate(buckets):
        clones = session.query(Clone.id, Clone.cdr3_aa).filter(
            Clone.subject_id == bucket.subject_id,
            Clone.v_gene == bucket.v_gene,
            Clone.j_gene == bucket.j_gene,
            Clone.cdr3_num_nts == bucket.cdr3_num_nts,
            Clone._insertions == bucket._insertions,
            Clone._deletions == bucket._deletions,
        )
        if clones.count() < 2:
            continue
        logger.info('Reducing bucket {} / {} ({} clones)'.format(
            i, len(buckets), clones.count()))
        uniques = {}
        for c in clones:
            uniques.setdefault(c.cdr3_aa, []).append(c.id)
        uniques = [sorted(u) for u in uniques.values() if len(u) > 1]
        if len(uniques) > 1:
            logger.info('Collapsing {} duplicate CDR3s'.format(len(uniques)))
        for identical in uniques:
            rep_id = identical[0]
            session.query(Sequence).filter(
                Sequence.clone_id.in_(identical)).update(
                    {'clone_id': rep_id}, synchronize_session=False)
            session.query(Clone).filter(Clone.id.in_(
                identical[1:])).delete(synchronize_session=False)
    session.commit()
Exemplo n.º 2
0
def aggregate_results(results, session, sample):
    alignments = {}
    success = [r for r in results if r['status'] == 'success']
    noresults = [r for r in results if r['status'] == 'noresult']
    logger.info('{} total sequences ({} alignments, {} noresults)'.format(
        len(results), len(success), len(noresults)))

    for result in success:
        alignment = result['alignment']
        key = (funcs.format_ties(alignment.v_gene),
               funcs.format_ties(alignment.j_gene), alignment.cdr3_num_nts,
               tuple(alignment.insertions), tuple(alignment.deletions))
        alignments.setdefault(key, []).append(alignment)

    copies = 0
    for i, result in enumerate(noresults):
        orig_id = result['vdj'].seq_id
        copies += result['vdj'].copy_number
        for i in range(result['vdj'].copy_number):
            result['vdj'].seq_id = '{}_{}'.format(orig_id, i)
            add_noresults_for_vdj(session, result['vdj'], sample,
                                  result['reason'])
        if copies % 1000 == 0:
            session.commit()

    session.commit()
    return alignments
Exemplo n.º 3
0
def write_pooled_clones(session,
                        out_format,
                        sample_ids=None,
                        pool_on=('sample', ),
                        zipped=False,
                        **kwargs):
    # Samples and subjects can't be combined with other features
    exclusives = set(pool_on).intersection(set(('sample', 'subject')))
    if len(pool_on) > 1 and exclusives:
        pool_on = (list(exclusives)[0], )
        logger.warning('You specified pooling on {feat} which '
                       'cannot be combined with other features.'
                       '  Using only {feat}.'.format(feat=pool_on[0]))

    logger.info('Writing clones pooled by {} in {} format'.format(
        ','.join(pool_on), out_format))

    sample_ids = sample_ids or [s.id for s in session.query(Sample)]
    aggregated = get_pooled_samples(session, sample_ids, pool_on)

    output_func = {
        'immunedb': get_immunedb_output,
        'vdjtools': get_vdjtools_output
    }[out_format]
    with ExportWriter(zipped=zipped) as fh:
        for (subject, feature_value), clones in aggregated.items():
            logger.info('Pooling subject {} for feature(s) {}'.format(
                subject, ','.join(feature_value)))
            fh.set_filename(get_filename(subject, pool_on, feature_value))
            fh.write(output_func(session, clones))
        return fh.get_zip_value()
Exemplo n.º 4
0
def write_vdjtools(session, args):
    clone_features = get_clone_features(session)
    for sample in session.query(Sample):
        logger.info('Exporting VDJTools format for sample {}'.format(
            sample.name))
        write_tsv('{}.sample.txt'.format(sample.name), get_sample_vdjtools,
                  session, sample, args.min_clone_size, clone_features)
Exemplo n.º 5
0
def write_selection(session, sample_ids=None, filter_type='both', zipped=False,
                    **kwargs):
    logger.info('Exporting selection pressure')
    with ExportWriter(zipped=zipped) as fh:
        fh.set_filename('selection_pressure.tsv')
        fh.write(get_selection(session, filter_type, sample_ids))
        return fh.get_zip_value()
Exemplo n.º 6
0
def write_samples(session, sample_ids=None, for_update=False, zipped=False,
                  **kwargs):
    logger.info('Exporting samples')
    with ExportWriter(zipped) as fh:
        fh.set_filename('samples.tsv')
        fh.write(get_samples(session, for_update, sample_ids))
        return fh.get_zip_value()
Exemplo n.º 7
0
def remove_duplicates(session, sample):
    logger.info('Removing duplicates from sample {}'.format(sample.id))
    seqs = session.query(Sequence).filter(
        Sequence.locally_aligned.is_(True),
        Sequence.sample_id == sample.id).order_by(Sequence.ai)

    for seq in seqs:
        potential_collapse = session.query(Sequence).filter(
            Sequence.sample_id == sample.id,
            Sequence.v_gene == seq.v_gene,
            Sequence.j_gene == seq.j_gene,
            Sequence.cdr3_num_nts == seq.cdr3_num_nts,
        ).order_by(desc(Sequence.copy_number), Sequence.ai)

        for other_seq in potential_collapse:
            if (other_seq.seq_id == seq.seq_id
                    or len(other_seq.sequence) != len(seq.sequence)):
                continue

            if dnautils.equal(other_seq.sequence, seq.sequence):
                other_seq.copy_number += seq.copy_number
                session.delete(seq)
                break

    session.commit()
def write_samples(session, sample_ids=None, for_update=False, zipped=False,
                  **kwargs):
    logger.info('Exporting samples')
    with ExportWriter(zipped) as fh:
        fh.set_filename('samples.tsv')
        fh.write(get_samples(session, for_update, sample_ids))
        return fh.get_zip_value()
Exemplo n.º 9
0
def run_import(session, args, remaps=None):
    v_germlines = VGermlines(args.v_germlines)
    j_germlines = JGermlines(args.j_germlines, args.upstream_of_cdr3,
                             args.anchor_len, args.min_anchor_len)

    study, new = funcs.get_or_create(session, Study, name=args.study_name)

    if new:
        logger.info('Created new study "{}"'.format(study.name))
        session.commit()

    sample, new = funcs.get_or_create(session,
                                      Sample,
                                      name=args.sample_name,
                                      study=study)
    if new:
        sample.date = args.date
        logger.info('Created new sample "{}"'.format(sample.name))
        for key in ('subset', 'tissue', 'disease', 'lab', 'experimenter',
                    'ig_class', 'v_primer', 'j_primer'):
            setattr(sample, key, vars(args).get(key, None))
        subject, new = funcs.get_or_create(session,
                                           Subject,
                                           study=study,
                                           identifier=args.subject)
        sample.subject = subject
        session.commit()
    else:
        loger.error('Sample "{}" already exists'.format(args.sample_name))
        return

    with open(args.input_file) as fh:
        read_file(session, fh, sample, v_germlines, j_germlines, args, remaps)
Exemplo n.º 10
0
def run_clearcut(session, args):
    if args.clone_ids is not None:
        clones = session.query(Clone.id).filter(
            Clone.id.in_(args.clone_ids))
    else:
        if args.subject_ids is not None:
            clones = session.query(Clone.id).filter(
                Clone.subject_id.in_(args.subject_ids))
        else:
            clones = session.query(Clone.id)

    if not args.force:
        clones = clones.filter(Clone.tree.is_(None))
    clones = [c.id for c in clones]
    mod_log.make_mod('clone_tree', session=session, commit=True,
                     info=vars(args))

    tasks = concurrent.TaskQueue()

    logger.info('Creating task queue for clones')
    for clone_id in clones:
        tasks.add_task(clone_id)

    for _ in range(0, args.nproc):
        session = config.init_db(args.db_config)
        tasks.add_worker(LineageWorker(
            session, get_newick,
            args.min_mut_copies, args.min_mut_samples,
            args.min_seq_copies,
            args.min_seq_samples,
            args.exclude_stops,
            args.full_seq,
            post_tree_hook=minimize_tree))

    tasks.start()
Exemplo n.º 11
0
def create_sample(session, metadata):
    study, new = funcs.get_or_create(
        session, Study, name=metadata['study_name'])

    if new:
        logger.info('Created new study "{}"'.format(study.name))
        session.commit()

    sample, new = funcs.get_or_create(
        session, Sample, name=metadata['sample_name'], study=study)
    if new:
        logger.info('Created new sample "{}"'.format(sample.name))
        for key, value in metadata.items():
            if key not in REQUIRED_FIELDS:
                session.add(SampleMetadata(
                    sample=sample,
                    key=key,
                    value=value
                ))

        subject, new = funcs.get_or_create(
            session, Subject, study=study,
            identifier=metadata['subject'])
        sample.subject = subject
        session.commit()
    else:
        logger.error(
            'Sample "{}" already exists'.format(metadata['sample_name']))
        return
    return sample
Exemplo n.º 12
0
def add_sequences_from_sample(session, sample, sequences, props):
    logger.info('Adding {} corrected sequences to sample {}'.format(
        len(sequences), sample.id))
    for sequence in periodic_commit(session, sequences):
        alignment = sequence['alignment']
        try:
            try:
                props.validate(alignment)
            except AlignmentException:
                continue
            if sequence['r_type'] == 'NoResult':
                add_sequences(session, [alignment],
                              sample,
                              error_action='raise')
                session.query(NoResult).filter(
                    NoResult.pk == sequence['pk']).delete(
                        synchronize_session=False)
            elif sequence['r_type'] == 'Sequence':
                fields = {
                    'partial': alignment.partial,
                    'probable_indel_or_misalign': alignment.has_possible_indel,
                    'v_gene': format_ties(alignment.v_gene),
                    'j_gene': format_ties(alignment.j_gene),
                    'num_gaps': alignment.num_gaps,
                    'seq_start': alignment.seq_start,
                    'v_match': alignment.v_match,
                    'v_length': alignment.v_length,
                    'j_match': alignment.j_match,
                    'j_length': alignment.j_length,
                    'removed_prefix':
                    alignment.sequence.removed_prefix_sequence,
                    'removed_prefix_qual':
                    alignment.sequence.removed_prefix_quality,
                    'v_mutation_fraction': alignment.v_mutation_fraction,
                    'pre_cdr3_length': alignment.pre_cdr3_length,
                    'pre_cdr3_match': alignment.pre_cdr3_match,
                    'post_cdr3_length': alignment.post_cdr3_length,
                    'post_cdr3_match': alignment.post_cdr3_match,
                    'in_frame': alignment.in_frame,
                    'functional': alignment.functional,
                    'stop': alignment.stop,
                    'cdr3_nt': alignment.cdr3,
                    'cdr3_num_nts': len(alignment.cdr3),
                    'cdr3_aa': lookups.aas_from_nts(alignment.cdr3),
                    'sequence': str(alignment.sequence.sequence),
                    'quality': alignment.sequence.quality,
                    'locally_aligned': alignment.locally_aligned,
                    '_insertions': serialize_gaps(alignment.insertions),
                    '_deletions': serialize_gaps(alignment.deletions),
                    'germline': alignment.germline
                }
                # This line doesnt actually add anything to the DB, it's just
                # to validate the fields
                Sequence(**fields)

                session.query(Sequence).filter(
                    Sequence.ai == sequence['pk']).update(
                        fields, synchronize_session=False)
        except ValueError:
            continue
Exemplo n.º 13
0
def collapse_duplicates(bucket):
    use_heuristic = len(bucket) > 10000
    bucket = sorted(bucket, key=lambda s: s.sequence.copy_number, reverse=True)

    if use_heuristic:
        uniques = {}
        for alignment in bucket:
            if alignment.cdr3 not in uniques:
                uniques[alignment.cdr3] = alignment
            else:
                uniques[alignment.cdr3].sequence.copy_number += (
                    alignment.sequence.copy_number)

        logger.info(
            'Bucket {v_gene} {j_gene} {cdr3_num_nts} had {cnt} '
            'sequences.  Used heuristic to reduce to {new_cnt}.'.format(
                v_gene=[g.name for g in bucket[0].v_gene],
                j_gene=[g.name for g in bucket[0].j_gene],
                cdr3_num_nts=bucket[0].cdr3_num_nts,
                cnt=len(bucket),
                new_cnt=len(uniques)))
        bucket = sorted(uniques.values(),
                        key=lambda s: s.sequence.copy_number,
                        reverse=True)

    return collapse_duplicate_alignments(bucket)
Exemplo n.º 14
0
def run_selection_pressure(session, args):
    mod_log.make_mod('clone_pressure',
                     session=session,
                     commit=True,
                     info=vars(args))

    if args.clone_ids is not None:
        clones = args.clone_ids
    elif args.subject_ids is not None:
        clones = map(
            lambda c: c.id,
            session.query(Clone.id).filter(
                Clone.subject_id.in_(args.subject_ids)).all())
    else:
        clones = map(lambda c: c.id, session.query(Clone.id).all())
    clones.sort()

    tasks = concurrent.TaskQueue()
    logger.info('Creating task queue to calculate selection pressure for {} '
                'clones.'.format(len(clones)))
    for cid in clones:
        tasks.add_task(cid)

    for i in range(0, args.nproc):
        session = config.init_db(args.db_config)
        tasks.add_worker(
            SelectionPressureWorker(session, args.baseline_path, args.temp,
                                    args.regen, args.thresholds))

    tasks.start()
Exemplo n.º 15
0
def collapse_similar_cdr3s(session, buckets, difference_allowed):
    logger.info('Collapsing similar clones in {} buckets'.format(
        buckets.count()))
    for i, bucket in enumerate(buckets):
        clones = session.query(Clone.id, Clone.cdr3_aa, Clone.cdr3_nt).filter(
            Clone.subject_id == bucket.subject_id,
            Clone.cdr3_num_nts == bucket.cdr3_num_nts,
        ).order_by(Clone.overall_total_cnt.desc())
        if clones.count() < 2:
            continue
        logger.info('Reducing bucket {} / {} ({} clones)'.format(
            i, buckets.count(), clones.count()))
        reduced = {}
        for c in clones:
            for larger_cdr3_nt, others in reduced.items():
                if (dnautils.hamming(larger_cdr3_nt, c.cdr3_nt) <=
                        difference_allowed):
                    others.append(c.id)
                    break
            else:
                reduced[c.cdr3_nt] = [c.id]

        for collapse in reduced.values():
            rep_id, others = collapse[0], collapse[1:]
            session.query(Sequence).filter(
                Sequence.clone_id.in_(others)).update(
                    {'clone_id': rep_id}, synchronize_session=False)
            session.query(Clone).filter(
                Clone.id.in_(others)).delete(synchronize_session=False)
    session.commit()
Exemplo n.º 16
0
def remove_duplicates(session, sample):
    logger.info('Removing duplicates from sample {}'.format(sample.id))
    seqs = session.query(
        Sequence
    ).filter(
        Sequence.locally_aligned.is_(True),
        Sequence.sample_id == sample.id
    ).order_by(Sequence.ai)

    for seq in seqs:
        potential_collapse = session.query(
            Sequence
        ).filter(
            Sequence.sample_id == sample.id,
            Sequence.v_gene == seq.v_gene,
            Sequence.j_gene == seq.j_gene,
            Sequence.cdr3_num_nts == seq.cdr3_num_nts,
        ).order_by(desc(Sequence.copy_number), Sequence.ai)

        for other_seq in potential_collapse:
            if (other_seq.seq_id == seq.seq_id or
                    len(other_seq.sequence) != len(seq.sequence)):
                continue

            if dnautils.equal(other_seq.sequence, seq.sequence):
                other_seq.copy_number += seq.copy_number
                session.delete(seq)
                break

    session.commit()
Exemplo n.º 17
0
def collapse_identical(session, buckets):
    for i, bucket in enumerate(buckets):
        clones = session.query(
            Clone.id, Clone.cdr3_aa
        ).filter(
            Clone.subject_id == bucket.subject_id,
            Clone.v_gene == bucket.v_gene,
            Clone.j_gene == bucket.j_gene,
            Clone.cdr3_num_nts == bucket.cdr3_num_nts,
            Clone._insertions == bucket._insertions,
            Clone._deletions == bucket._deletions,
        )
        if clones.count() < 2:
            continue
        logger.info('Reducing bucket {} / {} ({} clones)'.format(
            i, len(buckets), clones.count()))
        uniques = {}
        for c in clones:
            uniques.setdefault(c.cdr3_aa, []).append(c.id)
        uniques = [sorted(u) for u in uniques.values() if len(u) > 1]
        if len(uniques) > 1:
            logger.info('Collapsing {} duplicate CDR3s'.format(len(uniques)))
        for identical in uniques:
            rep_id = identical[0]
            session.query(Sequence).filter(
                Sequence.clone_id.in_(identical)
            ).update({'clone_id': rep_id}, synchronize_session=False)
            session.query(Clone).filter(
                Clone.id.in_(identical[1:])
            ).delete(synchronize_session=False)
    session.commit()
Exemplo n.º 18
0
def run_fix_sequences(session, args):
    v_germlines = VGermlines(args.v_germlines)
    j_germlines = JGermlines(args.j_germlines, args.upstream_of_cdr3)

    indexes = set()
    props = IdentificationProps(**args.__dict__)
    for sample in session.query(Sample):
        sequences = process_sample(session, sample, indexes, args.temp,
                                   v_germlines, j_germlines, args.nproc)
        add_sequences_from_sample(session, sample, sequences, props)
        remove_duplicates(session, sample)

    logger.info('Updating copy numbers')
    session.connection(mapper=Sequence).execute(
        text('''
        UPDATE
            sequences
        SET
            copy_number = 1 + (
                SELECT
                    COUNT(*)
                FROM
                    duplicate_sequences
                WHERE
                    duplicate_seq_ai = ai
            )
    '''))
Exemplo n.º 19
0
def process_data(input_data, process_func, aggregate_func, nproc,
                 generate_args={}, process_args={}, aggregate_args={}):
    if callable(input_data):
        start = time.time()
        input_data = input_data(**generate_args)
        logger.info('Generate time: {}'.format(time.time() - start))

    with mp.Manager() as manager:
        proxy_data = manager.list(input_data)
        pool = mp.Pool(processes=nproc)
        f = functools.partial(
            subcaller,
            functools.partial(process_func, **process_args),
            proxy_data
        )
        start = time.time()
        logger.info('Waiting on pool {}'.format(process_func.__name__))

        res = [r for r in pool.map(f, range(len(proxy_data))) if r is not None]
        pool.close()
    logger.info('Pool done: {}'.format(time.time() - start))

    start = time.time()
    logger.info('Waiting on aggregation {}'.format(aggregate_func.__name__))
    ret = aggregate_func(res, **aggregate_args)
    logger.info('Done aggregation: {}'.format(time.time() - start))

    return ret
Exemplo n.º 20
0
def create(main_parser, args):
    if re.search(r'[^A-Za-z0-9_-]', args.db_name) is not None:
        main_parser.error('Database name must only contain letters, numbers, '
                          'dashes and underscores.')

    try:
        conn = _get_root_connection(args.db_host, args.admin_user,
                                    args.admin_pass)

        db_user = args.db_user or args.db_name
        if args.db_pass:
            db_pass = args.db_pass
        else:
            db_pass = ''.join(
                random.choice(string.ascii_uppercase + string.ascii_lowercase +
                              string.digits) for _ in range(10))

        with conn.cursor() as cursor:
            logger.info('Creating user "{}"'.format(db_user))
            existing_password = _create_user_if_not_exists(conn, '%', db_user,
                                                           db_pass)
            if existing_password is not None:
                if not args.db_pass:
                    logger.warning(
                        'User {} already exists.  To generate the '
                        'configuration file, you must enter it\'s '
                        'password.'.format(db_user)
                    )
                    db_pass = _get_user_pass(conn, args.db_host, db_user,
                                             existing_password)
                else:
                    db_pass = args.db_pass

            logger.info('Creating database "{}"'.format(args.db_name))
            cursor.execute('CREATE DATABASE {}'.format(args.db_name))

            cursor.execute(
                'GRANT ALL PRIVILEGES ON {}.* TO \'{}\'@\'%\''.format(
                    args.db_name, db_user))

        config_path = os.path.join(args.config_dir, '{}.json'.format(
            args.db_name))
        logger.info('Creating config at {}'.format(config_path))
        with open(config_path, 'w+') as fh:
            json.dump({
                'host': args.db_host,
                'database': args.db_name,
                'username': db_user,
                'password': db_pass
            }, fh, sort_keys=True, indent=4, separators=(',', ': '))

        logger.info('Initializing tables')
        config.init_db(config_path)
        logger.info('Success!')
        return True
    except Exception as e:
        logger.error(e)
        return False
Exemplo n.º 21
0
def _get_root_connection(host, user, admin_pass=None):
    if admin_pass is None:
        try:
            return _connect(host, user)
        except Exception:
            logger.info('Failed connection with empty root password')
            admin_pass = getpass.getpass('MySQL password for ({}):'.format(
                user))
    return _connect(host, user, admin_pass)
Exemplo n.º 22
0
def _get_root_connection(host, user, admin_pass=None):
    if admin_pass is None:
        try:
            return _connect(host, user)
        except Exception as e:
            logger.info('Failed connection with empty root password')
            admin_pass = getpass.getpass(
                'MySQL password for ({}):'.format(user))
    return _connect(host, user, admin_pass)
Exemplo n.º 23
0
def _get_user_pass(conn, host, user, existing_password):
    with conn.cursor() as cursor:
        while True:
            db_pass = getpass.getpass()
            cursor.execute('SELECT PASSWORD(%s) as password', db_pass)
            if cursor.fetchone()['password'] != existing_password:
                logger.error('Password does not match.')
            else:
                logger.info('Correct password')
                return db_pass
Exemplo n.º 24
0
def write_genbank(session, args):
    args.inference = 'alignment:' + args.inference

    header = ('[organism={}] '
              '[moltype={}] '
              '[keywords=AIRR]').format(args.species, args.mol_type)

    for sample in session.query(Sample):
        logger.info('Exporting sample {}'.format(sample.name))
        _write_sample(session, sample.id, args.gene_db, args.inference, header)
Exemplo n.º 25
0
def _get_user_pass(conn, host, user, existing_password):
    with conn.cursor() as cursor:
        while True:
            db_pass = getpass.getpass()
            cursor.execute('SELECT PASSWORD(%s) as password', db_pass)
            if cursor.fetchone()['password'] != existing_password:
                logger.error('Password does not match.')
            else:
                logger.info('Correct password')
                return db_pass
Exemplo n.º 26
0
def preprocess_airr(reader):
    logger.info('Collapsing identical sequences')
    seen = {}
    for l in reader:
        if l['sequence_alignment'] in seen:
            seen[l['sequence_alignment']]['copy_number'] += 1
        else:
            l['copy_number'] = 1
            seen[l['sequence_alignment']] = l
    return sorted(seen.values(), key=lambda s: s['sequence_id'])
Exemplo n.º 27
0
def run_collapse(session, args):
    mod_log.make_mod('collapse', session=session, commit=True,
                     info=vars(args))
    subject_ids = []

    for subject in (args.subject_ids or map(
                lambda e: e.id, session.query(Subject.id).all()
                )):
        if session.query(Sample).filter(
                Sample.subject_id == subject,
                ~exists().where(
                    SequenceCollapse.sample_id == Sample.id
                )).first() is None:
            logger.info('Subject {} already collapsed.  Skipping.'.format(
                subject))
        else:
            logger.info('Resetting collapse info for subject {}'.format(
                subject))
            samples = session.query(Sample).filter(
                  Sample.subject_id == subject
            ).all()
            for sample in samples:
                session.query(SequenceCollapse).filter(
                    SequenceCollapse.sample_id == sample.id
                ).delete(synchronize_session=False)
            logger.info('Resetting clone info for subject {}'.format(subject))
            session.query(Clone).filter(Clone.subject_id == subject).delete()
            subject_ids.append(subject)
    session.commit()

    logger.info('Creating task queue to collapse {} subjects.'.format(
        len(subject_ids)))

    tasks = concurrent.TaskQueue()

    for subject_id in subject_ids:
        buckets = session.query(
            Sequence.subject_id, Sequence.v_gene, Sequence.j_gene,
            Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions
        ).filter(
            Sequence.subject_id == subject_id
        ).group_by(
            Sequence.subject_id, Sequence.v_gene, Sequence.j_gene,
            Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions
        )
        for bucket in buckets:
            tasks.add_task(bucket)

    logger.info('Generated {} total tasks'.format(tasks.num_tasks()))

    for i in range(0, min(tasks.num_tasks(), args.nproc)):
        tasks.add_worker(CollapseWorker(config.init_db(args.db_config)))
    tasks.start()

    session.close()
Exemplo n.º 28
0
def run_collapse(session, args):
    mod_log.make_mod('collapse', session=session, commit=True,
                     info=vars(args))
    subject_ids = []

    subjects = (args.subject_ids or [e.id for e in session.query(Subject.id)])
    for subject in subjects:
        if session.query(Sample).filter(
                Sample.subject_id == subject,
                ~exists().where(
                    SequenceCollapse.sample_id == Sample.id
                )).first() is None:
            logger.info('Subject {} already collapsed.  Skipping.'.format(
                subject))
        else:
            logger.info('Resetting collapse info for subject {}'.format(
                subject))
            samples = session.query(Sample).filter(
                  Sample.subject_id == subject
            )
            for sample in samples:
                session.query(SequenceCollapse).filter(
                    SequenceCollapse.sample_id == sample.id
                ).delete(synchronize_session=False)
                sample.sample_stats = []
            logger.info('Resetting clone info for subject {}'.format(subject))
            session.query(Clone).filter(Clone.subject_id == subject).delete()
            subject_ids.append(subject)
    session.commit()

    logger.info('Creating task queue to collapse {} subjects.'.format(
        len(subject_ids)))

    tasks = concurrent.TaskQueue()

    for subject_id in subject_ids:
        buckets = session.query(
            Sequence.subject_id, Sequence.v_gene, Sequence.j_gene,
            Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions
        ).filter(
            Sequence.subject_id == subject_id
        ).group_by(
            Sequence.subject_id, Sequence.v_gene, Sequence.j_gene,
            Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions
        )
        for bucket in buckets:
            tasks.add_task(bucket)

    logger.info('Generated {} total tasks'.format(tasks.num_tasks()))

    for i in range(0, min(tasks.num_tasks(), args.nproc)):
        tasks.add_worker(CollapseWorker(config.init_db(args.db_config)))
    tasks.start()

    session.close()
Exemplo n.º 29
0
def export_genbank(session, args):
    args.inference = 'alignment:' + args.inference

    header = ('[organism={}] '
              '[moltype={}] '
              '[keywords=AIRR]').format(args.species, args.mol_type)
    samples = args.sample_ids or [s.id for s in session.query(Sample.id).all()]

    for sample_id in samples:
        logger.info('Exporting sample {}'.format(sample_id))
        export_sample_genbank(session, sample_id, args.gene_db, args.inference,
                              header)
Exemplo n.º 30
0
def run_clones(session, args):
    """Runs the clone-assignment pipeline stage.

    :param Session session: The database session
    :param Namespace args: The arguments passed to the command

    """
    if args.subject_ids is None:
        subject_ids = [s.id for s in session.query(Subject.id)]
    else:
        subject_ids = args.subject_ids
    mod_log.make_mod('clones', session=session, commit=True, info=vars(args))

    if args.regen:
        logger.info('Deleting existing clones')
        session.query(Clone).filter(
            Clone.subject_id.in_(subject_ids)
        ).delete(synchronize_session=False)
        session.commit()

    tasks = concurrent.TaskQueue()
    for subject_id in subject_ids:
        logger.info('Generating task queue for subject {}'.format(
            subject_id))
        buckets = session.query(
            Sequence.subject_id, Sequence.v_gene, Sequence.j_gene,
            Sequence.cdr3_num_nts, Sequence._insertions,
            Sequence._deletions
        ).filter(
            Sequence.subject_id == subject_id,
            Sequence.clone_id.is_(None)
        ).group_by(
            Sequence.subject_id, Sequence.v_gene, Sequence.j_gene,
            Sequence.cdr3_num_nts, Sequence._insertions,
            Sequence._deletions
        )
        for bucket in buckets:
            if not args.gene or bucket.v_gene.startswith(args.gene):
                tasks.add_task(bucket)

    logger.info('Generated {} total tasks'.format(tasks.num_tasks()))

    methods = {
        'similarity': SimilarityClonalWorker,
        'lineage': LineageClonalWorker,
    }
    for i in range(0, min(tasks.num_tasks(), args.nproc)):
        worker = methods[args.method](
            config.init_db(args.db_config), **args.__dict__
        )
        tasks.add_worker(worker)
    tasks.start()

    if args.subclones:
        run_subclones(session, subject_ids, args)
    else:
        logger.info('Skipping subclones')

    push_clone_ids(session)
    session.commit()
Exemplo n.º 31
0
def run_clones(session, args):
    """Runs the clone-assignment pipeline stage.

    :param Session session: The database session
    :param Namespace args: The arguments passed to the command

    """
    if args.subject_ids is None:
        subject_ids = map(lambda s: s.id, session.query(Subject.id).all())
    else:
        subject_ids = args.subject_ids
    mod_log.make_mod('clones', session=session, commit=True, info=vars(args))

    if args.regen:
        logger.info('Deleting existing clones')
        session.query(Clone).filter(
            Clone.subject_id.in_(subject_ids)
        ).delete(synchronize_session=False)
        session.commit()

    tasks = concurrent.TaskQueue()
    for subject_id in subject_ids:
        logger.info('Generating task queue for subject {}'.format(
            subject_id))
        buckets = session.query(
            Sequence.subject_id, Sequence.v_gene, Sequence.j_gene,
            Sequence.cdr3_num_nts, Sequence._insertions,
            Sequence._deletions
        ).filter(
            Sequence.subject_id == subject_id,
            Sequence.clone_id.is_(None)
        ).group_by(
            Sequence.subject_id, Sequence.v_gene, Sequence.j_gene,
            Sequence.cdr3_num_nts, Sequence._insertions,
            Sequence._deletions
        )
        for bucket in buckets:
            tasks.add_task(bucket)

    logger.info('Generated {} total tasks'.format(tasks.num_tasks()))

    methods = {
        'similarity': SimilarityClonalWorker,
        'tcells': TCellClonalWorker,
        'lineage': LineageClonalWorker,
    }
    for i in range(0, min(tasks.num_tasks(), args.nproc)):
        worker = methods[args.method](
            config.init_db(args.db_config), **args.__dict__
        )
        tasks.add_worker(worker)
    tasks.start()

    if args.subclones:
        run_subclones(session, subject_ids, args)
    else:
        logger.info('Skipping subclones')

    push_clone_ids(session)
    session.commit()
def write_sequences(session, sample_ids=None, out_format='changeo',
                    clones_only=False, min_subject_copies=None, zipped=False,
                    **kwargs):
    samples = session.query(Sample)
    if sample_ids:
        samples = samples.filter(Sample.id.in_(sample_ids))
    with ExportWriter(zipped=zipped) as fh:
        for sample in samples:
            logger.info('Exporting sample {}'.format(sample.name))
            fh.set_filename('{}.{}.tsv'.format(sample.name, out_format))
            fh.write(
                get_sequences(session, sample, out_format, clones_only,
                              min_subject_copies)
            )
        return fh.get_zip_value()
Exemplo n.º 33
0
def write_clone_overlap(session,
                        sample_ids=None,
                        pool_on=('sample', ),
                        size_metric='copies',
                        sim_func='cosine',
                        agg_func='median',
                        zipped=False,
                        **kwargs):
    samples = session.query(Sample)
    if sample_ids:
        samples = samples.filter(Sample.id.in_(sample_ids))
    sample_instances = {s.id: s for s in samples}

    with ExportWriter(zipped=zipped) as writer:
        for subject in set([s.subject for s in sample_instances.values()]):
            logger.info('Calculating overlap for {}'.format(
                subject.identifier))
            sub_samples = [
                s.id for s in sample_instances.values() if s.subject == subject
            ]
            sdf = get_sample_df(session, sub_samples, pool_on, size_metric,
                                getattr(distance, sim_func))
            if sdf.empty:
                logger.warning(
                    'Subject {} had no clones for calculation'.format(
                        subject.identifier))
                continue

            sdf = collapse_df_features(sdf, pool_on, sample_instances,
                                       getattr(np, agg_func))
            name = '{}.overlap'.format(subject.identifier)

            with writer.get_handle(name + '.tsv') as fh:
                sdf.to_csv(fh, sep='\t')

            title_fmt = 'Subject {}\npooled by={}, similarity metric={}'
            if 'sample' not in pool_on:
                title_fmt += ', aggregation function={}'
            fig, ax = plt.subplots(figsize=(20, 20))
            ax = sns.heatmap(sdf, annot=True, linewidths=.25, vmin=0, vmax=1)
            ax.set_title(
                title_fmt.format(subject.identifier, ' & '.join(pool_on),
                                 sim_func, agg_func))

            with writer.get_handle(name + '.pdf', 'wb+') as fh:
                plt.savefig(fh, bbox_inches='tight', format='pdf')

        return writer.get_zip_value()
Exemplo n.º 34
0
    def start_job(self, func, **kwargs):
        uid = str(uuid.uuid4())
        logger.info('Starting job with UUID {}'.format(uid))

        job_func = partial(self._job_wrap, func, uid)

        mp.Process(
            target=job_func,
            kwargs=kwargs
        ).start()
        self.files.extend([
            self.get_path(uid, '.log'),
            self.get_path(uid, '.zip'),
        ])

        return uid
Exemplo n.º 35
0
def delete(main_parser, args):
    try:
        with open(args.db_config) as fh:
            db_config = json.load(fh)
        conn = _get_root_connection(db_config['host'], args.admin_user,
                                    args.admin_pass)
        with conn.cursor() as cursor:
            logger.info('Deleting database {}'.format(db_config['database']))
            cursor.execute('DROP DATABASE `{}`'.format(db_config['database']))
            if args.delete_user:
                logger.info('Deleting user {}'.format(db_config['username']))
                cursor.execute('DROP USER `{}`'.format(db_config['username']))
        return True
    except Exception as e:
        logger.error(e)
        return False
Exemplo n.º 36
0
def delete(main_parser, args):
    try:
        with open(args.db_config) as fh:
            db_config = json.load(fh)
        conn = _get_root_connection(db_config['host'], args.admin_user,
                                    args.admin_pass)
        with conn.cursor() as cursor:
            logger.info('Deleting database {}'.format(db_config['database']))
            cursor.execute('DROP DATABASE `{}`'.format(db_config['database']))
            if args.delete_user:
                logger.info('Deleting user {}'.format(db_config['username']))
                cursor.execute('DROP USER `{}`'.format(db_config['username']))
        return True
    except Exception as e:
        logger.error(e)
        return False
Exemplo n.º 37
0
def write_sequences(session, args):
    for subject in session.query(Subject):
        logger.info('Exporting subject {}'.format(subject.identifier))
        seqs = session.query(Sequence).filter(
            Sequence.subject_id == subject.id
        )
        if args.clones_only:
            seqs = seqs.filter(~Sequence.clone_id.is_(None))
        if args.min_subject_copies is not None:
            seqs = seqs.filter(
                SequenceCollapse.copy_number_in_subject >=
                args.min_subject_copies
            )

        fn = '{}.{}.tsv'.format(subject.identifier, args.fmt)
        write_tsv(fn, get_sequences, session, seqs, args.fmt)
Exemplo n.º 38
0
def run_subclones(session, subject_ids, args):
    tasks = concurrent.TaskQueue()
    for subject_id in subject_ids:
        logger.info(
            'Generating subclone task queue for subject {}'.format(subject_id))
        buckets = session.query(Clone.subject_id, Clone.v_gene, Clone.j_gene,
                                Clone.cdr3_num_nts).filter(
                                    Clone.subject_id == subject_id).group_by(
                                        Clone.subject_id, Clone.v_gene,
                                        Clone.j_gene, Clone.cdr3_num_nts)
        for bucket in buckets:
            tasks.add_task(bucket)

    logger.info('Generated {} total subclone tasks'.format(tasks.num_tasks()))
    for i in range(0, min(tasks.num_tasks(), args.nproc)):
        tasks.add_worker(SubcloneWorker(config.init_db(args.db_config)))
    tasks.start()
Exemplo n.º 39
0
def aggregate_collapse(aggregate_queue, db_config, sample_id, props):
    seqs_to_add = []
    session = config.init_db(db_config, create=False)
    sample = session.query(Sample).filter(Sample.id == sample_id).one()
    for i, alignment in enumerate(aggregate_queue):
        for seq in alignment:
            seqs_to_add.append(seq)
            if len(seqs_to_add) >= 1000:
                add_sequences(session, seqs_to_add, sample,
                              strip_alleles=not props.genotyping)
                seqs_to_add = []
                session.commit()
    if seqs_to_add:
        add_sequences(session, seqs_to_add, sample,
                      strip_alleles=not props.genotyping)
    logger.info('Finished aggregating sequences')
    session.commit()
    session.close()
Exemplo n.º 40
0
def aggregate_collapse(aggregate_queue, db_config, sample_id, props):
    seqs_to_add = []
    session = config.init_db(db_config, create=False)
    sample = session.query(Sample).filter(Sample.id == sample_id).one()
    for i, alignment in enumerate(aggregate_queue):
        for seq in alignment:
            seqs_to_add.append(seq)
            if len(seqs_to_add) >= 1000:
                add_sequences(session, seqs_to_add, sample,
                              strip_alleles=not props.genotyping)
                seqs_to_add = []
                session.commit()
    if seqs_to_add:
        add_sequences(session, seqs_to_add, sample,
                      strip_alleles=not props.genotyping)
    logger.info('Finished aggregating sequences')
    session.commit()
    session.close()
Exemplo n.º 41
0
def write_clone_overlap(session, sample_ids=None, pool_on=('sample',),
                        size_metric='copies', sim_func='cosine',
                        agg_func='median', zipped=False, **kwargs):
    samples = session.query(Sample)
    if sample_ids:
        samples = samples.filter(Sample.id.in_(sample_ids))
    sample_instances = {s.id: s for s in samples}

    with ExportWriter(zipped=zipped) as writer:
        for subject in set([s.subject for s in sample_instances.values()]):
            logger.info('Calculating overlap for {}'.format(
                subject.identifier))
            sub_samples = [
                s.id for s in sample_instances.values() if s.subject == subject
            ]
            sdf = get_sample_df(session, sub_samples, pool_on, size_metric,
                                getattr(distance, sim_func))
            if sdf.empty:
                logger.warning(
                    'Subject {} had no clones for calculation'.format(
                        subject.identifier))
                continue

            sdf = collapse_df_features(sdf, pool_on, sample_instances,
                                       getattr(np, agg_func))
            name = '{}.overlap'.format(subject.identifier)

            with writer.get_handle(name + '.tsv') as fh:
                sdf.to_csv(fh, sep='\t')

            title_fmt = 'Subject {}\npooled by={}, similarity metric={}'
            if 'sample' not in pool_on:
                title_fmt += ', aggregation function={}'
            fig, ax = plt.subplots(figsize=(20, 20))
            ax = sns.heatmap(sdf, annot=True, linewidths=.25, vmin=0, vmax=1)
            ax.set_title(title_fmt.format(
                subject.identifier, ' & '.join(pool_on), sim_func, agg_func
            ))

            with writer.get_handle(name + '.pdf', 'wb+') as fh:
                plt.savefig(fh, bbox_inches='tight', format='pdf')

        return writer.get_zip_value()
Exemplo n.º 42
0
def run_subclones(session, subject_ids, args):
    tasks = concurrent.TaskQueue()
    for subject_id in subject_ids:
        logger.info('Generating subclone task queue for subject {}'.format(
            subject_id))
        buckets = session.query(
            Clone.subject_id, Clone.v_gene, Clone.j_gene, Clone.cdr3_num_nts
        ).filter(
            Clone.subject_id == subject_id
        ).group_by(
            Clone.subject_id, Clone.v_gene, Clone.j_gene, Clone.cdr3_num_nts
        )
        for bucket in buckets:
            tasks.add_task(bucket)

    logger.info('Generated {} total subclone tasks'.format(tasks.num_tasks()))
    for i in range(0, min(tasks.num_tasks(), args.nproc)):
        tasks.add_worker(SubcloneWorker(config.init_db(args.db_config),
                                        args.similarity))
    tasks.start()
Exemplo n.º 43
0
def read_input(path):
    vdjs = []
    parser = SeqIO.parse(path, 'fasta' if path.endswith('.fasta') else 'fastq')

    # Collapse identical sequences
    logger.info('Parsing input')
    for record in parser:
        try:
            vdjs.append(VDJSequence(
                seq_id=record.description,
                sequence=str(record.seq),
                quality=funcs.ord_to_quality(
                    record.letter_annotations.get('phred_quality')
                )
            ))
        except ValueError:
            continue

    logger.info('There are {} sequences'.format(len(vdjs)))
    return vdjs
def _queue_tasks(session, sample_id, force, tasks):
    logger.info('Creating task queue to generate stats for sample {}.'.format(
        sample_id))
    existing_seq = session.query(Sequence).filter(
        Sequence.sample_id == sample_id)
    existing_nores = session.query(NoResult).filter(
        NoResult.sample_id == sample_id)
    if existing_seq.first() is None and existing_nores.first() is None:
        logger.warning('\tSKIPPING since there are no sequences in the '
                       'sample')
        return

    existing = session.query(SampleStats.sample_id).filter(
        SampleStats.sample_id == sample_id).first() is not None
    if force and existing:
        logger.warning('\tFORCING regeneration of stats')
    elif not force and existing:
        logger.warning('\tSKIPPING stats since they already exists and the '
                       '--force flag was not specified.')
        return

    min_cdr3, max_cdr3 = _get_cdr3_bounds(session, sample_id)
    for include_outliers in [True, False]:
        for only_full_reads in [True, False]:
            tasks.add_task({
                'func': 'seq',
                'sample_id': sample_id,
                'min_cdr3': min_cdr3,
                'max_cdr3': max_cdr3,
                'include_outliers': include_outliers,
                'only_full_reads': only_full_reads
            })
            tasks.add_task({
                'func': 'clone',
                'sample_id': sample_id,
                'min_cdr3': min_cdr3,
                'max_cdr3': max_cdr3,
                'include_outliers': include_outliers,
                'only_full_reads': only_full_reads
            })
Exemplo n.º 45
0
def parse_file(fh, sample, session, alignment_func, props, v_germlines,
               j_germlines, nproc, preprocess_func=None):
    start = time.time()
    reader = csv.DictReader(fh, delimiter='\t')
    if preprocess_func:
        reader = preprocess_func(reader)

    alignments = concurrent.process_data(
        reader,
        process_line,
        aggregate_results,
        nproc,
        process_args={
            'alignment_func': alignment_func,
            'props': props,
            'v_germlines': v_germlines,
            'j_germlines': j_germlines
        },
        aggregate_args={
            'session': session,
            'sample': sample
        }
    )

    concurrent.process_data(
        alignments.values(),
        collapse_duplicates,
        add_results,
        nproc,
        aggregate_args={
            'session': session,
            'sample': sample
        }
    )

    logger.info('Completed sample {} in {}m'.format(
        sample.name, round((time.time() - start) / 60., 1)))
def run_clone_stats(session, args):
    """Runs the clone statistics generation stage of the pipeline.
    :param Session session: The database session
    :param Namespace args: The arguments passed to the command

    """
    mod_log.make_mod('clone_stats', session=session, commit=True,
                     info=vars(args))

    if args.clone_ids is not None:
        clones = args.clone_ids
    elif args.subject_ids is not None:
        clones = [c.id for c in session.query(Clone.id).filter(
            Clone.subject_id.in_(args.subject_ids))]
    else:
        clones = [c.id for c in session.query(Clone.id)]
    clones.sort()

    if args.regen:
        logger.info('Deleting old clone statistics for {} clones'.format(
            len(clones)))
        session.query(CloneStats).filter(
            CloneStats.clone_id.in_(clones)
        ).delete(synchronize_session=False)
        session.commit()

    tasks = concurrent.TaskQueue()
    logger.info('Creating task queue to generate stats for {} clones.'.format(
        len(clones)))
    for cid in clones:
        tasks.add_task(cid)

    for i in range(0, args.nproc):
        session = config.init_db(args.db_config)
        tasks.add_worker(CloneStatsWorker(session))

    tasks.start()
Exemplo n.º 47
0
def setup_sample(session, meta):
    study, new = funcs.get_or_create(session, Study, name=meta['study_name'])

    if new:
        logger.info('Created new study "{}"'.format(study.name))
        session.commit()

    name = meta['sample_name']
    sample, new = funcs.get_or_create(session, Sample, name=name, study=study)

    if new:
        subject, new = funcs.get_or_create(
            session, Subject, study=study,
            identifier=meta['subject'])
        sample.subject = subject

        for key, value in meta.items():
            if key not in REQUIRED_FIELDS:
                session.add(SampleMetadata(
                    sample=sample, key=key, value=value
                ))

    session.commit()
    return sample
Exemplo n.º 48
0
def remove_duplicates(session, sample):
    logger.info('Removing duplicates from sample {}'.format(sample.id))
    all_seqs = session.query(Sequence).filter(
        Sequence.sample == sample
    ).order_by(
        Sequence.copy_number.desc()
    )

    buckets = {}
    for seq in all_seqs:
        key = (seq.v_gene, seq.j_gene, seq.cdr3_num_nts)
        buckets.setdefault(key, []).append(seq)

    for i, bucket in enumerate(buckets.values()):
        while len(bucket) > 0:
            larger = bucket.pop(0)
            for i in reversed(range(len(bucket))):
                smaller = bucket[i]
                if dnautils.equal(larger.sequence, smaller.sequence):
                    larger.copy_number += smaller.copy_number
                    session.delete(smaller)
                    del bucket[i]

    session.commit()
Exemplo n.º 49
0
def update_metadata(session, args):
    SENTINEL = '__TEMP'  # Used to temporarily avoid duplicate name issues
    with open(args.new_metadata) as fh:
        reader = csv.DictReader(fh, delimiter='\t')
        new_meta = {l['name']: l for l in reader}

    # delete existing metadata
    sample_ids = {s.name: s.id for s in session.query(Sample).filter(
        Sample.name.in_(new_meta))}

    session.query(SampleMetadata).filter(
        SampleMetadata.sample_id.in_(sample_ids.values())
    ).delete(synchronize_session='fetch')

    ignore_fields = ['name', 'new_name', 'subject', 'file_name']
    for sample_name, row in new_meta.items():
        if sample_name not in sample_ids:
            logger.warning('No sample {} in database.  Ignoring.'.format(
                sample_name))
        sample_id = sample_ids[sample_name]
        logger.info('Updating metadata for {}'.format(row['name']))
        session.add_all([
            SampleMetadata(sample_id=sample_id, key=k, value=v)
            for k, v in row.items() if k not in ignore_fields and v not in
            NA_VALUES
        ])
        if row['new_name'] != row['name']:
            logger.info('  Updating sample name to {}'.format(row['new_name']))
            session.query(Sample).filter(Sample.name == row['name']).update({
                Sample.name: row['new_name'] + SENTINEL
            })

    logger.info('Verifying uniqueness')
    for sample in session.query(Sample).filter(
            Sample.name.like('%' + SENTINEL)):
        sample.name = sample.name[:-len(SENTINEL)]

    if session.query(Clone.id).filter(~Clone.tree.is_(None)).count() > 0:
        logger.warning('This database has at least one clonal lineage '
                       'constructed.  All lineages will need to be updated '
                       'to reflect the modified metadata.')
    session.commit()
Exemplo n.º 50
0
def process_sample(session, sample, indexes, temp, v_germlines, j_germlines,
                   nproc):
    indels = session.query(
        Sequence.ai,
        Sequence.seq_id,
        Sequence.sample_id,
        Sequence.sequence
    ).filter(
        Sequence.sample_id == sample.id,
        Sequence.probable_indel_or_misalign == 1
    ).order_by(Sequence.seq_id)
    # Get the sequences that were not identifiable
    noresults = session.query(NoResult).filter(
        NoResult.sample_id == sample.id).order_by(NoResult.seq_id)

    if indels.count() == 0 and noresults.count() == 0:
        logger.info('Sample {} has no indels or noresults'.format(
            sample.id))
        return
    logger.info('Sample {} has {} indels and {} noresults'.format(
                sample.id, indels.count(), noresults.count()))

    mut_bucket = v_germlines.mut_bucket(sample.v_ties_mutations)
    len_bucket = v_germlines.length_bucket(sample.v_ties_len)
    bucket = '{}_{}'.format(str(mut_bucket).replace('.', ''),
                            len_bucket)
    sample_v_germlines = get_formatted_ties(v_germlines.all_ties(
            sample.v_ties_len, sample.v_ties_mutations))
    sample_j_germlines = get_formatted_ties(j_germlines.all_ties(
        sample.v_ties_len, sample.v_ties_mutations))
    if bucket not in indexes:
        indexes.add(bucket)
        v_path = os.path.join(temp, 'v_genes_{}'.format(bucket))
        j_path = os.path.join(temp, 'j_genes_{}'.format(bucket))
        logger.info('Creating index for V-ties at {} length, {} '
                    'mutation'.format(len_bucket, mut_bucket))
        build_index(sample_v_germlines, v_path)
        build_index(sample_j_germlines, j_path)

    seq_path = os.path.join(temp, 'll_{}.fasta'.format(sample.id))
    with open(seq_path, 'w+') as fh:
        fh.write(get_fasta({'tp=Sequence|ai={}|sample_id={}|seq_id={}'.format(
                r.ai, r.sample_id, r.seq_id): r.sequence for r in indels}))
        fh.write(get_fasta({'tp=NoResult|pk={}|sample_id={}|seq_id={}'.format(
            r.pk, r.sample_id, r.seq_id): r.sequence for r in noresults}))

    alignments = {}
    logger.info('Running bowtie2 for V-gene sequences')
    for line in get_reader(align_reference(temp, 'v_genes_{}'.format(bucket),
                                           seq_path, nproc)):
        line['ref_offset'] = int(line['ref_offset']) - 1
        ref_gene = line['reference']
        try:
            ref, seq, rem_seqs = create_seqs(
                ref_seq=sample_v_germlines[ref_gene].replace('-', ''),
                min_size=CDR3_OFFSET, **line)
        except KeyError as e:
            logger.warning('bowtie got invalid V: ' + str(e))
            continue
        if len(rem_seqs) == 0:
            continue

        ref, seq, seq_start = add_imgt_gaps(sample_v_germlines[ref_gene], ref,
                                            seq, line['ref_offset'])
        if len(ref) < CDR3_OFFSET:
            continue
        alignments[line['seq_id']] = {
            'v_germline': ref,
            'v_gene': line['reference'],
            'seq_start': seq_start,
            'v_sequence': seq,
            'v_rem_seq': rem_seqs[-1],
            'cdr3_start': len(ref)
        }

    seq_path = os.path.join(temp, 'll_j_{}.fasta'.format(sample.id))
    with open(seq_path, 'w+') as fh:
        seqs = {k: v['v_rem_seq'] for k, v in alignments.items() if
                len(v['v_rem_seq']) > 0}
        fh.write(get_fasta(seqs))

    tasks = []
    logger.info('Running bowtie2 for J-gene sequences')
    for line in get_reader(align_reference(temp, 'j_genes_{}'.format(bucket),
                                           seq_path, nproc)):
        line['ref_offset'] = int(line['ref_offset']) - 1
        ref_gene = line['reference']
        ref, seq, rem_seqs = create_seqs(
            ref_seq=sample_j_germlines[ref_gene].replace('-', ''),
            min_size=j_germlines.upstream_of_cdr3, **line)
        alignments[line['seq_id']]['j_gene'] = line['reference']

        full_seq = (alignments[line['seq_id']]['v_sequence'] +
                    alignments[line['seq_id']]['v_rem_seq'])
        if len(rem_seqs) > 0:
            full_seq = full_seq[:-len(rem_seqs[-1])]

        cdr3_end = len(full_seq)
        if len(ref) < j_germlines.upstream_of_cdr3:
            continue
        for i in range(j_germlines.upstream_of_cdr3):
            if ref[-i] != '-':
                cdr3_end -= 1
        alignments[line['seq_id']]['cdr3_end'] = cdr3_end

        cdr3_length = cdr3_end - alignments[line['seq_id']]['cdr3_start']

        full_germ = (alignments[line['seq_id']]['v_germline'] +
                     (GAP_PLACEHOLDER * cdr3_length))
        j_length = len(full_seq) - len(full_germ)
        if j_length <= 0 or cdr3_length <= 0:
            continue
        full_germ += ref[-j_length:]

        r_type, pk, sample_id, seq_id = [
            v.split('=', 1)[1] for v in line['seq_id'].split('|', 3)]
        insertions = gap_positions(full_germ)
        deletions = gap_positions(full_seq)

        alignment = VDJAlignment(
            VDJSequence(seq_id, full_seq.replace(GAP_PLACEHOLDER, '-'))
        )
        alignment.germline = full_germ.replace(GAP_PLACEHOLDER, '-')
        if len(alignment.germline) != len(alignment.sequence.sequence):
            continue
        alignment.v_gene.add(GeneName(alignments[line['seq_id']]['v_gene']))
        alignment.j_gene.add(GeneName(alignments[line['seq_id']]['j_gene']))
        alignment.seq_offset = alignments[line['seq_id']]['seq_start']
        # TODO: This should really look for a streak like in anchoring
        alignment.germline_cdr3 = '-' * cdr3_length
        gaps_in_seq = alignment.sequence.sequence[
            alignment.seq_start:alignments[line['seq_id']]['cdr3_start']
        ].count('-')
        alignment.v_length = (
            alignments[line['seq_id']]['cdr3_start'] -
            alignment.seq_offset
        ) - gaps_in_seq
        alignment.j_length = j_length
        alignment.v_mutation_fraction = 1 - (alignment.v_match /
                                             alignment.v_length)
        alignment.cdr3_start = alignments[line['seq_id']]['cdr3_start']
        alignment.cdr3_num_nts = cdr3_length
        alignment.post_cdr3_length = j_length
        alignment.insertions = insertions
        alignment.deletions = deletions
        alignment.locally_aligned = True

        tasks.append({
            'r_type': r_type,
            'pk': int(pk),
            'sample_id': int(sample_id),
            'alignment': alignment
        })
    return tasks
Exemplo n.º 51
0
def add_sequences_from_sample(session, sample, sequences, props):
    logger.info('Adding {} corrected sequences to sample {}'.format(
        len(sequences), sample.id))
    for sequence in periodic_commit(session, sequences):
        alignment = sequence['alignment']
        try:
            try:
                props.validate(alignment)
            except AlignmentException:
                continue
            if sequence['r_type'] == 'NoResult':
                add_sequences(session, [alignment], sample,
                              error_action='raise')
                session.query(NoResult).filter(
                    NoResult.pk == sequence['pk']
                ).delete(synchronize_session=False)
            elif sequence['r_type'] == 'Sequence':
                fields = {
                    'partial': alignment.partial,

                    'probable_indel_or_misalign':
                        alignment.has_possible_indel,

                    'v_gene': format_ties(alignment.v_gene),
                    'j_gene': format_ties(alignment.j_gene),

                    'num_gaps': alignment.num_gaps,
                    'seq_start': alignment.seq_start,

                    'v_match': alignment.v_match,
                    'v_length': alignment.v_length,
                    'j_match': alignment.j_match,
                    'j_length': alignment.j_length,

                    'removed_prefix':
                        alignment.sequence.removed_prefix_sequence,
                    'removed_prefix_qual':
                        alignment.sequence.removed_prefix_quality,
                    'v_mutation_fraction': alignment.v_mutation_fraction,

                    'pre_cdr3_length': alignment.pre_cdr3_length,
                    'pre_cdr3_match': alignment.pre_cdr3_match,
                    'post_cdr3_length': alignment.post_cdr3_length,
                    'post_cdr3_match': alignment.post_cdr3_match,

                    'in_frame': alignment.in_frame,
                    'functional': alignment.functional,
                    'stop': alignment.stop,

                    'cdr3_nt': alignment.cdr3,
                    'cdr3_num_nts': len(alignment.cdr3),
                    'cdr3_aa': lookups.aas_from_nts(alignment.cdr3),

                    'sequence': str(alignment.sequence.sequence),
                    'quality': alignment.sequence.quality,

                    'locally_aligned': alignment.locally_aligned,
                    '_insertions': serialize_gaps(alignment.insertions),
                    '_deletions': serialize_gaps(alignment.deletions),

                    'germline': alignment.germline
                }
                # This line doesnt actually add anything to the DB, it's just
                # to validate the fields
                Sequence(**fields)

                session.query(Sequence).filter(
                    Sequence.ai == sequence['pk']
                ).update(fields, synchronize_session=False)
        except ValueError:
            continue
Exemplo n.º 52
0
def process_sample(db_config, v_germlines, j_germlines, path, meta, props,
                   nproc):
    session = config.init_db(db_config)
    start = time.time()
    logger.info('Starting sample {}'.format(meta['sample_name']))
    sample = setup_sample(session, meta)

    aligner = AnchorAligner(v_germlines, j_germlines)

    # Initial VJ assignment
    alignments = concurrent.process_data(
        read_input,
        process_vdj,
        aggregate_vdj,
        nproc,
        process_args={'aligner': aligner},
        generate_args={'path': path},
    )
    logger.info('Adding noresults')
    for result in alignments['noresult']:
        add_noresults_for_vdj(session, result['vdj'], sample, result['reason'])

    alignments = alignments['success']
    if alignments:
        avg_len = (
            sum([v.v_length for v in alignments]) /
            len(alignments))
        avg_mut = (
            sum([v.v_mutation_fraction for v in alignments]) /
            len(alignments)
        )
        sample.v_ties_mutations = avg_mut
        sample.v_ties_len = avg_len
        logger.info('Re-aligning {} sequences to V-ties: Mutations={}, '
                    'Length={}'.format(len(alignments),
                                       round(avg_mut, 2),
                                       round(avg_len, 2)))
        session.commit()
        # Realign to V-ties
        v_ties = concurrent.process_data(
            alignments,
            process_vties,
            aggregate_vties,
            nproc,
            process_args={'aligner': aligner, 'avg_len': avg_len, 'avg_mut':
                          avg_mut, 'props': props},
        )
        logger.info('Adding noresults')

        for result in funcs.periodic_commit(session, v_ties['noresult'], 100):
            add_noresults_for_vdj(session, result['alignment'].sequence,
                                  sample, result['reason'])

        logger.info('Collapsing {} buckets'.format(len(v_ties['success'])))
        session.commit()

        # TODO: Change this so we arent copying everything between processes
        concurrent.process_data(
            [list(v) for v in v_ties['success']],
            process_collapse,
            aggregate_collapse,
            nproc,
            aggregate_args={'db_config': db_config, 'sample_id': sample.id,
                            'props': props}
        )
        session.expire_all()
        session.commit()

        identified = int(session.query(
            func.sum(Sequence.copy_number)
        ).filter(
            Sequence.sample == sample
        ).scalar() or 0)
        noresults = int(session.query(
            func.count(NoResult.pk)
        ).filter(
            NoResult.sample == sample
        ).scalar() or 0)
        if identified + noresults:
            frac = int(100 * identified / (identified + noresults))
        else:
            frac = 0
        logger.info(
            'Completed sample {} in {}m - {}/{} ({}%) identified'.format(
                sample.name,
                round((time.time() - start) / 60., 1),
                identified,
                identified + noresults,
                frac
            )
        )
    session.close()
Exemplo n.º 53
0
def combine_samples(session, args):
    groups = {}

    for meta in session.query(SampleMetadata).filter(
            SampleMetadata.key == args.combine_field):
        groups.setdefault(meta.value, set()).add(meta.sample_id)
    all_subjects = set()
    for group_id, samples in groups.items():
        group_subs = session.query(Sample.subject_id).filter(
            Sample.id.in_(samples)
        ).group_by(Sample.subject_id)
        group_subs = [s.subject_id for s in group_subs]
        all_subjects.update(set(group_subs))
        if len(group_subs) > 1:
            logger.error('Cannot combine samples across subjects '
                         '(group "{}" has {} subjects)'.format(
                             group_id, len(group_subs)))
            sys.exit(1)

    all_samples = [s.id for s in session.query(Sample.id).filter(
        Sample.subject_id.in_(all_subjects))]

    logger.info('Resetting information for {} subjects'.format(
        len(all_subjects), len(all_samples)))
    logger.info('   Resetting collapsing')
    session.query(SequenceCollapse).filter(
        SequenceCollapse.sample_id.in_(all_samples)
    ).delete(synchronize_session=False)
    logger.info('   Resetting clones')
    session.query(Clone).filter(
        Clone.subject_id.in_(all_subjects)
    ).delete(synchronize_session=False)
    logger.info('   Resetting sample statistics')
    session.query(SampleStats).filter(
        SampleStats.sample_id.in_(all_samples)
    ).delete(synchronize_session=False)

    for group_id, samples in groups.items():
        final_sample_id = min(samples)
        logger.info('Combining {} samples into new sample "{}" (ID {})'.format(
            len(samples), group_id, final_sample_id))
        session.query(Sequence).filter(
            Sequence.sample_id.in_(samples)
        ).update({
            Sequence.sample_id: final_sample_id,
        }, synchronize_session=False)

        logger.info('Updating sample name and deleting empty samples')
        # collapse to one sample
        final_sample = session.query(Sample).get(final_sample_id)
        final_sample.name = group_id
        remove_duplicates(session, final_sample)

        logger.info('Moving noresults')
        session.query(NoResult).filter(
            NoResult.sample_id.in_(samples)
        ).update({
            'sample_id': final_sample_id
        }, synchronize_session=False)

        # delete the now-empty samples
        session.query(Sample).filter(
            Sample.id.in_(samples - set([final_sample_id]))
        ).delete(synchronize_session=False)

    session.commit()
    logger.info('Sequences successfully collapsed: please re-run '
                'immunedb_collapse and later pipeline steps.')