Exemplo n.º 1
0
def run_clearcut(session, args):
    if args.clone_ids is not None:
        clones = session.query(Clone.id).filter(
            Clone.id.in_(args.clone_ids))
    else:
        if args.subject_ids is not None:
            clones = session.query(Clone.id).filter(
                Clone.subject_id.in_(args.subject_ids))
        else:
            clones = session.query(Clone.id)

    if not args.force:
        clones = clones.filter(Clone.tree.is_(None))
    clones = [c.id for c in clones]
    mod_log.make_mod('clone_tree', session=session, commit=True,
                     info=vars(args))

    tasks = concurrent.TaskQueue()

    logger.info('Creating task queue for clones')
    for clone_id in clones:
        tasks.add_task(clone_id)

    for _ in range(0, args.nproc):
        session = config.init_db(args.db_config)
        tasks.add_worker(LineageWorker(
            session, get_newick,
            args.min_mut_copies, args.min_mut_samples,
            args.min_seq_copies,
            args.min_seq_samples,
            args.exclude_stops,
            args.full_seq,
            post_tree_hook=minimize_tree))

    tasks.start()
Exemplo n.º 2
0
def run_clones(session, args):
    """Runs the clone-assignment pipeline stage.

    :param Session session: The database session
    :param Namespace args: The arguments passed to the command

    """
    if args.subject_ids is None:
        subject_ids = [s.id for s in session.query(Subject.id)]
    else:
        subject_ids = args.subject_ids
    mod_log.make_mod('clones', session=session, commit=True, info=vars(args))

    if args.regen:
        logger.info('Deleting existing clones')
        session.query(Clone).filter(
            Clone.subject_id.in_(subject_ids)
        ).delete(synchronize_session=False)
        session.commit()

    tasks = concurrent.TaskQueue()
    for subject_id in subject_ids:
        logger.info('Generating task queue for subject {}'.format(
            subject_id))
        buckets = session.query(
            Sequence.subject_id, Sequence.v_gene, Sequence.j_gene,
            Sequence.cdr3_num_nts, Sequence._insertions,
            Sequence._deletions
        ).filter(
            Sequence.subject_id == subject_id,
            Sequence.clone_id.is_(None)
        ).group_by(
            Sequence.subject_id, Sequence.v_gene, Sequence.j_gene,
            Sequence.cdr3_num_nts, Sequence._insertions,
            Sequence._deletions
        )
        for bucket in buckets:
            if not args.gene or bucket.v_gene.startswith(args.gene):
                tasks.add_task(bucket)

    logger.info('Generated {} total tasks'.format(tasks.num_tasks()))

    methods = {
        'similarity': SimilarityClonalWorker,
        'lineage': LineageClonalWorker,
    }
    for i in range(0, min(tasks.num_tasks(), args.nproc)):
        worker = methods[args.method](
            config.init_db(args.db_config), **args.__dict__
        )
        tasks.add_worker(worker)
    tasks.start()

    if args.subclones:
        run_subclones(session, subject_ids, args)
    else:
        logger.info('Skipping subclones')

    push_clone_ids(session)
    session.commit()
Exemplo n.º 3
0
def run_sample_stats(session, args):
    np.seterr(all='raise')
    mod_log.make_mod('sample_stats',
                     session=session,
                     commit=True,
                     info=vars(args))

    if args.sample_ids is None:
        samples = map(lambda s: s.id, session.query(Sample.id).all())
    else:
        samples = args.sample_ids

    if args.force:
        q = session.query(SampleStats).filter(
            SampleStats.sample_id.in_(samples))
        q.delete(synchronize_session=False)
        session.commit()

    tasks = concurrent.TaskQueue()
    for sample_id in samples:
        _queue_tasks(session, sample_id, args.force, tasks)

    for i in range(0, args.nproc):
        session = config.init_db(args.db_config)
        tasks.add_worker(SampleStatsWorker(session))

    tasks.start()
    session.commit()
def run_sample_stats(session, args):
    np.seterr(all='raise')
    mod_log.make_mod('sample_stats', session=session, commit=True,
                     info=vars(args))

    if args.sample_ids is None:
        samples = [s.id for s in session.query(Sample.id)]
    else:
        samples = args.sample_ids

    if args.force:
        q = session.query(SampleStats).filter(
            SampleStats.sample_id.in_(samples))
        q.delete(synchronize_session=False)
        session.commit()

    tasks = concurrent.TaskQueue()
    for sample_id in samples:
        _queue_tasks(session, sample_id, args.force, tasks)

    for i in range(0, args.nproc):
        session = config.init_db(args.db_config)
        tasks.add_worker(SampleStatsWorker(session))

    tasks.start()
    session.commit()
    session.close()
Exemplo n.º 5
0
def run_clones(session, args):
    """Runs the clone-assignment pipeline stage.

    :param Session session: The database session
    :param Namespace args: The arguments passed to the command

    """
    if args.subject_ids is None:
        subject_ids = map(lambda s: s.id, session.query(Subject.id).all())
    else:
        subject_ids = args.subject_ids
    mod_log.make_mod('clones', session=session, commit=True, info=vars(args))

    if args.regen:
        logger.info('Deleting existing clones')
        session.query(Clone).filter(
            Clone.subject_id.in_(subject_ids)
        ).delete(synchronize_session=False)
        session.commit()

    tasks = concurrent.TaskQueue()
    for subject_id in subject_ids:
        logger.info('Generating task queue for subject {}'.format(
            subject_id))
        buckets = session.query(
            Sequence.subject_id, Sequence.v_gene, Sequence.j_gene,
            Sequence.cdr3_num_nts, Sequence._insertions,
            Sequence._deletions
        ).filter(
            Sequence.subject_id == subject_id,
            Sequence.clone_id.is_(None)
        ).group_by(
            Sequence.subject_id, Sequence.v_gene, Sequence.j_gene,
            Sequence.cdr3_num_nts, Sequence._insertions,
            Sequence._deletions
        )
        for bucket in buckets:
            tasks.add_task(bucket)

    logger.info('Generated {} total tasks'.format(tasks.num_tasks()))

    methods = {
        'similarity': SimilarityClonalWorker,
        'tcells': TCellClonalWorker,
        'lineage': LineageClonalWorker,
    }
    for i in range(0, min(tasks.num_tasks(), args.nproc)):
        worker = methods[args.method](
            config.init_db(args.db_config), **args.__dict__
        )
        tasks.add_worker(worker)
    tasks.start()

    if args.subclones:
        run_subclones(session, subject_ids, args)
    else:
        logger.info('Skipping subclones')

    push_clone_ids(session)
    session.commit()
Exemplo n.º 6
0
def run_selection_pressure(session, args):
    mod_log.make_mod('clone_pressure',
                     session=session,
                     commit=True,
                     info=vars(args))

    if args.clone_ids is not None:
        clones = args.clone_ids
    elif args.subject_ids is not None:
        clones = map(
            lambda c: c.id,
            session.query(Clone.id).filter(
                Clone.subject_id.in_(args.subject_ids)).all())
    else:
        clones = map(lambda c: c.id, session.query(Clone.id).all())
    clones.sort()

    tasks = concurrent.TaskQueue()
    logger.info('Creating task queue to calculate selection pressure for {} '
                'clones.'.format(len(clones)))
    for cid in clones:
        tasks.add_task(cid)

    for i in range(0, args.nproc):
        session = config.init_db(args.db_config)
        tasks.add_worker(
            SelectionPressureWorker(session, args.baseline_path, args.temp,
                                    args.regen, args.thresholds))

    tasks.start()
def run_collapse(session, args):
    mod_log.make_mod('collapse', session=session, commit=True,
                     info=vars(args))
    subject_ids = []

    subjects = (args.subject_ids or [e.id for e in session.query(Subject.id)])
    for subject in subjects:
        if session.query(Sample).filter(
                Sample.subject_id == subject,
                ~exists().where(
                    SequenceCollapse.sample_id == Sample.id
                )).first() is None:
            logger.info('Subject {} already collapsed.  Skipping.'.format(
                subject))
        else:
            logger.info('Resetting collapse info for subject {}'.format(
                subject))
            samples = session.query(Sample).filter(
                  Sample.subject_id == subject
            )
            for sample in samples:
                session.query(SequenceCollapse).filter(
                    SequenceCollapse.sample_id == sample.id
                ).delete(synchronize_session=False)
                sample.sample_stats = []
            logger.info('Resetting clone info for subject {}'.format(subject))
            session.query(Clone).filter(Clone.subject_id == subject).delete()
            subject_ids.append(subject)
    session.commit()

    logger.info('Creating task queue to collapse {} subjects.'.format(
        len(subject_ids)))

    tasks = concurrent.TaskQueue()

    for subject_id in subject_ids:
        buckets = session.query(
            Sequence.subject_id, Sequence.v_gene, Sequence.j_gene,
            Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions
        ).filter(
            Sequence.subject_id == subject_id
        ).group_by(
            Sequence.subject_id, Sequence.v_gene, Sequence.j_gene,
            Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions
        )
        for bucket in buckets:
            tasks.add_task(bucket)

    logger.info('Generated {} total tasks'.format(tasks.num_tasks()))

    for i in range(0, min(tasks.num_tasks(), args.nproc)):
        tasks.add_worker(CollapseWorker(config.init_db(args.db_config)))
    tasks.start()

    session.close()
Exemplo n.º 8
0
def run_collapse(session, args):
    mod_log.make_mod('collapse', session=session, commit=True,
                     info=vars(args))
    subject_ids = []

    for subject in (args.subject_ids or map(
                lambda e: e.id, session.query(Subject.id).all()
                )):
        if session.query(Sample).filter(
                Sample.subject_id == subject,
                ~exists().where(
                    SequenceCollapse.sample_id == Sample.id
                )).first() is None:
            logger.info('Subject {} already collapsed.  Skipping.'.format(
                subject))
        else:
            logger.info('Resetting collapse info for subject {}'.format(
                subject))
            samples = session.query(Sample).filter(
                  Sample.subject_id == subject
            ).all()
            for sample in samples:
                session.query(SequenceCollapse).filter(
                    SequenceCollapse.sample_id == sample.id
                ).delete(synchronize_session=False)
            logger.info('Resetting clone info for subject {}'.format(subject))
            session.query(Clone).filter(Clone.subject_id == subject).delete()
            subject_ids.append(subject)
    session.commit()

    logger.info('Creating task queue to collapse {} subjects.'.format(
        len(subject_ids)))

    tasks = concurrent.TaskQueue()

    for subject_id in subject_ids:
        buckets = session.query(
            Sequence.subject_id, Sequence.v_gene, Sequence.j_gene,
            Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions
        ).filter(
            Sequence.subject_id == subject_id
        ).group_by(
            Sequence.subject_id, Sequence.v_gene, Sequence.j_gene,
            Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions
        )
        for bucket in buckets:
            tasks.add_task(bucket)

    logger.info('Generated {} total tasks'.format(tasks.num_tasks()))

    for i in range(0, min(tasks.num_tasks(), args.nproc)):
        tasks.add_worker(CollapseWorker(config.init_db(args.db_config)))
    tasks.start()

    session.close()
Exemplo n.º 9
0
def run_identify(session, args):
    mod_log.make_mod('identification',
                     session=session,
                     commit=True,
                     info=vars(args))
    session.close()
    # Load the germlines from files
    v_germlines = VGermlines(args.v_germlines)
    j_germlines = JGermlines(args.j_germlines, args.upstream_of_cdr3,
                             args.anchor_len, args.min_anchor_len)
    tasks = concurrent.TaskQueue()

    # If metadata is not specified, assume it is "metadata." in the
    # directory
    meta_fn = args.metadata if args.metadata else os.path.join(
        args.sample_dir, 'metadata.tsv')

    # Verify the metadata file exists
    if not os.path.isfile(meta_fn):
        logger.error('Metadata file not found.')
        return

    with open(meta_fn, 'rU') as fh:
        try:
            metadata = parse_metadata(session, fh, args.warn_existing,
                                      args.sample_dir)
        except MetadataException as ex:
            logger.error(ex.message)
            return

    # Create the tasks for each file
    for sample_name in sorted(metadata.keys()):
        tasks.add_task({
            'path':
            os.path.join(args.sample_dir, metadata[sample_name]['file_name']),
            'meta':
            metadata[sample_name]
        })

    props = IdentificationProps(**args.__dict__)
    lock = mp.Lock()
    for i in range(0, min(args.nproc, tasks.num_tasks())):
        worker_session = config.init_db(args.db_config)
        tasks.add_worker(
            IdentificationWorker(worker_session, v_germlines, j_germlines,
                                 props, lock))

    tasks.start()
Exemplo n.º 10
0
def run_identify(session, args):
    mod_log.make_mod('identification', session=session, commit=True,
                     info=vars(args))
    # Load the germlines from files
    v_germlines = VGermlines(args.v_germlines,
                             ties=args.ties and not args.genotyping)
    j_germlines = JGermlines(args.j_germlines, args.upstream_of_cdr3,
                             args.anchor_len, args.min_anchor_len,
                             ties=args.ties and not args.genotyping)

    # If metadata is not specified, assume it is "metadata." in the
    # directory
    meta_fn = args.metadata if args.metadata else os.path.join(
        args.sample_dir, 'metadata.tsv')

    # Verify the metadata file exists
    if not os.path.isfile(meta_fn):
        logger.error('Metadata file not found.')
        sys.exit(-1)

    with open(meta_fn, 'rU') as fh:
        try:
            metadata = parse_metadata(session, fh, args.warn_existing,
                                      args.warn_missing, args.sample_dir)
        except MetadataException as ex:
            logger.error(ex)
            sys.exit(-1)

    session.close()
    # Create the tasks for each file
    props = IdentificationProps(**args.__dict__)
    for sample_name in sorted(metadata.keys()):
        process_sample(
            args.db_config, v_germlines, j_germlines,
            os.path.join(
                args.sample_dir,
                metadata[sample_name]['file_name']
            ),
            metadata[sample_name],
            props,
            args.nproc
        )
Exemplo n.º 11
0
def run_identify(session, args):
    mod_log.make_mod('identification', session=session, commit=True,
                     info=vars(args))
    # Load the germlines from files
    v_germlines = VGermlines(args.v_germlines, no_ties=args.genotyping)
    j_germlines = JGermlines(args.j_germlines, args.upstream_of_cdr3,
                             args.anchor_len, args.min_anchor_len,
                             no_ties=args.genotyping)

    # If metadata is not specified, assume it is "metadata." in the
    # directory
    meta_fn = args.metadata if args.metadata else os.path.join(
        args.sample_dir, 'metadata.tsv')

    # Verify the metadata file exists
    if not os.path.isfile(meta_fn):
        logger.error('Metadata file not found.')
        sys.exit(-1)

    with open(meta_fn, 'rU') as fh:
        try:
            metadata = parse_metadata(session, fh, args.warn_existing,
                                      args.warn_missing, args.sample_dir)
        except MetadataException as ex:
            logger.error(ex)
            sys.exit(-1)

    session.close()
    # Create the tasks for each file
    props = IdentificationProps(**args.__dict__)
    for sample_name in sorted(metadata.keys()):
        process_sample(
            args.db_config, v_germlines, j_germlines,
            os.path.join(
                args.sample_dir,
                metadata[sample_name]['file_name']
            ),
            metadata[sample_name],
            props,
            args.nproc
        )
Exemplo n.º 12
0
def import_alignments(session, args):
    mod_log.make_mod('import_alignments',
                     session=session,
                     commit=True,
                     info=vars(args))
    parse_funcs = {
        'airr': (parse_airr, preprocess_airr),
    }

    meta_fn = args.metadata if args.metadata else os.path.join(
        args.sample_dir, 'metadata.tsv')
    if not os.path.isfile(meta_fn):
        logger.error('Metadata file not found.')
        return
    with open(meta_fn, 'rU') as fh:
        try:
            metadata = parse_metadata(session, fh, args.warn_existing,
                                      args.warn_missing, args.sample_dir)
        except MetadataException as ex:
            logger.error(ex)
            return

    props = IdentificationProps(**args.__dict__)
    v_germlines = raw_germlines(args.v_germlines, 'v')
    j_germlines = raw_germlines(args.j_germlines, 'j')

    for sample_name in sorted(metadata.keys()):
        sample = create_sample(session, metadata[sample_name])
        if sample:
            path = os.path.join(args.sample_dir,
                                metadata[sample_name]['file_name'])
            with open(path) as fh:
                parse_file(fh,
                           sample,
                           session,
                           parse_funcs[args.format][0],
                           props,
                           v_germlines,
                           j_germlines,
                           args.nproc,
                           preprocess_func=parse_funcs[args.format][1])
Exemplo n.º 13
0
def run_clone_stats(session, args):
    """Runs the clone statistics generation stage of the pipeline.
    :param Session session: The database session
    :param Namespace args: The arguments passed to the command

    """
    mod_log.make_mod('clone_stats',
                     session=session,
                     commit=True,
                     info=vars(args))

    if args.clone_ids is not None:
        clones = args.clone_ids
    elif args.subject_ids is not None:
        clones = map(
            lambda c: c.id,
            session.query(Clone.id).filter(
                Clone.subject_id.in_(args.subject_ids)).all())
    else:
        clones = map(lambda c: c.id, session.query(Clone.id).all())
    clones.sort()

    if args.regen:
        logger.info('Deleting old clone statistics for {} clones'.format(
            len(clones)))
        session.query(CloneStats).filter(
            CloneStats.clone_id.in_(clones)).delete(synchronize_session=False)
        session.commit()

    tasks = concurrent.TaskQueue()
    logger.info('Creating task queue to generate stats for {} clones.'.format(
        len(clones)))
    for cid in clones:
        tasks.add_task(cid)

    for i in range(0, args.nproc):
        session = config.init_db(args.db_config)
        tasks.add_worker(CloneStatsWorker(session))

    tasks.start()
def run_clone_stats(session, args):
    """Runs the clone statistics generation stage of the pipeline.
    :param Session session: The database session
    :param Namespace args: The arguments passed to the command

    """
    mod_log.make_mod('clone_stats', session=session, commit=True,
                     info=vars(args))

    if args.clone_ids is not None:
        clones = args.clone_ids
    elif args.subject_ids is not None:
        clones = [c.id for c in session.query(Clone.id).filter(
            Clone.subject_id.in_(args.subject_ids))]
    else:
        clones = [c.id for c in session.query(Clone.id)]
    clones.sort()

    if args.regen:
        logger.info('Deleting old clone statistics for {} clones'.format(
            len(clones)))
        session.query(CloneStats).filter(
            CloneStats.clone_id.in_(clones)
        ).delete(synchronize_session=False)
        session.commit()

    tasks = concurrent.TaskQueue()
    logger.info('Creating task queue to generate stats for {} clones.'.format(
        len(clones)))
    for cid in clones:
        tasks.add_task(cid)

    for i in range(0, args.nproc):
        session = config.init_db(args.db_config)
        tasks.add_worker(CloneStatsWorker(session))

    tasks.start()
Exemplo n.º 15
0
def run_clones(session, args):
    """Runs the clone-assignment pipeline stage.

    :param Session session: The database session
    :param Namespace args: The arguments passed to the command

    """
    if args.subject_ids is None:
        subject_ids = [s.id for s in session.query(Subject.id)]
    else:
        subject_ids = args.subject_ids
    mod_log.make_mod('clones', session=session, commit=True, info=vars(args))

    if not args.skip_regen:
        logger.info('Deleting existing clones')
        q = session.query(Clone).filter(Clone.subject_id.in_(subject_ids))
        if args.gene:
            q = q.filter(Clone.v_gene.like(args.gene + '%'))
        q.delete(synchronize_session=False)
        session.commit()

    tasks = concurrent.TaskQueue()
    all_buckets = []
    for subject_id in subject_ids:
        logger.info('Generating task queue for subject {}'.format(subject_id))
        buckets = session.query(Sequence.subject_id, Sequence.v_gene,
                                Sequence.j_gene, Sequence.cdr3_num_nts).filter(
                                    Sequence.subject_id == subject_id,
                                    Sequence.clone_id.is_(None)).group_by(
                                        Sequence.subject_id, Sequence.v_gene,
                                        Sequence.j_gene, Sequence.cdr3_num_nts)
        for bucket in buckets:
            if not args.gene or bucket.v_gene.startswith(args.gene):
                tasks.add_task(bucket)
        all_buckets.extend(buckets)

    logger.info('Generated {} total tasks'.format(tasks.num_tasks()))

    methods = {
        'similarity': SimilarityClonalWorker,
        'cluster': ClusteringClonalWorker
    }
    for i in range(0, min(tasks.num_tasks(), args.nproc)):
        worker = methods[args.method](config.init_db(args.db_config),
                                      **args.__dict__)
        tasks.add_worker(worker)
    tasks.start()

    session.commit()
    if args.reduce_difference:
        buckets = session.query(Clone.subject_id, Clone.cdr3_num_nts).filter(
            Clone.subject_id.in_(subject_ids)).group_by(
                Clone.subject_id, Clone.cdr3_num_nts)
        collapse_similar_cdr3s(session, buckets, args.reduce_difference)
    else:
        logger.info('Skipping reduce since --reduce-differece set to 0')

    push_clone_ids(session)
    session.commit()

    if not args.skip_subclones:
        run_subclones(session, subject_ids, args)
    else:
        logger.info('Skipping subclones')
Exemplo n.º 16
0
def run_identify(session, args):
    mod_log.make_mod('identification',
                     session=session,
                     commit=True,
                     info=vars(args))
    session.close()
    # Load the germlines from files
    v_germlines = VGermlines(args.v_germlines)
    j_germlines = JGermlines(args.j_germlines, args.upstream_of_cdr3,
                             args.anchor_len, args.min_anchor_len)
    tasks = concurrent.TaskQueue()

    sample_names = set([])
    fail = False
    for directory in args.sample_dirs:
        # If metadata is not specified, assume it is "metadata.json" in the
        # directory
        if args.metadata is None:
            meta_fn = os.path.join(directory, 'metadata.json')
        else:
            meta_fn = args.metadata

        # Verify the metadata file exists
        if not os.path.isfile(meta_fn):
            logger.error('Metadata file not found.')
            return

        with open(meta_fn) as fh:
            metadata = json.load(fh)

        # Create the tasks for each file
        for fn in sorted(metadata.keys()):
            if fn == 'all':
                continue
            meta = SampleMetadata(
                metadata[fn], metadata['all'] if 'all' in metadata else None)
            if session.query(Sample).filter(
                    Sample.name == meta.get('sample_name'),
                    exists().where(
                        Sequence.sample_id == Sample.id)).first() is not None:
                log_f = logger.warning if args.warn_existing else logger.error
                log_f('Sample {} already exists. {}'.format(
                    meta.get('sample_name'),
                    'Skipping.' if args.warn_existing else 'Cannot continue.'))
                fail = True
            elif meta.get('sample_name') in sample_names:
                logger.error(
                    'Sample {} exists more than once in metadata.'.format(
                        meta.get('sample_name')))
                return
            else:
                tasks.add_task({'path': directory, 'fn': fn, 'meta': meta})
                sample_names.add(meta.get('sample_name'))

        if fail and not args.warn_existing:
            logger.error('Encountered errors.  Not running any identification.'
                         ' To skip samples that are already in the database '
                         'use --warn-existing.')
            return

    lock = mp.Lock()
    for i in range(0, min(args.nproc, tasks.num_tasks())):
        worker_session = config.init_db(args.db_config)
        tasks.add_worker(
            IdentificationWorker(worker_session, v_germlines, j_germlines,
                                 args.trim_to, args.max_padding,
                                 args.max_vties,
                                 args.min_similarity / float(100), lock))

    tasks.start()