Exemplo n.º 1
0
def setup_logging(log_dir, debug):
    logfile = os.path.join(log_dir, 'abstar.log')
    debug = True if debug > 0 else False
    print_debug = True if debug == 2 else False
    log.setup_logging(logfile, debug=debug)
    global logger
    logger = log.get_logger('abstar')
Exemplo n.º 2
0
def build_output(vdjs, output_type, pretty, padding):
    logger = log.get_logger()
    try:
        vdjs = [vdj for vdj in vdjs if vdj.rearrangement]
        if output_type.lower() == 'json':
            output = []
            for vdj in vdjs:
                try:
                    output.append(_json_output(vdj, pretty, padding))
                except AttributeError:
                    logger.debug('OUTPUT ERROR: {}'.format(vdj.id))
        elif output_type.lower() == 'imgt':
            header, firstvals = _imgt_summary_output(vdjs[0], header=True)
            output = [header, firstvals, ]
            for vdj in vdjs[1:]:
                try:
                    output.append(_imgt_summary_output(vdj))
                except AttributeError:
                    logger.debug('OUTPUT ERROR: {}'.format(vdj.id))
        elif output_type.lower() == 'hadoop':
            output = []
            for vdj in vdjs:
                try:
                    output.append(_hadoop_minimal_output(vdj))
                except AttributeError:
                    logger.debug('OUTPUT ERROR: {}'.format(vdj.id))
        return output
    except:
        logger.debug('FILE-LEVEL OUTPUT ERROR: sequences {} - {}, output_type = {}'.format(
            vdjs[0].id,
            vdjs[-1].id,
            output_type))
        logger.debug(traceback.format_exc())
Exemplo n.º 3
0
def run_standalone(args):
    logfile = args.log if args.log else os.path.join(args.output,
                                                     'abcompare.log')
    log.setup_logging(logfile)
    global logger
    logger = log.get_logger('abcompare')
    main(args)
Exemplo n.º 4
0
Arquivo: s3.py Projeto: briney/abtools
def put(f, s3_path, multipart_chunk_size_mb=500, logger=None):
    '''
    Uploads a single file to S3, using s3cmd.

    Args:

        f (str): Path to a single file.

        s3_path (str): The S3 path, with the filename omitted. The S3 filename
            will be the basename of the ``f``. For example::

                put(f='/path/to/myfile.tar.gz', s3_path='s3://my_bucket/path/to/')

            will result in an uploaded S3 path of ``s3://my_bucket/path/to/myfile.tar.gz``
    '''
    if not logger:
        logger = log.get_logger('s3')
    fname = os.path.basename(f)
    target = os.path.join(s3_path, fname)
    s3cmd_cline = 's3cmd put {} {} --multipart-chunk-size-mb {}'.format(f,
                                                                        target,
                                                                        multipart_chunk_size_mb)
    print_put_info(fname, target, logger)
    s3cmd = sp.Popen(s3cmd_cline,
                     stdout=sp.PIPE,
                     stderr=sp.PIPE,
                     shell=True)
    stdout, stderr = s3cmd.communicate()
Exemplo n.º 5
0
def build_output(vdjs, output_type, pretty, padding):
    logger = log.get_logger()
    try:
        vdjs = [vdj for vdj in vdjs if vdj.rearrangement]
        if output_type.lower() == 'json':
            output = []
            for vdj in vdjs:
                try:
                    output.append(_json_output(vdj, pretty, padding))
                except AttributeError:
                    logger.debug('OUTPUT ERROR: {}'.format(vdj.id))
        elif output_type.lower() == 'imgt':
            header, firstvals = _imgt_summary_output(vdjs[0], header=True)
            output = [
                header,
                firstvals,
            ]
            for vdj in vdjs[1:]:
                try:
                    output.append(_imgt_summary_output(vdj))
                except AttributeError:
                    logger.debug('OUTPUT ERROR: {}'.format(vdj.id))
        elif output_type.lower() == 'hadoop':
            output = []
            for vdj in vdjs:
                try:
                    output.append(_hadoop_minimal_output(vdj))
                except AttributeError:
                    logger.debug('OUTPUT ERROR: {}'.format(vdj.id))
        return output
    except:
        logger.debug(
            'FILE-LEVEL OUTPUT ERROR: sequences {} - {}, output_type = {}'.
            format(vdjs[0].id, vdjs[-1].id, output_type))
        logger.debug(traceback.format_exc())
Exemplo n.º 6
0
def find_insertions(blast_result):
    '''
    Identifies and annotates/fixes insertions. Frameshift insertions (those with a length
    not evenly divisible by 3) will be removed. Codon-length insertions will be annotated.

    Input is a BlastResult object.

    Output is a list of insertion annotations, or None if there were no codon-length
    insertions.
    '''
    logger = log.get_logger(__name__)
    try:
        insertions = []
        o = 0
        for i in re.finditer('-+', blast_result.germline_alignment):
            s = i.start() - o
            e = i.end() - o
            l = e - s
            if l % 3 == 0 or l > 3:
                insertions.append(_annotate_insertion(blast_result, s, e))
            else:
                blast_result = _fix_frameshift_insertion(blast_result, s, e)
                o += l
        return insertions if insertions else []
    except:
        logger.debug('FIND INSERTIONS ERROR: {}, {}'.format(blast_result.id,
                                                             blast_result.input_sequence))
        logger.debug(traceback.format_exc())
Exemplo n.º 7
0
def get_junction(vdj, germ=False):
    global logger
    logger = log.get_logger(__name__)
    try:
        return Junction(vdj, germ)
    except Exception, err:
        logger.debug('JUNCTION ERROR: {id}'.format(id=vdj.id))
        logger.debug(traceback.format_exc())
Exemplo n.º 8
0
def parse_codons(vdj, gapped):
    logger = log.get_logger(__name__)
    try:
        codons = Codons(vdj, gapped)
        return codons
    except:
        logger.debug('PARSE CODONS ERROR: {}, {}'.format(vdj.id, vdj.raw_input))
        logger.debug(traceback.format_exc())
Exemplo n.º 9
0
def aa_mutations(blast_result):
    global logger
    logger = log.get_logger(__name__)
    try:
        return MutationsAA(blast_result)
    except:
        logger.debug('AA MUTATIONS ERROR: {}, {}\n'.format(blast_result.id,
                                                          blast_result.input_sequence))
        logger.debug(traceback.format_exc())
Exemplo n.º 10
0
Arquivo: s3.py Projeto: briney/abtools
def compress_and_upload(data, compressed_file, s3_path, multipart_chunk_size_mb=500,
    method='gz', delete=False, access_key=None, secret_key=None):
    '''
    Compresses data and uploads to S3.

    S3 upload uses ``s3cmd``, so you must either:

        1) Manually configure ``s3cmd`` prior to use (typically using ``s3cmd --configure``).

        2) Configure ``s3cmd`` using ``s3.configure()``.

        3) Pass your access key and secret key to ``compress_and_upload``, which will automatically configure s3cmd.

    .. note:

        ``s3cmd`` configuration only needs to be done once per computer,
        which means that relaunching a cloud instance or Docker image will
        require re-configuration of ``s3cmd``.

    Args:

        data: Can be one of three things:

            1) Path to a single file

            2) Path to a directory

            3) A list of one or more paths to files or directories

        compressed_file (str): Path to the compressed file. Required.

        s3_path (str): The S3 path, with the filename omitted. The S3 filename
          will be the basename of the ``compressed_file``. For example::

            compress_and_upload(data='/path/to/data',
                                compressed_file='/path/to/compressed.tar.gz',
                                s3_path='s3://my_bucket/path/to/')

          will result in an uploaded S3 path of ``s3://my_bucket/path/to/compressed.tar.gz``

        method (str): Compression method. Options are ``'gz'`` (gzip) or ``'bz2'`` (bzip2).
            Default is ``'gz'``.

        delete (bool): If ``True``, the ``compressed_file`` will be deleted after upload
            to S3. Default is ``False``.

        access_key (str): AWS access key.

        secret_key (str): AWS secret key.
    '''
    logger = log.get_logger('s3')
    if all([access_key, secret_key]):
        configure(access_key=access_key, secret_key=secret_key, logger=logger)
    compress(data, compressed_file, compress=method, logger=logger)
    put(compressed_file, s3_path, multipart_chunk_size_mb=multipart_chunk_size_mb, logger=logger)
    if delete:
        os.unlink(compressed_file)
Exemplo n.º 11
0
def get_isotype(vdj):
    logger = log.get_logger(__name__)
    try:
        mod_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
        isotype_file = os.path.join(mod_dir, 'ssw/isotypes/{}_isotypes.fasta'.format(vdj.species))
        isotype_seqs = [Sequence(s) for s in SeqIO.parse(open(isotype_file, 'r'), 'fasta')]
        isotype_seqs += [Sequence((s.id, s.reverse_complement)) for s in isotype_seqs]
        return Isotype(vdj, isotype_seqs)
    except:
        logger.debug('ISOTYPE ERROR: {}\t{}'.format(vdj.id, vdj.raw_query))
        logger.debug(traceback.format_exc())
Exemplo n.º 12
0
Arquivo: jobs.py Projeto: cvisb/abstar
def run_abstar(sequence_file, output_directory, args):
    '''
    Wrapper function to multiprocess (or not) the assignment of V, D and J
    germline genes. Also writes the JSON-formatted output to file.

    Input is a a FASTA-formatted file of antibody sequences and the output directory.
    Optional input items include the species (supported species: 'human'); length of
    the unique antibody identifier (UAID); and debug mode (which forces single-threading
    and prints more verbose errors.)

    Output is the number of functional antibody sequences identified in the input file.
    '''
    try:
        # setup logging
        global logger
        logger = log.get_logger(__name__)
        assigned_log = ''
        unassigned_log = ''
        # identify output file
        output_filename = os.path.basename(seq_file)
        if args.output_type == 'json':
            output_file = os.path.join(output_dir, output_filename + '.json')
        elif args.output_type in ['imgt', 'hadoop']:
            output_file = os.path.join(output_dir, output_filename + '.txt')
        # start assignment
        assigner = ASSIGNERS[args.assigner]
        assigner(sequence_file, args.species)
        # process all of the successfully assigned sequences
        assigned = [Antibody(vdj, args.species) for vdj in assigner.assigned]
        for ab in assigned:
            ab.annotate()
            if args.debug:
                assigned_log += ab.format_log()
        results = get_abstar_results(assigned,
                                     pretty=args.pretty,
                                     padding=args.padding,
                                     raw=args.raw)
        write_output(results, output_file, args.output_type)
        # capture the log for all unsuccessful sequences
        for vdj in unassigned:
            unassigned_log += vdj.format_log()

        return (len(assigned), assigned_log, unassigned_log)

    #     vdj_output = process_sequence_file(seq_file, args)
    #     if not vdj_output:
    #         return None
    #     clean_vdjs = [vdj for vdj in vdj_output if vdj.rearrangement]
    #     output_count = write_output(clean_vdjs, output_file, args.output_type, args.pretty, args.padding)
    #     return (output_file, output_count)
    except:
        logger.debug(traceback.format_exc())
        raise Exception("".join(traceback.format_exception(*sys.exc_info())))
Exemplo n.º 13
0
def cdhit(seqs,
          out_file=None,
          temp_dir=None,
          threshold=0.975,
          make_db=True,
          quiet=False,
          threads=0,
          max_memory=800,
          debug=False):
    # '''
    # Perform CD-HIT clustering on a set of sequences.

    # Inputs are an iterable of sequences, which can be in any format that abtools.sequence.Sequence
    # can handle.

    # Returns the centroid file name and cluster file name (from CD-HIT).
    # If ::make_db:: is True (default), a SQLite3 connection and database path are also returned.
    # '''
    logger = log.get_logger('cluster')
    start_time = time.time()
    seqs = [Sequence(s) for s in seqs]
    if not quiet:
        logger.info('CD-HIT: clustering {} seqeunces'.format(len(seqs)))
    if out_file is None:
        out_file = tempfile.NamedTemporaryFile(dir=temp_dir, delete=False)
        ofile = out_file.name
    else:
        ofile = os.path.expanduser(out_file)
    ifile = _make_cdhit_input(seqs, temp_dir)
    cdhit_cmd = 'cd-hit -i {} -o {} -c {} -n 5 -d 0 -T {} -M {}'.format(
        ifile, ofile, threshold, threads, max_memory)
    cluster = sp.Popen(cdhit_cmd, shell=True, stdout=sp.PIPE, stderr=sp.PIPE)
    stdout, stderr = cluster.communicate()
    if debug:
        print(stdout)
        print(stderr)
    else:
        os.unlink(ifile)
    if not quiet:
        logger.info(
            'CD-HIT: clustering took {:.2f} seconds'.format(time.time() -
                                                            start_time))
    cfile = ofile + '.clstr'
    if make_db:
        if not quiet:
            logger.info('CD-HIT: building a SQLite3 database')
        seq_db, db_path = _build_seq_db(seqs, direc=temp_dir)
        return ofile, cfile, seq_db, db_path
    return ofile, cfile
Exemplo n.º 14
0
def nt_mutations(blast_result):
    global logger
    logger = log.get_logger(__name__)
    try:
        return MutationsNT(blast_result)
    except:
        logger.debug('NT MUTATIONS ERROR: {}, {}\n'.format(blast_result.id,
                                                          blast_result.input_sequence))
        logger.debug('QUERY ALIGNMENT: {}'.format(blast_result.query_alignment))
        logger.debug('GERMLINE ALIGNMENT: {}'.format(blast_result.germline_alignment))
        logger.debug(blast_result.regions.raw_positions)
        logger.debug(blast_result.regions.adjusted_positions)
        logger.debug(blast_result.regions.nt_seqs)
        logger.debug(blast_result.regions.germline_nt_seqs)
        logger.debug(traceback.format_exc())
Exemplo n.º 15
0
def check_productivity(vdj):
    logger = log.get_logger(__name__)
    try:
        problems = 0
        problems += stop_codons(vdj)
        problems += ambig_codons(vdj)
        problems += vdj_agreement(vdj)
        problems += conserved_junc_residues(vdj)
        problems += rearrangement(vdj)
        problems += junction_frame(vdj)
        problems += indels(vdj)
        if problems:
            return 'no'
        return 'yes'
    except:
        logger.debug('PRODUCTIVITY CHECK ERROR: {}'.format(vdj.id))
        logger.debug(traceback.format_exc())
Exemplo n.º 16
0
def cdhit(seqs, out_file=None, temp_dir=None, threshold=0.975, make_db=True, quiet=False, threads=0, max_memory=800, debug=False):
    # '''
    # Perform CD-HIT clustering on a set of sequences.

    # Inputs are an iterable of sequences, which can be in any format that abtools.sequence.Sequence
    # can handle.

    # Returns the centroid file name and cluster file name (from CD-HIT).
    # If ::make_db:: is True (default), a SQLite3 connection and database path are also returned.
    # '''
    logger = log.get_logger('cluster')
    start_time = time.time()
    seqs = [Sequence(s) for s in seqs]
    if not quiet:
        logger.info('CD-HIT: clustering {} seqeunces'.format(len(seqs)))
    if out_file is None:
        out_file = tempfile.NamedTemporaryFile(dir=temp_dir, delete=False)
        ofile = out_file.name
    else:
        ofile = os.path.expanduser(out_file)
    ifile = _make_cdhit_input(seqs, temp_dir)
    cdhit_cmd = 'cd-hit -i {} -o {} -c {} -n 5 -d 0 -T {} -M {}'.format(ifile,
                                                                        ofile,
                                                                        threshold,
                                                                        threads,
                                                                        max_memory)
    cluster = sp.Popen(cdhit_cmd,
                       shell=True,
                       stdout=sp.PIPE,
                       stderr=sp.PIPE)
    stdout, stderr = cluster.communicate()
    if debug:
        print(stdout)
        print(stderr)
    else:
        os.unlink(ifile)
    if not quiet:
        logger.info('CD-HIT: clustering took {:.2f} seconds'.format(time.time() - start_time))
    cfile = ofile + '.clstr'
    if make_db:
        if not quiet:
            logger.info('CD-HIT: building a SQLite3 database')
        seq_db, db_path = _build_seq_db(seqs, direc=temp_dir)
        return ofile, cfile, seq_db, db_path
    return ofile, cfile
Exemplo n.º 17
0
def regions(blast_result):
    '''
    Returns a VarRegions (or JoinRegions) object, containing a variety of information about
    Variable (or Joining) gene regions.  Both Regions objects contain equivalent attributes
    to ease downstream handling.

    Input is a BlastResult object for a variable gene.
    '''
    global logger
    logger = log.get_logger(__name__)
    try:
        if blast_result.gene_type == 'variable':
            return VarRegions(blast_result)
        if blast_result.gene_type == 'joining':
            return JoinRegions(blast_result)
    except Exception:
        logger.debug('REGIONS ERROR: {} {}'.format(blast_result.id, blast_result.seq))
        logger.debug(traceback.format_exc())
Exemplo n.º 18
0
Arquivo: s3.py Projeto: menis/abtools
def compress(d, output, compress='gz', logger=None):
    '''
    Creates a compressed/uncompressed tar file.

    Args:

        d: Can be one of three things:

            1. the path to a single file, as a string

            2. the path to a single directory, as a string

            3. an iterable of file or directory paths

        output (str): Output file path.

        compress: Compression method. Options are ``'gz'`` (gzip),
            ``'bz2'`` (bzip2) and ``'none'`` (uncompressed). Default is ``'gz'``.
    '''
    if not logger:
        logger = log.get_logger('s3')
    if type(d) not in [list, tuple]:
        d = [
            d,
        ]
    d = [os.path.expanduser(_d) for _d in d]
    print_compress_info(d, output, compress, logger)
    if compress.lower() == 'none':
        compress = ''
    elif compress.lower() not in ['gz', 'bz2']:
        logger.info(
            'Compression option ("{}") is invalid.\nFalling back to uncompressed.'
            .format(compress))
        compress = ''
    output = os.path.expanduser(output)
    tar = tarfile.open(output, 'w:{}'.format(compress))
    for obj in d:
        tar.add(obj)
    tar.close()
    return output
Exemplo n.º 19
0
Arquivo: s3.py Projeto: menis/abtools
def print_compress_info(d, output, compress, logger):
    if not logger:
        logger = log.get_logger('s3')
    dirs = [obj for obj in d if os.path.isdir(obj)]
    files = [obj for obj in d if os.path.isfile(obj)]
    logger.info('')
    logger.info('')
    logger.info('')
    logger.info('-' * 25)
    logger.info('COMPRESSING DATA')
    logger.info('-' * 25)
    logger.info('')
    logger.info('Ouptut file: {}'.format(output))
    logger.info('Compression: {}'.format(compress.lower()))
    if dirs:
        d = 'directories' if len(dirs) > 1 else 'directory'
        logger.info('Found {} {} to compress: {}'.format(
            len(dirs), d, ', '.join(dirs)))
    if files:
        f = 'files' if len(files) > 1 else 'file'
        logger.info('Found {} {} to compress: {}'.format(
            len(files), f, ', '.join(files)))
Exemplo n.º 20
0
Arquivo: s3.py Projeto: briney/abtools
def print_compress_info(d, output, compress, logger):
    if not logger:
        logger = log.get_logger('s3')
    dirs = [obj for obj in d if os.path.isdir(obj)]
    files = [obj for obj in d if os.path.isfile(obj)]
    logger.info('')
    logger.info('')
    logger.info('')
    logger.info('-' * 25)
    logger.info('COMPRESSING DATA')
    logger.info('-' * 25)
    logger.info('')
    logger.info('Ouptut file: {}'.format(output))
    logger.info('Compression: {}'.format(compress.lower()))
    if dirs:
        d = 'directories' if len(dirs) > 1 else 'directory'
        logger.info('Found {} {} to compress: {}'.format(len(dirs), d,
                                                         ', '.join(dirs)))
    if files:
        f = 'files' if len(files) > 1 else 'file'
        logger.info('Found {} {} to compress: {}'.format(len(files), f,
                                                         ', '.join(files)))
Exemplo n.º 21
0
Arquivo: s3.py Projeto: briney/abtools
def configure(access_key=None, secret_key=None, logger=None):
    '''
    Configures s3cmd prior to first use.

    If no arguments are provided, you will be prompted to enter
    the access key and secret key interactively.

    Args:

        access_key (str): AWS access key

        secret_key (str): AWS secret key
    '''
    if not logger:
        logger = log.get_logger('s3')
    if not all([access_key, secret_key]):
        logger.info('')
        access_key = raw_input('AWS Access Key: ')
        secret_key = raw_input('AWS Secret Key: ')
    _write_config(access_key, secret_key)
    logger.info('')
    logger.info('Completed writing S3 config file.')
    logger.info('')
Exemplo n.º 22
0
Arquivo: s3.py Projeto: menis/abtools
def configure(access_key=None, secret_key=None, logger=None):
    '''
    Configures s3cmd prior to first use.

    If no arguments are provided, you will be prompted to enter
    the access key and secret key interactively.

    Args:

        access_key (str): AWS access key

        secret_key (str): AWS secret key
    '''
    if not logger:
        logger = log.get_logger('s3')
    if not all([access_key, secret_key]):
        logger.info('')
        access_key = raw_input('AWS Access Key: ')
        secret_key = raw_input('AWS Secret Key: ')
    _write_config(access_key, secret_key)
    logger.info('')
    logger.info('Completed writing S3 config file.')
    logger.info('')
Exemplo n.º 23
0
Arquivo: s3.py Projeto: menis/abtools
def put(f, s3_path, multipart_chunk_size_mb=500, logger=None):
    '''
    Uploads a single file to S3, using s3cmd.

    Args:

        f (str): Path to a single file.

        s3_path (str): The S3 path, with the filename omitted. The S3 filename
            will be the basename of the ``f``. For example::

                put(f='/path/to/myfile.tar.gz', s3_path='s3://my_bucket/path/to/')

            will result in an uploaded S3 path of ``s3://my_bucket/path/to/myfile.tar.gz``
    '''
    if not logger:
        logger = log.get_logger('s3')
    fname = os.path.basename(f)
    target = os.path.join(s3_path, fname)
    s3cmd_cline = 's3cmd put {} {} --multipart-chunk-size-mb {}'.format(
        f, target, multipart_chunk_size_mb)
    print_put_info(fname, target, logger)
    s3cmd = sp.Popen(s3cmd_cline, stdout=sp.PIPE, stderr=sp.PIPE, shell=True)
    stdout, stderr = s3cmd.communicate()
Exemplo n.º 24
0
Arquivo: s3.py Projeto: briney/abtools
def compress(d, output, compress='gz', logger=None):
    '''
    Creates a compressed/uncompressed tar file.

    Args:

        d: Can be one of three things:

            1. the path to a single file, as a string

            2. the path to a single directory, as a string

            3. an iterable of file or directory paths

        output (str): Output file path.

        compress: Compression method. Options are ``'gz'`` (gzip),
            ``'bz2'`` (bzip2) and ``'none'`` (uncompressed). Default is ``'gz'``.
    '''
    if not logger:
        logger = log.get_logger('s3')
    if type(d) not in [list, tuple]:
        d = [d, ]
    d = [os.path.expanduser(_d) for _d in d]
    print_compress_info(d, output, compress, logger)
    if compress.lower() == 'none':
        compress = ''
    elif compress.lower() not in ['gz', 'bz2']:
        logger.info('Compression option ("{}") is invalid.\nFalling back to uncompressed.'.format(compress))
        compress = ''
    output = os.path.expanduser(output)
    tar = tarfile.open(output, 'w:{}'.format(compress))
    for obj in d:
        tar.add(obj)
    tar.close()
    return output
Exemplo n.º 25
0
def run_standalone(args):
    global logger
    logger = log.get_logger('demultiplex')
    main(args)
Exemplo n.º 26
0
def run_standalone(args):
    global logger
    logger = log.get_logger('barcodes')
    main(args)


def main(args):
    for f in list_files(args.input):
        # experiment = get_experiment(f, args)
        wb = load_workbook(f)
        ws = wb[wb.get_sheet_names()[0]]
        plate_blocks = get_plate_blocks(ws, args)
        plural = '' if len(plate_blocks) <= 2 else 's'
        logger.info('\nFound {} plate{} in the input file'.format(
            len(plate_blocks) - 1, plural))
        # logger.info('Experiment name: {}\n'.format(experiment))
        # plates = parse_plates(plate_blocks[1:], args)
        plates = parse_barcodes(plate_blocks[1:], args)
        write_output(plates, args)
        logger.info('')


if __name__ == '__main__':
    args = parse_args()
    if args.log is None:
        args.log = os.path.join(args.output, 'barcodes.log')
    log.setup_logging(args.log)
    logger = log.get_logger('barcodes')
    main(args)
Exemplo n.º 27
0
def run_standalone(args):
    logfile = args.log if args.log else os.path.join(args.output_dir, 'abfinder.log')
    log.setup_logging(logfile)
    global logger
    logger = log.get_logger('abfinder')
    main(args)
Exemplo n.º 28
0
def run(**kwargs):
    '''
    Mines NGS datasets for identity to known antibody sequences.

    All of ``db``, ``output``, ``temp`` and ``standard`` are required.


    Args:

        db (str): Name of a MongoDB database to query.

        collection (str): Name of a MongoDB collection. If not provided, all collections
            in ``db`` will be processed iteratively.

        output_dir (str): Path to the output directory, into which identity/divergence
            figures will be deposited.

        temp_dir (str): Path to a temporary directory.

        log (str): Path to a log file. If not provided, log information will not be retained.

        ip (str): IP address of the MongoDB server. Default is ``localhost``.

        port (str): Port of the MongoDB server. Default is ``27017``.

        user (str): Username with which to connect to the MongoDB database. If either
            of ``user`` or ``password`` is not provided, the connection to the MongoDB
            database will be attempted without authentication.

        password (str): Password with which to connect to the MongoDB database. If either
            of ``user`` or ``password`` is not provided, the connection to the MongoDB
            database will be attempted without authentication.

        standard (path): Path to a FASTA-formatted file containing one or more 'standard'
            sequences, against which the NGS sequences will be compared.

        chain (str): Antibody chain. Choices are 'heavy', 'kappa', 'lambda', and 'light'.
            Default is 'heavy'. Only NGS sequences matching ``chain`` (with 'light' covering
            both 'kappa' and 'lambda') will be compared to the ``standard`` sequences.

        update (bool): If ``True``, the MongoDB record for each NGS sequence will be updated
            with identity information for each standard. If ``False``, the updated is skipped.
            Default is ``True``.

        is_aa (bool): If ``True``, the ``standard`` sequences are amino acid sequences. If
            ``False``, they are nucleotide seqeunces. Default is ``False``.

        x_min (int): Minimum x-axis value on identity/divergence plots.

        x_max (int): Maximum x-axis value on identity/divergence plots.

        y_min (int): Minimum y-axis value on identity/divergence plots.

        y_max (int): Maximum y-axis value on identity/divergence plots.

        gridsize (int): Relative size of hexbin grids.

        mincount (int): Minimum number of sequences in a hexbin for the bin to be colored.
            Default is 3.

        colormap (str, colormap): Colormap to be used for identity/divergence plots.
            Default is ``Blues``.

        debug (bool): If ``True``, more verbose logging.
   '''
    args = Args(**kwargs)
    global logger
    logger = log.get_logger('abfinder')
    main(args)
Exemplo n.º 29
0
    print_standards_info(standards)
    collections = mongodb.get_collections(db, args.collection, prefix=args.collection_prefix)
    print_collections_info(collections)
    for collection in collections:
        indexed = False
        print_single_collection(collection)
        if args.remove_padding:
            print_remove_padding()
            mongodb.remove_padding(db, collection)
        seq_files = get_sequences(db, collection, args.temp_dir, args)
        for standard in standards:
            print_single_standard(standard)
            scores = run_jobs(seq_files, standard, args)
            if args.output_dir:
                make_figure(standard.id, scores, collection, args)
            if args.update:
                if not indexed:
                    mongodb.index(db, collection, 'seq_id')
                    indexed = True
                update_db(db, standard.id, scores, collection, args)
        clean_up(seq_files)


if __name__ == '__main__':
    parser = parse_args()
    args = parser.parse_args()
    logfile = args.log if args.log else os.path.join(args.output_dir, 'abfinder.log')
    log.setup_logging(logfile)
    logger = log.get_logger('abfinder')
    main(args)
Exemplo n.º 30
0
def run(**kwargs):
    args = Args(**kwargs)
    global logger
    logger = log.get_logger('barcodes')
    main(args)
Exemplo n.º 31
0
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#


import glob
from multiprocessing import cpu_count
import os
import sys
import subprocess as sp

from abtools import log

logger = log.get_logger('basespace')


def list_files(d):
    return sorted([f for f in glob.glob(d + '/*') if os.path.isfile(f)])


def pair_files(files, nextseq):
    pairs = {}
    for f in files:
        if nextseq:
            f_prefix = '_'.join(os.path.basename(f).split('_')[:3])
        else:
            f_prefix = '_'.join(os.path.basename(f).split('_')[:2])
        if f_prefix in pairs:
            pairs[f_prefix].append(f)
Exemplo n.º 32
0
Arquivo: s3.py Projeto: menis/abtools
def compress_and_upload(data,
                        compressed_file,
                        s3_path,
                        multipart_chunk_size_mb=500,
                        method='gz',
                        delete=False,
                        access_key=None,
                        secret_key=None):
    '''
    Compresses data and uploads to S3.

    S3 upload uses ``s3cmd``, so you must either:

        1) Manually configure ``s3cmd`` prior to use (typically using ``s3cmd --configure``).

        2) Configure ``s3cmd`` using ``s3.configure()``.

        3) Pass your access key and secret key to ``compress_and_upload``, which will automatically configure s3cmd.

    .. note:

        ``s3cmd`` configuration only needs to be done once per computer,
        which means that relaunching a cloud instance or Docker image will
        require re-configuration of ``s3cmd``.

    Args:

        data: Can be one of three things:

            1) Path to a single file

            2) Path to a directory

            3) A list of one or more paths to files or directories

        compressed_file (str): Path to the compressed file. Required.

        s3_path (str): The S3 path, with the filename omitted. The S3 filename
          will be the basename of the ``compressed_file``. For example::

            compress_and_upload(data='/path/to/data',
                                compressed_file='/path/to/compressed.tar.gz',
                                s3_path='s3://my_bucket/path/to/')

          will result in an uploaded S3 path of ``s3://my_bucket/path/to/compressed.tar.gz``

        method (str): Compression method. Options are ``'gz'`` (gzip) or ``'bz2'`` (bzip2).
            Default is ``'gz'``.

        delete (bool): If ``True``, the ``compressed_file`` will be deleted after upload
            to S3. Default is ``False``.

        access_key (str): AWS access key.

        secret_key (str): AWS secret key.
    '''
    logger = log.get_logger('s3')
    if all([access_key, secret_key]):
        configure(access_key=access_key, secret_key=secret_key, logger=logger)
    compress(data, compressed_file, compress=method, logger=logger)
    put(compressed_file,
        s3_path,
        multipart_chunk_size_mb=multipart_chunk_size_mb,
        logger=logger)
    if delete:
        os.unlink(compressed_file)
Exemplo n.º 33
0
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#

import os
import sys
import glob
import subprocess as sp
from multiprocessing import cpu_count

from abtools import log

logger = log.get_logger('basespace')


def list_files(d):
    return sorted([f for f in glob.glob(d + '/*') if os.path.isfile(f)])


def pair_files(files, nextseq):
    pairs = {}
    for f in files:
        if nextseq:
            f_prefix = '_'.join(os.path.basename(f).split('_')[:3])
        else:
            f_prefix = '_'.join(os.path.basename(f).split('_')[:2])
        if f_prefix in pairs:
            pairs[f_prefix].append(f)
Exemplo n.º 34
0
def run(*args, **kwargs):
    '''
    Runs AbStar.

    Input sequences can be provided in several different formats:

        1) individual sequences as positional arguments: ``run(seq1, seq2, temp=temp, output=output)``
        2) a list of sequences, as an argument: ``run([seq1, seq2], temp=temp, output=output)``
        3) a single FASTA/Q-formatted input file, passed via ``input``
        4) a directory of FASTA/Q-formatted files, passed via ``input``

    When passing sequences (not FASTA/Q files), the sequences can be in any format recognized
    by ``abtools.sequence.Sequence``, including:

        - a raw nucleotide sequence, as a string (a random sequence ID will be assigned)
        - a list/tuple of the format ``[sequence_id, sequence]``
        - a BioPython SeqRecord object
        - an AbTools Sequence object

    Either sequences, ``project_dir``, or all of ``input``, ``output`` and ``temp`` are required.


    Examples:

        If processing a single sequence, you can pass the raw sequence, as a string::

            import abstar

            result = abstar.run('ATGC')

        or a list/tuple of the format ``[sequence_id, sequence]``::

            result = abstar.run(['seq1', 'ATGC'])

        If you pass just the raw sequence, a random sequence ID will be generated with ``uuid.uuid4()``.
        In either case, when given a single sequence, ``abstar.run()`` will return a single AbTools ``Sequence``
        object. If running multiple sequences, you can either pass each sequence as a positional argument::

            result_list = run(['seq1', 'ATGC'], ['seq2', 'CGTA'])

        or you can pass a list of sequences as the first argument, in this case using sequences parsed from a
        FASTA file using Biopython::

            from Bio import SeqIO

            fasta = open('my_sequences.fasta', 'r')
            seqs = [s for s in SeqIO.parse(fasta, 'fasta')]
            result_list = abstar.run(seqs)

        When given multiple sequences, ``abstar.run()`` will return a list of AbTools ``Sequence`` objects,
        one per input sequence.

        If you'd prefer not to parse the FASTQ/A file into a list (for example, if the input file is
        extremely large), you can pass the input file path directly, along with a temp directory and output
        directory::

            result_files = abstar.run(input='/path/to/my_sequences.fasta',
                                      temp='/path/to/temp',
                                      output='/path/to/output')

        Given a file path, ``abstar.run()`` returns a list of output file paths. In the above case,
        ``result_files`` will be a list containing a single output file path:
        ``/path/to/output/my_sequences.json``.

        If you have a directory containing multiple FASTQ/A files, you can pass the directory path
        using ``input``::

            result_files = abstar.run(input='/path/to/input',
                                      temp='/path/to/temp',
                                      output='/path/to/output')

        As before, ``result_files`` will contain a list of output file paths.

        If your input directory contains paired FASTQ files (gzip compressed or uncompressed)
        that need to be merged prior to processing with AbStar::

            result_files = abstar.run(input='/path/to/input',
                                      temp='/path/to/temp',
                                      output='/path/to/output',
                                      merge=True)

        The paired read files in ``input`` will be merged with PANDAseq prior to processing with AbStar.
        By default, PANDAseq's 'simple bayesian' read merging algorithm is used, although alternate
        algorithms can be selected with ``pandaseq_algo``.

        AbStar also provides an alternate CSV-formatted output type that mimics the `IMGT Summary file`_.
        This option is provided to minimize the effort needed to convert existing
        IMGT-based pipelines to AbStar. Alternate output is only available when passing an input file or
        directory; passing individual sequences or a list of sequences will always return Sequence objects.
        To produce IMGT-formatted output::

            result_files = abstar.run(input='/path/to/input',
                                      temp='/path/to/temp',
                                      output='/path/to/output',
                                      output_type='imgt')

        .. _IMGT Summary file: http://www.imgt.org/IMGT_vquest/share/textes/imgtvquest.html#Esummary


    Args:

        project_dir (str): Path to the project directory. Most useful when directly downloading
            files from BaseSpace, and all subdirectories will be created by AbStar.

        input (str): Path to input directory, containing FASTA/Q files. If performing
            read merging with PANDAseq, paired FASTQ files may be gzip compressed.

        output (str): Path to output directory.

        temp (str): Path to temp directory, where intermediate job files will be stored.

        log (str): Path to log file. If not provided and ``project_dir`` is provided,
            the log will be written to ``/path/to/project_dir/abstar.log``. If output is
            provided, log will be written to ``/path/to/output/abstar.log``.

        species (str): Species of the antibody sequences. Choices are 'human', 'macaque',
            'mouse' and 'rabbit'. Default is 'human'.

        isotype (bool): If True, the isotype will infered by aligning the sequence region
            downstream of the J-gene. If False, the isotype will not be determined.
            Default is True.

        uid (int): Length (in nucleotides) of the Unique Molecular ID used to barcode input RNA.
            A positive integer results in the UMID being parsed from the start of the read (or merged
            read), a negative integer results in parsing from the end of the read. Default is 0,
            which results in no UMID parsing.

        gzip (bool): If True, compresses output files with gzip. Default is False.

        pretty (bool): If True, formats JSON output files to be more human-readable. If False,
            JSON output files contain one record per line. Default is False.

        output_type (str): Options are 'json' or 'imgt'. IMGT output mimics the Summary
            table produced by IMGT High-V/Quest, to maintain a level of compatibility with
            existing IMGT-based pipelines. JSON output is much more detailed. Default is 'json'.

        merge (bool): If True, input must be paired-read FASTA files (gzip compressed or uncompressed)
            which will be merged with PANDAseq prior to processing with AbStar. If ``basespace`` is True,
            ``merge`` is automatically set to True. Default is False.

        pandaseq_algo (str): Define merging algorithm to be used by PANDAseq. Options are
            'simple_bayesian', 'ea_util', 'flash', 'pear', 'rdp_mle', 'stitch', or 'uparse'. Default is
            'simple_bayesian', which is the default PANDAseq algorithm.

        debug (bool): If ``True``, ``abstar.run()`` runs in single-threaded mode, the log is much more verbose,
            and temporary files are not removed. Default is ``False``.


    Returns:

        If the input is a single sequence, ``run`` returns a single AbTools ``Sequence`` object.

        If the input is a list of sequences, ``run`` returns a list of AbTools ``Sequence`` objects.

        If the input is a file or a directory of files, ``run`` returns a list of output files.
    '''

    warnings.filterwarnings("ignore")
    if len(args) == 1:
        # if there's a single arg, need to check if it's a single sequence...
        try:
            sequences = [
                Sequence(args[0]),
            ]
        except:
            # ...or a list of sequences
            try:
                sequences = [Sequence(s) for s in args[0]]
            except:
                print('ERROR: invalid format for sequence input:')
                for a in args:
                    print(a)
                sys.exit(1)
    # if multiple args, assume each is a sequence
    elif len(args) > 1:
        try:
            sequences = [Sequence(s) for s in args]
        except:
            print('ERROR: invalid format for sequence input:')
            for a in args:
                print(a)
            sys.exit(1)
    kwargs['sequences'] = sequences
    args = Args(**kwargs)
    validate_args(args)
    global logger
    logger = log.get_logger('abstar')
    output = main(args)
    # if args.sequences is not None:
    #     output = [Sequence(o) for o in output]
    #     if len(output) == 1:
    #         return output[0]
    return output
Exemplo n.º 35
0

from multiprocessing import cpu_count
import os
from subprocess import Popen, PIPE
import sys

from Bio import SeqIO

from utils.pandaseq import pair_files

from abtools.log import get_logger
from abtools.pipeline import list_files, make_dir


logger = get_logger('preprocess')


def quality_trim(input_directory=None, output_directory=None,
        quality_cutoff=20, length_cutoff=50,
        quality_type='sanger', compress_output=True, file_pairs=None,
        singles_directory=None, nextseq=False, paired_reads=True,
        allow_5prime_trimming=False, print_debug=False):
    '''
    Performs quality trimming with sickle.

    Args:

        input_directory (str): Path to a directory of files to be quality
            trimmed. If the directory contains paired reads, they should
            follow the Illumina MiSeq naming scheme. If you have paired reads
Exemplo n.º 36
0
def mongoimport(json, database,
                ip='localhost', port=27017,
                user=None, password=None,
                delim='_', delim1=None, delim2=None,
                delim_occurance=1, delim1_occurance=1, delim2_occurance=1):
    '''
    Performs mongoimport on one or more json files.

    Args:

        json: Can be one of several things:

            - path to a single JSON file
            - an iterable (list or tuple) of one or more JSON file paths
            - path to a directory containing one or more JSON files

        database (str): Name of the database into which the JSON files
            will be imported

        ip (str): IP address of the MongoDB server. Default is ``localhost``.

        port (int): Port of the MongoDB database. Default is ``27017``.

        user (str): Username for the MongoDB database, if authentication is enabled.
            Default is ``None``, which results in attempting connection without
            authentication.

        password (str): Password for the MongoDB database, if authentication is enabled.
            Default is ``None``, which results in attempting connection without
            authentication.

        delim (str): Delimiter, when generating collection names using a single delimiter.
            Default is ``_``

        delim_occurance (int): Occurance at which to split filename when using a
            single delimiter. Default is ``1``

        delim1 (str): Left delimiter when splitting with two delimiters. Default is None.

        delim1_occurance (int): Occurance of ``delim1`` at which to split filename.
            Default is ``1``

        delim2 (str): Right delimiter when splitting with two delimiters. Default is None.

        delim2_occurance (int): Occurance of ``delim2`` at which to split filename.
            Default is ``1``
    '''
    logger = log.get_logger('mongodb')
    _print_mongoimport_info(logger)
    if type(json) in (list, tuple):
        pass
    elif os.path.isdir(json):
        from abtools.utils.pipeline import list_files
        json = list_files(json)
    else:
        json = [json, ]
    jsons = sorted([os.path.expanduser(j) for j in json if j.endswith('.json')])
    collections = _get_import_collections(jsons, delim, delim_occurance,
                                          delim1, delim1_occurance,
                                          delim2, delim2_occurance)
    logger.info('Found {} files to import'.format(len(jsons)))
    logger.info('')
    for i, (json_file, collection) in enumerate(zip(jsons, collections)):
        logger.info('[ {} ] {} --> {}'.format(i + 1, os.path.basename(json_file), collection))
        # logger.info("Performing mongoimport on {}.".format(os.path.basename(json_file)))
        # logger.info("Importing the file into collection {}.".format(collection))
        if all([user, password]):
            host = '--host {} --port {} -username {} -password {}'.format(ip, port, user, password)
        else:
            host = '--host {} --port {}'.format(ip, port)
        mongo_cmd = "mongoimport {} --db {} --collection {} --file {}".format(
            host, database, collection, json_file)
        mongo = sp.Popen(mongo_cmd, shell=True, stdout=sp.PIPE, stderr=sp.PIPE)
        stdout, stderr = mongo.communicate()
Exemplo n.º 37
0
def _print_remove_padding():
    logger = log.get_logger('mongodb')
    logger.info('Removing MongoDB padding...')
Exemplo n.º 38
0
def run(**kwargs):
    args = Args(**kwargs)
    global logger
    logger = log.get_logger('demultiplex')
    main(args)
Exemplo n.º 39
0
def run(**kwargs):
    '''
    Corrects antibody reads using UAIDs (molecular barcodes) or identity-based clustering.

    Either ``json`` or ``db`` is required.


    Args:

        db (str): Name of a MongoDB database to query.

        collection (str): Name of a MongoDB collection. If not provided, all
            collections in ``db`` will be iteratively processed.

        json: Can be one of two things:

            1. Path to a JSON file, containing sequence data annotated by AbStar.

            2. Path to a directory, containing one or more JSON files of
                AbStar-annotated data.

        output (str): Path to the output directory, into which corrected FASTA
            files will be deposited. If it does not exist, it will be created.

        log (str): Path to the log file. If the parent directory doesn't exist,
            it will be created.

        ip (str): IP address of the MongoDB server. Default is ``localhost``.

        port (int): Port of the MongoDB server. Default is ``27017``.

        user (str): Username with which to connect to the MongoDB database. If either
            of ``user`` or ``password`` is not provided, the connection to the MongoDB
            database will be attempted without authentication.

        password (str): Password with which to connect to the MongoDB database. If either
            of ``user`` or ``password`` is not provided, the connection to the MongoDB
            database will be attempted without authentication.

        min_seqs (int): Minimum number of sequences for a centroid/consensus sequence to be
            calculated. After clustering (either by identity or using UAIDs), clusters with
            at least ``min_seqs`` sequences will be retained for consensus/centroid calculation.
            Default is ``1``.

        uaid (bool): If ``True``, use Unique Antibody IDs (UAIDs) for error correction.
            Sequences will be binned by UAID and the sequences in each bin will be used to
            compute a centroid or consensus sequence. If ``False``, sequences will be clustered
            by identity and each cluster will be used for consensus/centroid determination.

        parse_uaids (int): If UAIDs haven't been pre-parsed by AbStar, indicate the length of the
            UAID sequence (in nucleotides) and the UAIDs will be parsed during correction. If
            ``parse_uaids`` is negative, the UAID will be parsed from the end of the sequence.
            Default is ``0``, which does not parse a UAID sequence.

        consensus (bool): If ``True``, consensus sequences are calculated. If ``False``, centroid
            sequences are calculated. Default is ``True``.

        identity_threshold (float): Identity threshold, if clustering by identity (not UAIDs).
            Must be a float between 0 and 1. Default is 0.975.

        only_largest_cluster (bool): When clustering using UAIDs, there is a some probability that
            different sequences get labeled with the same UAID. To limit incorrect consensus/centroid
            calculation, sequences in each UAID bin are clustered using ``identity_threshold`` before
            calculating consens/centroid sequences. By default, all UAID clusters that meet the
            ``min_seqs`` size threshold are used to generate consensus/centroid sequences. If that
            behavior is not desired, setting ``only_largest_cluster`` to ``True`` results in only
            the largest UAID cluster being used to generate centroid/consensus sequences.

        nr (bool): If ``True``, a non-redundant sequence dataset will be generated using ``sort | uniq``.
            This is much faster than normal sequence clustering with CD-HIT, but can only be performed at an
            identity threshold of 100%.

            .. note:

                Using ``nr`` may produce different results than clustering sequences with ``identity_threshold``
                set to ``1.0``. This is because sequences of different lengths that are otherwise identical
                will not be collapsed when using ``nr`` but will be collapsed using normal clustering.

        germs (str): Path to a file containing germline V-gene sequences. When clustering with ``min_seqs``
            equal to 2, the appropriate germline sequence will be added to the alignment to serve as a
            consensus tiebreaker.

        aa (bool): If ``True``, perform sequence clustering (either using ``identity_threshold`` or ``nr``)
            using amino acid sequences. Default is ``False``, which performs clustering using nucleotide
            sequences.

        debug (bool): If ``True``, logging is more verbose.
    '''
    args = Args(**kwargs)
    global logger
    logger = log.get_logger('abcorrect')
    main(args)
Exemplo n.º 40
0

def run_standalone(args):
    global logger
    logger = log.get_logger('demultiplex')
    main(args)


def main(args):
    for f in list_files(args.input):
        experiment = get_experiment(f, args)
        wb = load_workbook(f)
        ws = wb[wb.get_sheet_names()[0]]
        plate_blocks = get_plate_blocks(ws, args)
        plural = '' if len(plate_blocks) <= 2 else 's'
        logger.info('\nFound {} plate{} in the input file'.format(
            len(plate_blocks) - 1, plural))
        logger.info('Experiment name: {}\n'.format(experiment))
        plates = parse_plates(plate_blocks[1:], args)
        write_output(plates, experiment, args)
        logger.info('')


if __name__ == '__main__':
    args = parse_args()
    if args.log is None:
        args.log = os.path.join(args.output, 'platemap.log')
    log.setup_logging(args.log)
    logger = log.get_logger('demultiplex')
    main(args)
Exemplo n.º 41
0
            s1_all_vgenes = get_vgenes(db, s1, args.chain)
        print_pair_info(s1, s2)
        s1_vgenes, s2_vgenes = get_vgenes(db, s2, args.chain, prev_data=s1_all_vgenes)
        logger.info('')
        logger.info('Calculating similarities...')
        median, counts, bins, similarities = calculate_similarities(s1_vgenes,
                                                                    s2_vgenes,
                                                                    args)
        write_output(s1, s2, median, counts, bins, similarities, args)
        scores = update_scores(s1, s2, median, scores)
        if args.control_similarity:
            logger.info('')
            logger.info('Calculating control similarities...')
            cmedian, ccounts, cbins, csimilarities = calculate_control_similarities(s1_vgenes,
                                                                                    s2_vgenes,
                                                                                    args)
            write_output(s1, s2, cmedian, ccounts, cbins, csimilarities, args)
            cscores = update_scores(s1, s2, cmedian, cscores)
        prev1 = s1
    print_final_results(scores)
    print_final_results(cscores, control=True)


if __name__ == '__main__':
    parser = parse_args()
    args = parser.parse_args()
    logfile = args.log if args.log else os.path.join(args.output, 'abcompare.log')
    log.setup_logging(logfile)
    logger = log.get_logger('abcompare')
    main(args)
Exemplo n.º 42
0
def run_standalone(args):
    global logger
    logger = log.get_logger('barcodes')
    main(args)
Exemplo n.º 43
0
def run(**kwargs):
    '''
    Performs repertoire-level comparison of antibody sequencing datasets.

    Currently, the only metric for comparison is V-gene usage frequency. Additional measures
    are in the works (such as comparisons based on clonality).

    Args:

        db (str): MongoDB database name.

        collection1 (str): Name of the first MongoDB collection to query for comparison.
            If both ``collection1`` and ``collection2`` are provided, ``collection1`` will
            be compared only to ``collection2``.
            If neither ``collection1`` nor ``collection2`` are provided, all collections in
            ``db`` will be processed iteratively (all pairwise comparisons will be made).
            If ``collection1`` is provided but ``collection2`` is not, ``collection1`` will
            be iteratively compared to all other collections in ``db``.

        collection2 (str): Name of the second MongoDB collection to query for comparison.
            If both ``collection1`` and ``collection2`` are provided, ``collection1`` will
            be compared only to ``collection2``.
            If neither ``collection1`` nor ``collection2`` are provided, all collections in
            ``db`` will be processed iteratively (all pairwise comparisons will be made).

        collection_prefix (str): All collections beginning with ``collection_prefix`` will
            be iteratively compared (all pairwise comparisons will be made).

        ip (str): IP address of the MongoDB server. Default is ``localhost``.

        port (int): Port of the MongoDB server. Default is ``27017``.

        user (str): Username with which to connect to the MongoDB database. If either
            of ``user`` or ``password`` is not provided, the connection to the MongoDB
            database will be attempted without authentication.

        password (str): Password with which to connect to the MongoDB database. If either
            of ``user`` or ``password`` is not provided, the connection to the MongoDB
            database will be attempted without authentication.

        chunksize (int): Number of sequences for each iteration. Default is 100,000.

        iterations (int): Number of iterations to perform on each pair of samples.
            Default is 10,000

        method (str): Similarity/divergence method to used for comparison. Default is
            ``marisita-horn``. Options are:

            - ``marisita-horn``
            - ``kullback-leibler``
            - ``jensen-shannon``
            - ``jaccard``
            - ``bray-curtis``
            - ``renkonen``
            - ``cosine``

        control_similarity (bool): If ``True``, control similarity/divergence will be
            calculated, in which each sample is also compared to itself. Default is ``False``.

        chain (str): Antibody chain to be used for comparison. Options are ``heavy``, ``kappa``
            and ``lambda``. Default is ``heavy``.
    '''

    args = Args(**kwargs)
    global logger
    logger = log.get_logger('abcompare')
    main(args)
Exemplo n.º 44
0
            seqs = get_seqs(db, collection, args, make_seq_db=False)
            unique_file = unix_sort_unique(seqs, args)
            write_nr_output(collection, unique_file, collection_start, args)
        else:
            seq_db_path = get_seqs(db, collection, args)
            initial_clusters = initial_clustering(seq_db_path, args)
            if args.min_seqs == 1:
                singletons = [ic for ic in initial_clusters if ic.size == 1]
                initial_clusters = [ic for ic in initial_clusters if ic.size > 1]
                logger.info('{} clusters contained only a single sequence. Processing singletons...'.format(len(singletons)))
                singleton_consentroids = process_singleton_clusters(singletons, seq_db_path, args)
                logger.info('')
            else:
                singleton_consentroids = []
            consentroids = process_initial_clusters(initial_clusters, seq_db_path, args)
            consentroids += singleton_consentroids
            sequences, sizes = zip(*consentroids)
            write_output(sample_name, sequences, sizes, collection_start, args)
            for ic in initial_clusters:
                ic.cleanup()
            remove_sqlite_db(args)


if __name__ == '__main__':
    parser = parse_args()
    args = parser.parse_args()
    logfile = args.log if args.log else os.path.join(args.output, 'abcorrect.log')
    log.setup_logging(logfile)
    logger = log.get_logger('abcorrect')
    main(args)
Exemplo n.º 45
0
def run_standalone(args):
    logfile = args.log if args.log else os.path.join(args.output, 'abcompare.log')
    log.setup_logging(logfile)
    global logger
    logger = log.get_logger('abcompare')
    main(args)