Пример #1
0
def get_isotype(antibody):
    try:
        germ_dir = get_germline_database_directory(antibody.species)
        isotype_file = os.path.join(germ_dir, 'isotypes/isotypes.fasta')
        isotype_seqs = [Sequence(s) for s in SeqIO.parse(open(isotype_file, 'r'), 'fasta')]
        return Isotype(antibody, isotype_seqs)
    except:
        antibody.exception('ISOTYPING ERROR', traceback.format_exc())
Пример #2
0
def get_germline(gene, species):
    gene = gene.upper()
    segment = gene[3]
    direc = os.path.dirname(os.path.abspath(__file__))
    gg_file = os.path.join(
        direc, 'utils/germline_genes/{}_{}.fasta'.format(species, segment))
    gene = [
        s for s in SeqIO.parse(open(gg_file, 'r'), 'fasta') if s.id == gene
    ][0]
    return Sequence(gene)
Пример #3
0
def parse_mabs(mabs, delimiter, args):
    if type(mabs) in [list, tuple]:
        seqs = [
            Sequence(m, id_key=args.name_key, seq_key=args.sequence_key)
            for m in mabs
        ]
    elif type(mabs) == dict:
        seqs = [
            Sequence(mabs),
        ]
    else:
        seqs = [
            Sequence(s.id, str(s.seq))
            for s in SeqIO.parse(open(mabs_file, 'r'), 'fasta')
        ]
    for seq in seqs:
        if seq.id.split(delimiter)[0] != 'mab':
            seq.id = 'mab{}{}'.format(delimiter, seq.id)
    return seqs
Пример #4
0
 def __init__(self, sequence, v=None, d=None, j=None):
     super(VDJ, self).__init__()
     LoggingMixin.__init__(self)
     self.sequence = Sequence(sequence)
     self.id = self.sequence.id
     self.oriented = self.sequence
     self.v = v
     self.d = d
     self.j = j
     self.initialize_log()
Пример #5
0
def get_vrc01_class_sequences(chain='heavy',
                              vgene_only=True,
                              only_include=None):
    if vgene_only:
        heavy = [
            ('VRC01',
             'QVQLVQSGGQMKKPGESMRISCRASGYEFIDCTLNWIRLAPGKRPEWMGWLKPRGGAVNYARPLQGRVTMTRDVYSDTAFLELRSLTVDDTAVYFCTR'
             ),
            ('PGV04',
             'QVQLVQSGSGVKKPGASVRVSCWTSEDIFERTELIHWVRQAPGQGLEWIGWVKTVTGAVNFGSPDFRQRVSLTRDRDLFTAHMDIRGLTQGDTATYFCAR'
             ),
            ('VRC-CH31',
             'QVQLVQSGAAVRKPGASVTVSCKFAEDDDYSPYWVNPAPEHFIHFLRQAPGQQLEWLAWMNPTNGAVNYAWYLNGRVTATRDRSMTTAFLEVKSLRSDDTAVYYCAR'
             ),
            ('3BNC60',
             'QVHLSQSGAAVTKPGASVRVSCEASGYKISDHFIHWWRQAPGQGLQWVGWINPKTGQPNNPRQFQGRVSLTRQASWDFDTYSFYMDLKAVRSDDTAIYFCAR'
             ),
            ('12A12',
             'HLVQSGTQVKKPGASVRISCQASGYSFTDYVLHWWRQAPGQGLEWMGWIKPVYGARNYARRFQGRINFDRDIYREIAFMDLSGLRSDDTALYFCAR'
             ),
            ('PGV20',
             'QVHLMQSGTEMKKPGASVRVTCQTSGYTFSDYFIHWLRQVPGRGFEWMGWMNPQWGQVNYARTFQGRVTMTRDVYREVAYLDLRSLTFADTAVYFCAR'
             )
        ]
        light = []
    else:
        heavy = [
            ('VRC01',
             'QVQLVQSGGQMKKPGESMRISCRASGYEFIDCTLNWIRLAPGKRPEWMGWLKPRGGAVNYARPLQGRVTMTRDVYSDTAFLELRSLTVDDTAVYFCTRGKNCDYNWDFEHWGRGTPVIVSS'
             ),
            ('PGV04',
             'QVQLVQSGSGVKKPGASVRVSCWTSEDIFERTELIHWVRQAPGQGLEWIGWVKTVTGAVNFGSPDFRQRVSLTRDRDLFTAHMDIRGLTQGDTATYFCARQKFYTGGQGWYFDLWGRGTLIVVSS'
             ),
            ('VRC-CH31',
             'QVQLVQSGAAVRKPGASVTVSCKFAEDDDYSPYWVNPAPEHFIHFLRQAPGQQLEWLAWMNPTNGAVNYAWYLNGRVTATRDRSMTTAFLEVKSLRSDDTAVYYCARAQKRGRSEWAYAHWGQGTPVVVSS'
             ),
            ('3BNC60',
             'QVHLSQSGAAVTKPGASVRVSCEASGYKISDHFIHWWRQAPGQGLQWVGWINPKTGQPNNPRQFQGRVSLTRQASWDFDTYSFYMDLKAVRSDDTAIYFCARQRSDFWDFDVWGSGTQVTVSS'
             ),
            ('12A12',
             'HLVQSGTQVKKPGASVRISCQASGYSFTDYVLHWWRQAPGQGLEWMGWIKPVYGARNYARRFQGRINFDRDIYREIAFMDLSGLRSDDTALYFCARDGSGDDTSWHLDPWGQGTLVIVSA'
             ),
            ('PGV20',
             'QVHLMQSGTEMKKPGASVRVTCQTSGYTFSDYFIHWLRQVPGRGFEWMGWMNPQWGQVNYARTFQGRVTMTRDVYREVAYLDLRSLTFADTAVYFCARRMRSQDREWDFQHWGQGTRIIVSS'
             )
        ]
        light = []
    seqs = heavy if chain == 'heavy' else light
    if only_include is not None:
        if type(only_include) in [str, unicode]:
            only_include = [
                only_include,
            ]
        seqs = [s for s in seqs if s[0] in only_include]
    return [Sequence(s) for s in seqs]
Пример #6
0
 def _make_consensus(self):
     if len(self.sequences) == 1:
         return self.sequences[0]
     _aln = mafft(self.sequences, as_file=True)
     aln = AlignIO.read(open(_aln, 'r'), 'fasta')
     summary_align = AlignInfo.SummaryInfo(aln)
     consensus = summary_align.gap_consensus(threshold=0.51, ambiguous='n')
     consensus_string = str(consensus).replace('-', '')
     consensus_seq = Sequence(consensus_string.upper())
     os.unlink(_aln)
     return consensus_seq
Пример #7
0
def get_vrc01_germline_sequence(vgene_only=True):
    if vgene_only:
        gl_vrc01 = (
            'glVRC01',
            'QVQLVQSGAEVKKPGASVKVSCKASGYTFTGYYMHWVRQAPGQGLEWMGWINPNSGGTNYAQKFQGRVTMTRDTSISTAYMELSRLRSDDTAVYYCAR'
        )
    else:
        gl_vrc01 = (
            'glVRC01',
            'QVQLVQSGAEVKKPGASVKVSCKASGYTFTGYYMHWVRQAPGQGLEWMGWINPNSGGTNYAQKFQGRVTMTRDTSISTAYMELSRLRSDDTAVYYCARGKNSDYNWDFQHWGQGTLVTVSS'
        )
    return Sequence(gl_vrc01)
Пример #8
0
def retrieve_output_seqs(seq_ids, seq_db_path):
    conn = sqlite3.connect(seq_db_path)
    seq_db = conn.cursor()
    seqs = []
    for chunk in chunker(seq_ids):
        seq_chunk = seq_db.execute('''SELECT seqs.seq_id, seqs.output_seq
                                   FROM seqs
                                   WHERE seqs.seq_id IN ({})'''.format(','.join('?' * len(chunk))), chunk)
        seqs.extend(seq_chunk)
    conn.commit()
    conn.close()
    return [Sequence(s[1], id=s[0]) for s in seqs]
Пример #9
0
 def _get_sequences(self):
     if self._seq_db is not None:
         seqs = []
         for chunk in self._chunker(self.ids):
             sql_cmd = '''SELECT seqs.id, seqs.sequence
                          FROM seqs
                          WHERE seqs.id IN ({})'''.format(','.join(
                 '?' * len(chunk)))
             seq_chunk = self._seq_db.execute(sql_cmd, chunk)
             seqs.extend(seq_chunk)
         return [Sequence(s) for s in seqs]
     else:
         return [self._seq_dict[s] for s in self.ids]
Пример #10
0
def cdhit(seqs,
          out_file=None,
          temp_dir=None,
          threshold=0.975,
          make_db=True,
          quiet=False,
          threads=0,
          max_memory=800,
          debug=False):
    # '''
    # Perform CD-HIT clustering on a set of sequences.

    # Inputs are an iterable of sequences, which can be in any format that abtools.sequence.Sequence
    # can handle.

    # Returns the centroid file name and cluster file name (from CD-HIT).
    # If ::make_db:: is True (default), a SQLite3 connection and database path are also returned.
    # '''
    logger = log.get_logger('cluster')
    start_time = time.time()
    seqs = [Sequence(s) for s in seqs]
    if not quiet:
        logger.info('CD-HIT: clustering {} seqeunces'.format(len(seqs)))
    if out_file is None:
        out_file = tempfile.NamedTemporaryFile(dir=temp_dir, delete=False)
        ofile = out_file.name
    else:
        ofile = os.path.expanduser(out_file)
    ifile = _make_cdhit_input(seqs, temp_dir)
    cdhit_cmd = 'cd-hit -i {} -o {} -c {} -n 5 -d 0 -T {} -M {}'.format(
        ifile, ofile, threshold, threads, max_memory)
    cluster = sp.Popen(cdhit_cmd, shell=True, stdout=sp.PIPE, stderr=sp.PIPE)
    stdout, stderr = cluster.communicate()
    if debug:
        print(stdout)
        print(stderr)
    else:
        os.unlink(ifile)
    if not quiet:
        logger.info(
            'CD-HIT: clustering took {:.2f} seconds'.format(time.time() -
                                                            start_time))
    cfile = ofile + '.clstr'
    if make_db:
        if not quiet:
            logger.info('CD-HIT: building a SQLite3 database')
        seq_db, db_path = _build_seq_db(seqs, direc=temp_dir)
        return ofile, cfile, seq_db, db_path
    return ofile, cfile
Пример #11
0
 def run_partis(self, sequence_file, file_format, locus):
     with open(sequence_file, 'r') as sequence_handle:
         seqs = [
             Sequence(s) for s in SeqIO.parse(sequence_handle, file_format)
         ]
     seq_dict = {s.id.replace(':', 'c'): s for s in seqs}
     partis_out = NamedTemporaryFile(delete=False)
     germline_dir = os.path.join(self.germline_directory, 'partis/')
     locus = 'igh' if locus == 'ig' else 'tra'
     partis_cmd = [
         'partis', 'run-viterbi', '--infname', sequence_file, '--outfname',
         partis_out.name, '--locus', locus, '--initial-germline-dir',
         germline_dir
     ]
     p = sp.Popen(partis_cmd, stdout=sp.PIPE, stderr=sp.PIPE)
     stdout, stderr = p.communicate()
     parsed_vdjs = self.parse_partis_output(partis_out.name, seq_dict)
     os.unlink(partis_out.name)
     return parsed_vdjs
Пример #12
0
 def _rmp(self, sequences):
     rmp = ''
     seqs = [(s['seq_id'], s['vdj_nt']) for s in sequences]
     aln = muscle(seqs)
     g_aln = [a for a in aln if a.id == 'UCA'][0]
     query_seqs = [str(a.seq) for a in aln if a.id != 'UCA']
     for i, g in enumerate(g_aln):
         qcounts = Counter([q[i] for q in query_seqs])
         qmax = sorted(list(qcounts.keys()),
                       key=lambda x: qcounts[x],
                       reverse=True)[0]
         qmax_fraction = float(qcounts[qmax]) / sum(qcounts.values())
         qmax_alt_mismatches = sum(qcounts.values()) - qcounts[qmax]
         if any([
                 qmax_fraction >= self.rmp_threshold,
                 qmax_alt_mismatches <= self.rmp_alt_allowed_mismatches
         ]):
             rmp += qmax
         else:
             rmp += g
     return run_abstar(Sequence(rmp.replace('-', ''), id='RMP'))
Пример #13
0
def initial_clustering(seq_db_path, args):
    logger.info('\n{} clustering with CD-HIT...'.format('Initial UAID' if args.uaid else 'Identity-based'))
    start = time.time()
    conn = sqlite3.connect(seq_db_path)
    seq_db = conn.cursor()
    if args.uaid:
        seqs = seq_db.execute('''SELECT seqs.seq_id, seqs.uaid FROM seqs''')
    else:
        seqs = seq_db.execute('''SELECT seqs.seq_id, seqs.clustering_seq FROM seqs''')
    seqs = [Sequence(s[1], id=s[0]) for s in seqs]
    all_clusters = cluster(seqs, args.identity_threshold, temp_dir=args.temp_dir,
                           quiet=True, max_memory=0, debug=args.debug)
    passed_clusters = [c for c in all_clusters if c.size >= args.min_seqs]
    sizes = [c.size for c in passed_clusters]
    logger.info('{} total clusters identified'.format(len(all_clusters)))
    logger.info('{} clusters meet the minimum size cutoff ({} sequence{})'.format(len(passed_clusters), args.min_seqs, 's' if args.min_seqs > 1 else ''))
    logger.info('The average cluster contains {} sequences; the largest contains {} sequences'.format(round(1. * sum(sizes) / len(sizes), 2), max(sizes)))
    logger.info('Initial clustering took {} seconds\n'.format(round(time.time() - start, 2)))
    conn.commit()
    conn.close()
    return passed_clusters
Пример #14
0
def vrc01_class_mutation_positions(seqs):
    data = []
    input_seqs = [Sequence([s['seq_id'], s['vdj_aa']]) for s in seqs]
    input_names = [s.id for s in input_seqs]
    # get VRC01-class sequences
    hiv_seqs = get_vrc01_class_sequences()
    all_hiv_names = [s.id for s in hiv_seqs]
    # MSA
    seqs_for_alignment = input_seqs + hiv_seqs
    seqs_for_alignment.append(get_vrc01_germline_sequence(vgene_only=False))
    aln = muscle(seqs_for_alignment)
    aln_seqs = [seq for seq in aln if seq.id in input_names]
    aln_gl = [seq for seq in aln if seq.id == 'glVRC01'][0]
    aln_mins = [seq for seq in aln if seq.id in ['minVRC01', 'min12A21']]
    aln_hiv = [seq for seq in aln if seq.id in all_hiv_names]
    for seq in aln_seqs:
        seq_data = []
        for i, (s, g) in enumerate(zip(str(seq.seq), str(aln_gl.seq))):
            # if g == '-' and s == '-':
            if g == '-':
                continue
            min_residues = [seq[i] for seq in aln_mins]
            vrc01_residues = [seq[i] for seq in aln_hiv]
            if s == '-':
                seq_data.append(0)
            elif s == g:
                seq_data.append(0)
            elif s != g and s in min_residues:
                seq_data.append(2)
            elif s != g and s in vrc01_residues:
                seq_data.append(3)
            elif s != g and s not in vrc01_residues:
                seq_data.append(1)
            else:
                seq_data.append(0)
        data.append(np.asarray(seq_data))
    return np.asarray(data)
Пример #15
0
 def _process_sequence(sequence, aa):
     if type(sequence) == Sequence:
         return sequence
     return Sequence(sequence)
Пример #16
0
def run(*args, **kwargs):
    '''
    Runs AbStar.

    Input sequences can be provided in several different formats:

        1) individual sequences as positional arguments: ``run(seq1, seq2, temp=temp, output=output)``
        2) a list of sequences, as an argument: ``run([seq1, seq2], temp=temp, output=output)``
        3) a single FASTA/Q-formatted input file, passed via ``input``
        4) a directory of FASTA/Q-formatted files, passed via ``input``

    When passing sequences (not FASTA/Q files), the sequences can be in any format recognized
    by ``abtools.sequence.Sequence``, including:

        - a raw nucleotide sequence, as a string (a random sequence ID will be assigned)
        - a list/tuple of the format ``[sequence_id, sequence]``
        - a BioPython SeqRecord object
        - an AbTools Sequence object

    Either sequences, ``project_dir``, or all of ``input``, ``output`` and ``temp`` are required.


    Examples:

        If processing a single sequence, you can pass the raw sequence, as a string::

            import abstar

            result = abstar.run('ATGC')

        or a list/tuple of the format ``[sequence_id, sequence]``::

            result = abstar.run(['seq1', 'ATGC'])

        If you pass just the raw sequence, a random sequence ID will be generated with ``uuid.uuid4()``.
        In either case, when given a single sequence, ``abstar.run()`` will return a single AbTools ``Sequence``
        object. If running multiple sequences, you can either pass each sequence as a positional argument::

            result_list = run(['seq1', 'ATGC'], ['seq2', 'CGTA'])

        or you can pass a list of sequences as the first argument, in this case using sequences parsed from a
        FASTA file using Biopython::

            from Bio import SeqIO

            fasta = open('my_sequences.fasta', 'r')
            seqs = [s for s in SeqIO.parse(fasta, 'fasta')]
            result_list = abstar.run(seqs)

        When given multiple sequences, ``abstar.run()`` will return a list of AbTools ``Sequence`` objects,
        one per input sequence.

        If you'd prefer not to parse the FASTQ/A file into a list (for example, if the input file is
        extremely large), you can pass the input file path directly, along with a temp directory and output
        directory::

            result_files = abstar.run(input='/path/to/my_sequences.fasta',
                                      temp='/path/to/temp',
                                      output='/path/to/output')

        Given a file path, ``abstar.run()`` returns a list of output file paths. In the above case,
        ``result_files`` will be a list containing a single output file path:
        ``/path/to/output/my_sequences.json``.

        If you have a directory containing multiple FASTQ/A files, you can pass the directory path
        using ``input``::

            result_files = abstar.run(input='/path/to/input',
                                      temp='/path/to/temp',
                                      output='/path/to/output')

        As before, ``result_files`` will contain a list of output file paths.

        If your input directory contains paired FASTQ files (gzip compressed or uncompressed)
        that need to be merged prior to processing with AbStar::

            result_files = abstar.run(input='/path/to/input',
                                      temp='/path/to/temp',
                                      output='/path/to/output',
                                      merge=True)

        The paired read files in ``input`` will be merged with PANDAseq prior to processing with AbStar.
        By default, PANDAseq's 'simple bayesian' read merging algorithm is used, although alternate
        algorithms can be selected with ``pandaseq_algo``.

        AbStar also provides an alternate CSV-formatted output type that mimics the `IMGT Summary file`_.
        This option is provided to minimize the effort needed to convert existing
        IMGT-based pipelines to AbStar. Alternate output is only available when passing an input file or
        directory; passing individual sequences or a list of sequences will always return Sequence objects.
        To produce IMGT-formatted output::

            result_files = abstar.run(input='/path/to/input',
                                      temp='/path/to/temp',
                                      output='/path/to/output',
                                      output_type='imgt')

        .. _IMGT Summary file: http://www.imgt.org/IMGT_vquest/share/textes/imgtvquest.html#Esummary


    Args:

        project_dir (str): Path to the project directory. Most useful when directly downloading
            files from BaseSpace, and all subdirectories will be created by AbStar.

        input (str): Path to input directory, containing FASTA/Q files. If performing
            read merging with PANDAseq, paired FASTQ files may be gzip compressed.

        output (str): Path to output directory.

        temp (str): Path to temp directory, where intermediate job files will be stored.

        log (str): Path to log file. If not provided and ``project_dir`` is provided,
            the log will be written to ``/path/to/project_dir/abstar.log``. If output is
            provided, log will be written to ``/path/to/output/abstar.log``.

        species (str): Species of the antibody sequences. Choices are 'human', 'macaque',
            'mouse' and 'rabbit'. Default is 'human'.

        isotype (bool): If True, the isotype will infered by aligning the sequence region
            downstream of the J-gene. If False, the isotype will not be determined.
            Default is True.

        uid (int): Length (in nucleotides) of the Unique Molecular ID used to barcode input RNA.
            A positive integer results in the UMID being parsed from the start of the read (or merged
            read), a negative integer results in parsing from the end of the read. Default is 0,
            which results in no UMID parsing.

        gzip (bool): If True, compresses output files with gzip. Default is False.

        pretty (bool): If True, formats JSON output files to be more human-readable. If False,
            JSON output files contain one record per line. Default is False.

        output_type (str): Options are 'json' or 'imgt'. IMGT output mimics the Summary
            table produced by IMGT High-V/Quest, to maintain a level of compatibility with
            existing IMGT-based pipelines. JSON output is much more detailed. Default is 'json'.

        merge (bool): If True, input must be paired-read FASTA files (gzip compressed or uncompressed)
            which will be merged with PANDAseq prior to processing with AbStar. If ``basespace`` is True,
            ``merge`` is automatically set to True. Default is False.

        pandaseq_algo (str): Define merging algorithm to be used by PANDAseq. Options are
            'simple_bayesian', 'ea_util', 'flash', 'pear', 'rdp_mle', 'stitch', or 'uparse'. Default is
            'simple_bayesian', which is the default PANDAseq algorithm.

        debug (bool): If ``True``, ``abstar.run()`` runs in single-threaded mode, the log is much more verbose,
            and temporary files are not removed. Default is ``False``.


    Returns:

        If the input is a single sequence, ``run`` returns a single AbTools ``Sequence`` object.

        If the input is a list of sequences, ``run`` returns a list of AbTools ``Sequence`` objects.

        If the input is a file or a directory of files, ``run`` returns a list of output files.
    '''

    warnings.filterwarnings("ignore")
    if len(args) == 1:
        # if there's a single arg, need to check if it's a single sequence...
        try:
            sequences = [
                Sequence(args[0]),
            ]
        except:
            # ...or a list of sequences
            try:
                sequences = [Sequence(s) for s in args[0]]
            except:
                print('ERROR: invalid format for sequence input:')
                for a in args:
                    print(a)
                sys.exit(1)
    # if multiple args, assume each is a sequence
    elif len(args) > 1:
        try:
            sequences = [Sequence(s) for s in args]
        except:
            print('ERROR: invalid format for sequence input:')
            for a in args:
                print(a)
            sys.exit(1)
    kwargs['sequences'] = sequences
    args = Args(**kwargs)
    validate_args(args)
    global logger
    logger = log.get_logger('abstar')
    output = main(args)
    # if args.sequences is not None:
    #     output = [Sequence(o) for o in output]
    #     if len(output) == 1:
    #         return output[0]
    return output
Пример #17
0
def main(args):
    if args.sequences is not None:
        # from utils import output, vdj
        processed = process_sequences(args.sequences, args)
        if len(processed) == 1:
            return Sequence(dict(processed[0]))
        return [Sequence(dict(p)) for p in processed]
    else:
        input_dir, output_dir, temp_dir, log_dir = make_directories(args)
        setup_logging(log_dir, args.debug)
        log_options(input_dir, output_dir, temp_dir, args)
        if args.use_test_data:
            mod_dir = os.path.dirname(
                os.path.dirname(os.path.abspath(__file__)))
            input_files = [
                os.path.join(mod_dir, 'test_data/test_1k.fasta'),
            ]
        else:
            if args.basespace:
                args.merge = True
                download_files(input_dir)
            if args.merge:
                input_dir = merge_reads(input_dir, args)
            if args.isotype:
                args.isotype = args.species
            input_files = [
                f for f in list_files(input_dir, log=True)
                if os.stat(f).st_size > 0
            ]
        output_files = []
        assigned_files = []
        unassigned_files = []
        for f, fmt in zip(input_files, format_check(input_files)):
            # skip the non-FASTA/Q files
            if fmt is None:
                continue
            start_time = time.time()
            print_input_file_info(f, fmt)
            subfiles, seq_count = split_file(f, fmt, temp_dir, args)
            run_info = run_jobs(subfiles, temp_dir, log_dir, fmt, args)
            temp_output_files = [r[0] for r in run_info if r is not None]
            processed_seq_counts = [r[1] for r in run_info if r is not None]
            annotated_log_files = [r[2] for r in run_info if r is not None]
            failed_log_files = [r[3] for r in run_info if r is not None]
            unassigned_log_files = [r[4] for r in run_info if r is not None]
            vdj_end_time = time.time()
            _output_files = concat_outputs(f, temp_output_files, output_dir,
                                           args)
            unassigned_file = concat_logs(f, unassigned_log_files, log_dir,
                                          'unassigned')
            failed_file = concat_logs(f, failed_log_files, log_dir, 'failed')
            if args.debug:
                annotated_file = concat_logs(f, annotated_log_files, log_dir,
                                             'annotated')
            output_files.extend(output_files)
            if not args.debug:
                flat_temp_files = [
                    f for subl in temp_output_files for f in subl
                ]
                clear_temp_files(subfiles + flat_temp_files +
                                 annotated_log_files + failed_log_files +
                                 unassigned_log_files)
            print_job_stats(seq_count, processed_seq_counts, start_time,
                            vdj_end_time)
        return output_files
Пример #18
0
    def phylogeny(self,
                  project_dir,
                  aln_file=None,
                  tree_file=None,
                  root=None,
                  seq_field='vdj_nt',
                  aa=False,
                  root_name=None,
                  show_root_name=False,
                  colors=None,
                  color_function=None,
                  orders=None,
                  order_function=None,
                  chain='heavy',
                  filter_function=None,
                  just_pairs=False,
                  color_node_labels=False,
                  label_colors=None,
                  scale=None,
                  branch_vert_margin=None,
                  fontsize=12,
                  show_names=True,
                  name_field='seq_id',
                  show_scale=False,
                  mirror=False,
                  min_order_fraction=0.1,
                  figname_prefix=None,
                  figname_suffix=None,
                  linked_alignment=None,
                  alignment_fontsize=11,
                  scale_factor=1,
                  rename_function=None,
                  alignment_height=50,
                  alignment_width=50,
                  compact_alignment=False,
                  linewidth=1.0,
                  show_output=False):
        '''
        Generates a lineage phylogeny figure.

        Inputs (required)
        -----------------
        project_dir: directory for all phylogeny files,
            including alignment, tree and figure files

        Inputs (optional)
        -----------------
        aln_file: if a multiple sequence alignment has already been calculated,
            passing the path to the alignment file will force Lineage.phylogeny()
            to use the supplied msa instead of computing a new one.
        tree_file: if a tree file has already been calculated, passing the path
            to the pre-computed tree file will force Lineage.phylogeny() to use
            the supplied tree file instead of computing a new one.
        aa: if True, use amino acid sequences to compute the phylogeny.
            Default is False.
        root: provide a sequence to be used as the tree root. If not provided,
            the UCA will be used to root the tree.
        colors: a dictionary with sequence IDs as keys and colors as values. If any
            lineage sequences are not in the dict, they will be colored black. If
            not provided, all leaves will be colored black. Alternately, if provided
            in combination with either <orders> or <order_function>, the dictionary
            keys should be orders (integers) instead of sequence IDs.
        color_function: provide a function that will be called on each sequence. The
            function should accept an AbTools Sequence object and return a color
            (as a hex value).
        orders: a dictionary with sequence IDs as keys and orders (integers) as values.
            If not provided, only the leaf branches will be colored (if <colors> or
            <color_function> is provided).
        chain: build a phylogeny using the given chain ('heavy' or 'light').
            Default is 'heavy'.
        filter_function: function used to filter sequences (identity-based clustering, for
            example). The function should accept a list of Sequence objects and return
            a list of Sequence objects.
        just_pairs: if True, compute the phylogeny using only paired sequences.
            Default (False) will use all sequences of the appropriate chain, paired or not.
        scale: passed to ete2.TreeStyle() to set the scale of the tree figure. Increased
            scale results in a wider tree.
        branch_vert_margin: passed to ete2.TreeStyle() to set the branch_vertical_margin of
            the tree figure. Increased branch_vert_margin results in a taller tree.
        fontsize: size of the leaf labels. Default is 12.
        show_names: show names of leaf nodes. Options are True (show labels for all leaf nodes),
            False (don't show labels for any leaf nodes) or a list of sequence IDs for which
            labels should be shown. Default is True.
        mirror: flip the orientation of the tree. Default is to draw the tree from left to right.
            Setting mirror to True results in the tree being drawn from right to left.
        min_order_fraction: minimum fraction of downstream leaves requried to color a branch.
            When coloring non-leaf nodes, the earliest 'order' with at least <min_order_fraction>
            leaf nodes is used. Default is 0.1 (which corresponds to 10%).
        figname_prefix: by default, figures will be named <lineage_id>.pdf. If prefix='prefix_' and
            the lineage ID is 'ABC123', the figure file will be named 'prefix_ABC123.pdf'.
        figname_suffix: by default, figures will be named <lineage_id>.pdf. If suffix='_suffix' and
            the lineage ID is 'ABC123', the figure file will be named 'ABC123_suffix.pdf'.
        '''
        # seq_field = 'vdj_nt' if seq_field is None else seq_field
        project_dir = os.path.abspath(project_dir)
        orientation = 1 if mirror else 0
        root_name = root_name if root_name is not None else 'root'
        # get sequences for phylogeny
        if chain == 'heavy':
            seq_pool = self.just_pairs if just_pairs else self.heavies
            # seqs = [Sequence(p.heavy[seq_field], id=p.heavy[name_field]) for p in seq_pool]
            seqs = [p.heavy for p in seq_pool]
            if filter_function is not None:
                seqs = filter_function(seqs)
            if root is None:
                root = self.uca.heavy[seq_field]
            seqs += [Sequence(root, id=root_name)]
        else:
            seq_pool = self.just_pairs if just_pairs else self.lights
            # seqs = [Sequence(p.light[seq_field], id=p.light[name_field]) for p in seq_pool]
            seqs = [p.light for p in seq_pool]
            if filter_function is not None:
                seqs = filter_function(seqs)
            if root is None:
                root = self.uca.light[seq_field]
            seqs += [Sequence(root, id=root_name)]
        # filter sequences
        # if filter_function is not None:
        #     seqs = filter_function(seqs)
        # setup orders
        if orders is None:
            if order_function is not None:
                orders = {seq['seq_id']: order_function(seq) for seq in seqs}
        # setup colors
        if colors is None:
            if color_function is not None:
                colors = {seq['seq_id']: color_function(seq) for seq in seqs}
            else:
                colors = {}
        # make msa
        if all([aln_file is None, tree_file is None]):
            aln_file = os.path.abspath(
                os.path.join(project_dir, '{}.aln'.format(self.name)))
            # muscle(seqs, aln_file, as_file=True)
            mafft(seqs, aln_file, as_file=True)
        # make treefile
        if tree_file is None:
            tree_file = os.path.abspath(
                os.path.join(project_dir, '{}.nw'.format(self.name)))
            fast_tree(aln_file, tree_file, is_aa=aa, show_output=show_output)
        # make phylogeny
        prefix = '' if figname_prefix is None else figname_prefix
        suffix = '' if figname_suffix is None else figname_suffix
        fig_file = os.path.join(project_dir,
                                '{}{}{}.pdf'.format(prefix, self.name, suffix))
        self._make_tree_figure(tree_file,
                               fig_file,
                               colors,
                               orders,
                               root_name,
                               rename_function=rename_function,
                               show_names=show_names,
                               name_field=name_field,
                               branch_vert_margin=branch_vert_margin,
                               scale=scale,
                               color_node_labels=color_node_labels,
                               label_colors=label_colors,
                               show_root_name=show_root_name,
                               tree_orientation=orientation,
                               fontsize=fontsize,
                               min_order_fraction=min_order_fraction,
                               linked_alignment=linked_alignment,
                               alignment_fontsize=alignment_fontsize,
                               chain=chain,
                               alignment_height=alignment_height,
                               alignment_width=alignment_width,
                               show_scale=show_scale,
                               compact_alignment=compact_alignment,
                               scale_factor=scale_factor,
                               linewidth=linewidth)
Пример #19
0
 def __init__(self, sequence, species=None):
     self.raw_sequence = Sequence(str(sequence.seq), id=sequence.description)
     self._species = species
     self.gapped_nt_sequence = self.raw_sequence.sequence
     self.ungapped_nt_sequence = self.gapped_nt_sequence.replace('.', '')
Пример #20
0
def cluster(seqs,
            threshold=0.975,
            out_file=None,
            make_db=True,
            temp_dir=None,
            quiet=False,
            threads=0,
            return_just_seq_ids=False,
            max_memory=800,
            debug=False):
    '''
    Perform sequence clustering with CD-HIT.

    Args:

        seqs (list): An iterable of sequences, in any format that abtools.sequence.Sequence()
            can handle

        threshold (float): Clustering identity threshold. Default is 0.975.

        out_file (str): Path to the clustering output file. Default is to use
            tempfile.NamedTempraryFile to generate an output file name.

        temp_dir (str): Path to the temporary directory. If not provided, '/tmp' is used.

        make_db (bool): Whether to build a SQlite database of sequence information. Required
            if you want to calculate consensus/centroid sequences for the resulting
            clusters or if you need to access the clustered sequences (not just sequence IDs)
            Default is True.

    Returns:

        list: A list of Cluster objects, one per cluster.
    '''
    if make_db:
        ofile, cfile, seq_db, db_path = cdhit(seqs,
                                              out_file=out_file,
                                              temp_dir=temp_dir,
                                              threshold=threshold,
                                              make_db=True,
                                              quiet=quiet,
                                              threads=threads,
                                              max_memory=max_memory,
                                              debug=debug)
        return parse_clusters(ofile,
                              cfile,
                              seq_db=seq_db,
                              db_path=db_path,
                              return_just_seq_ids=return_just_seq_ids)
    else:
        seqs = [Sequence(s) for s in seqs]
        seq_dict = {s.id: s for s in seqs}
        ofile, cfile, = cdhit(seqs,
                              out_file=out_file,
                              temp_dir=temp_dir,
                              threads=threads,
                              threshold=threshold,
                              make_db=False,
                              quiet=quiet,
                              max_memory=max_memory,
                              debug=debug)
        return parse_clusters(ofile,
                              cfile,
                              seq_dict=seq_dict,
                              return_just_seq_ids=return_just_seq_ids)
Пример #21
0
def get_expanded_vrc01_class_sequences(chain='heavy',
                                       vgene_only=True,
                                       only_include=None):
    if vgene_only:
        heavy = [
            ('VRC01',
             'QVQLVQSGGQMKKPGESMRISCRASGYEFIDCTLNWIRLAPGKRPEWMGWLKPRGGAVNYARPLQGRVTMTRDVYSDTAFLELRSLTVDDTAVYFCTR'
             ),
            ('PGV04',
             'QVQLVQSGSGVKKPGASVRVSCWTSEDIFERTELIHWVRQAPGQGLEWIGWVKTVTGAVNFGSPDFRQRVSLTRDRDLFTAHMDIRGLTQGDTATYFCAR'
             ),
            ('VRC-CH31',
             'QVQLVQSGAAVRKPGASVTVSCKFAEDDDYSPYWVNPAPEHFIHFLRQAPGQQLEWLAWMNPTNGAVNYAWYLNGRVTATRDRSMTTAFLEVKSLRSDDTAVYYCAR'
             ),
            ('3BNC60',
             'QVHLSQSGAAVTKPGASVRVSCEASGYKISDHFIHWWRQAPGQGLQWVGWINPKTGQPNNPRQFQGRVSLTRQASWDFDTYSFYMDLKAVRSDDTAIYFCAR'
             ),
            ('12A12',
             'HLVQSGTQVKKPGASVRISCQASGYSFTDYVLHWWRQAPGQGLEWMGWIKPVYGARNYARRFQGRINFDRDIYREIAFMDLSGLRSDDTALYFCAR'
             ),
            ('PGV20',
             'QVHLMQSGTEMKKPGASVRVTCQTSGYTFSDYFIHWLRQVPGRGFEWMGWMNPQWGQVNYARTFQGRVTMTRDVYREVAYLDLRSLTFADTAVYFCAR'
             ),
            ('3BNC117',
             'QVQLLQSGAAVTKPGASVRVSCEASGYNIRDYFIHWWRQAPGQGLQWVGWINPKTGQPNNPRQFQGRVSLTRHASWDFDTFSFYMDLKALRSDDTAVYFCAR'
             ),
            ('12A21',
             'SQHLVQSGTQVKKPGASVRVSCQASGYTFTNYILHWWRQAPGQGLEWMGLIKPVFGAVNYARQFQGRIQLTRDIYREIAFLDLSGLRSDDTAVYYCAR'
             ),
            ('C38-VRC18.02',
             'EVRLVQSGNQVRKPGASVRISCEASGYKFIDHFIHWVRQVPGHGLEWLGWINPRGGGVNYSRSFQGKLSMTMTRDNFEETAYLDLSKLNPGDTAVYFCAR'
             ),
            ('N6',
             'RAHLVQSGTAMKKPGASVRVSCQTSGYTFTAHILFWFRQAPGRGLEWVGWIKPQYGAVNFGGGFRDRVTLTRDVYREIAYMDIRGLKPDDTAVYYCAR'
             ),
            ('N49P7',
             'ADLVQSGAVVKKPGDSVRISCEAQGYRFPDYIIHWIRRAPGQGPEWMGWMNPMGGQVNIPWKFQGRVSMTRDTSIETAFLDLRGLKSDDTAVYYCVR'
             ),
            ('N60P25.1',
             'HVQLVQSGTEVKRPGASVRISCASSGYTFSNYFIHWVRQAPGRGLEWMGWMNPLRGAVNYSGKFQGRVTMTRDIYTETSFMVLSGLRSDDTAIYFCAR'
             ),
            ('NIH45-46',
             'QVRLSQSGGQMKKPGESMRLSCRASGYEFLNCPINWIRLAPGRRPEWMGWLKPRGGAVNYARKFQGRVTMTRDVYSDTAFLELRSLTSDDTAVYFCTR'
             ),
            ('PCIN63-71I',
             'QVQLVQSGVAVKKPGASVWVSCKASGYTFTSCYIHWFRQAPGQGLEWMGWLNPINGARNNPYQFQGRISLTRDTSSETAYLELRNLRSDDTAVYYCAR'
             ),
            ('VRC02',
             'QVQLVQSGGQMKKPGESMRISCQASGYEFIDCTLNWVRLAPGRRPEWMGWLKPRGGAVNYARPLQGRVTMTRDVYSDTAFLELRSLTADDTAVYYCTR'
             ),
            ('VRC07',
             'QVRLSQSGGQMKKPGDSMRISCRASGYEFINCPINWIRLAPGKRPEWMGWMKPRGGAVSYARQLQGRVTMTRDMYSETAFLELRSLTSDDTAVYFCTR'
             ),
            ('VRC08',
             'QVQLVQSGTQMKEPGASVTISCVTSGYEFVEILINWVRQVPGRGLEWMGWMNPRGGGVNYARQFQGKVTMTRDVYRDTAYLTLSGLTSGDTAKYFCVR'
             ),
            ('VRC27',
             'SQRLVQSGPQVRKPGSSVRISCETSGYTFNAYILHWFRQAPGRSFEWMGWIKPKFGAVNYAHSFQGRITLTRDIYRETAFLDLTGLRFDDTAVYYCAR'
             ),
            ('VRC-PG19',
             'EVRLVQSGAEVKKPGASVRVSCAASGYTFTDFDIHWLRQAPGRGLEWMGWVRPLGGGVSYARQFQGRVTMTRDFYIDTAFMDFRNLKMDDTALYFCAR'
             )
        ]
        light = []
    else:
        heavy = [
            ('VRC01',
             'QVQLVQSGGQMKKPGESMRISCRASGYEFIDCTLNWIRLAPGKRPEWMGWLKPRGGAVNYARPLQGRVTMTRDVYSDTAFLELRSLTVDDTAVYFCTRGKNCDYNWDFEHWGRGTPVIVSS'
             ),
            ('PGV04',
             'QVQLVQSGSGVKKPGASVRVSCWTSEDIFERTELIHWVRQAPGQGLEWIGWVKTVTGAVNFGSPDFRQRVSLTRDRDLFTAHMDIRGLTQGDTATYFCARQKFYTGGQGWYFDLWGRGTLIVVSS'
             ),
            ('VRC-CH31',
             'QVQLVQSGAAVRKPGASVTVSCKFAEDDDYSPYWVNPAPEHFIHFLRQAPGQQLEWLAWMNPTNGAVNYAWYLNGRVTATRDRSMTTAFLEVKSLRSDDTAVYYCARAQKRGRSEWAYAHWGQGTPVVVSS'
             ),
            ('3BNC60',
             'QVHLSQSGAAVTKPGASVRVSCEASGYKISDHFIHWWRQAPGQGLQWVGWINPKTGQPNNPRQFQGRVSLTRQASWDFDTYSFYMDLKAVRSDDTAIYFCARQRSDFWDFDVWGSGTQVTVSS'
             ),
            ('12A12',
             'HLVQSGTQVKKPGASVRISCQASGYSFTDYVLHWWRQAPGQGLEWMGWIKPVYGARNYARRFQGRINFDRDIYREIAFMDLSGLRSDDTALYFCARDGSGDDTSWHLDPWGQGTLVIVSA'
             ),
            ('PGV20',
             'QVHLMQSGTEMKKPGASVRVTCQTSGYTFSDYFIHWLRQVPGRGFEWMGWMNPQWGQVNYARTFQGRVTMTRDVYREVAYLDLRSLTFADTAVYFCARRMRSQDREWDFQHWGQGTRIIVSS'
             )
        ]
        light = []
    seqs = heavy if chain == 'heavy' else light
    if only_include is not None:
        if type(only_include) in [str, unicode]:
            only_include = [
                only_include,
            ]
        seqs = [s for s in seqs if s[0] in only_include]
    return [Sequence(s) for s in seqs]