Пример #1
0
def distance_comparison(dataframe, data_dir, test_name, samples=10000):
    simulated_dir = join(data_dir, test_name)
    for index, data in dataframe.iterrows():
        lengths = Counter()
        inner = []
        outer = []
        trainsets = glob(join(simulated_dir, index + '*', 'ref_seqs.fasta'))
        testsets = glob(join(simulated_dir, index + '*', 'query.fasta'))
        for train_fp, test_fp in zip(trainsets, testsets):
            train = list(map(str, io.read(train_fp, format='fasta')))
            test = list(map(str, io.read(test_fp, format='fasta')))
            if not train or not test:
                continue
            lengths.update(map(len, test))
            inner.extend(distances(train, train, samples))
            outer.extend(distances(train, test, samples))
        inner.sort()
        outer.sort()
        df = pd.DataFrame({'train/train': inner, 'train/test': outer})

        plt.figure()  # figsize=(width, height))
        ax = regplot('train/train', 'train/test', df, fit_reg=False)
        ax.set_title(index, fontsize=20)
        maxval = max((inner[-1], outer[-1]))
        plt.plot([0, maxval], [0, maxval], linewidth=2)
        plt.show()
def main():
    parser = argparse.ArgumentParser(
             description= 'This script will write out sequences based on \n'
             'sequence identifiers in a label file. ',
             formatter_class=RawTextHelpFormatter)
    req = parser.add_argument_group('REQUIRED')
    req.add_argument('-i', '--input_fasta', required=True, action='store',
                     help='Input fasta file.')
    req.add_argument('-l', '--input_sequence_labels', required=True, action='store',
                     help='File in which the first item in each line is'
                        ' a sequence label / identifier.')
    req.add_argument('-o', '--output_fasta', required=True, action='store',
                     help='Output fasta file.')
    optp = parser.add_argument_group('OPTIONAL')
    optp.add_argument('-d', '--include_description', action='store_true',
                      help='Boolean. Keep the additional FASTA header '
                      'description text.[Default: False]')
    optp.add_argument('-r', '--remove_ids', action='store_true',
                      help='Boolean. Remove sequences with the corresponding '
                      'IDs, rather than keep. [Default: False]')

    p = parser.parse_args()

    input_fasta = read(p.input_fasta, format='fasta')
    input_labels = open(p.input_sequence_labels, 'U')
    output_fasta = open(p.output_fasta, 'w')
    remove_ids = p.remove_ids
    include_description = p.include_description

    seq_labels = parse_labels(input_labels)
    filter_seqs(input_fasta, output_fasta, seq_labels, remove_ids=remove_ids,
               desc=include_description)

    input_fasta.close()
    output_fasta.close()
Пример #3
0
 def test_write_genes(self):
     genes = sample_genes(self.ortho_groups_fp, min_taxa_cutoff=4.0)
     write_genes(genes, self.ref_faa_dir, self.out_fa_dir,
                 self.out_genes_fp)
     # test number of output FASTA files
     obs = sorted(listdir(self.out_fa_dir))
     exp = sorted(['%s.fa' % ogid for ogid in genes])
     self.assertListEqual(obs, exp)
     # test output gene list content
     with open(self.out_genes_fp, 'r') as f:
         obs = f.read()
     with open(self.write_genes_list, 'r') as f:
         exp = f.read()
     self.assertEqual(obs, exp)
     # test FASTA file content
     for seq in io.read(join(self.out_fa_dir, 'OG0000017.fa'),
                        format='fasta'):
         exp = 'GCF_000160655.1|WP_040356123.1'
         self.assertEqual(seq.metadata['id'], exp)
         exp = ('MYRKHYAADVTETLDGQTVQVAGWVHRRRDHGGVIFIDLRDRSGLVQIVIDPDTADAF'
                'ALAEQVRNEYCLAIEGRVRLRPAGTENPDLASGKIEILGKQLTVLSKSEPLPFQLDED'
                'NVSEEIRLKHRTIDLRRDVMQKNLILRSKVAASLRRYLDEHGFMDIETPMLTKATPEG'
                'ARDYLVPSRTHPGKFFALPQSPQLFKQMLMMSGFDRYYQIVRCFRDEDLRADRQPEFT'
                'QLDIETSFLEEEDILQIMEPMIRGIFKEHLGVELANPFPRMTYREAMRRYASDKPDLR'
                'IPLELVDIDDLVKNSGFKVFASVAAQDNGRVVALKIPGGAKLTRKEIDDYTAYVARYG'
                'AKGLAYIKVNDATNVEGLQSPIVKFLTTEGGAEGAIALDIIKRVDAQNGDLIFFGADK'
                'ASIVNDAIGALRIKVGHDLNMLTCDWAPLWVVDFPMFEYDEKDGRWYSMHHPFTQPKT'
                'ANLDELDTNPGDVLSRAYDMVLNGTEIGGGSIRIHRDDMQQRVFKSLGIGAEEAQEKF'
                'GFLLNALKYGCPPHGGIAFGLDRLIMLMAGAKSIRDVMAFPKTQTAWCPLTDAPSEAS'
                'EAQLRELHIRKRQVEKSE')
         self.assertEqual(str(seq), exp)
         break
Пример #4
0
 def test_write_genes(self):
     genes = sample_genes(self.ortho_groups_fp, min_taxa_cutoff=4.0)
     write_genes(genes, self.ref_faa_dir, self.out_fa_dir,
                 self.out_genes_fp)
     # test number of output FASTA files
     obs = sorted(listdir(self.out_fa_dir))
     exp = sorted(['%s.fa' % ogid for ogid in genes])
     self.assertListEqual(obs, exp)
     # test output gene list content
     with open(self.out_genes_fp, 'r') as f:
         obs = f.read()
     with open(self.write_genes_list, 'r') as f:
         exp = f.read()
     self.assertEqual(obs, exp)
     # test FASTA file content
     for seq in io.read(join(self.out_fa_dir, 'OG0000017.fa'),
                        format='fasta'):
         exp = 'GCF_000160655.1|WP_040356123.1'
         self.assertEqual(seq.metadata['id'], exp)
         exp = ('MYRKHYAADVTETLDGQTVQVAGWVHRRRDHGGVIFIDLRDRSGLVQIVIDPDTADAF'
                'ALAEQVRNEYCLAIEGRVRLRPAGTENPDLASGKIEILGKQLTVLSKSEPLPFQLDED'
                'NVSEEIRLKHRTIDLRRDVMQKNLILRSKVAASLRRYLDEHGFMDIETPMLTKATPEG'
                'ARDYLVPSRTHPGKFFALPQSPQLFKQMLMMSGFDRYYQIVRCFRDEDLRADRQPEFT'
                'QLDIETSFLEEEDILQIMEPMIRGIFKEHLGVELANPFPRMTYREAMRRYASDKPDLR'
                'IPLELVDIDDLVKNSGFKVFASVAAQDNGRVVALKIPGGAKLTRKEIDDYTAYVARYG'
                'AKGLAYIKVNDATNVEGLQSPIVKFLTTEGGAEGAIALDIIKRVDAQNGDLIFFGADK'
                'ASIVNDAIGALRIKVGHDLNMLTCDWAPLWVVDFPMFEYDEKDGRWYSMHHPFTQPKT'
                'ANLDELDTNPGDVLSRAYDMVLNGTEIGGGSIRIHRDDMQQRVFKSLGIGAEEAQEKF'
                'GFLLNALKYGCPPHGGIAFGLDRLIMLMAGAKSIRDVMAFPKTQTAWCPLTDAPSEAS'
                'EAQLRELHIRKRQVEKSE')
         self.assertEqual(str(seq), exp)
         break
Пример #5
0
def count_seq(filename):
    '''Count seq number in the file.

    The file can be gzipped.
    '''
    for i, s in enumerate(read(filename, format='fasta'), 1):
        pass
    return i
Пример #6
0
def mv_seq(seq, opath, name_dict):
    seq = read(seq, format='fasta')
    with open(opath, 'w') as f1:
        for i in seq:
            pre_name = i.metadata['id']
            i.metadata['id'] = name_dict[pre_name]
            i.metadata['description'] = ''
            write(i, 'fasta', f1)
Пример #7
0
 def test_compute_gene_score(self):
     seqs = get_data_path('pfam.faa')
     for number, exp in [(1, 0.1), (102, 1)]:
         with NamedTemporaryFile() as faa:
             for i, seq in enumerate(read(seqs, format='fasta')):
                 if i == number:
                     break
                 write(seq, into=faa, format='fasta')
             faa.flush()
             obs = compute_gene_score(faa.name)
             self.assertEqual(obs, exp)
Пример #8
0
def loadRefSeqs(seqsDb, taxRef):
    reference_db = []
    for e in read(seqsDb, format='fasta', constructor=DNA):
        if e.has_degenerates():
            # For the purpose of this lesson, we're going to ignore sequences that contain
            # degenerate characters (i.e., characters other than A, C, G, or T)
            continue
        seq_tax = taxRef[e.metadata['id']]
        e.metadata['taxonomy'] = seq_tax
        reference_db.append(e)
    return reference_db
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i",
                        "--input_file",
                        help="location of biom table or fasta file")
    parser.add_argument("-o",
                        "--output_file",
                        help="location of output biom table or fasta file")
    parser.add_argument(
        "-f",
        "--pynast_fasta",
        help="location of pynast failures fasta file to be removed")
    args = parser.parse_args()

    ids_to_toss = set()
    if os.stat(args.pynast_fasta).st_size != 0:
        ids_to_toss = set([
            i.id for i in read("pynast_aligned_seqs/rep_set_failures.fasta",
                               format='fasta')
        ])

    if args.input_file.endswith(".biom"):
        table = load_table(args.input_file)
        set_to_toss = set(table.ids(axis="observation")) & ids_to_toss

        table.filter(set_to_toss,
                     invert=True,
                     axis="observation",
                     inplace=True)
        table.to_json("remove_pynast_failures.py", open(args.output_file, 'w'))

    elif args.input_file.endswith(".fasta") or args.input_file.endswith(".fa"):
        if args.output_file is not None:
            sys.stdout = open(args.output_file, 'w')
        for seq in read(args.input_file, format='fasta'):
            if seq.id not in ids_to_toss:
                print('>%s\n%s' % (seq.id, str(seq)))
        if args.output_file is not None:
            sys.stdout.close()
    else:
        raise ValueError("Input file must of type .biom, .fasta or .fa")
Пример #10
0
    def parse(self):
        '''Parse the annotation and add it to interval metadata.

        Parameters
        ----------
        fp : str
            file path from minced prediction

        '''
        self.result = {
            sid: imd
            for sid, imd in read(self.files['gff'], format='gff3')
        }
Пример #11
0
def main():
    parser = argparse.ArgumentParser(
        description='This script will simply re-write FASTA files '
        'without the description. \nWill also convert all Us to Ts and '
        'optionally convert "." to "-".'
        'That is, this: \n'
        '\t>seq1 H. Sapiens\n'
        '\tACCGGUUGGCCGUUCAGGGUACAGGUUGGCCGUUCAGGGUAA\n'
        'will be output as:\n'
        '\t>seq1\n'
        '\tACCGGTTGGCCGTTCAGGGTACAGGTTGGCCGTTCAGGGTAA\n'
        'Expected to be used with SILVA FASTA files.',
        formatter_class=RawTextHelpFormatter)
    req = parser.add_argument_group('REQUIRED')
    req.add_argument('-i',
                     '--input_fasta',
                     required=True,
                     action='store',
                     help='Input fasta file.')
    req.add_argument('-o',
                     '--output_fasta',
                     required=True,
                     action='store',
                     help='Output fasta file.')
    optp = parser.add_argument_group('OPTIONAL')
    optp.add_argument('-d',
                      '--include_description',
                      action='store_true',
                      help='Boolean. Keep the additional FASTA header '
                      'description text.[Default: False]')
    optp.add_argument('-g',
                      '--convert_to_gap',
                      action='store_true',
                      help='Boolean. Convert "." to "-". [Default: False]')

    p = parser.parse_args()

    input_fasta = read(p.input_fasta, format='fasta')
    output_fasta = open(p.output_fasta, 'w')
    convert_to_gap = p.convert_to_gap
    include_description = p.include_description

    parse_seqs(input_fasta,
               output_fasta,
               convg=convert_to_gap,
               desc=include_description)

    input_fasta.close()
    output_fasta.close()
Пример #12
0
def load_fasta_ids(path):
    """
    Reads sequences from a fasta file and extracts identifiers.

    Parameters
    ----------
    input_file : str
        fasta file containing contigs and gene identifiers

    Returns
    -------
    List of fasta identifiers
    """
    fasta_ids = [seq.metadata['id'] for seq in io.read(path, format='fasta')]
    return fasta_ids
Пример #13
0
def main():
    parser = argparse.ArgumentParser(
        description='This script will simply degap FASTA files.\n'
        'Optionally without the description and or converting '
        'Us to Ts.\n'
        'That is, this: \n'
        '\t>seq1 H. Sapiens\n'
        '\t...ACCGGUU---GGCCGUU CAGGGUACAGGUUGGCCGUUCAGGGUAA...\n'
        'will be output as:\n'
        '\t>seq1\n'
        '\tACCGGTTGGCCGTTCAGGGTACAGGTTGGCCGTTCAGGGTAA\n',
        formatter_class=RawTextHelpFormatter)
    req = parser.add_argument_group('REQUIRED')
    req.add_argument('-i',
                     '--input_fasta',
                     required=True,
                     action='store',
                     help='Input fasta file.')
    req.add_argument('-o',
                     '--output_fasta',
                     required=True,
                     action='store',
                     help='Output fasta file.')
    optp = parser.add_argument_group('OPTIONAL')
    optp.add_argument('-d',
                      '--include_description',
                      action='store_true',
                      help='Boolean. Keep the additional FASTA header '
                      'description text.[Default: False]')
    optp.add_argument('-u',
                      '--convert_to_uracil',
                      action='store_true',
                      help='Boolean. Convert "U" to "T". [Default: False]')

    p = parser.parse_args()

    input_fasta = read(p.input_fasta, format='fasta')
    output_fasta = open(p.output_fasta, 'w')
    convert_to_uracil = p.convert_to_uracil
    include_description = p.include_description

    parse_seqs(input_fasta,
               output_fasta,
               convu=convert_to_uracil,
               desc=include_description)

    input_fasta.close()
    output_fasta.close()
Пример #14
0
def align_sequences(seqs):
    import io
    from subprocess import run, PIPE
    from skbio.io import read, write
    from skbio.sequence import Sequence

    fasta = 'rational_designs.fa'
    seqs = (Sequence(x) for x in seqs)
    write(seqs, format='fasta', into=fasta)

    clustalo = 'clustalo', '-i', fasta
    stdout = run(clustalo, stdout=PIPE, encoding='utf8').stdout
    stdout_io = io.StringIO(stdout)
    msa = read(stdout_io, format='fasta')

    return [str(x) for x in msa]
Пример #15
0
def parse_msa_file(infile):
    """Read sequences from a multiple sequence alignment (MSA) file.

    Parameters
    ----------
    infile : str
        file path to input MSA file in A3M format (like FASTA format, but
        lowercase letters will be dropped)

    Returns
    -------
    skbio TabularMSA
    """
    seqs = []
    for seq in io.read(infile, format='fasta'):
        seqs.append(Protein(re.sub('[a-z]', '', str(seq)),
                            metadata=seq.metadata))
    return TabularMSA(seqs)
Пример #16
0
def parse_msa_file(infile):
    """Read sequences from a multiple sequence alignment (MSA) file.

    Parameters
    ----------
    infile : str
        file path to input MSA file in A3M format (like FASTA format, but
        lowercase letters will be dropped)

    Returns
    -------
    skbio TabularMSA
    """
    seqs = []
    for seq in io.read(infile, format='fasta'):
        seqs.append(
            Protein(re.sub('[a-z]', '', str(seq)), metadata=seq.metadata))
    return TabularMSA(seqs)
Пример #17
0
def runtime_make_test_data(seqs_in, results_dir, sampling_depths):
    '''Repeatedly subsample a fasta sequence file at multiple sequence depths
    to generate query/test data for testing method runtimes.

    seqs_in: path
        fasta format reference sequences.
    results_dir: path
        Output directory.
    sampling_depths: list of integers
        Number of sequences to subsample from seqs.
    '''
    if not exists(results_dir):
        makedirs(results_dir)

    seqs = [seq for seq in io.read(seqs_in, format='fasta')]
    for depth in sampling_depths:
        subset = sample(seqs, depth)
        tmpfile = join(results_dir, str(depth)) + '.fna'
        with open(tmpfile, "w") as output_fasta:
            for s in subset:
                s.write(output_fasta, format='fasta')
def main():
    parser = argparse.ArgumentParser(
             description= 'This script will read in a FASTA file and remove '
              'any sequences that have homopolymers and ambiguous base calls.'
              'That is, the following sequences would be removed: \n'
              '\t>seq1-homopolymeric\n'
              '\tACCGGTTGGCCGTTTTTTTTTCAGGGMACAGGTTVGCCGTTCAGGGTAA\n'
              '\t>seq2-ambiguos-bases\n'
              '\tACCGGTTGGCCVTGCCGMMTTCVVAGRGTAY\n',
              formatter_class=RawTextHelpFormatter)
    req = parser.add_argument_group('REQUIRED')
    req.add_argument('-i', '--input_fasta', required=True, action='store',
                     help='Input fasta file.')
    req.add_argument('-o', '--output_fasta', required=True, action='store',
                     help='Output fasta file.')
    optp = parser.add_argument_group('OPTIONAL')
    optp.add_argument('-p', '--n_homopolymer_length', action='store',
                     type=int, default=8,
                     help='Remove sequences that contain homopolymers of '
                     'greater than or equal to length n. \n'
                     "[Default %(default)s)]")
    optp.add_argument('-a', '--n_ambiguous_bases', action='store', type=int,
                     default=5, help='Remove sequences that contain a '
                     'number of IUPAC ambiguous bases greater than or equal '
                     "to length n. \n[Default %(default)s)]")

    p = parser.parse_args()

    input_fasta = read(p.input_fasta, format='fasta')
    output_fasta = open(p.output_fasta, 'w')
    n_homopolymer_length = p.n_homopolymer_length
    n_ambiguous_bases = p.n_ambiguous_bases

    filter_seqs(input_fasta, output_fasta,
                n_homopolymer_length=n_homopolymer_length,
                n_ambiguous_bases=n_ambiguous_bases)

    input_fasta.close()
    output_fasta.close()
Пример #19
0
def extract_sequences(infile, seqidx=0):
    """ Extract sequence(s) from a multi-sequence FASTA file

    Parameters
    ----------
    infile : str
        file path to input multi-sequence FASTA file
    seqidx : int (optional)
        n-th sequence of the input file to extract
        (default: 0 for all sequences)

    Returns
    -------
    list of skbio Sequence
    """
    seqs = []
    iseq = 0  # current sequence index
    for seq in io.read(infile, format='fasta'):
        iseq += 1
        if 0 < seqidx != iseq:
            continue
        seqs.append(seq)
    return seqs
Пример #20
0
def make_tRNA_table(mature_fa,
                    tRNA_structure,
                    tRNA_fa,
                    tablename,
                    prefix='TR'):
    tRNA_dict = tRNA_seq_dict(mature_fa)
    anticodon_dict = anticodon_pos(tRNA_structure)
    rows = []
    for record in read(tRNA_fa, 'fasta'):
        if record.metadata['id'].startswith(prefix):
            tRNA_id = tRNA_dict[str(record)]
            anticodon = record.metadata['id'].split('-')[1]
            pos, annotated_anticodon, aa = anticodon_dict[tRNA_id].split(',')
            tRNA_length = len(record)
            rows.append(
                (record.metadata['id'], 0, tRNA_length, anticodon, pos, aa,
                 str(record)))

    df = pd.DataFrame(rows, columns = ['tRNA','start','end','anticodon','anticodon_pos', 'aa', 'seq']) \
        .assign(anticodon_start = lambda d: d.anticodon_pos.str.extract('^([0-9]+)-',expand=False).astype(int))\
        .assign(anticodon_end = lambda d: d.anticodon_pos.str.extract('-([0-9]+)$',expand=False).astype(int)) \
        .assign(predicted_anticodon = lambda d: list(map(lambda x,y,z: x[(y-1):z], d.seq, d.anticodon_start, d.anticodon_end)))\
        .to_csv(tablename, sep='\t', index=False)
    print('Written %s' % tablename)
Пример #21
0
 def parse(self):
     '''Parse the annotation and add it to interval metadata.'''
     self.result = {
         sid: imd
         for sid, imd in read(self.files['gff'], format='gff3')
     }
Пример #22
0
def tRNA_seq_dict(mature_fa):
    tRNA_dict = {}
    for r in read(mature_fa, 'fasta'):
        description = r.metadata['description'].split(' ')[2].strip(')')
        tRNA_dict[str(r).replace('U', 'T') + 'CCAA'] = description
    return tRNA_dict
Пример #23
0
def tax_acc(argv):
    """Computing accuracy of taxonomic classification across all ranks"""

    parser = argparse.ArgumentParser()
    parser.add_argument('lca',
                        type=str,
                        help='aggregated ORF LCA output. See agg-orf command')
    parser.add_argument(
        'taxonomy',
        type=str,
        help='the preprocessed GTDB taxonomy file. See prep-meta command')
    parser.add_argument('fasta',
                        type=str,
                        help='the input Fasta file with genomic sequences')
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        help='the output file to save results to',
                        default=None)

    args = parser.parse_args(argv)

    logger = get_logger()

    seqs = list()
    accs = list()
    for seq in skio.read(args.fasta, format='fasta'):
        accession, length, seq_name = seq.metadata['id'].split('-')
        seqs.append(seq_name)
        accs.append(accession)
    input_df = pd.DataFrame({'accession': accs, 'seq_name': seqs})

    # accession,domain,phylum,class,order,family,genus,species,gtdb_genome_representative
    logger.info(f'Reading taxonomy from {args.taxonomy}')
    taxdf = pd.read_csv(args.taxonomy, index_col='accession')

    ar122 = (taxdf['domain'] == 'd__Archaea').values
    bac120 = (taxdf['domain'] == 'd__Bacteria').values
    logger.info(
        f' - found {ar122.sum()} Archaea genomes and {bac120.sum()} Bacteria genomes'
    )

    # accession,seq_name,domain,phylum,class,order,family,genus,species
    # GCA_000380905.1,AQYW01000001.1,d__Archaea,p__Nanoarchaeota,c__Nanoarchaeia,o__SCGC-AAA011-G17,f__SCGC-AAA011-G17,g__SCGC-AAA011-G17,s__SCGC-AAA011-G17 sp000402515

    logger.info(f'Reading LCA results from {args.lca}')
    lca_df = pd.read_csv(args.lca)

    lca_df = input_df.set_index('seq_name').join(
        lca_df.set_index('seq_name').drop('accession', axis=1))

    taxdf = taxdf.filter(lca_df['accession'], axis=0)

    results = {
        'accuracy': list(),
        'pclfd': list(),
        'bac_accuracy': list(),
        'bac_pclfd': list(),
        'ar_accuracy': list(),
        'ar_pclfd': list()
    }
    ar122 = (taxdf['domain'] == 'd__Archaea').values
    bac120 = (taxdf['domain'] == 'd__Bacteria').values

    logger.info(
        f' - found {ar122.sum()} Archaea sequences and {bac120.sum()} Bacteria sequences'
    )

    ar122_tax = taxdf.index[ar122]
    bac120_tax = taxdf.index[bac120]
    logger.info(
        f' - found {len(set(ar122_tax))} Archaea genomes and {len(set(bac120_tax))} Bacteria genomes'
    )

    def get_results(tdf, ldf, col, sub=None):
        if sub is not None:
            tdf = tdf.iloc[sub]
            ldf = ldf.iloc[sub]
        mask = ldf[col].notna().values
        true = tdf[col][mask].values
        pred = ldf[col][mask].values
        eq = true == pred
        return mask.mean(), eq.mean()

    for col in taxlevels[1:]:
        logger.info(f'computing results for {col}')

        pclfd, acc = get_results(taxdf, lca_df, col)
        results['pclfd'].append(pclfd)
        results['accuracy'].append(acc)

        pclfd, acc = get_results(taxdf, lca_df, col, sub=bac120)
        results['bac_pclfd'].append(pclfd)
        results['bac_accuracy'].append(acc)

        pclfd, acc = get_results(taxdf, lca_df, col, sub=ar122)
        results['ar_pclfd'].append(pclfd)
        results['ar_accuracy'].append(acc)

    df = pd.DataFrame(data=results, index=taxlevels[1:])
    if args.output is not None:
        df.to_csv(args.output)
    print(df)
Пример #24
0
def write_genes(genes, input_faa_dir, output_fa_dir, output_genes_fp):
    """ Write protein sequences of selected gene families to external files

    Parameters
    ----------
    genes : dict of dict of set of str
        { ogid : { taxon : set(protein(s)) }
        return value of sample_genes or filter_paralogs
    input_faa_dir : str
        directory of input protein sequences from the query genome and the
        selected taxa for comparison (FASTA format, one taxon per file)
    output_fa_dir : str
        directory to store output protein sequences (FASTA format, one gene
        family per file)
    output_genes_fp : str
        file to store a list of selected gene families and their members.
        format: gene1<tab>taxon1|protein1,taxon2|protein2,...
    """
    # generate a taxon to protein to gene family map
    #   so complicated because it is optimized for subsequent filesystem I/O
    prots = {}
    for ogid in genes:
        for taxon in genes[ogid]:
            if taxon not in prots:
                prots[taxon] = {}
            for prot in genes[ogid][taxon]:
                if prot in prots[taxon]:
                    prots[taxon][prot].add(ogid)
                else:
                    prots[taxon][prot] = set([ogid])
    # match taxa with faa filenames
    #   so complicated because OrthoFinder trims off the version number from
    #   an NCBI-style accession (e.g., GCF_012345.1 becomes GCF_012345)
    taxon2file = {}
    for fname in os.listdir(input_faa_dir):
        if fname.endswith('.faa'):
            taxon = fname.split('.')[0]
            if taxon in prots:
                taxon2file[taxon] = fname
    # read protein sequences
    seqs = {}
    for taxon in taxon2file:
        for seq in io.read(os.path.join(input_faa_dir, taxon2file[taxon]),
                           format='fasta'):
            id = seq.metadata['id']
            if id in prots[taxon]:
                seqs[id] = str(seq)
    # write protein sequences per selected gene family
    for ogid in sorted(genes):
        members = []
        with open(os.path.join(output_fa_dir, '%s.fa' % ogid), 'w') as f:
            for taxon in sorted(genes[ogid]):
                # restore taxon name from OrthoFinder-crippled form
                trutax = taxon2file[taxon][:-4]
                for prot in sorted(genes[ogid][taxon]):
                    if prot in seqs:
                        # sequence IDs are like: taxon|protein
                        f.write('>%s|%s\n%s\n' % (trutax, prot, seqs[prot]))
                        members.append('%s|%s' % (trutax, prot))
        with open(output_genes_fp, 'a') as f:
            f.write('%s\t%s\n' % (ogid, ','.join(members)))
Пример #25
0
if len(sys.argv) == 1:
    parser.print_help()
    sys.exit(1)

args = parser.parse_args()

logging.basicConfig(stream=sys.stdout,
                    level=logging.DEBUG,
                    format='%(asctime)s - %(message)s')
logger = logging.getLogger()

logger.info('loading data %s' % args.input)

io = get_hdf5io(args.input, 'r')
difile = io.read()
difile.set_raw()
tid = difile.seq_table[args.idx][3][1]
sid = difile.seq_table[args.idx][1]

fofin = open(args.fof, 'r')
for line in map(lambda x: x.strip(), fofin):
    if tid in line:
        fasta_file = line
        break
fofin.close()

print(sid, tid, fasta_file)

for seq in skbio.io.read(fasta_file, constructor=Protein, format='fasta'):
    if sid == seq.metadata['id']:
Пример #26
0
import argparse
import os
import sys

from skbio import DNA
import skbio.io as skio

desc = "fasta_path must be a file path from NCBI and have the genome assembly accession in it"
parser = argparse.ArgumentParser(description=desc)
parser.add_argument('fasta_path', type=str, help='the fasta file to append the prefix to')

args = parser.parse_args()
args.prefix = os.path.basename(args.fasta_path)[:15]

seqs = list()
tmp_fa = sys.stdout
w = 100
for seq in skio.read(args.fasta_path, format='fasta', constructor=DNA):
    seq.metadata['id'] = args.prefix+"-"+str(len(seq))+"-"+seq.metadata['id']
    seqs.append(seq)
    tmp_fa.write('>')
    tmp_fa.write(seq.metadata['id'])
    tmp_fa.write('\n')
    for s in range(0, len(seq), w):
        tmp_fa.write(''.join(seq.values[s:s+w].astype('U')))
        tmp_fa.write('\n')
Пример #27
0
def mt_tRNA_tab(mature_fa):
    rows = []
    for record in read(mature_fa, 'fasta'):
        rows.append((record.metadata['id'], str(record), 0, len(str(record))))
    return pd.DataFrame(rows, columns=['tRNA', 'seq', 'start', 'end'])
Пример #28
0
def write_genes(genes,
                input_faa_dir,
                output_fa_dir,
                output_genes_fp):
    """ Write protein sequences of selected gene families to external files

    Parameters
    ----------
    genes : dict of dict of set of str
        { ogid : { taxon : set(protein(s)) }
        return value of sample_genes or filter_paralogs
    input_faa_dir : str
        directory of input protein sequences from the query genome and the
        selected taxa for comparison (FASTA format, one taxon per file)
    output_fa_dir : str
        directory to store output protein sequences (FASTA format, one gene
        family per file)
    output_genes_fp : str
        file to store a list of selected gene families and their members.
        format: gene1<tab>taxon1|protein1,taxon2|protein2,...
    """
    # generate a taxon to protein to gene family map
    #   so complicated because it is optimized for subsequent filesystem I/O
    prots = {}
    for ogid in genes:
        for taxon in genes[ogid]:
            if taxon not in prots:
                prots[taxon] = {}
            for prot in genes[ogid][taxon]:
                if prot in prots[taxon]:
                    prots[taxon][prot].add(ogid)
                else:
                    prots[taxon][prot] = set([ogid])
    # match taxa with faa filenames
    #   so complicated because OrthoFinder trims off the version number from
    #   an NCBI-style accession (e.g., GCF_012345.1 becomes GCF_012345)
    taxon2file = {}
    for fname in os.listdir(input_faa_dir):
        if fname.endswith('.faa'):
            taxon = fname.split('.')[0]
            if taxon in prots:
                taxon2file[taxon] = fname
    # read protein sequences
    seqs = {}
    for taxon in taxon2file:
        for seq in io.read(os.path.join(input_faa_dir, taxon2file[taxon]),
                           format='fasta'):
            id = seq.metadata['id']
            if id in prots[taxon]:
                seqs[id] = str(seq)
    # write protein sequences per selected gene family
    for ogid in sorted(genes):
        members = []
        with open(os.path.join(output_fa_dir, '%s.fa' % ogid), 'w') as f:
            for taxon in sorted(genes[ogid]):
                # restore taxon name from OrthoFinder-crippled form
                trutax = taxon2file[taxon][:-4]
                for prot in sorted(genes[ogid][taxon]):
                    if prot in seqs:
                        # sequence IDs are like: taxon|protein
                        f.write('>%s|%s\n%s\n' % (trutax, prot, seqs[prot]))
                        members.append('%s|%s' % (trutax, prot))
        with open(output_genes_fp, 'a') as f:
            f.write('%s\t%s\n' % (ogid, ','.join(members)))
def main():
    parser = argparse.ArgumentParser(
        description='Using a minimum sequence length per taxonomic '
        'group. \nThis script will read in a FASTA file and a taxonomy file, '
        '\nany sequence that does not fit the length criteria for a given '
        '\ntaxonomic group will be discarded. For example, if the following '
        '\ncriteria are specified:\n'
        '\n\t\'{"d__Bacteria":1200, "d__Archaea":900}\'\n'
        '\nThis means, any Bacterial and Eukaryal sequences less than 1200 '
        '\nbases, and any Archaeal sequences less than 900 bases, will be '
        '\ndiscarded.',
        formatter_class=RawTextHelpFormatter)
    req = parser.add_argument_group('REQUIRED')
    req.add_argument('-i',
                     '--input_sequences',
                     required=True,
                     action='store',
                     help='Input fasta file.')
    req.add_argument('-t',
                     '--input_taxonomy',
                     required=True,
                     action='store',
                     help='Input taxonomy file.')
    req.add_argument('-o',
                     '--output_sequences',
                     required=True,
                     action='store',
                     help='Output filtered FASTA file.')
    optp = parser.add_argument_group('OPTIONAL')
    optp.add_argument(
        '-g',
        '--taxonomic_groups',
        action='store',
        default='{"d__Bacteria":1200, "d__Archaea":900, "d__Eukaryota":1400}',
        help='List of taxonomic groups and associated minimum seq '
        '\nlength. Any sequences greater than or equal to length n.'
        '\nTip: set to \'{}\' if you only want to use the '
        '\n\'global_length_min\' option.'
        "\n[Default: \'%(default)s\']")
    optp.add_argument(
        '-m',
        '--global_length_min',
        action='store',
        default='1200',
        type=int,
        help='Any taxonomic groups not specified, will have their '
        '\nsequences discarded if they do not fit this length '
        '\ncritera. Set to large value if you want to remove all '
        '\nunspecified taxonomic groups.'
        "groups.\n[Default: %(default)s]")

    p = parser.parse_args()

    input_sequences = read(p.input_sequences, format='fasta')
    output_sequences = open(p.output_sequences, 'w')
    input_taxonomy = open(p.input_taxonomy, 'U')
    taxonomic_groups = p.taxonomic_groups
    global_length_min = p.global_length_min

    id_taxonomy_dict = make_taxonomy_dict(input_taxonomy)
    taxonomic_groups_dict = make_tax_group_dict(taxonomic_groups)

    filter_seqs_by_len_and_tax(input_sequences,
                               output_sequences,
                               taxonomic_groups_dict,
                               id_taxonomy_dict,
                               global_length_min=global_length_min)

    input_sequences.close()
    output_sequences.close()
Пример #30
0
                current = t.ids('observation')
                updated = map(lambda x: x.upper(), current)
                if len(set(updated)) != len(updated):
                    print('************>', a.id, fp, '<**************')
                if set(current) ^ set(updated):
                    print('Changing biom: ', a.id, fp)
                    t.update_ids({i: i.upper() for i in t.ids('observation')},
                                 axis='observation', inplace=True)
                    with biom_open(fp, 'w') as f:
                        t.to_hdf5(f, t.generated_by)
                    checksum = compute_checksum(fp)
            elif fpt == 'preprocessed_fasta':
                changed = False
                tmp = fp + '.tmp'
                with open(tmp, 'w') as out:
                    for seq in read(fp, format='fasta'):
                        seq = str(seq)
                        sequ = seq.upper()
                        out.write('>%s\n%s\n' % (sequ, sequ))
                        if seq != sequ:
                            changed = True
                if changed:
                    print('Changing biom: ', a.id, fp)
                    rename(tmp, fp)
                    checksum = compute_checksum(fp)
                else:
                    remove(tmp)

            if checksum is not None:
                TRN.add(sql, [checksum, _id])
                TRN.execute()
Пример #31
0
def extract_sequences(infile, identifiers=None):
    """Extract sequence(s) from a multi-sequence FASTA file.

    Parameters
    ----------
    infile : str
        file path to input multi-sequence FASTA file
    identifiers :
        int
            sequence index (n-th sequence in the file)
        str
            sequence ID (name) or index
                numeric str is treated as index instead of ID
            comma-separated sequence IDs or indexes
            file path to sequence list (one ID or index per line)
            sequence index range as "start..end" (both included)
                start must be smaller or equal to end
        list of int
            sequence indexes
        list of str
            sequence IDs or indexes
        tuple of two int's
            sequence index range as (start, end)
        if omitted, all sequences will be extracted

    Returns
    -------
    list of skbio Sequence
        extracted protein sequences

    Raises
    ------
    ValueError
        if tuple (index range) is not in (start, end) form
        if index range str is not formatted as "start, end"
        if the data type of identifiers is incorrect
    """
    l, ids, indexes = [], set(), set()
    if identifiers:
        # IDs or indexes as list
        if isinstance(identifiers, list):
            l = identifiers
        # start and end indexes as tuple of int
        elif isinstance(identifiers, tuple):
            if len(identifiers) == 2 \
                and all(isinstance(n, int) for n in identifiers) \
                    and 0 < identifiers[0] <= identifiers[1]:
                l = list(range(identifiers[0], identifiers[1] + 1))
            else:
                raise ValueError('Error: Index range must be a tuple of '
                                 '(start, end).')
        elif isinstance(identifiers, str):
            # read from a file
            if os.path.isfile(identifiers):
                with open(identifiers, 'r') as f:
                    l = f.read().splitlines()
            # start and end indexes as str
            elif '..' in identifiers:
                l = identifiers.split('..')
                if len(l) == 2 \
                    and all(n.isdigit() for n in l) \
                        and 0 < int(l[0]) <= int(l[1]):
                    l = list(range(int(l[0]), int(l[1]) + 1))
                else:
                    raise ValueError('Error: Index range must be formatted as '
                                     '"start..end".')
            # IDs or indexes as str (single or comma-separated list)
            else:
                l = list(map(str.strip, identifiers.split(',')))
        # index as int
        elif isinstance(identifiers, int):
            l = [identifiers]
        else:
            raise ValueError('Error: Incorrect data type of identifiers.')
        for i in l:
            if isinstance(i, int):  # index of this protein in the file
                indexes.add(i)
            elif i.isdigit():
                indexes.add(int(i))
            else:
                ids.add(i)  # protein ID (name)
    seqs = []
    for i, seq in enumerate(io.read(infile, format='fasta')):
        if ids:
            if seq.metadata['id'] in ids:
                seqs.append(seq)
        elif indexes:
            if i + 1 in indexes:  # indexes start with 1, not 0
                seqs.append(seq)
        else:
            seqs.append(seq)
    return seqs
Пример #32
0
#!/usr/bin/env python

import RNA
from skbio import io
import re
import sys

if len(sys.argv) != 2:
    sys.exit('[usage] python %s <fasta file>' % sys.argv[0])
fa = sys.argv[1]
for r in io.read(fa, 'fasta'):
    seq = str(r)[20:-20]
    f, e = RNA.fold(seq.strip('N'))
    folded = RNA.b2C(f)
    is_cloverleave = re.findall('[A-Z]', folded)
    is_tRNA = is_cloverleave and 'HHH' in ''.join(is_cloverleave)
    closed_end = folded.startswith('(') and folded.endswith(')')
    cloverleave = 'cloverleaf' if is_tRNA and closed_end else 'hairpin'
    print(r.metadata['id'], cloverleave, folded, seq.strip('N'))
Пример #33
0
def generate_cross_validated_sequences(read_taxa, simulated_reads_fp, index,
                                       iterations, cv_dir):
    '''Generates simulated community files (fasta and taxonomy) as subsets of
    simulated amplicons/taxa for cross-validated taxonomy assignment. Selects
    duplicated taxa names, evenly allocates these among subsets as query taxa
    (test set), generates ref taxa (training set) that do not match query fasta
    IDs, and creates fasta files to match each of these sets.
    read_taxa: list or path
        list or file of taxonomies corresponding to simulated_reads_fp
    simulated_reads_fp: path
        simulated amplicon reads (fasta format file)
    index: str
        reference database name
    iterations: int >= 2
        number of subsets to create
    cv_dir: path
        base output directory to contain simulated datasets
    '''
    if iterations < 2:
        raise ValueError('Must perform two or more iterations for '
                         'construction of cross-validated datasets.')

    # Stratify the data and form the CV data sets
    simulated_reads = list(io.read(simulated_reads_fp, format='fasta'))
    taxonomy = Artifact.import_data('FeatureData[Taxonomy]',
                                    read_taxa,
                                    view_type='HeaderlessTSVTaxonomyFormat')
    tree = build_tree(taxonomy, simulated_reads)
    strata = get_strata(tree, iterations)
    print(index + ': generating', iterations, 'folds on', len(strata),
          'strata')
    X, y = zip(*[(s, t) for t, ss in strata for s in ss])
    skf = StratifiedKFold(n_splits=iterations, shuffle=True, random_state=0)
    splits = []
    for train, test in skf.split(X, y):
        train_set = {X[i] for i in train}
        test_set = {X[i] for i in test}
        splits.append((train_set, test_set))

    # Output the CV data sets in the expected formats
    taxonomy_series = taxonomy.view(pd.Series)
    for iteration, (train, test) in enumerate(splits):
        db_iter_dir = join(cv_dir, '{0}-iter{1}'.format(index, iteration))
        if not exists(db_iter_dir):
            makedirs(db_iter_dir)
        query_taxa_fp = join(db_iter_dir, 'query_taxa.tsv')
        query_fp = join(db_iter_dir, 'query.fasta')
        ref_fp = join(db_iter_dir, 'ref_seqs.fasta')
        ref_taxa_fp = join(db_iter_dir, 'ref_taxa.tsv')

        # Output the taxa files
        train_series = taxonomy_series[train]
        train_series.to_csv(ref_taxa_fp, sep='\t')
        # If a taxonomy in the test set doesn't exist in the training set, trim
        # it until it does
        train_taxonomies = set()
        for taxonomy in train_series.values:
            taxonomy = taxonomy.split(';')
            for level in range(1, len(taxonomy) + 1):
                train_taxonomies.add(';'.join(taxonomy[:level]))
        test_list = []
        for sid in test:
            taxonomy = taxonomy_series[sid].split(';')
            for level in range(len(taxonomy), 0, -1):
                if ';'.join(taxonomy[:level]) in train_taxonomies:
                    test_list.append('\t'.join(
                        [sid, ';'.join(taxonomy[:level]).strip()]))
                    break
            else:
                raise RuntimeError('unknown kingdom in query set')
        export_list_to_file(test_list, query_taxa_fp)
        # Output the reference files
        with open(ref_fp, 'w') as ref_fasta:
            with open(query_fp, 'w') as query_fasta:
                for seq in simulated_reads:
                    if seq.metadata['id'] in train:
                        seq.write(ref_fasta, format='fasta')
                    else:
                        seq.write(query_fasta, format='fasta')

        # Encode as Artifacts for convenience
        artifact = Artifact.import_data('FeatureData[Sequence]', ref_fp)
        artifact.save(ref_fp[:-5] + 'qza')
        artifact = Artifact.import_data('FeatureData[Sequence]', query_fp)
        artifact.save(query_fp[:-5] + 'qza')
        artifact = Artifact.import_data(
            'FeatureData[Taxonomy]',
            ref_taxa_fp,
            view_type='HeaderlessTSVTaxonomyFormat')
        artifact.save(ref_taxa_fp[:-3] + 'qza')
Пример #34
0
from skbio.io import read, write

seqs = read("example.fna", qual="example.qual", format="fasta")
write(seqs, into="example.fastq", variant="illumina1.8", format="fastq")
Пример #35
0
def summarize_blast6(filename):
    df = read(filename, format="blast+6", into=pd.DataFrame, default_columns=True)
    df_best = filter_best(df)
Пример #36
0
def loadTree(tree):
	with open(tree, 'r') as f: 
		tree = read(f, format="newick", into=TreeNode)
	return tree
Пример #37
0
def extract_sequences(infile, identifiers=None):
    """Extract sequence(s) from a multi-sequence FASTA file.

    Parameters
    ----------
    infile : str
        file path to input multi-sequence FASTA file
    identifiers :
        int
            sequence index (n-th sequence in the file)
        str
            sequence ID (name) or index
                numeric str is treated as index instead of ID
            comma-separated sequence IDs or indexes
            file path to sequence list (one ID or index per line)
            sequence index range as "start..end" (both included)
                start must be smaller or equal to end
        list of int
            sequence indexes
        list of str
            sequence IDs or indexes
        tuple of two int's
            sequence index range as (start, end)
        if omitted, all sequences will be extracted

    Returns
    -------
    list of skbio Sequence
        extracted protein sequences

    Raises
    ------
    ValueError
        if tuple (index range) is not in (start, end) form
        if index range str is not formatted as "start, end"
        if the data type of identifiers is incorrect
    """
    l, ids, indexes = [], set(), set()
    if identifiers:
        # IDs or indexes as list
        if isinstance(identifiers, list):
            l = identifiers
        # start and end indexes as tuple of int
        elif isinstance(identifiers, tuple):
            if len(identifiers) == 2 \
                and all(isinstance(n, int) for n in identifiers) \
                    and 0 < identifiers[0] <= identifiers[1]:
                l = list(range(identifiers[0], identifiers[1] + 1))
            else:
                raise ValueError('Error: Index range must be a tuple of '
                                 '(start, end).')
        elif isinstance(identifiers, str):
            # read from a file
            if os.path.isfile(identifiers):
                with open(identifiers, 'r') as f:
                    l = f.read().splitlines()
            # start and end indexes as str
            elif '..' in identifiers:
                l = identifiers.split('..')
                if len(l) == 2 \
                    and all(n.isdigit() for n in l) \
                        and 0 < int(l[0]) <= int(l[1]):
                    l = list(range(int(l[0]), int(l[1]) + 1))
                else:
                    raise ValueError('Error: Index range must be formatted as '
                                     '"start..end".')
            # IDs or indexes as str (single or comma-separated list)
            else:
                l = list(map(str.strip, identifiers.split(',')))
        # index as int
        elif isinstance(identifiers, int):
            l = [identifiers]
        else:
            raise ValueError('Error: Incorrect data type of identifiers.')
        for i in l:
            if isinstance(i, int):  # index of this protein in the file
                indexes.add(i)
            elif i.isdigit():
                indexes.add(int(i))
            else:
                ids.add(i)  # protein ID (name)
    seqs = []
    for i, seq in enumerate(io.read(infile, format='fasta')):
        if ids:
            if seq.metadata['id'] in ids:
                seqs.append(seq)
        elif indexes:
            if i+1 in indexes:  # indexes start with 1, not 0
                seqs.append(seq)
        else:
            seqs.append(seq)
    return seqs