def test_write_fasta_sequence1(nucseq, tmpdir): seq_id, seq = next(fasta.load_fasta(nucseq)) file_name = (tmpdir / 'test.fa').strpath file_handle = open_file(file_name, 'w') fasta.write_fasta_sequence(file_handle, seq_id, seq) file_handle.close() seq_idw, seqw = next(fasta.load_fasta(file_name)) assert (seq_id, seq) == (seq_idw, seqw)
def test_write_fasta_sequence2(nucseq, tmpdir): file_name = (tmpdir / 'test.fa').strpath file_handle = open_file(file_name, 'w') for seq_id, seq in fasta.load_fasta(nucseq): fasta.write_fasta_sequence(file_handle, seq_id, seq) file_handle.close() count1 = sum(1 for x in fasta.load_fasta(nucseq)) count2 = sum(1 for x in fasta.load_fasta(file_name)) assert count1 == count2
def filter_command(verbose, len_gt, len_lt, header_contains, seq_pattern, wrap, fasta_file, output_file): """ .. versionadded:: 0.5.7 Filters a fasta file """ mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO) if wrap: wrap = 60 else: wrap = None for name, seq in fasta.load_fasta(fasta_file): seq_len = len(seq) if len_gt is not None: if not (seq_len > len_gt): continue if len_lt is not None: if not (seq_len < len_lt): continue if header_contains is not None: if header_contains not in name: continue if seq_pattern is not None: if seq_pattern not in seq: continue fasta.write_fasta_sequence(output_file, name, seq, wrap=wrap)
def translate_command(verbose, trans_table, one_seq, no_wrap, progress, fasta_file, output_file): mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO) if one_seq: LOG.info("Assuming the sequences are in the correct frame") LOG.info( 'Writing to file (%s)', getattr(output_file, 'name', repr(output_file)) ) trans_table = load_trans_table(trans_table) iterator = fasta.load_fasta(fasta_file) if progress: iterator = tqdm(iterator) if no_wrap: wrap = None else: wrap = 60 for name, seq in iterator: if one_seq: new_seq = translate_sequence(seq, 0, trans_table, False) fasta.write_fasta_sequence(output_file, name, new_seq, wrap=wrap) else: for new_header, new_seq in translate_seq(name, seq, trans_table): fasta.write_fasta_sequence(output_file, new_header, new_seq, wrap=wrap)
def infer_parameters(file_handle, fastq_bool, progress): LOG.info("Extrapolating model from file %s", file_handle.name) if fastq_bool: it = load_fastq(file_handle, num_qual=True) quals = [] else: it = fasta.load_fasta(file_handle) if progress: it = tqdm(it) gc_content = [] length = 0 for record in it: length = max(length, len(record[1])) gc_content.append(sequence.sequence_gc_content(record[1])) if fastq_bool: quals.append(record[2]) if fastq_bool: model = sequence.extrapolate_model(quals) else: model = None gc_content = numpy.mean(gc_content) return length, gc_content, model
def sequence_command(verbose, reverse, no_wrap, split, reference, progress, gff_file, fasta_file): mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO) if reference is None: utils.exit_script('A fasta reference file is required', 1) wrap = 60 if no_wrap: wrap = None seqs = dict((seq_id.split(' ')[0] if split else seq_id, seq) for seq_id, seq in fasta.load_fasta(reference)) ann_iter = gff.parse_gff(gff_file, gff_type=gff.from_gff) seq_iter = gff.extract_nuc_seqs(ann_iter, seqs, reverse=reverse) if progress: seq_iter = tqdm(seq_iter) for name, seq in seq_iter: fasta.write_fasta_sequence(fasta_file, name, seq, wrap=wrap)
def get_aa_data(f_handle): """ Load aminoacid seuqnces used by HMMER. """ # LOG.info('Loading aa data from file %s', f_handle.name) aa_seqs = dict( (name.split(' ')[0], seq) for name, seq in fasta.load_fasta(f_handle)) return aa_seqs
def test_split_fasta_file2(nucseq, tmpdir): fasta.split_fasta_file(nucseq, (tmpdir / 'test{}.fa').strpath, 3) files = list( str(path) for path in pathlib.Path(tmpdir.strpath).glob('*.fa')) count1 = sum(1 for x in fasta.load_fasta(nucseq)) count2 = sum(1 for x in fasta.load_fasta_files(files)) assert count1 == count2
def uid_command(verbose, table, fasta_file, output_file): mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO) if table is not None: LOG.info( 'Writing Table to file (%s)', getattr(table, 'name', repr(table)) ) LOG.info( 'Writing to file (%s)', getattr(output_file, 'name', repr(output_file)) ) for name, seq in fasta.load_fasta(fasta_file): uid = str(uuid4()) if table is not None: table.write("{}\t{}\n".format(uid, name).encode('ascii')) fasta.write_fasta_sequence(output_file, uid, seq)
def test_load_fasta5(oneseq): seq_id, seq = next(fasta.load_fasta(oneseq)) assert len(seq) == 630
def test_load_fasta4(oneseq): count = sum(1 for x in fasta.load_fasta(oneseq)) assert count == 1
def test_load_fasta3(nucseq): count = sum(1 for x in fasta.load_fasta(nucseq)) assert count == 115
def test_load_fasta2(nucseq): seq_id, seq = next(fasta.load_fasta(nucseq)) assert seq_id == u'contig-1467318'
def test_load_fasta1(nucseq): seq_id, seq = next(fasta.load_fasta(nucseq)) assert seq == u'ATCGATGTCTGCCGCAATGACGGTGGCACCACGCTGGGCAGAGCGTACGACGGCACCCATACCGACCATGCCACAGCCAATCACCATGACGACATCGATATCTGTTACCTGGGCACGACTGACAGCATGGAAACCCACACTCATCGGTTCGATTAATGCACAGGTGCGGGGTGTCAGCAGTCCTGCGGGGATGACTTTCTCCCAGGGCAGGGCGAGATACTCACACATGGCTCCCCAGCGCTGCACACCTAATGTCTGGTTGTGCTCGCAGGCATTGACACGGTCGTTACGGCATGACGCACATTTTCCACAGTTGGTGTAGGGGTTGACGGTGACGGTCATACCGGGCTTCAGTCCCTCAGGTACGTTCTTGCCAATCTTGACAATCTCCGCACCTACCTCATGACCGGGAACCACAGGCATCTTCACCATCGGGTTACCGCCACGGAAGGTATTCAGGTCACTGCCGCAAAAACC'
def nucseq(shared_datadir): return dict( fasta.load_fasta(str(shared_datadir / 'test-seq-nuc.fa')) )
def aaseq(shared_datadir): return dict( fasta.load_fasta(str(shared_datadir / 'test-seq-aa.fa')) )