Exemplo n.º 1
0
def split_fasta_file(input_file_path,
                     dest_dir,
                     prefix='part',
                     num_reads_per_file=5000):
    input_fasta = u.SequenceSource(input_file_path)

    parts = []
    next_part = 1
    part_obj = None

    while input_fasta.next():
        if (input_fasta.pos - 1) % num_reads_per_file == 0:
            if part_obj:
                part_obj.close()

            rand_bit = ''.join([
                random.choice(string.ascii_letters + string.digits)
                for n in xrange(8)
            ])
            file_path = os.path.join(
                dest_dir, '%s-%d-%s.fa' % (prefix, next_part, rand_bit))
            parts.append(file_path)
            next_part += 1
            part_obj = u.FastaOutput(file_path)

        part_obj.store(input_fasta, split=False)

    if part_obj:
        part_obj.close()

    return parts
def main(input_fasta_path, output_fasta_path=None, reverse=False):
    if not output_fasta_path:
        output_fasta_path = input_fasta_path + '-PADDED-WITH-GAPS'

    fasta = u.SequenceSource(input_fasta_path)
    output = u.FastaOutput(output_fasta_path)

    longest_read = 0
    while next(fasta):
        if len(fasta.seq) > longest_read:
            longest_read = len(fasta.seq)

    fasta.reset()

    while next(fasta):
        if fasta.pos % 10000 == 0:
            sys.stderr.write('\rreads processed so far: %d' % (fasta.pos))
            sys.stderr.flush()

        gaps = longest_read - len(fasta.seq)

        output.write_id(fasta.id)
        if reverse:
            output.write_seq('-' * gaps + fasta.seq, split=False)
        else:
            output.write_seq(fasta.seq + '-' * gaps, split=False)

    fasta.close()
    sys.stderr.write('\n')
Exemplo n.º 3
0
def trim_uninformative_columns_from_alignment(input_file_path):
    input_fasta = u.SequenceSource(input_file_path, lazy_init=False)
    input_fasta.next()
    fasta_read_len = len(input_fasta.seq)
    invalid_columns = range(0, fasta_read_len)
    input_fasta.reset()

    while input_fasta.next():
        for i in invalid_columns:
            if input_fasta.seq[i] != '-':
                invalid_columns.remove(i)

    columns_to_keep = [
        x for x in range(0, fasta_read_len) if x not in invalid_columns
    ]

    input_fasta.reset()

    temp_file = tempfile.NamedTemporaryFile(delete=False)
    temp_file_path = temp_file.name
    temp_file.close()

    temp_file = u.FastaOutput(temp_file_path)

    while input_fasta.next():
        new_seq = ''
        for i in columns_to_keep:
            new_seq += input_fasta.seq[i]
        temp_file.write_id(input_fasta.id)
        temp_file.write_seq(new_seq, split=False)

    temp_file.close()

    # overwrite the original file with trimmed content
    shutil.move(temp_file_path, input_file_path)
Exemplo n.º 4
0
def gen_tmpl(taxon,
             otu_id_to_greengenes,
             greengenes_alignment,
             output_file_path=None):
    ids = []

    for id, tax in [
            line.strip().split('\t')
            for line in open(otu_id_to_greengenes).readlines()
    ]:
        if tax.find(taxon) > 0:
            ids.append(id)

    ids = list(set(ids))
    print '%d ids found for %s.' % (len(ids), taxon)

    template = u.FastaOutput('%s.tmpl' % taxon)
    fasta = u.SequenceSource(greengenes_alignment)
    while fasta.next():
        if fasta.id in ids:
            template.store(fasta, split=False)
            ids.remove(fasta.id)

    fasta.close()
    template.close()
Exemplo n.º 5
0
def split_fasta_file(input_file_path,
                     dest_dir,
                     prefix='part',
                     num_reads_per_file=5000):
    input_fasta = u.SequenceSource(input_file_path)

    parts = []
    next_part = 1
    part_obj = None

    while input_fasta.next():
        if (input_fasta.pos - 1) % num_reads_per_file == 0:
            if part_obj:
                part_obj.close()

            file_path = os.path.join(dest_dir, '%s-%d' % (prefix, next_part))
            parts.append(file_path)
            next_part += 1
            part_obj = u.FastaOutput(file_path)

        part_obj.store(input_fasta, split=False)

    if part_obj:
        part_obj.close()

    return parts
Exemplo n.º 6
0
def mask_defline_whitespaces_in_FASTA(fasta_file_path,
                                      defline_white_space_mask='<$!$>'):
    temp_file_path = fasta_file_path + '.tmp'
    fasta = u.SequenceSource(fasta_file_path)
    output = u.FastaOutput(fasta_file_path + '.tmp')

    while fasta.next():
        output.write_id(fasta.id.replace(' ', defline_white_space_mask))
        output.write_seq(fasta.seq, split=False)

    shutil.move(temp_file_path, fasta_file_path)
Exemplo n.º 7
0
def main(input_fasta, subsample_to, output_fasta):
    fasta = u.SequenceSource(input_fasta)

    fasta_content = {}

    while fasta.next():
        if fasta.pos % 1000 == 0:
            sys.stderr.write(
                '\r[Reading FASTA into memory] reads processed so far: %d' %
                (fasta.pos))
            sys.stderr.flush()

        sample_name = get_sample_name_from_defline(fasta.id)

        if not fasta_content.has_key(sample_name):
            fasta_content[sample_name] = []

        fasta_content[sample_name].append((fasta.id, fasta.seq), )

    samples = sorted(fasta_content.keys())
    sys.stderr.write(
        '\n%d samples found in the FASTA file: %s%s\n' %
        (len(samples),
         ', '.join(samples[0:3] if len(samples) > 3 else ', '.join(samples)),
         ' (...)' if len(samples) > 3 else '.'))

    sample_counter = 0
    for sample in samples:
        sample_counter += 1
        sys.stderr.write('\r[Shuffling] Sample %d of %d' %
                         (sample_counter, len(samples)))
        sys.stderr.flush()

        random.shuffle(fasta_content[sample])

    output = u.FastaOutput(output_fasta)

    sample_counter = 0
    for sample in samples:
        sample_counter += 1
        sys.stderr.write('\r[Writing Output] Sample %d of %d' %
                         (sample_counter, len(samples)))
        sys.stderr.flush()

        for e in fasta_content[sample][0:subsample_to]:
            output.write_id(e[0])
            output.write_seq(e[1], split=False)

    sys.stderr.write('\n')
    sys.stderr.flush()
Exemplo n.º 8
0
 def store_node_representatives(self,
                                node_ids,
                                output_file_path,
                                store_gaps=False):
     output = u.FastaOutput(output_file_path)
     for node_id in node_ids:
         output.write_id(node_id)
         if store_gaps:
             output.write_seq(self.nodes[node_id].representative_seq,
                              split=False)
         else:
             output.write_seq(
                 self.nodes[node_id].representative_seq.replace('-', ''),
                 split=False)
     output.close()
Exemplo n.º 9
0
def unique_and_store_alignment(alignment_path, output_path):
    output = u.FastaOutput(output_path)
    alignment = u.SequenceSource(alignment_path, unique=True)

    alignment.next()
    most_abundant_unique_read = alignment.seq
    alignment.reset()

    read_ids = []
    unique_read_counts = []
    while alignment.next():
        read_ids += alignment.ids
        unique_read_counts.append(len(alignment.ids))
        output.store(alignment, split=False)

    output.close()
    alignment.close()

    return (read_ids, unique_read_counts, most_abundant_unique_read)
Exemplo n.º 10
0
import sys

import Oligotyping.lib.fastalib as u

fasta = u.SequenceSource(sys.argv[1])
output = u.FastaOutput(sys.argv[1] + '-PADDED-WITH-GAPS')

longest_read = 0
while fasta.next():
    if len(fasta.seq) > longest_read:
        longest_read = len(fasta.seq)

fasta.reset()

while fasta.next():
    if fasta.pos % 10000 == 0:
        sys.stdout.write('\rreads processed so far: %d' % (fasta.pos))
        sys.stdout.flush()

    gaps = longest_read - len(fasta.seq)

    output.write_id(fasta.id)
    output.write_seq(fasta.seq + '-' * gaps, split=False)

fasta.close()
print
Exemplo n.º 11
0
def main(fasta_file_path, min_percent=95.0, output_file_path=None):
    fasta = u.SequenceSource(fasta_file_path)

    fasta.next()
    alignment_length = len(fasta.seq)
    fasta.reset()

    positions = {}

    while fasta.next():
        if fasta.pos % 1000 == 0:
            sys.stderr.write('\rAnalyzing all reads; pos: %d' % fasta.pos)
            sys.stderr.flush()
        for i in range(0, alignment_length):
            if fasta.seq[i] != '-':
                for j in range(i, alignment_length):
                    try:
                        positions[j] += 1
                    except:
                        positions[j] = 1
                break

    fasta.reset()
    sys.stderr.write('\n')

    num_reads = positions[alignment_length - 1]
    trim_location = 0

    for i in range(0, alignment_length):
        pct_reads_will_survive = positions[i] * 100.0 / num_reads
        if pct_reads_will_survive >= min_percent and not trim_location:
            trim_location = i
            trim_location_pct_reads_survive = pct_reads_will_survive
        if pct_reads_will_survive == 100:
            print
            print 'All reads are going to be trimmed from the %dth position.' % (
                trim_location_pct_reads_survive)

            if 100 - trim_location_pct_reads_survive:
                print
                print '%d reads that do not reach to this locaition will be eliminated.' % (
                    (100 - trim_location_pct_reads_survive) / 100.0 *
                    num_reads)

            if min_percent < 100:
                print
                print 'If all reads were to be retained, alignments should have been trimmed from'
                print 'the %dth location, however, this would have required all reads to lose %d' % (
                    i, i - trim_location)
                print 'bases'
            print
            break

    output = u.FastaOutput(
        output_file_path if output_file_path else sys.argv[1] + '-TRIMMED')

    while fasta.next():
        if fasta.pos % 1000 == 0:
            sys.stderr.write('\rStoring trimmed reads; pos: %d' % fasta.pos)
            sys.stderr.flush()

        if fasta.seq[trim_location:].startswith('-'):
            continue
        else:
            output.write_id(fasta.id)
            output.write_seq(fasta.seq[trim_location:], split=False)

    sys.stderr.write('\n')
    sys.stderr.write('\n')
    print 'Trimmed reads stored: "%s"\n' % (
        output_file_path if output_file_path else sys.argv[1] + '-TRIMMED')
Exemplo n.º 12
0
# removes samples from FASTA file:
#
# ./me FASTA_FILE sample_1,sample_2,[...],sample_N
#

import sys

import Oligotyping.lib.fastalib as u
from Oligotyping.utils.utils import pretty_print as pp

fasta = u.SequenceSource(sys.argv[1])
output = u.FastaOutput(sys.argv[1] + '-SAMPLES-REMOVED.fa')
samples_to_be_removed = [s.strip() for s in sys.argv[2].split(',')]

while fasta.next():
    if fasta.pos % 1000 == 0:
        sys.stderr.write('\rreads processed so far: %s' % (pp(fasta.pos)))
        sys.stderr.flush()
    sample_name = '_'.join(fasta.id.split('_')[:-1])

    if sample_name in samples_to_be_removed:
        continue

    output.store(fasta, split=False)

sys.stderr.write('\rNew FASTA file .............: %s\n' % (sys.argv[1] + '-SAMPLES-REMOVED.fa'))
fasta.close()
output.close()
# -*- coding: utf-8 -*-

import sys

import Oligotyping.lib.fastalib as u

fasta = u.SequenceSource(sys.argv[1])
taxon = sys.argv[2]

output = u.FastaOutput(sys.argv[2].replace(';', ''))

while fasta.next():
    if fasta.id.find(taxon) > -1:
        acc = fasta.id.split('|')[0]
        project = fasta.id.split('|')[1].split('=')[1]
        sample = fasta.id.split('|')[2].split('=')[1]
        new_id = project + '_' + sample + '_' + acc

        abundance = int(fasta.id.split('|')[7].split('=')[1])

        for i in range(0, abundance):
            output.write_id('%s-%s|%s' % (new_id, str(i), fasta.id))
            output.write_seq(fasta.seq, split=False)

fasta.close()
output.close()
Exemplo n.º 14
0
# -*- coding: utf-8 -*-

import sys

import Oligotyping.lib.fastalib as u

fasta = u.SequenceSource(sys.argv[1], lazy_init=False)
output = u.FastaOutput(sys.argv[1] + '-TRIMMED')

trim_from = int(sys.argv[2])
trim_to = int(sys.argv[3]) if len(sys.argv) == 4 else None

while fasta.next():
    output.write_id(fasta.id)
    if trim_to:
        output.write_seq(fasta.seq[trim_from:trim_to], split=False)
    else:
        output.write_seq(fasta.seq[trim_from:], split=False)

fasta.close()
output.close()