Exemplo n.º 1
0
def split_fasta_file(input_file_path,
                     dest_dir,
                     prefix='part',
                     number_of_sequences_per_file=20000):
    debug('%s; src: %s, dest dir: %s' % (my_name(), input_file_path, dest_dir))

    input = SequenceSource(input_file_path)

    parts = []
    next_part = 1
    part_obj = None

    while input.next():
        if (input.pos - 1) % number_of_sequences_per_file == 0:
            sys.stderr.write('\rCreating part: ~ %s' % (pp(next_part)))
            sys.stderr.flush()

            if part_obj:
                part_obj.close()
            file_path = os.path.join(dest_dir, prefix + '-%08d' % next_part)
            parts.append(file_path)
            next_part += 1
            part_obj = open(file_path, 'w')

        part_obj.write('>%s\n' % input.id)
        part_obj.write('%s\n' % input.seq)

    if part_obj:
        part_obj.close()

    sys.stderr.write('\n')
    return parts
Exemplo n.º 2
0
def split_fasta_file(input_file_path, dest_dir, prefix = 'part', number_of_sequences_per_file = 20000):
    debug('%s; src: %s, dest dir: %s' % (my_name(), input_file_path, dest_dir))
    
    input = SequenceSource(input_file_path)
    
    parts = []
    next_part = 1
    part_obj = None

    while input.next():
        if (input.pos - 1) % number_of_sequences_per_file == 0:
            sys.stderr.write('\rCreating part: ~ %s' % (pp(next_part)))
            sys.stderr.flush()

            if part_obj:
                part_obj.close()
            file_path = os.path.join(dest_dir, prefix + '-%08d' % next_part)
            parts.append(file_path)
            next_part += 1
            part_obj = open(file_path, 'w')

        part_obj.write('>%s\n' % input.id)
        part_obj.write('%s\n' % input.seq)
  
    if part_obj:
        part_obj.close()

    sys.stderr.write('\n')
    return parts
Exemplo n.º 3
0
def split_file(ids_file,
               source_file,
               filtered_dest_file,
               survived_dest_file,
               type='fasta'):
    """splits reads in input file into two files based on ids_file

       for read_id in input:
           if read_id in list_of_ids:
               --> filtered_dest_file
           else:
               --> survived dest_file

       """
    debug('%s; src: "%s" (%s), filtered_dest: "%s", survived_dest: "%s"'\
        % (my_name(), source_file, type, filtered_dest_file, survived_dest_file))

    try:
        ids_to_filter = set([id.strip() for id in open(ids_file).readlines()])
    except IOError:
        raise FilterError, 'Hit IDs file missing ("%s").' \
                % (ids_to_filter)

    if type == 'fasta':

        STORE = lambda e, f: f.write('>%s\n%s\n' % (e.id, e.seq))

        input = SequenceSource(source_file)
        filtered_output = open(filtered_dest_file, 'w')
        survived_output = open(survived_dest_file, 'w')
        filtered_count, survived_count = 0, 0

        while input.next():
            if input.pos % 10000 == 0 or input.pos == 1:
                sys.stderr.write('\rSplitting FASTA file: ~ %s' %
                                 (pp(input.pos)))
                sys.stderr.flush()

            if input.id in ids_to_filter:
                ids_to_filter.remove(input.id)
                STORE(input, filtered_output)
                filtered_count += 1
            else:
                STORE(input, survived_output)
                survived_count += 1

        sys.stderr.write('\n')
        filtered_output.close()
        survived_output.close()

        debug('%s; done. of %s total reads, filtered: %s, survived: %s.'\
                    % (my_name(), pp(filtered_count + survived_count),\
                       pp(filtered_count), pp(survived_count)))

    else:
        raise UtilsError, "type '%s' is not implemented" % (type)

    return True
Exemplo n.º 4
0
def split_file(ids_file, source_file, filtered_dest_file, survived_dest_file, type = 'fasta'):
    """splits reads in input file into two files based on ids_file

       for read_id in input:
           if read_id in list_of_ids:
               --> filtered_dest_file
           else:
               --> survived dest_file

       """
    debug('%s; src: "%s" (%s), filtered_dest: "%s", survived_dest: "%s"'\
        % (my_name(), source_file, type, filtered_dest_file, survived_dest_file))
    
    try:
       ids_to_filter = set([id.strip() for id in open(ids_file).readlines()])
    except IOError:
       raise FilterError, 'Hit IDs file missing ("%s").' \
               % (ids_to_filter)
  
    if type == 'fasta':
        
        STORE = lambda e, f: f.write('>%s\n%s\n' % (e.id, e.seq))

        input  = SequenceSource(source_file)
        filtered_output = open(filtered_dest_file, 'w')
        survived_output = open(survived_dest_file, 'w')
        filtered_count, survived_count = 0, 0

        while input.next():
            if input.pos % 10000 == 0 or input.pos == 1:
                sys.stderr.write('\rSplitting FASTA file: ~ %s' % (pp(input.pos)))
                sys.stderr.flush()

            if input.id in ids_to_filter:
                ids_to_filter.remove(input.id)
                STORE(input, filtered_output)
                filtered_count += 1
            else:
                STORE(input, survived_output)
                survived_count += 1
       
        sys.stderr.write('\n')
        filtered_output.close()
        survived_output.close()

        debug('%s; done. of %s total reads, filtered: %s, survived: %s.'\
                    % (my_name(), pp(filtered_count + survived_count),\
                       pp(filtered_count), pp(survived_count)))
    
    else:
        raise UtilsError, "type '%s' is not implemented" % (type)   
    
    return True