예제 #1
0
def add_delta_f_to_file(input_path, output_path, min_delta=0):
    """    
    Processes the input tab-separated file, calculates the deltaF function for each raw (max - min).
    :param input_path: list of tab-separated file.
    :param output_path: output file.
    :param min_delta: min value of deltaF function required to include the raw to the output
    :rtype: None
    """
    reader = open(input_path, 'r')
    writer = open(output_path, 'w')
    header_line = utils.remove_line_delimiter(reader.readline())
    column_number = len(header_line.split(utils.COLUMN_DELIMITER))
    if column_number < 4:
        utils.print_error(['Invalid header line: ', header_line], True)
    writer.write(header_line + utils.COLUMN_DELIMITER + 'delta_F' +
                 utils.LINES_DELIMITER)

    def process_invalid_line(invalid_line, line_index, text):
        text = ['Line %s: %s' % (line_index, text)]
        if len(invalid_line.strip()) > 0:
            text.append(invalid_line)
        utils.print_warning(text)
        writer.write(invalid_line)

    i = 0
    for l in reader:
        i += 1
        line = utils.remove_line_delimiter(l)

        if len(line.strip()) == 0:
            process_invalid_line(line, i, 'Empty line')
            continue

        tabs = line.split(utils.COLUMN_DELIMITER)
        if len(tabs) < column_number:
            process_invalid_line(
                line, i, 'Invalid number of values: %s (%s expected)' %
                (len(tabs), column_number))
            writer.write(l)
            continue

        try:
            column_values = [float(tabs[i]) for i in range(3, len(tabs))]
        except ValueError as e:
            process_invalid_line(line, i, 'Invalid value: %s' % e)
            writer.write(l)
            continue
        delta = round(max(column_values) - min(column_values), 3)
        if delta > min_delta:
            writer.write(line + utils.COLUMN_DELIMITER + str(delta) +
                         utils.LINES_DELIMITER)
    reader.close()
    writer.close()
    utils.print_result_file(output_path, 'for input file "%s"' % input_path)
예제 #2
0
def process_files(alignment_paths, reference_path, output_path, quiet=False):
    """
    Processes the input fasta files, compares aligned reads with reference
    and writes the mutations with a rate for each file to a tab-separated output file.
    :param alignment_paths: list of fasta files with aligned reads.
    :param reference_path: reference genome fasta file.
    :param output_path: output file.
    :param quiet: not print the progress on standard output
    :type quiet: bool
    :rtype: None
    """
    with open(reference_path) as reader:
        reference_sequence = SeqIO.parse(reader, 'fasta').next().seq
    reference_sequence_len = len(reference_sequence)

    def check_seq(seq_record):
        fasta_seq_len = len(seq_record.seq)
        if reference_sequence_len != fasta_seq_len:
            utils.print_warning('%s: invalid length %s (%s expected)' %
                                (seq_record.description, fasta_seq_len,
                                 reference_sequence_len))

    col_names = []
    alignment_number_by_groups = {}
    all_mutations = {}

    for alignment_path in alignment_paths:
        with open(alignment_path) as reader:  # IOException?
            # currently all the sequences of the same group are in the same file
            if not quiet:
                print 'Processing ' + alignment_path
            group_id = os.path.split(alignment_path)[1]
            col_names.append(group_id)
            count = 0
            for fasta in SeqIO.parse(reader, 'fasta'):
                count += 1
                check_seq(fasta)
                alignment = fasta.seq
                for position in range(
                        min(reference_sequence_len, len(alignment))):
                    alignment_val = alignment[position]
                    if reference_sequence[position] != alignment_val:
                        all_mutations.setdefault(position, {}).setdefault(
                            alignment_val, {}).setdefault(group_id, 0)
                        all_mutations[position][alignment_val][group_id] += 1
            alignment_number_by_groups[group_id] = count

    writer = open(output_path, 'w')

    def write_to_file(cols):
        writer.write(utils.COLUMN_DELIMITER.join(cols) + utils.LINES_DELIMITER)

    header = ['pos', 'ref', 'alt']
    header.extend(col_names)
    write_to_file(header)
    positions = all_mutations.keys()
    positions.sort()
    for i in positions:
        reference_val = reference_sequence[i]
        for mutation in all_mutations[i]:
            line = [str(i + 1), reference_val, mutation]
            for group_id in col_names:
                total_count = alignment_number_by_groups[group_id]
                mutation_count = all_mutations[i][mutation].get(group_id, 0)
                line.append(str(round(float(mutation_count) / total_count, 3)))
            write_to_file(line)
    writer.close()
    if not quiet:
        utils.print_result_file(output_path)
예제 #3
0
def process_files(alignment_paths, reference_path, output_path, quiet=False):
    """
    Processes the input fasta files, compares aligned reads with reference
    and writes the mutations with a rate for each file to a tab-separated output file.
    :param alignment_paths: list of fasta files with aligned reads.
    :param reference_path: reference genome fasta file.
    :param output_path: output file.
    :param quiet: not print the progress on standard output
    :type quiet: bool
    :rtype: None
    """
    with open(reference_path) as reader:
        reference_sequence = SeqIO.parse(reader, 'fasta').next().seq
    reference_sequence_len = len(reference_sequence)

    def check_seq(seq_record):
        fasta_seq_len = len(seq_record.seq)
        if reference_sequence_len != fasta_seq_len:
            utils.print_warning(
                '%s: invalid length %s (%s expected)' % (seq_record.description, fasta_seq_len, reference_sequence_len))

    col_names = []
    alignment_number_by_groups = {}
    all_mutations = {}

    for alignment_path in alignment_paths:
        with open(alignment_path) as reader:  # IOException?
            # currently all the sequences of the same group are in the same file
            if not quiet:
                print 'Processing ' + alignment_path
            group_id = os.path.split(alignment_path)[1]
            col_names.append(group_id)
            count = 0
            for fasta in SeqIO.parse(reader, 'fasta'):
                count += 1
                check_seq(fasta)
                alignment = fasta.seq
                for position in range(min(reference_sequence_len, len(alignment))):
                    alignment_val = alignment[position]
                    if reference_sequence[position] != alignment_val:
                        all_mutations.setdefault(position, {}).setdefault(alignment_val, {}).setdefault(group_id, 0)
                        all_mutations[position][alignment_val][group_id] += 1
            alignment_number_by_groups[group_id] = count

    writer = open(output_path, 'w')

    def write_to_file(cols):
        writer.write(utils.COLUMN_DELIMITER.join(cols) + utils.LINES_DELIMITER)

    header = ['pos', 'ref', 'alt']
    header.extend(col_names)
    write_to_file(header)
    positions = all_mutations.keys()
    positions.sort()
    for i in positions:
        reference_val = reference_sequence[i]
        for mutation in all_mutations[i]:
            line = [str(i + 1), reference_val, mutation]
            for group_id in col_names:
                total_count = alignment_number_by_groups[group_id]
                mutation_count = all_mutations[i][mutation].get(group_id, 0)
                line.append(str(round(float(mutation_count) / total_count, 3)))
            write_to_file(line)
    writer.close()
    if not quiet:
        utils.print_result_file(output_path)