def add_delta_f_to_file(input_path, output_path, min_delta=0): """ Processes the input tab-separated file, calculates the deltaF function for each raw (max - min). :param input_path: list of tab-separated file. :param output_path: output file. :param min_delta: min value of deltaF function required to include the raw to the output :rtype: None """ reader = open(input_path, 'r') writer = open(output_path, 'w') header_line = utils.remove_line_delimiter(reader.readline()) column_number = len(header_line.split(utils.COLUMN_DELIMITER)) if column_number < 4: utils.print_error(['Invalid header line: ', header_line], True) writer.write(header_line + utils.COLUMN_DELIMITER + 'delta_F' + utils.LINES_DELIMITER) def process_invalid_line(invalid_line, line_index, text): text = ['Line %s: %s' % (line_index, text)] if len(invalid_line.strip()) > 0: text.append(invalid_line) utils.print_warning(text) writer.write(invalid_line) i = 0 for l in reader: i += 1 line = utils.remove_line_delimiter(l) if len(line.strip()) == 0: process_invalid_line(line, i, 'Empty line') continue tabs = line.split(utils.COLUMN_DELIMITER) if len(tabs) < column_number: process_invalid_line( line, i, 'Invalid number of values: %s (%s expected)' % (len(tabs), column_number)) writer.write(l) continue try: column_values = [float(tabs[i]) for i in range(3, len(tabs))] except ValueError as e: process_invalid_line(line, i, 'Invalid value: %s' % e) writer.write(l) continue delta = round(max(column_values) - min(column_values), 3) if delta > min_delta: writer.write(line + utils.COLUMN_DELIMITER + str(delta) + utils.LINES_DELIMITER) reader.close() writer.close() utils.print_result_file(output_path, 'for input file "%s"' % input_path)
def process_files(alignment_paths, reference_path, output_path, quiet=False): """ Processes the input fasta files, compares aligned reads with reference and writes the mutations with a rate for each file to a tab-separated output file. :param alignment_paths: list of fasta files with aligned reads. :param reference_path: reference genome fasta file. :param output_path: output file. :param quiet: not print the progress on standard output :type quiet: bool :rtype: None """ with open(reference_path) as reader: reference_sequence = SeqIO.parse(reader, 'fasta').next().seq reference_sequence_len = len(reference_sequence) def check_seq(seq_record): fasta_seq_len = len(seq_record.seq) if reference_sequence_len != fasta_seq_len: utils.print_warning('%s: invalid length %s (%s expected)' % (seq_record.description, fasta_seq_len, reference_sequence_len)) col_names = [] alignment_number_by_groups = {} all_mutations = {} for alignment_path in alignment_paths: with open(alignment_path) as reader: # IOException? # currently all the sequences of the same group are in the same file if not quiet: print 'Processing ' + alignment_path group_id = os.path.split(alignment_path)[1] col_names.append(group_id) count = 0 for fasta in SeqIO.parse(reader, 'fasta'): count += 1 check_seq(fasta) alignment = fasta.seq for position in range( min(reference_sequence_len, len(alignment))): alignment_val = alignment[position] if reference_sequence[position] != alignment_val: all_mutations.setdefault(position, {}).setdefault( alignment_val, {}).setdefault(group_id, 0) all_mutations[position][alignment_val][group_id] += 1 alignment_number_by_groups[group_id] = count writer = open(output_path, 'w') def write_to_file(cols): writer.write(utils.COLUMN_DELIMITER.join(cols) + utils.LINES_DELIMITER) header = ['pos', 'ref', 'alt'] header.extend(col_names) write_to_file(header) positions = all_mutations.keys() positions.sort() for i in positions: reference_val = reference_sequence[i] for mutation in all_mutations[i]: line = [str(i + 1), reference_val, mutation] for group_id in col_names: total_count = alignment_number_by_groups[group_id] mutation_count = all_mutations[i][mutation].get(group_id, 0) line.append(str(round(float(mutation_count) / total_count, 3))) write_to_file(line) writer.close() if not quiet: utils.print_result_file(output_path)
def process_files(alignment_paths, reference_path, output_path, quiet=False): """ Processes the input fasta files, compares aligned reads with reference and writes the mutations with a rate for each file to a tab-separated output file. :param alignment_paths: list of fasta files with aligned reads. :param reference_path: reference genome fasta file. :param output_path: output file. :param quiet: not print the progress on standard output :type quiet: bool :rtype: None """ with open(reference_path) as reader: reference_sequence = SeqIO.parse(reader, 'fasta').next().seq reference_sequence_len = len(reference_sequence) def check_seq(seq_record): fasta_seq_len = len(seq_record.seq) if reference_sequence_len != fasta_seq_len: utils.print_warning( '%s: invalid length %s (%s expected)' % (seq_record.description, fasta_seq_len, reference_sequence_len)) col_names = [] alignment_number_by_groups = {} all_mutations = {} for alignment_path in alignment_paths: with open(alignment_path) as reader: # IOException? # currently all the sequences of the same group are in the same file if not quiet: print 'Processing ' + alignment_path group_id = os.path.split(alignment_path)[1] col_names.append(group_id) count = 0 for fasta in SeqIO.parse(reader, 'fasta'): count += 1 check_seq(fasta) alignment = fasta.seq for position in range(min(reference_sequence_len, len(alignment))): alignment_val = alignment[position] if reference_sequence[position] != alignment_val: all_mutations.setdefault(position, {}).setdefault(alignment_val, {}).setdefault(group_id, 0) all_mutations[position][alignment_val][group_id] += 1 alignment_number_by_groups[group_id] = count writer = open(output_path, 'w') def write_to_file(cols): writer.write(utils.COLUMN_DELIMITER.join(cols) + utils.LINES_DELIMITER) header = ['pos', 'ref', 'alt'] header.extend(col_names) write_to_file(header) positions = all_mutations.keys() positions.sort() for i in positions: reference_val = reference_sequence[i] for mutation in all_mutations[i]: line = [str(i + 1), reference_val, mutation] for group_id in col_names: total_count = alignment_number_by_groups[group_id] mutation_count = all_mutations[i][mutation].get(group_id, 0) line.append(str(round(float(mutation_count) / total_count, 3))) write_to_file(line) writer.close() if not quiet: utils.print_result_file(output_path)