def trim_additional_merged_contigs(original_contig, merged_contig): open_merged = open(merged_contig) fasta_reader = FastaReader(open_merged) header, sequence = fasta_reader.next() sp_header = header.split('_') trim = int(sp_header[-2].strip('os')) cigar = sp_header[-1] all_cigar = re.findall('(\d+)([MXDI])', cigar) for count, type in all_cigar: if type == "M" or type == "X": trim += int(count) open_merged.close() if trim > 50: logging.info("trim %s of %s" % (trim, original_contig)) open_contig = open(original_contig) fasta_reader = FastaReader(open_contig) header, sequence, fasta_reader.next() header = header + "trim_%s" % trim sequence = sequence[trim:] open_contig.close() open_contig = open(original_contig, 'w') if len(sequence) > 100: open_contig.write(">%s\n%s\n" % (header, sequence)) open_contig.close()
def correct_contig_file(contig_file, site_name, min_contig_len=101): dir_name = os.path.dirname(contig_file) corrected_file = os.path.join(dir_name, 'contigs_corrected.fa') open_file = open(contig_file) open_corrected = open(corrected_file, 'w') fasta_reader = FastaReader(open_file) nb_seq = 0 max_len = 0 for fasta_record in fasta_reader: header, sequence = fasta_record length_A = sum([len(e) for e in re.findall('[A]{7,}', sequence)]) length_C = sum([len(e) for e in re.findall('[C]{7,}', sequence)]) length_G = sum([len(e) for e in re.findall('[G]{7,}', sequence)]) length_T = sum([len(e) for e in re.findall('[T]{7,}', sequence)]) total_repeat = sum([length_A, length_C, length_G, length_T]) if len(sequence ) > min_contig_len and float(total_repeat) / len(sequence) < .5: nb_seq += 1 if len(sequence) > max_len: max_len = len(sequence) header = '%s_pair_%s_length_%s' % (site_name, nb_seq, len(sequence)) open_corrected.write('>%s\n%s\n' % (header, sequence)) open_corrected.close() open_file.close() return (corrected_file, nb_seq, max_len)
def merge_read1_and_read2_contigs(name, read1_contig, read2_contigs, output_dir): open_read2 = open(read2_contigs) all_fasta2_entries = [] read2_reader = FastaReader(open_read2) for header, sequence in read2_reader: all_fasta2_entries.append((header, sequence)) if len(all_fasta2_entries) == 1: merged_contigs_info = merge_2_contigs(name, read1_contig, read2_contigs, output_dir) if merged_contigs_info: merged_contig_file = os.path.join(output_dir, 'tmp_merged_consensus.fa') with open(merged_contig_file, 'w') as open_output: open_output.write('>%s\n%s\n' % merged_contigs_info) return merged_contig_file else: return None else: all_fasta2_entries.sort(cmp=compare_fasta_length) merged_pair = None remaining = [] for header, sequence in all_fasta2_entries: cur_pair = os.path.join(output_dir, header + ".fa") open_pair = open(cur_pair, 'w') open_pair.write(">%s\n%s\n" % (header, sequence)) open_pair.close() if not merged_pair: merged_pair = merge_2_contigs(name, read1_contig, cur_pair, output_dir) if not merged_pair: remaining.append(cur_pair) else: results = merge_2_contigs(name + "add", read1_contig, cur_pair, output_dir) #TODO: fix this as it doesn't seems to trim and the output file is the same as above but in the mean time disable using False if False and results: additional_merged_pair = os.path.join( output_dir, 'tmp_merged_consensus.fa') with open(additional_merged_pair, 'w') as open_output: open_output.write('>%s\n%s\n' % results) #trim this contig trim_additional_merged_contigs(cur_pair, additional_merged_pair) remaining.append(cur_pair) merge_file = os.path.join(output_dir, "merged_consensus.fa") if merged_pair: merged_pair_file = os.path.join(output_dir, 'tmp_merged_consensus.fa') with open(merged_pair_file, 'w') as open_output: open_output.write('>%s\n%s\n' % merged_pair) tmp = [merged_pair_file] tmp.extend(remaining) concatenate_consensus(tmp, merge_file) return merge_file else: return None
def get_fasta_length(fasta_file): length = 0 with open(fasta_file) as open_fasta: reader = FastaReader(open_fasta) header, sequence = reader.next() length = len(sequence) return length
def force_merge_consensus(read1_consensus, read2_consensus, output_merge_file): open_output = open(output_merge_file, 'w') open_read1 = open(read1_consensus) open_read2 = open(read2_consensus) fasta_reader1 = FastaReader(open_read1) read1_name, read1_sequence = fasta_reader1.next() open_read1.close() name = "%s_forced_merged" % read1_name array = [read1_sequence] fasta_reader2 = FastaReader(open_read2) for read2_name, read2_sequence in fasta_reader2: name = "%s_%s" % (name, read2_name) array.append("N" * 100) array.append(read2_sequence) open_output.write(">%s\n%s\n" % (name, ''.join(array))) open_read2.close() open_output.close()
def get_basic_stats(contig_file, all_sites): open_file = open(contig_file) fasta_reader = FastaReader(open_file) for fasta_record in fasta_reader: header, sequence = fasta_record match = re.match('(.+)_pair_\d+_length_\d+', header) site_name = match.group(1) all_sites[site_name]["number_contig"] += 1 if len(sequence) > all_sites[site_name].get("max_length"): all_sites[site_name]["max_length"] = len(sequence) return all_sites
def get_list_of_length(contig_file): open_file = open(contig_file) list_length = [] nb_contig = 0 max_length = 0 fasta_reader = FastaReader(open_file) for fasta_record in fasta_reader: header, sequence = fasta_record nb_contig += 1 if len(sequence) > max_length: max_length = len(sequence) return nb_contig, max_length
def __init__(self, genome_file, keep_in_memory=True, keep_until_done=False, prefix=''): self.open_genome_file = open_input_file(genome_file) self.reader = FastaReader(self.open_genome_file) self.keep_in_memory = keep_in_memory self.keep_until_done = keep_until_done self.prefix = prefix self.all_chr = {}
def create_sequence_dict_from_contigs_file(contig_file): print "Read %s" % contig_file sequence_dictionary = [] all_names = {} with open(contig_file) as open_file: for header, sequence in FastaReader(open_file): sequence_dictionary.append({"SN": header, "LN": len(sequence)}) if all_names.has_key(header): raise StandardError("Duplicated reference name %s in %s" % (header, contig_file)) all_names[header] = 1 return sequence_dictionary
def analyse_consensus_file(consensus_file): with open(consensus_file) as open_file: fasta_reader = FastaReader(open_file) number_read2_contig = 0 longuest_contigs = 0 is_merged = False for header, sequence in fasta_reader: #_merged_os0_71D21M193I or _pair_1_length_452 sp_header = header.split('_') if len(sp_header) > 2 and sp_header[-3] == 'merged': type = 'merged' number_read2_contig += 1 if len(sequence) > longuest_contigs: longuest_contigs = len(sequence) is_merged = True elif len(sp_header) > 3 and sp_header[-4] == 'pair': type = 'read2_contig' number_read2_contig += 1 if len(sequence) > longuest_contigs: longuest_contigs = len(sequence) else: type = 'read1_contig' return number_read2_contig, longuest_contigs, is_merged