def write_misasm_broken_ctgs(contigs_file, breaks, out_prefix, in_gff=None, in_gff_name=None): current_path = os.getcwd() os.chdir('ctg_alignments') if in_gff and in_gff_name: with open(in_gff_name, 'w') as f: for i in in_gff.keys(): for j in in_gff[i]: f.write(str(j) + '\n') x = SeqReader("../../" + contigs_file) f = open(out_prefix + ".misasm.break.fa", 'w') for header, seq in x.parse_fasta(): header = header[1:] if header not in breaks: f.write(">" + header + "\n") f.write(seq + "\n") else: # Break the contig ctg_len = len(seq) break_list = [0] + sorted(breaks[header]) + [ctg_len] for i in range(len(break_list) - 1): f.write(">" + header + "_misasm_break:" + str(break_list[i]) + "-" + str(break_list[i + 1]) + "\n") f.write(seq[break_list[i]:break_list[i + 1]] + "\n") os.chdir(current_path)
def make_gaps_tree(in_file): # A dictionary to store an interval tree for each chromosome header. all_trees = dict() x = SeqReader(in_file) if in_file.endswith(".gz"): for header, sequence in x.parse_gzip_fasta(): # Remove the greater than sign and only get first token if delimited by spaces header = header[1:].split(' ')[0] all_trees[header] = IntervalTree() gap_sequence = GapSequence(sequence) all_coordinates = [(m.start(0), m.end(0)) for m in gap_sequence.get_gap_coords()] for i in all_coordinates: all_trees[header][i[0]:i[1]] = i else: for header, sequence in x.parse_fasta(): # Remove the greater than sign and only get first token if delimited by spaces header = header[1:].split(' ')[0] all_trees[header] = IntervalTree() gap_sequence = GapSequence(sequence) all_coordinates = [(m.start(0), m.end(0)) for m in gap_sequence.get_gap_coords()] for i in all_coordinates: all_trees[header][i[0]:i[1]] = i return all_trees
def create_pseudomolecules(in_contigs_file, in_unique_contigs, gap_size): """ Need to make a translation table for easy lift-over. :param in_contigs_file: :param in_unique_contigs: :param gap_size: :return: """ # First, read all of the contigs into memory remaining_contig_headers = [] all_seqs = dict() x = SeqReader('../' + in_contigs_file) for header, seq in x.parse_fasta(): remaining_contig_headers.append(header.split(' ')[0]) all_seqs[header.split(' ')[0]] = seq # Get all reference chromosomes all_chroms = sorted( list( set([ in_unique_contigs[i].ref_chrom for i in in_unique_contigs.keys() ]))) # Iterate through each orderings file and store sequence in a dictionary all_pms = dict() for this_chrom in all_chroms: all_pms[this_chrom] = '' orderings_file = 'orderings/' + this_chrom + '_orderings.txt' orderings = get_orderings(orderings_file) for line in orderings: # Mark that we have seen this contig remaining_contig_headers.pop( remaining_contig_headers.index('>' + line[0])) if line[1] == '+': all_pms[this_chrom] += all_seqs['>' + line[0]] all_pms[this_chrom] += ''.join('N' for i in range(gap_size)) else: assert line[1] == '-' all_pms[this_chrom] += reverse_complement(all_seqs['>' + line[0]]) all_pms[this_chrom] += ''.join('N' for i in range(gap_size)) all_pms[this_chrom] += '\n' # Get unincorporated sequences and place them in Chr0 all_pms['Chr0'] = '' for header in remaining_contig_headers: all_pms['Chr0'] += all_seqs[header] all_pms['Chr0'] += ''.join('N' for i in range(gap_size)) all_pms['Chr0'] += '\n' # Write the final sequences out to a file with open('ragoo.fasta', 'w') as f: f.write('>Chr0_RaGOO\n') f.write(all_pms['Chr0']) for header in all_chroms: f.write('>' + header + '_RaGOO\n') f.write(all_pms[header])
def create_pseudomolecules(in_contigs_file, in_unique_contigs, gap_size, chr0=True): """ Need to make a translation table for easy lift-over. :param in_contigs_file: :param in_unique_contigs: :param gap_size: :return: """ # First, read all of the contigs into memory remaining_contig_headers = [] all_seqs = OrderedDict() x = SeqReader(in_contigs_file) for header, seq in x.parse_fasta(): remaining_contig_headers.append(header.split(' ')[0]) all_seqs[header.split(' ')[0]] = seq # Get all reference chromosomes all_chroms = sorted( list( set([ in_unique_contigs[i].ref_chrom for i in in_unique_contigs.keys() ]))) # Iterate through each orderings file and store sequence in a dictionary all_pms = dict() pad = ''.join('N' for i in range(gap_size)) for this_chrom in all_chroms: orderings_file = 'orderings/' + this_chrom + '_orderings.txt' orderings = get_orderings(orderings_file) if orderings: seq_list = [] for line in orderings: # Mark that we have seen this contig remaining_contig_headers.pop( remaining_contig_headers.index('>' + line[0])) if line[1] == '+': seq_list.append(all_seqs['>' + line[0]]) else: assert line[1] == '-' seq_list.append(reverse_complement(all_seqs['>' + line[0]])) all_pms[this_chrom] = pad.join(seq_list) all_pms[this_chrom] += '\n' # Get unincorporated sequences and place them in Chr0 if remaining_contig_headers: if chr0: chr0_headers = [] chr0_seq_list = [] for header in remaining_contig_headers: chr0_headers.append(header) chr0_seq_list.append(all_seqs[header]) all_pms['Chr0'] = pad.join(chr0_seq_list) all_pms['Chr0'] += '\n' # Write out the list of chr0 headers f_chr0_g = open('groupings/Chr0_contigs.txt', 'w') f_chr0_o = open('orderings/Chr0_orderings.txt', 'w') for i in chr0_headers: f_chr0_g.write(i[1:] + "\t" + "0" + '\n') f_chr0_o.write(i[1:] + '\t' + "+" + '\t' + "0" + '\t' + "0" + '\n') f_chr0_g.close() f_chr0_o.close() else: # Instead of making a chromosome 0, add the unplaced sequences as is. for header in remaining_contig_headers: all_pms[header[1:]] = all_seqs[header] + "\n" f_chr0_g = open('groupings/' + header[1:] + '_contigs.txt', 'w') f_chr0_o = open('orderings/' + header[1:] + '_orderings.txt', 'w') f_chr0_g.write(header[1:] + "\t" + "0" + '\n') f_chr0_o.write(header[1:] + '\t' + "+" + '\t' + "0" + '\t' + "0" + '\n') f_chr0_g.close() f_chr0_o.close() # Write the final sequences out to a file with open('ragoo.fasta', 'w') as f: for out_header in all_pms: f.write(">" + out_header + "_RaGOO\n") f.write(all_pms[out_header])
def read_gz_contigs(in_file): d = dict() x = SeqReader(in_file) for header, seq in x.parse_gzip_fasta(): d[header.replace('>', '').split(' ')[0]] = seq return d