def parse_cdhit_output_100p(unclustered_fasta, cdhit_output): final_cluster_mappings = dict() seqdict_100p = dict() for rec in SeqIO.parse(cdhit_output, 'fasta'): fwd = str(rec.seq) rev = misc.revcomp(fwd) seq_key = tuple(sorted([fwd, rev])) seqdict_100p[seq_key] = rec for rec in SeqIO.parse(unclustered_fasta, 'fasta'): fwd = str(rec.seq) rev = misc.revcomp(fwd) seq_key = tuple(sorted([fwd, rev])) cluster = seqdict_100p[seq_key].id final_cluster_mappings[rec.id] = cluster return final_cluster_mappings
def get_inferred_sequences(pairs, genome_dict, add_softclipped_bases=False): inferred_sequences = [] for read1, read2 in pairs: if read1.query_name.count('_') == 2: context_width = int(read1.query_name.split('_')[-2]) name = read1.reference_name + ':' + str( read1.reference_start + context_width) + '-' + str(read2.reference_end - context_width) inferred_sequence = genome_dict[read1.reference_name][ read1.reference_start:read2.reference_end] if add_softclipped_bases: inferred_sequence = sctools.left_softclipped_sequence_strict( read1 ) + inferred_sequence + sctools.right_softclipped_sequence_strict( read2) inferred_sequence = inferred_sequence[context_width:-context_width] if read1.query_name.split('_')[-1] == '2': inferred_sequence = misc.revcomp(inferred_sequence) contig_edge = False if sctools.is_left_softclipped_strict(read1) and \ sctools.left_softclipped_position(read1) < 0: contig_edge = True elif sctools.is_right_softclipped_strict(read2) and \ sctools.right_softclipped_position(read2) >= len(genome_dict[read2.reference_name]): contig_edge = True else: name = read1.reference_name + ':' + str( read1.reference_start) + '-' + str(read2.reference_end) inferred_sequence = genome_dict[read1.reference_name][ read1.reference_start:read2.reference_end] if add_softclipped_bases: inferred_sequence = sctools.left_softclipped_sequence_strict( read1 ) + inferred_sequence + sctools.right_softclipped_sequence_strict( read2) if read1.query_name.split('_')[-1] == '2': inferred_sequence = misc.revcomp(inferred_sequence) contig_edge = False if sctools.is_left_softclipped_strict(read1) and \ sctools.left_softclipped_position(read1) < 0: contig_edge = True elif sctools.is_right_softclipped_strict(read2) and \ sctools.right_softclipped_position(read2) >= len(genome_dict[read2.reference_name]): contig_edge = True inferred_sequences.append( (name, len(inferred_sequence), contig_edge, inferred_sequence)) return inferred_sequences
def get_inferred_sequence(self, forward_read, reverse_read, is_reverse): contig = forward_read.reference_name start = forward_read.reference_start end = reverse_read.reference_end inferred_sequence = ''.join(self.genome_dict[contig][start:end]) inferred_sequence = sctools.left_softclipped_sequence_strict(forward_read) + \ inferred_sequence + \ sctools.right_softclipped_sequence_strict(reverse_read) inferred_sequence = inferred_sequence[self.context_width:-self.context_width] if is_reverse: inferred_sequence = misc.revcomp(inferred_sequence) contig_edge = False if sctools.is_left_softclipped_strict(forward_read) and \ sctools.left_softclipped_position(forward_read) < 0: contig_edge = True elif sctools.is_right_softclipped_strict(reverse_read) and \ sctools.right_softclipped_position(reverse_read) >= len(self.genome_dict[contig]): contig_edge = True return inferred_sequence, contig_edge
def get_inferred_sequence(self, forward_read, reverse_read, is_reverse): contig, start, end = forward_read.reference_name, forward_read.reference_start, reverse_read.reference_end inferred_sequence = ''.join(self.genome_dict[contig][start:end]) inferred_sequence = sctools.left_softclipped_sequence_strict(forward_read) + \ inferred_sequence + \ sctools.right_softclipped_sequence_strict(reverse_read) if is_reverse: inferred_sequence = misc.revcomp(inferred_sequence) return inferred_sequence
def add_sequence_to_secondary_alignment(sam_file_in, sam_file_out): outfile = open(sam_file_out, 'w') infile = pysam.AlignmentFile(sam_file_in, 'r') outfile = pysam.AlignmentFile(sam_file_out, "w", template=infile) current_seq = None current_seq_reverse = None for read in infile: if read.query_sequence is not None: current_seq = read.query_sequence current_seq_reverse = read.is_reverse else: if current_seq_reverse == read.is_reverse: read.query_sequence = current_seq else: read.query_sequence = misc.revcomp(current_seq) outfile.write(read) outfile.close()
def get_seq_lengths(clusters, seqs, header=[ 'cluster', 'num_unique_seqs', 'mean_length', 'min_length', 'max_length' ]): seq_lengths1 = defaultdict(set) for cluster, seq in zip(clusters, seqs): seq_lengths1[cluster].add(seq) seq_lengths2 = defaultdict( lambda: { 'unique_seqs': set(), 'num_unique_seqs': 0, 'mean_length': 0, 'min_length': 0, 'max_length': 0 }) for cluster in seq_lengths1: for seq in seq_lengths1[cluster]: seq_lengths2[cluster]['unique_seqs'].add( tuple(sorted([seq, misc.revcomp(seq)]))) for cluster in seq_lengths2: all_seq_lengths = list( map(lambda x: len(x[0]), seq_lengths2[cluster]['unique_seqs'])) seq_lengths2[cluster]['num_unique_seqs'] = len(all_seq_lengths) seq_lengths2[cluster]['mean_length'] = np.mean(all_seq_lengths) seq_lengths2[cluster]['min_length'] = np.min(all_seq_lengths) seq_lengths2[cluster]['max_length'] = np.max(all_seq_lengths) seq_lengths2 = [ tuple([cluster] + [seq_lengths2[cluster][lab] for lab in header[1:]]) for cluster in seq_lengths2 ] seq_lengths2 = pd.DataFrame(seq_lengths2, columns=header) return seq_lengths2
def cluster_100p(infile, outfile): seqdict1 = dict() for rec in SeqIO.parse(infile, 'fasta'): seq = str(rec.seq) if seq not in seqdict1: seqdict1[seq] = rec seqdict2 = dict() for seq in seqdict1: fwd = seq rev = misc.revcomp(seq) seq_key = tuple(sorted([fwd, rev])) if seq_key not in seqdict2: seqdict2[seq_key] = seqdict1[seq] with open(outfile, "w") as handle: SeqIO.write(seqdict2.values(), handle, "fasta")
def write_termini_to_unpaired_fasta(pairs, fasta_prefix): fiveprime_outseqs = [] threeprime_outseqs = [] for p in pairs: name5p = p['pair_id']+'_1' name3p = p['pair_id']+'_2' seq5p = p['seq_5p'] seq3p = p['seq_3p'] record5p = SeqRecord(Seq(seq5p, IUPAC.IUPACAmbiguousDNA), id=str(name5p), description=str(name5p)+'_5p') record3p = SeqRecord(Seq(revcomp(seq3p), IUPAC.IUPACAmbiguousDNA), id=str(name3p), description=str(name3p)+'_3p') fiveprime_outseqs.append(record5p) threeprime_outseqs.append(record3p) with open(fasta_prefix+'.fasta', 'w') as output_handle: SeqIO.write(fiveprime_outseqs, output_handle, 'fasta') with open(fasta_prefix + '.fasta', 'a') as output_handle: SeqIO.write(threeprime_outseqs, output_handle, 'fasta')
def get_inferred_sequences(pairs, genome_dict, add_softclipped_bases=False): inferred_sequences = [] for read1, read2 in pairs: name = read1.reference_name + ':' + str( read1.reference_start) + '-' + str(read2.reference_end) inferred_sequence = genome_dict[ read1.reference_name][read1.reference_start:read2.reference_end] if add_softclipped_bases: inferred_sequence = sctools.left_softclipped_sequence_strict( read1 ) + inferred_sequence + sctools.right_softclipped_sequence_strict( read2) if read1.is_read2: inferred_sequence = misc.revcomp(inferred_sequence) inferred_sequences.append( (name, len(inferred_sequence), inferred_sequence)) return inferred_sequences