def assess_alignment(alignment: pysam.AlignedSegment, alignment_info: Dict): """ Compare alignment against reference alignment""" chrom_match = alignment.reference_name == alignment_info['chrom'] # assess reference bases that match between the two reads matching_pos = np.array(alignment.get_reference_positions(full_length=False)) base_range = (matching_pos >= alignment_info['start']) & (matching_pos <= alignment_info['end']) matching_prop = sum(base_range) / len(alignment_info['cigar']) return chrom_match, matching_prop
referenceid = str(reference) + ' ' refnucpos1 = 0 + (3 * refcodonpos) refnucpos2 = 1 + (3 * refcodonpos) refnucpos3 = 2 + (3 * refcodonpos) if 1 <= (refcodonpos + 1) <= 9: refcodonposid = str(refcodonpos + 1) + " " else: refcodonposid = str(refcodonpos + 1) refAAid = str(Seq(codon).translate()[0]) # loop over all reads to find AAs marker_list = [] for read in samfile.fetch(): read_codon = [] #loop through base and its pos at the same time for seq, pos in zip(read.seq, AlignedSegment.get_reference_positions(read)): if pos == refnucpos1: read_codon.append(seq) if pos == refnucpos2: read_codon.append(seq) if pos == refnucpos3: read_codon.append(seq) if any(read_codon) is True: if len(read_codon) == 3: counter += 1 if ''.join(read_codon) == codon: marker_list.append('.') else: marker_list.append( str(Seq("".join(read_codon)).translate()[0])) print(referenceid, refcodonposid, refAAid, counter,
def __call__(self): fastaFile = pysam.FastaFile(self.args.fastainput) bamFile = pysam.AlignmentFile(self.args.BAMinput, "rb") ssl_settings = {'ca':self.args.sslpath} con = MySQLdb.connect(self.args.server, self.args.user, self.args.password, self.args.database, ssl=ssl_settings) with con: cur = con.cursor() cur.execute("USE " + self.args.database) def batch_gen(data, batch_size): for i in range(0, len(data), batch_size): yield data[i:i+batch_size] references = sorted(set(bamFile.getrname(read.tid) for read in samfile.fetch())) referencesLeng = sorted(set(len(fastaFile.fetch(reference=str(item)))for item in references)) for ref, leng in zip(references, referencesLeng): print ref, leng cur.execute('INSERT INTO templates(protein, length) VALUES(%s, %s)' ,(ref, leng)) for reference in references: returned_position_lines=[] length=0 refcodonpos=0 counter=0 for codon in batch_gen(fastaFile.fetch(reference=str(reference)),3): length+=3 markerlist=[] referenceid = str(reference)+ ' ' refnucpos1=0 +(3*refcodonpos) refnucpos2=1 +(3*refcodonpos) refnucpos3=2 +(3*refcodonpos) if 1 <= (refcodonpos+1) <= 9: refcodonposid = str(refcodonpos+1)+ " " else: refcodonposid = str(refcodonpos+1) refAAid = str(Seq(codon).translate()[0]) marker_list=[] for read in samfile.fetch(): read_codon=[] for seq, pos in zip(read.seq,AlignedSegment.get_reference_positions(read)): if pos == refnucpos1: read_codon.append(seq) if pos == refnucpos2: read_codon.append(seq) if pos == refnucpos3: read_codon.append(seq) if any(read_codon) is True: if len(read_codon) == 3: counter+=1 if ''.join(read_codon) == codon: marker_list.append('.') else: marker_list.append(str(Seq("".join(read_codon)).translate()[0])) print (referenceid, refcodonposid, refAAid, counter, ''.join(str(item)for item in marker_list)) returned_position_lines.append(''.join(str(item)for item in marker_list)) cur.execute("INSERT INTO sites(template_id, position, wild_type_AA) VALUES((SELECT id from templates WHERE protein=%s), %s, %s)" ,(reference, refcodonposid, refAAid)) counter=0 refcodonpos+=1 print returned_position_lines AAs = ('A','R','N','D','C','E','Q','G','H','I','L','K','M','F','P','S','T','W','Y','V','*') for AA in AAs: position=0 for line in returned_position_lines: position+=1 count=0 for readAA in line: if readAA==AA: count+=1 if (count >= 1): print count, AA, position cur.execute("INSERT INTO substitutions(site_id, substitution, count) VALUES((SELECT id from sites WHERE position=%s AND template_id=(SELECT id from templates WHERE protein=%s)), %s, %s)" ,(position, reference, AA, count)) con.commit() fastaFile.close() bamFile.close()