QC = str(case_SeqInfo['QC']) # PS1 PS_set = ['PS1', 'PS2'] for PS in PS_set: HLATyping = case_SeqInfo[PS]['HLATyping'] Donor = case_SeqInfo[PS]['Donor'] Recipient = case_SeqInfo[PS]['Recipient'] record = (BMT_caseID, Audit, Active, Comment, QC, HLATyping, PS, Donor, Recipient, ) cursor.execute('INSERT INTO OriginalSeqs VALUES (?,?,?,?,?,?,?,?,?)', record) conn.commit() conn.close() fname = output + 'SG41_52_HLA_' + locus + '_paired' IMGTdbIO.save_dict2pickle(available_records, fname) #aa = IMGTdbIO.load_pickle2dict(fname, output) ################# # Class II ################# all_DB_files = glob.glob("../Output/SG41_52/2018/IMGTv3310/AvailDB/*.db") db_file = all_DB_files[4] ## 0: DPB1 2:DRB1 5:DQB1 locus = db_file.split('_')[4] conn = sql.connect(db_file) # automatically creates a file if doesn't exist conn.row_factory = sql.Row # Each row is a dictionary: {colNames: Value} cursor = conn.cursor()
@author: hhuang2 """ # import glob import sqlite3 as sql # from utils import phase_block_check as ps from utils import IMGTdbIO, CompareSeq import os import re locus = 'DQB1' #pkl_fp = '../Output/SG39_DRpairs/SG39_HLA_'+ locus +'_paired.pkl' pkl_fp = '../Output/SG39/2018/SG39_DRpairs/SG39_HLA_' + locus + '_paired.pkl' DRpair_seqInfo = IMGTdbIO.load_pickle2dict(pkl_fp) case_count = len(DRpair_seqInfo) print('Locus ' + locus + ' has ' + str(case_count) + ' paired cases.') DB_fp = "../Output/SG39/2018/SG39_DRpairs/SG39_HLA_" + locus + "_paired.db" conn = sql.connect(DB_fp) cursor = conn.cursor() cursor.execute('''CREATE TABLE IF NOT EXISTS DR_pair_comparison (BMT_caseID text, QC text, PS1_HLATyping text, PS1_GLstringM text, PS1_SeqM text, PS2_HLATyping text, PS2_GLstringM text, PS2_SeqM text, Audit text, Active text, Comment text)''')
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sun Oct 8 13:24:16 2017 @author: hhuang2 """ from utils import IMGTdbIO from Bio.Seq import Seq from Bio.Alphabet import generic_dna typing1 = 'A*23:17' Refseq1 = IMGTdbIO.readIMGTsql(typing1, db_fp= '../Database/', field = 'Exon1, Exon2, Exon3, Exon4, Exon5, Exon6, Exon7, Exon8') typing2 = 'A*23:01:01' Refseq2 = IMGTdbIO.readIMGTsql(typing2, db_fp= '../Database/', field = 'Exon1, Exon2, Exon3, Exon4, Exon5, Exon6, Exon7, Exon8') coding_dna = Seq(Refseq1, generic_dna) coding_dna.translate() str(coding_dna.translate()) seq = '' for i in range(len(Refseq1)): seq += Refseq1[i] typing1 = 'A*23:17' typing2 = 'A*23:01:01' HLAtyping = typing1+'_'+typing2 Exons = 'Exon1, Exon2, Exon3, Exon4, Exon5, Exon6, Exon7, Exon8'
""" import glob import csv from utils import IMGTdbIO, CompareSeq groupType = 'fiveLoci_paired' # groupType = 'ClassI_paired' # groupType = 'All_paired' ; 'fiveLoci_paired' All_loci = ['A', 'B', 'C', 'DRB1', 'DQB1', 'DPB1'] Five_loci = ['A', 'B', 'C', 'DRB1', 'DQB1'] ClassI_loci = ['A', 'B', 'C'] ClassII_loci = ['DRB1', 'DQB1', 'DPB1'] Group_fname = '../Output/SG39/2018/SG39_Stats/fiveLoci_paired_Stats_0125_' + groupType + '.pkl' Stats_Dict = IMGTdbIO.load_pickle2dict(Group_fname) CaseStats = Stats_Dict['CaseStats'] LocusStats = Stats_Dict['LocusStats'] db_fp = '../Database/' #key = '84571' #CaseStats[key] #key in group_caseIDs ## : paired cases HLA typing stats fname = '../Output/SG39/2018/SG39_DRpairs/SG39_pairedCases_Stats.pkl' Matching_cases_stats = IMGTdbIO.load_pickle2dict(fname) CaseMatchTable = {} for locus in All_loci:
""" Created on Tue Oct 3 14:42:06 2017 @author: hhuang2 """ import glob import sqlite3 as sql # from utils import phase_block_check as ps from utils import IMGTdbIO, CompareSeq import os import re fname = '../Output/SG41_52/2018/IMGTv3310/SG41_52_DRpair_Stats/SG41_52_pairedCases_Stats.pkl' Matching_cases_stats = IMGTdbIO.load_pickle2dict(fname) ## 'All_paired' groupType = 'fiveLoci_paired' # groupType = 'ClassI_paired' # groupType = 'All_paired' group_caseIDs = Matching_cases_stats[groupType] All_loci = ['A', 'B', 'C', 'DRB1', 'DQB1']#, 'DPB1'] ClassI_loci = ['A', 'B', 'C'] ClassII_loci = ['DRB1', 'DQB1'] CaseStats = {} LocusStats = {} #MatchStats = {} for caseID in group_caseIDs: # for locus in ClassI_loci: ARSregion = ['Exon2', 'Exon3']
def check_DQB102_Block_seq(seq_count, tplist, unique_Query, unique_HLATyping_list, ID, version="3310"): ''' Two blocks one phase sequences ''' if type(ID) == float: ID = str(int(ID)) Locus = tplist[0].split("*")[0] ARS0seq = IMGTdbIO.readIMGTsql(tplist[0], field='Exon2, Exon3', version=version) ARS1seq = IMGTdbIO.readIMGTsql(tplist[1], field='Exon2, Exon3', version=version) serotype = [tp.split(":")[0] for tp in tplist] if seq_count > 3: print( "Please check the ID: " + ID + " Locus " + Locus + ", have heterozygotic DQB1*02 types or have more sequences than expected." ) QueryTyping = {} for seq_item in unique_Query: # PS1 if ARS0seq[0] in seq_item: # PS1 Exon 2 if serotype[0] == "DQB1*02": # DQB1*02 - 2 blocks if "PS1" not in QueryTyping.keys(): QueryTyping["PS1"] = { "GLstring": unique_HLATyping_list[0], "Sequence": [seq_item], "blockIDs": [1] } else: # altered block order QueryTyping["PS1"] = { "GLstring": unique_HLATyping_list[0], "Sequence": [seq_item, QueryTyping["PS1"]["Sequence"][0]], "blockIDs": [1, 2] } else: # non-DQB1 - 1 block if "PS1" not in QueryTyping.keys(): QueryTyping["PS1"] = { "GLstring": unique_HLATyping_list[0], "Sequence": [seq_item], "blockIDs": [1] } else: QueryTyping["PS1"]['Sequence'].append(seq_item) QueryTyping["PS1"]['blockIDs'].append(2) elif ARS0seq[1] in seq_item: # PS1 Exon 3 if serotype[0] == "DQB1*02": # DQB1*02 - 2 blocks if "PS1" not in QueryTyping.keys(): QueryTyping["PS1"] = { "GLstring": unique_HLATyping_list[0], "Sequence": [seq_item], "blockIDs": [2] } else: QueryTyping["PS1"]['Sequence'].append(seq_item) QueryTyping["PS1"]['blockIDs'].append(2) else: # non-DQB1 - 1 block if "PS1" not in QueryTyping.keys(): QueryTyping["PS1"] = { "GLstring": unique_HLATyping_list[0], "Sequence": [seq_item], "blockIDs": [1] } else: QueryTyping["PS1"]['Sequence'].append(seq_item) QueryTyping["PS1"]['blockIDs'].append(2) ## PS2 elif ARS1seq[0] in seq_item: # PS2 Exon 2 if serotype[0] == "DQB1*02": # DQB1*02 - 2 blocks if "PS2" not in QueryTyping.keys(): QueryTyping["PS2"] = { "GLstring": unique_HLATyping_list[1], "Sequence": [seq_item], "blockIDs": [1] } else: # altered block order QueryTyping["PS2"] = { "GLstring": unique_HLATyping_list[1], "Sequence": [seq_item, QueryTyping["PS2"]["Sequence"][0]], "blockIDs": [1, 2] } else: # non-DQB1 - 1 block if "PS2" not in QueryTyping.keys(): QueryTyping["PS2"] = { "GLstring": unique_HLATyping_list[1], "Sequence": [seq_item], "blockIDs": [1] } else: QueryTyping["PS2"]['Sequence'].append(seq_item) QueryTyping["PS2"]['blockIDs'].append(2) elif ARS1seq[1] in seq_item: # PS2 Exon 3 if serotype[0] == "DQB1*02": # DQB1*02 - 2 blocks if "PS2" not in QueryTyping.keys(): QueryTyping["PS2"] = { "GLstring": unique_HLATyping_list[1], "Sequence": [seq_item], "blockIDs": [2] } else: QueryTyping["PS2"]['Sequence'].append(seq_item) QueryTyping["PS2"]['blockIDs'].append(2) else: # non-DQB1 - 1 block if "PS2" not in QueryTyping.keys(): QueryTyping["PS2"] = { "GLstring": unique_HLATyping_list[1], "Sequence": [seq_item], "blockIDs": [1] } else: QueryTyping["PS2"]['Sequence'].append(seq_item) QueryTyping["PS2"]['blockIDs'].append(2) else: QueryTyping["PS3"] = { "GLstring": unique_HLATyping_list, "Sequence": [seq_item], "blockIDs": [1] } print(ID + ": The sequence at Locus " + Locus + " doesn't match to either of the Typings") if "PS1" in QueryTyping.keys() and "PS2" not in QueryTyping.keys( ): ## Homozygous QueryTyping["PS2"] = QueryTyping["PS1"] return (QueryTyping)
def check_twoBlock_seq(seq_count, tplist, unique_Query, unique_HLATyping_list, ID, version="3310"): ''' Two blocks one phase sequences ''' if type(ID) == float: ID = str(int(ID)) Locus = tplist[0].split("*")[0] ARS0seq = IMGTdbIO.readIMGTsql(tplist[0], field='Exon2, Exon3', version=version) ARS1seq = IMGTdbIO.readIMGTsql(tplist[1], field='Exon2, Exon3', version=version) if seq_count > 4: print("Please check the ID: " + ID + " Locus " + Locus + "! More sequences than expected.") QueryTyping = {} for seq_item in unique_Query: if ARS0seq[0] in seq_item: # the first type; block 1; exon2 if "PS1" not in QueryTyping.keys(): QueryTyping["PS1"] = { "GLstring": unique_HLATyping_list[0], "Sequence": [seq_item], "blockIDs": [1] } else: # altered block order QueryTyping["PS1"] = { "GLstring": unique_HLATyping_list[0], "Sequence": [seq_item, QueryTyping["PS1"]["Sequence"][0]], "blockIDs": [1, 2] } elif ARS0seq[1] in seq_item: # the first type; block 2; exon3 if "PS1" not in QueryTyping.keys(): QueryTyping["PS1"] = { "GLstring": unique_HLATyping_list[0], "Sequence": [seq_item], "blockIDs": [2] } else: QueryTyping["PS1"]['Sequence'].append(seq_item) QueryTyping["PS1"]['blockIDs'].append(2) elif ARS1seq[0] in seq_item: # second type; block 1; exon2 if "PS2" not in QueryTyping.keys(): QueryTyping["PS2"] = { "GLstring": unique_HLATyping_list[1], "Sequence": [seq_item], "blockIDs": [1] } else: QueryTyping["PS2"] = { "GLstring": unique_HLATyping_list[1], "Sequence": [seq_item, QueryTyping["PS2"]["Sequence"][0]], "blockIDs": [1, 2] } elif ARS1seq[1] in seq_item: # second type; block2; exon3 if "PS2" not in QueryTyping.keys(): QueryTyping["PS2"] = { "GLstring": unique_HLATyping_list[1], "Sequence": [seq_item], "blockIDs": [2] } else: QueryTyping["PS2"]['Sequence'].append(seq_item) QueryTyping["PS2"]['blockIDs'].append(2) else: QueryTyping["PS3"] = { "GLstring": unique_HLATyping_list, "Sequence": [seq_item], "blockIDs": [1] } print(ID + ": The sequence at Locus " + Locus + " doesn't match to either of the Typings") if "PS1" in QueryTyping.keys() and "PS2" not in QueryTyping.keys( ): ## Homozygous QueryTyping["PS2"] = QueryTyping["PS1"] return (QueryTyping)
def check_oneBlock_seq(seq_count, tplist, unique_Query, unique_HLATyping_list, ID, version="3310"): ''' For one block one phase sequence ''' if type(ID) == float: ID = str(int(ID)) Locus = tplist[0].split("*")[0] ARS0seq = IMGTdbIO.readIMGTsql(tplist[0], field='Exon2, Exon3', version=version) ARS1seq = IMGTdbIO.readIMGTsql(tplist[1], field='Exon2, Exon3', version=version) if seq_count > 2: print("Please check the ID: " + ID + " Locus " + Locus + "! More sequences than expected.") QueryTyping = {} for seq_item in unique_Query: if ARS0seq != ARS1seq: # if the two types have different ARS regions if ARS0seq[0] in seq_item and ARS0seq[ 1] in seq_item: # the first type if "PS1" not in QueryTyping.keys(): QueryTyping["PS1"] = { "GLstring": unique_HLATyping_list[0], "Sequence": [seq_item], "blockIDs": [1] } else: QueryTyping["PS1"]['Sequence'].append(seq_item) QueryTyping["PS1"]['blockIDs'].append(2) elif ARS1seq[0] in seq_item and ARS1seq[ 1] in seq_item: # second type if "PS2" not in QueryTyping.keys(): QueryTyping["PS2"] = { "GLstring": unique_HLATyping_list[1], "Sequence": [seq_item], "blockIDs": [1] } else: QueryTyping["PS2"]['Sequence'].append(seq_item) QueryTyping["PS2"]['blockIDs'].append(2) else: if "PS3" not in QueryTyping.keys(): QueryTyping["PS3"] = { "GLstring": unique_HLATyping_list, "Sequence": [seq_item], "blockIDs": [1] } else: QueryTyping["PS3"]['Sequence'].append(seq_item) QueryTyping["PS3"]['blockIDs'].append(2) print(ID + ": The sequence at Locus " + Locus + " doesn't match to either of the Typings") else: # if the two types have the same ARS regions ARS0seq1456 = IMGTdbIO.readIMGTsql( tplist[0], field='Exon1, Exon4, Exon5, Exon6', version=version) ARS1seq1456 = IMGTdbIO.readIMGTsql( tplist[1], field='Exon1, Exon4, Exon5, Exon6', version=version) if ARS0seq1456 != ARS1seq1456: if ARS0seq1456[0] in seq_item and ARS0seq1456[ 1] in seq_item and ARS0seq1456[ 2] in seq_item and ARS0seq1456[ 3] in seq_item: # the first type if "PS1" not in QueryTyping.keys(): QueryTyping["PS1"] = { "GLstring": unique_HLATyping_list[0], "Sequence": [seq_item], "blockIDs": [1] } else: QueryTyping["PS1"]['Sequence'].append(seq_item) QueryTyping["PS1"]['blockIDs'].append(2) elif ARS1seq1456[0] in seq_item and ARS1seq1456[ 1] in seq_item and ARS1seq1456[ 2] in seq_item and ARS1seq1456[ 3] in seq_item: # second type if "PS2" not in QueryTyping.keys(): QueryTyping["PS2"] = { "GLstring": unique_HLATyping_list[1], "Sequence": [seq_item], "blockIDs": [1] } else: QueryTyping["PS2"]['Sequence'].append(seq_item) QueryTyping["PS2"]['blockIDs'].append(2) else: if "PS3" not in QueryTyping.keys(): QueryTyping["PS3"] = { "GLstring": unique_HLATyping_list, "Sequence": [seq_item], "blockIDs": [1] } else: QueryTyping["PS3"]['Sequence'].append(seq_item) QueryTyping["PS3"]['blockIDs'].append(2) print(ID + ": The sequence at Locus " + Locus + " doesn't match to either of the Typings") else: ARS0seq7 = IMGTdbIO.readIMGTsql(tplist[0], field='Exon7', version=version) ARS1seq7 = IMGTdbIO.readIMGTsql(tplist[1], field='Exon7', version=version) if ARS0seq7 != ARS1seq7: if ARS0seq7[0] in seq_item: # the first type if "PS1" not in QueryTyping.keys(): QueryTyping["PS1"] = { "GLstring": unique_HLATyping_list[0], "Sequence": [seq_item], "blockIDs": [1] } else: QueryTyping["PS1"]['Sequence'].append(seq_item) QueryTyping["PS1"]['blockIDs'].append(2) elif ARS1seq7[0] in seq_item: # second type if "PS2" not in QueryTyping.keys(): QueryTyping["PS2"] = { "GLstring": unique_HLATyping_list[1], "Sequence": [seq_item], "blockIDs": [1] } else: QueryTyping["PS2"]['Sequence'].append(seq_item) QueryTyping["PS2"]['blockIDs'].append(2) else: QueryTyping["PS3"] = { "GLstring": unique_HLATyping_list, "Sequence": [seq_item], "blockIDs": [1] } print(ID + ": The sequence at Locus " + Locus + " doesn't match to either of the Typings") else: ## all 8 exons are the same if "PS1" not in QueryTyping.keys(): QueryTyping["PS1"] = { "GLstring": unique_HLATyping_list[0], "Sequence": [seq_item], "blockIDs": [1] } elif "PS2" not in QueryTyping.keys(): QueryTyping["PS2"] = { "GLstring": unique_HLATyping_list[1], "Sequence": [seq_item], "blockIDs": [1] } else: QueryTyping["PS1"]['Sequence'].append(seq_item) QueryTyping["PS1"]['blockIDs'].append(2) print( ID + ": The sequence at Locus " + Locus + " two typings have exactly the same Exon sequences. Cannot distinguish by Exons." ) if "PS1" in QueryTyping.keys() and "PS2" not in QueryTyping.keys( ): ## Homozygous QueryTyping["PS2"] = QueryTyping["PS1"] return (QueryTyping)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sat Feb 3 12:28:22 2018 @author: hhuang2 """ import sqlite3 as sql from utils import IMGTdbIO version = '3310' IMGTdbIO.buildIMGTsql('DQB1', version=version, output_fp="../Database/") locus = 'DQB1' db_fp = '../Database/' filename = db_fp + "IMGT-" + version + "_HLA-" + locus + ".db" con = sql.connect(filename) cur = con.cursor() field1 = 'HLATyping' cur.execute('SELECT ' + field1 + ' FROM Sequences') Typings_temp = cur.fetchall() count = 0 field2 = 'AlignedGenomSeq' for tp in Typings_temp: cur.execute('SELECT ' + field2 + ' FROM Sequences WHERE HLATyping = ?', tp) sequences_temp = cur.fetchone() if sequences_temp[0] != '': count += 1
@author: hhuang2 """ from utils import IMGTdbIO #, CompareSeq from collections import Counter groupType = 'All_paired' # groupType = 'ClassI_paired' # groupType = 'All_paired' All_loci = ['A', 'B', 'C', 'DRB1', 'DQB1', 'DPB1'] Five_loci = ['A', 'B', 'C', 'DRB1', 'DQB1'] ClassI_loci = ['A', 'B', 'C'] ClassII_loci = ['DRB1', 'DQB1', 'DPB1'] Group_fname = '../Output/Stats/ClassI_Stats_1003_' + groupType + '.pkl' Stats_Dict = IMGTdbIO.load_pickle2dict(Group_fname) CaseStats = Stats_Dict['CaseStats'] LocusStats = Stats_Dict['LocusStats'] ## TODO1: paired cases HLA typing stats fname = '../Output/SG39_DRpairs/SG39_pairedCases_Stats.pkl' Matching_cases_stats = IMGTdbIO.load_pickle2dict(fname) AlleleStats = {} for locus in All_loci: AlleleStats[locus] = {} DRpaired_file = '../Output/SG39_DRpairs/SG39_HLA_' + locus + '_paired.pkl' DRpaired_table = IMGTdbIO.load_pickle2dict(DRpaired_file) num_total = len(DRpaired_table)