def __init__(self, strSample, strRef, options, InstInitFolder): UserFolderAdmin.__init__(self, strSample, strRef, options, InstInitFolder.strLogPath) self.strSample = strSample self._RemoveTmpBeforStart() self.MakeSampleFolder() ## inheritance self.strRef = strRef self.intCore = options.multicore self.strGapOpen = options.gap_open self.strGapExtend = options.gap_extend self.strTargetWindow = options.target_window self.strIndelCheckPos = options.indel_check_pos self.strTargetRefAlt = options.target_ref_alt self.strBarcodeFile = os.path.join(self.strRefDir, 'Barcode.txt') self.strReferenceSeqFile = os.path.join(self.strRefDir, 'Reference.txt') self.strRefFile = os.path.join(self.strRefDir, 'Reference.fa') self.strPamSeq = options.PAM_seq self.strPamPos = options.PAM_pos self.strGuidePos = options.Guide_pos Helper.MakeFolderIfNot( './Output/{user}/{project}/{sample}/Tmp/Alignment'.format( user=self.strUser, project=self.strProject, sample=self.strSample))
def SubtractIndelWithD0(dictD0IndelMerge, dictExpIndel): """ dictD0IndelMerge: indel proportion - dictExpIndel: indel proportion """ strD0SubResultDir = './Output/{user}/{project}/All_results/D0SubResult'.format( user=strUserName, project=strProjectName) Helper.MakeFolderIfNot(strD0SubResultDir) for strSample, dictBarcode in dictExpIndel.items(): with open( os.path.join( strD0SubResultDir, '{sample}_D0SubResult.txt').format(sample=strSample), 'w') as Output: Output.write( 'Barcode_indel_seq\tD0_total\tD0_indel_prop\tExp_total\tExp_indel_prop\tD0_sub_indel_prop\n' ) for strBarcode, dictCountTotalAndIndel in dictBarcode.items(): intExpTotal = dictCountTotalAndIndel['Total'] for strIndelSeq, dictCount in dictCountTotalAndIndel.items(): if strIndelSeq == 'Total': continue try: intD0Total = dictD0IndelMerge[strBarcode]['Total'] intD0Count = dictD0IndelMerge[strBarcode][strIndelSeq][ 'IndelCount'] floD0Prop = round(intD0Count / float(intD0Total), 6) intExpCount = dictCount['IndelCount'] floExpProp = round(intExpCount / float(intExpTotal), 6) floSubExpIndel = floExpProp - floD0Prop if floSubExpIndel < 0: floSubExpIndel = 0 Output.write('\t'.join( map(str, [ strIndelSeq, intD0Total, floD0Prop, intExpTotal, floExpProp, floSubExpIndel ])) + '\n') except KeyError: intExpCount = dictCount['IndelCount'] floExpProp = round(intExpCount / float(intExpTotal), 6) Output.write('\t'.join( map(str, [ strIndelSeq, 'None', 'None', intExpTotal, floExpProp, floExpProp ])) + '\n')
def CountGroup(InstParameters): """ Sorting_barcode Unique_RandomBarcodeNumber_In_SortingBarcode RandomBarcode Each_RandomBarcode_read_count TATATCATAGCGTACTCATC 8 TGCGTTTG 3 TATATCATAGCGTACTCATC 8 CGCGTTTG 3 TATATCATAGCGTACTCATC 8 TAGTTTTG 1 TATATCATAGCGTACTCATC 8 ATAGTTTG 1 """ sHeader = '' with open(InstParameters.strSampleList) as Sample: ## tmp input listSample = Sample.readlines() setGroup = set([ strRow.replace('\n', '').split('\t')[2].upper() for strRow in listSample ]) for strGroup in setGroup: if strGroup == 'CTRL': continue for strRow in listSample: if strGroup == strGroupOfSample: ## matched group names -> Sum the counts listCol = strRow.replace('\n', '').split('\t') strSample = listCol[0] strRef = listCol[1] strGroupOfSample = listCol[2] strProjectDir = './Output/{user}/{project}'.format( user=InstParameters.strUser, project=InstParameters.strProject) strGroupDir = os.path.join(strProjectDir, 'Group_result') Helper.MakeFolderIfNot(strGroupDir) dTotal_RandomBarcode_cnt_in_SortingBarcode = OrderedDict( ) ## ('GECKO_6367_GATCTGCTC', ['GECKO_6367', 'GATCTGCTC', 2, 156, '0.0128']), ## Unique key, only one list. with open('{project_dir}/{sample}_all_random_barcode.txt'. format( project_dir=strProjectDir, sample=strSample)) as RandomBarcode_SeqFreq: sHeader = RandomBarcode_SeqFreq.readline() for sRow in RandomBarcode_SeqFreq: lCol = sRow.replace('\n', '').split('\t') sSortingBarcode = lCol[0] #iTotal_RandomBarcode_cnt_in_SortingBarcode = int(lCol[1]) sSorting_and_Random_barcode_seq = lCol[ 0] + '_' + lCol[ 2] ## Unique name : Doench2014_1000_CTCTGGGGT iRandomBarcode_count = int(lCol[3]) lCol[3] = iRandomBarcode_count try: _ = dTotal_RandomBarcode_cnt_in_SortingBarcode[ sSorting_and_Random_barcode_seq] dTotal_RandomBarcode_cnt_in_SortingBarcode[ sSorting_and_Random_barcode_seq][ 3] += iRandomBarcode_count except KeyError: dTotal_RandomBarcode_cnt_in_SortingBarcode[ sSorting_and_Random_barcode_seq] = lCol ## initial assignment #END for dRecal_total_kind_of_RandomBarcode = OrderedDict() for sSort_Rand_seq in dTotal_RandomBarcode_cnt_in_SortingBarcode: ## sSorting_and_Random_barcode_seq sSortBarcode = sSort_Rand_seq.split('_')[0] try: dRecal_total_kind_of_RandomBarcode[ sSortBarcode].append( dTotal_RandomBarcode_cnt_in_SortingBarcode[ sSort_Rand_seq]) except KeyError: dRecal_total_kind_of_RandomBarcode[ sSortBarcode] = [ dTotal_RandomBarcode_cnt_in_SortingBarcode[ sSort_Rand_seq] ] for sKey, llValue in dRecal_total_kind_of_RandomBarcode.items( ): ## sKey: TATATCATAGCGTACTCATC, llValue : [[TATATCATAGCGTACTCATC, 8, TGCGTTTG, 3],[],[] ... iKind_of_RandomBarcode = len( llValue ) ################## why do I make like this ????? for lValue in llValue: lValue[ 1] = iKind_of_RandomBarcode ## Recal using group total cnt. llValue = sorted(llValue, key=lambda x: x[3], reverse=True) dRecal_total_kind_of_RandomBarcode[sKey] = llValue strEachGroup = './Output/Group_result/%s' % strGroup Helper.MakeFolderIfNot(strEachGroup) with open(os.path.join(strEachGroup, 'Summary_all_random_barcode_in_group.txt'), 'w') as Sort_Random_cnt,\ open(os.path.join(strEachGroup, 'Summary_Unique_RandomBarcodeNumber_in_group.txt'), 'w') as Uniq_random_cnt: Sort_Random_cnt.write(sHeader) Uniq_random_cnt.write( 'Sorting_barcode\tUnique_RandomBarcodeNumber_In_SortingBarcode\n' ) for sSortBarcode, llCol in dRecal_total_kind_of_RandomBarcode.items( ): Uniq_random_cnt.write( '\t'.join(map(str, [sSortBarcode, len(llCol)])) + '\n') for lCol in llCol: Sort_Random_cnt.write( '\t'.join(map(str, lCol)) + '\n')
def __init__(self, strSample, strRef, options, InstInitFolder): UserFolderAdmin.__init__(self, strSample, strRef, options, InstInitFolder.strLogPath) self.MakeSampleFolder() self.strProjectFile = InstInitFolder.strProjectFile self.intChunkSize = options.chunk_number self.strQualCutoff = options.base_quality self.intInsertionWin = options.insertion_window # Insertion window 0,1,2,3,4 self.intDeletionWin = options.deletion_window # Deletion window 0,1,2,3,4 self.strPamType = options.pam_type # CRISPR type : Cpf1(2 cleavages), Cas9(1 cleavage) self.strPamPos = options.pam_pos # Barcode target position : Forward (barcode + target), Reverse (target + barcode) self.strPickle = options.pickle self.strClassFASTQ = options.class_fastq self.strSplit = options.split self.strLogPath = InstInitFolder.strLogPath self.strBarcodeFile = os.path.join(self.strRefDir, 'Barcode.txt') self.strReferenceSeqFile = os.path.join(self.strRefDir, 'Reference_sequence.txt') self.strTargetSeqFile = os.path.join(self.strRefDir, 'Target_region.txt') self.strRefFile = os.path.join(self.strRefDir, 'Reference.fa') ## The file name required for the user is 'B'arcode.txt but it may be written as 'b'arcode.txt by mistake. ## This part is to fix the situation as mentioned above. if not os.path.isfile(self.strBarcodeFile): if os.path.isfile(self.strRefDir + 'barcode.txt'): self.strBarcodeFile = self.strRefDir + 'barcode.txt' else: logging.error('Barcode path is not correct, please make sure the path correctly.') if not os.path.isfile(self.strReferenceSeqFile): if os.path.isfile(self.strRefDir + 'reference_sequence.txt'): self.strReferenceSeqFile = self.strRefDir + 'reference_sequence.txt' else: logging.error('Reference path is not correct, please make sure the path correctly.') if not os.path.isfile(self.strTargetSeqFile): if os.path.isfile(self.strRefDir + 'target_region.txt'): self.strTargetSeqFile = self.strRefDir + 'target_region.txt' else: logging.error('Target path is not correct, please make sure the path correctly.') self.strFastqDir = './Input/{user}/FASTQ/{project}'.format(user=self.strUser, project=self.strProject) ## './Input/JaeWoo/FASTQ/Test_samples/Sample_1' self.strSampleDir = os.path.join(self.strFastqDir, self.strSample) self.strFastq_name = '' for strFile in os.listdir(self.strSampleDir): if os.path.isfile(self.strSampleDir + '/' + strFile) and strFile.split('.')[-1] == 'fastq': self.strFastq_name = '.'.join(strFile.split('.')[:-1]) logging.info('File name : %s' % self.strFastq_name) ## './Input/JaeWoo/FASTQ/Test_samples/Sample_1/Fastq_file.fastq' self.strInputFile = os.path.join(self.strSampleDir, self.strFastq_name+'.fastq') ## './Input/JaeWoo/FASTQ/Test_samples/Sample_1/Fastq_file.txt' self.strInputList = os.path.join(self.strSampleDir, self.strFastq_name+'.txt') ## './Input/JaeWoo/FASTQ/Test_samples/Sample_1/Split_files' self.strSplitPath = os.path.join(self.strSampleDir, 'Split_files') Helper.MakeFolderIfNot(self.strSplitPath) self.strPair = 'False' # FASTQ pair: True, False
def Convert_Indelsearcher_output(strSampleRefGroup): listSampleRefGroup = strSampleRefGroup.replace('\n', '').replace('\r', '').split('\t') strSample = listSampleRefGroup[0] strRef = listSampleRefGroup[1] print('Processing: %s, %s' % (strSample, strRef)) strBaseEditRefFolder = '../Base_edit_2/Input/{user}/Reference/{project}/{ref}'.format( user=strUser, project=strProject, ref=strRef) strBaseEditQueryFolder = '../Base_edit_2/Input/{user}/Query/{project}/{sample}'.format( user=strUser, project=strProject, sample=strSample) try: Helper.MakeFolderIfNot(strBaseEditRefFolder) Helper.MakeFolderIfNot(strBaseEditQueryFolder) except OSError as e: print(e) pass ## BaseEdit refer format : filename, barcode, reference ReferenceFile_in_IndelSearcher = open( './Input/{user}/Reference/{project}/{ref}/Reference_sequence.txt'. format(user=strUser, project=strProject, ref=strRef)) BarcodeFile_in_IndelSearcher = open( './Input/{user}/Reference/{project}/{ref}/Barcode.txt'.format( user=strUser, project=strProject, ref=strRef)) BarcodeFile_for_BaseEdit = open( '../Base_edit_2/Input/{user}/Reference/{project}/{ref}/Barcode.txt'. format(user=strUser, project=strProject, ref=strRef), 'w') Reference_for_BaseEdit = open( '../Base_edit_2/Input/{user}/Reference/{project}/{ref}/Reference.txt'. format(user=strUser, ref=strRef, project=strProject), 'w') ## conversion target to barcode:refseq dictBarcodeSeq = {} for strBarcodeIndelSearcher, strReferenceIndelSearcher in zip( BarcodeFile_in_IndelSearcher, ReferenceFile_in_IndelSearcher): strBarcodeIndelSearcher = strBarcodeIndelSearcher.replace('\n', '').strip() strReferenceIndelSearcher = strReferenceIndelSearcher.replace( '\n', '').strip() dictBarcodeSeq[strBarcodeIndelSearcher] = [] BarcodeFile_for_BaseEdit.write( strBarcodeIndelSearcher + ':' + strBarcodeIndelSearcher + '\n' ) ## first is filename, second is barcode. BaseEdit barcode format Reference_for_BaseEdit.write(strBarcodeIndelSearcher + ':' + strReferenceIndelSearcher + '\n') ReferenceFile_in_IndelSearcher.close() BarcodeFile_in_IndelSearcher.close() Reference_for_BaseEdit.close() Total_result_file = open( './Output/{user}/{project}/{sample}/Tmp/{sample}_Classified_Indel_barcode.fastq' .format(user=strUser, project=strProject, sample=strSample)) intCheckTotLine = 0 intOneLineMore = 0 for i, strRow in enumerate(Total_result_file): ## for query reads if intOneLineMore == 1: intCheckTotLine = 0 intOneLineMore = 0 if i % 4 == 0: ## Classified_Indel_barcode has all total sequence. strBarcode = strRow.split('Barcode_')[1].split(':')[0] intCheckTotLine = 1 elif intCheckTotLine == 1: dictBarcodeSeq[strBarcode].append(strRow) intOneLineMore = 1 for strBarcode, listSeq in dictBarcodeSeq.items(): with open( '../Base_edit_2/Input/{user}/Query/{project}/{sample}/{barcode}.txt' .format(user=strUser, project=strProject, sample=strSample, barcode=strBarcode), 'w') as Output: Output.write(''.join(listSeq)) Total_result_file.close()