def __init__(self, strSample, strRef, options, InstInitFolder):
        UserFolderAdmin.__init__(self, strSample, strRef, options,
                                 InstInitFolder.strLogPath)

        self.strSample = strSample
        self._RemoveTmpBeforStart()
        self.MakeSampleFolder()  ## inheritance

        self.strRef = strRef
        self.intCore = options.multicore
        self.strGapOpen = options.gap_open
        self.strGapExtend = options.gap_extend
        self.strTargetWindow = options.target_window
        self.strIndelCheckPos = options.indel_check_pos
        self.strTargetRefAlt = options.target_ref_alt

        self.strBarcodeFile = os.path.join(self.strRefDir, 'Barcode.txt')
        self.strReferenceSeqFile = os.path.join(self.strRefDir,
                                                'Reference.txt')
        self.strRefFile = os.path.join(self.strRefDir, 'Reference.fa')

        self.strPamSeq = options.PAM_seq
        self.strPamPos = options.PAM_pos
        self.strGuidePos = options.Guide_pos

        Helper.MakeFolderIfNot(
            './Output/{user}/{project}/{sample}/Tmp/Alignment'.format(
                user=self.strUser,
                project=self.strProject,
                sample=self.strSample))
def SubtractIndelWithD0(dictD0IndelMerge, dictExpIndel):
    """
    dictD0IndelMerge: indel proportion - dictExpIndel: indel proportion
    """
    strD0SubResultDir = './Output/{user}/{project}/All_results/D0SubResult'.format(
        user=strUserName, project=strProjectName)
    Helper.MakeFolderIfNot(strD0SubResultDir)

    for strSample, dictBarcode in dictExpIndel.items():
        with open(
                os.path.join(
                    strD0SubResultDir,
                    '{sample}_D0SubResult.txt').format(sample=strSample),
                'w') as Output:
            Output.write(
                'Barcode_indel_seq\tD0_total\tD0_indel_prop\tExp_total\tExp_indel_prop\tD0_sub_indel_prop\n'
            )

            for strBarcode, dictCountTotalAndIndel in dictBarcode.items():

                intExpTotal = dictCountTotalAndIndel['Total']

                for strIndelSeq, dictCount in dictCountTotalAndIndel.items():
                    if strIndelSeq == 'Total': continue

                    try:
                        intD0Total = dictD0IndelMerge[strBarcode]['Total']
                        intD0Count = dictD0IndelMerge[strBarcode][strIndelSeq][
                            'IndelCount']

                        floD0Prop = round(intD0Count / float(intD0Total), 6)

                        intExpCount = dictCount['IndelCount']
                        floExpProp = round(intExpCount / float(intExpTotal), 6)

                        floSubExpIndel = floExpProp - floD0Prop
                        if floSubExpIndel < 0:
                            floSubExpIndel = 0

                        Output.write('\t'.join(
                            map(str, [
                                strIndelSeq, intD0Total, floD0Prop,
                                intExpTotal, floExpProp, floSubExpIndel
                            ])) + '\n')
                    except KeyError:
                        intExpCount = dictCount['IndelCount']
                        floExpProp = round(intExpCount / float(intExpTotal), 6)

                        Output.write('\t'.join(
                            map(str, [
                                strIndelSeq, 'None', 'None', intExpTotal,
                                floExpProp, floExpProp
                            ])) + '\n')
Пример #3
0
def CountGroup(InstParameters):
    """
    Sorting_barcode Unique_RandomBarcodeNumber_In_SortingBarcode    RandomBarcode   Each_RandomBarcode_read_count
    TATATCATAGCGTACTCATC    8       TGCGTTTG        3
    TATATCATAGCGTACTCATC    8       CGCGTTTG        3
    TATATCATAGCGTACTCATC    8       TAGTTTTG        1
    TATATCATAGCGTACTCATC    8       ATAGTTTG        1
    """

    sHeader = ''

    with open(InstParameters.strSampleList) as Sample:  ## tmp input

        listSample = Sample.readlines()

        setGroup = set([
            strRow.replace('\n', '').split('\t')[2].upper()
            for strRow in listSample
        ])

        for strGroup in setGroup:
            if strGroup == 'CTRL': continue

            for strRow in listSample:
                if strGroup == strGroupOfSample:  ## matched group names -> Sum the counts
                    listCol = strRow.replace('\n', '').split('\t')
                    strSample = listCol[0]
                    strRef = listCol[1]
                    strGroupOfSample = listCol[2]

                    strProjectDir = './Output/{user}/{project}'.format(
                        user=InstParameters.strUser,
                        project=InstParameters.strProject)
                    strGroupDir = os.path.join(strProjectDir, 'Group_result')
                    Helper.MakeFolderIfNot(strGroupDir)

                    dTotal_RandomBarcode_cnt_in_SortingBarcode = OrderedDict(
                    )  ## ('GECKO_6367_GATCTGCTC', ['GECKO_6367', 'GATCTGCTC', 2, 156, '0.0128']),
                    ## Unique key, only one list.

                    with open('{project_dir}/{sample}_all_random_barcode.txt'.
                              format(
                                  project_dir=strProjectDir,
                                  sample=strSample)) as RandomBarcode_SeqFreq:
                        sHeader = RandomBarcode_SeqFreq.readline()

                        for sRow in RandomBarcode_SeqFreq:
                            lCol = sRow.replace('\n', '').split('\t')

                            sSortingBarcode = lCol[0]
                            #iTotal_RandomBarcode_cnt_in_SortingBarcode  = int(lCol[1])
                            sSorting_and_Random_barcode_seq = lCol[
                                0] + '_' + lCol[
                                    2]  ## Unique name : Doench2014_1000_CTCTGGGGT
                            iRandomBarcode_count = int(lCol[3])

                            lCol[3] = iRandomBarcode_count

                            try:
                                _ = dTotal_RandomBarcode_cnt_in_SortingBarcode[
                                    sSorting_and_Random_barcode_seq]

                                dTotal_RandomBarcode_cnt_in_SortingBarcode[
                                    sSorting_and_Random_barcode_seq][
                                        3] += iRandomBarcode_count

                            except KeyError:
                                dTotal_RandomBarcode_cnt_in_SortingBarcode[
                                    sSorting_and_Random_barcode_seq] = lCol  ## initial assignment
                    #END for
                    dRecal_total_kind_of_RandomBarcode = OrderedDict()
                    for sSort_Rand_seq in dTotal_RandomBarcode_cnt_in_SortingBarcode:  ## sSorting_and_Random_barcode_seq
                        sSortBarcode = sSort_Rand_seq.split('_')[0]
                        try:
                            dRecal_total_kind_of_RandomBarcode[
                                sSortBarcode].append(
                                    dTotal_RandomBarcode_cnt_in_SortingBarcode[
                                        sSort_Rand_seq])
                        except KeyError:
                            dRecal_total_kind_of_RandomBarcode[
                                sSortBarcode] = [
                                    dTotal_RandomBarcode_cnt_in_SortingBarcode[
                                        sSort_Rand_seq]
                                ]

                    for sKey, llValue in dRecal_total_kind_of_RandomBarcode.items(
                    ):
                        ## sKey: TATATCATAGCGTACTCATC, llValue : [[TATATCATAGCGTACTCATC, 8, TGCGTTTG, 3],[],[] ...
                        iKind_of_RandomBarcode = len(
                            llValue
                        )  ################## why do I make like this ?????
                        for lValue in llValue:
                            lValue[
                                1] = iKind_of_RandomBarcode  ## Recal using group total cnt.

                        llValue = sorted(llValue,
                                         key=lambda x: x[3],
                                         reverse=True)
                        dRecal_total_kind_of_RandomBarcode[sKey] = llValue

                    strEachGroup = './Output/Group_result/%s' % strGroup
                    Helper.MakeFolderIfNot(strEachGroup)

                    with open(os.path.join(strEachGroup, 'Summary_all_random_barcode_in_group.txt'), 'w') as Sort_Random_cnt,\
                        open(os.path.join(strEachGroup, 'Summary_Unique_RandomBarcodeNumber_in_group.txt'), 'w') as Uniq_random_cnt:

                        Sort_Random_cnt.write(sHeader)
                        Uniq_random_cnt.write(
                            'Sorting_barcode\tUnique_RandomBarcodeNumber_In_SortingBarcode\n'
                        )

                        for sSortBarcode, llCol in dRecal_total_kind_of_RandomBarcode.items(
                        ):
                            Uniq_random_cnt.write(
                                '\t'.join(map(str, [sSortBarcode,
                                                    len(llCol)])) + '\n')
                            for lCol in llCol:
                                Sort_Random_cnt.write(
                                    '\t'.join(map(str, lCol)) + '\n')
    def __init__(self, strSample, strRef, options, InstInitFolder):
        UserFolderAdmin.__init__(self, strSample, strRef, options, InstInitFolder.strLogPath)
        self.MakeSampleFolder()

        self.strProjectFile    = InstInitFolder.strProjectFile
        self.intChunkSize      = options.chunk_number
        self.strQualCutoff     = options.base_quality
        self.intInsertionWin   = options.insertion_window  # Insertion window 0,1,2,3,4
        self.intDeletionWin    = options.deletion_window  # Deletion window 0,1,2,3,4
        self.strPamType        = options.pam_type  # CRISPR type : Cpf1(2 cleavages), Cas9(1 cleavage)
        self.strPamPos         = options.pam_pos  # Barcode target position : Forward (barcode + target), Reverse (target + barcode)
        self.strPickle         = options.pickle
        self.strClassFASTQ     = options.class_fastq
        self.strSplit          = options.split
        self.strLogPath        = InstInitFolder.strLogPath

        self.strBarcodeFile      = os.path.join(self.strRefDir, 'Barcode.txt')
        self.strReferenceSeqFile = os.path.join(self.strRefDir, 'Reference_sequence.txt')
        self.strTargetSeqFile    = os.path.join(self.strRefDir, 'Target_region.txt')
        self.strRefFile          = os.path.join(self.strRefDir, 'Reference.fa')

        ## The file name required for the user is 'B'arcode.txt but it may be written as 'b'arcode.txt by mistake.
        ## This part is to fix the situation as mentioned above.
        if not os.path.isfile(self.strBarcodeFile):
            if os.path.isfile(self.strRefDir + 'barcode.txt'):
                self.strBarcodeFile = self.strRefDir + 'barcode.txt'
            else:
                logging.error('Barcode path is not correct, please make sure the path correctly.')
        if not os.path.isfile(self.strReferenceSeqFile):
            if os.path.isfile(self.strRefDir + 'reference_sequence.txt'):
                self.strReferenceSeqFile = self.strRefDir + 'reference_sequence.txt'
            else:
                logging.error('Reference path is not correct, please make sure the path correctly.')
        if not os.path.isfile(self.strTargetSeqFile):
            if os.path.isfile(self.strRefDir + 'target_region.txt'):
                self.strTargetSeqFile = self.strRefDir + 'target_region.txt'
            else:
                logging.error('Target path is not correct, please make sure the path correctly.')


        self.strFastqDir = './Input/{user}/FASTQ/{project}'.format(user=self.strUser,
                                                                     project=self.strProject)
        ## './Input/JaeWoo/FASTQ/Test_samples/Sample_1'
        self.strSampleDir  = os.path.join(self.strFastqDir, self.strSample)

        self.strFastq_name = ''
        for strFile in os.listdir(self.strSampleDir):
            if os.path.isfile(self.strSampleDir + '/' + strFile) and strFile.split('.')[-1] == 'fastq':
                self.strFastq_name = '.'.join(strFile.split('.')[:-1])
        logging.info('File name : %s' % self.strFastq_name)

        ## './Input/JaeWoo/FASTQ/Test_samples/Sample_1/Fastq_file.fastq'
        self.strInputFile = os.path.join(self.strSampleDir, self.strFastq_name+'.fastq')
        ## './Input/JaeWoo/FASTQ/Test_samples/Sample_1/Fastq_file.txt'
        self.strInputList = os.path.join(self.strSampleDir, self.strFastq_name+'.txt')

        ## './Input/JaeWoo/FASTQ/Test_samples/Sample_1/Split_files'
        self.strSplitPath = os.path.join(self.strSampleDir, 'Split_files')
        Helper.MakeFolderIfNot(self.strSplitPath)

        self.strPair = 'False'  # FASTQ pair: True, False
Пример #5
0
def Convert_Indelsearcher_output(strSampleRefGroup):

    listSampleRefGroup = strSampleRefGroup.replace('\n',
                                                   '').replace('\r',
                                                               '').split('\t')

    strSample = listSampleRefGroup[0]
    strRef = listSampleRefGroup[1]

    print('Processing: %s, %s' % (strSample, strRef))

    strBaseEditRefFolder = '../Base_edit_2/Input/{user}/Reference/{project}/{ref}'.format(
        user=strUser, project=strProject, ref=strRef)
    strBaseEditQueryFolder = '../Base_edit_2/Input/{user}/Query/{project}/{sample}'.format(
        user=strUser, project=strProject, sample=strSample)
    try:
        Helper.MakeFolderIfNot(strBaseEditRefFolder)
        Helper.MakeFolderIfNot(strBaseEditQueryFolder)
    except OSError as e:
        print(e)
        pass

    ## BaseEdit refer format : filename, barcode, reference
    ReferenceFile_in_IndelSearcher = open(
        './Input/{user}/Reference/{project}/{ref}/Reference_sequence.txt'.
        format(user=strUser, project=strProject, ref=strRef))
    BarcodeFile_in_IndelSearcher = open(
        './Input/{user}/Reference/{project}/{ref}/Barcode.txt'.format(
            user=strUser, project=strProject, ref=strRef))
    BarcodeFile_for_BaseEdit = open(
        '../Base_edit_2/Input/{user}/Reference/{project}/{ref}/Barcode.txt'.
        format(user=strUser, project=strProject, ref=strRef), 'w')
    Reference_for_BaseEdit = open(
        '../Base_edit_2/Input/{user}/Reference/{project}/{ref}/Reference.txt'.
        format(user=strUser, ref=strRef, project=strProject),
        'w')  ## conversion target to barcode:refseq

    dictBarcodeSeq = {}

    for strBarcodeIndelSearcher, strReferenceIndelSearcher in zip(
            BarcodeFile_in_IndelSearcher, ReferenceFile_in_IndelSearcher):

        strBarcodeIndelSearcher = strBarcodeIndelSearcher.replace('\n',
                                                                  '').strip()
        strReferenceIndelSearcher = strReferenceIndelSearcher.replace(
            '\n', '').strip()

        dictBarcodeSeq[strBarcodeIndelSearcher] = []
        BarcodeFile_for_BaseEdit.write(
            strBarcodeIndelSearcher + ':' + strBarcodeIndelSearcher + '\n'
        )  ## first is filename, second is barcode. BaseEdit barcode format
        Reference_for_BaseEdit.write(strBarcodeIndelSearcher + ':' +
                                     strReferenceIndelSearcher + '\n')

    ReferenceFile_in_IndelSearcher.close()
    BarcodeFile_in_IndelSearcher.close()
    Reference_for_BaseEdit.close()

    Total_result_file = open(
        './Output/{user}/{project}/{sample}/Tmp/{sample}_Classified_Indel_barcode.fastq'
        .format(user=strUser, project=strProject, sample=strSample))

    intCheckTotLine = 0
    intOneLineMore = 0

    for i, strRow in enumerate(Total_result_file):  ## for query reads

        if intOneLineMore == 1:
            intCheckTotLine = 0
            intOneLineMore = 0

        if i % 4 == 0:  ## Classified_Indel_barcode has all total sequence.
            strBarcode = strRow.split('Barcode_')[1].split(':')[0]
            intCheckTotLine = 1

        elif intCheckTotLine == 1:
            dictBarcodeSeq[strBarcode].append(strRow)
            intOneLineMore = 1

    for strBarcode, listSeq in dictBarcodeSeq.items():
        with open(
                '../Base_edit_2/Input/{user}/Query/{project}/{sample}/{barcode}.txt'
                .format(user=strUser,
                        project=strProject,
                        sample=strSample,
                        barcode=strBarcode), 'w') as Output:
            Output.write(''.join(listSeq))

    Total_result_file.close()