Exemplo n.º 1
0
def LoadFrame(ori_file, mode='b'):
    test_frame = pd.read_csv(ori_file)
    test_frame = test_frame.astype('str')
    if mode == 'b':
        test_frame['Reference_Allele'] = test_frame['Reference_Allele'] + \
            test_frame['Tumor_Seq_Allele2']
        test_frame['Reference_Allele'].loc[test_frame['Reference_Allele'] ==
                                           'GT'] = 'CA'
        test_frame['Reference_Allele'].loc[test_frame['Reference_Allele'] ==
                                           'GC'] = 'CG'
        test_frame['Reference_Allele'].loc[test_frame['Reference_Allele'] ==
                                           'GA'] = 'CT'
        test_frame['Reference_Allele'].loc[test_frame['Reference_Allele'] ==
                                           'AT'] = 'TA'
        test_frame['Reference_Allele'].loc[test_frame['Reference_Allele'] ==
                                           'TC'] = 'AG'
        test_frame['Reference_Allele'].loc[test_frame['Reference_Allele'] ==
                                           'AC'] = 'TG'
        test_frame = SATA_PRETREAT.GetSurvTime(test_frame)
        test_frame = SATA_PRETREAT.NeuPreteate(test_frame)
        classed_frame = SATA_PRETREAT.Classifier(test_frame)
        classed_frame.drop(
            classed_frame[classed_frame['class_result'] == 0].index,
            inplace=True)
    classed_frame = SATA_PRETREAT.DropEmptyData(classed_frame,
                                                column='to_last_known_alive')
    return classed_frame
Exemplo n.º 2
0
def VisiableSeq(test_frame):
    print('File_Loaded')
    test_frame = test_frame.astype('str')
    test_frame['Reference_Allele'] = test_frame['Reference_Allele'] + \
        test_frame['Tumor_Seq_Allele2']
    test_frame['Reference_Allele'].loc[test_frame['Reference_Allele'] == 'GT'] = 'CA'
    test_frame['Reference_Allele'].loc[test_frame['Reference_Allele'] == 'GC'] = 'CG'
    test_frame['Reference_Allele'].loc[test_frame['Reference_Allele'] == 'GA'] = 'CT'
    test_frame['Reference_Allele'].loc[test_frame['Reference_Allele'] == 'AT'] = 'TA'
    test_frame['Reference_Allele'].loc[test_frame['Reference_Allele'] == 'AG'] = 'TC'
    test_frame['Reference_Allele'].loc[test_frame['Reference_Allele'] == 'AC'] = 'TG'
    print("********************************")
    # print(list(set(test_frame['Reference_Allele'])))
    test_frame = SATA_PRETREAT.GetSurvTime(test_frame)
    test_frame = SATA_PRETREAT.NeuPreteate(test_frame)
    classed_frame = SATA_PRETREAT.Classifier(test_frame)
    classed_frame.drop(
        classed_frame[classed_frame['class_result'] == 0].index, inplace=True)
    classed_frame.reset_index(drop=True, inplace=True)
    print('visiable frame loaded')
    pre_list = []
    post_list = []
    for i in range(len(classed_frame)):
        try:
            pre_list.append(str(WHOLE_SEQUENCE.get_seq(classed_frame.Chromosome[i],
                                                       int(
                                                           classed_frame.Start_Position[i])-EXT,
                                                       int(classed_frame.Start_Position[i])-1)).lower())
        except ValueError:
            print('!!!!!!!', i, ' wrong.')
            pre_list.append('nan')
        try:
            post_list.append(str(WHOLE_SEQUENCE.get_seq(classed_frame.Chromosome[i],
                                                        int(
                                                            classed_frame.Start_Position[i])+1,
                                                        int(classed_frame.Start_Position[i])+EXT)).lower())
        except ValueError:
            print('!!!!!!!', i, ' wrong.')
            post_list.append('nan')
        if i % 10000 == 0:
            print(i, ' has finished.')
    pre_list = np.array(pre_list)
    post_list = np.array(post_list)
    pre_list = pre_list.reshape(len(classed_frame), -1)
    post_list = post_list.reshape(len(classed_frame), -1)
    classed_frame['front_seq'] = pre_list
    classed_frame['behind_seq'] = post_list
    print(pd.crosstab(classed_frame['class_result'],
                      classed_frame['Reference_Allele']))
    return classed_frame
Exemplo n.º 3
0
 def _DropEmptyCol(self, drop_col=[]):
     """
     paramater:
         |-> drop_col: a list of columns that needed to delete empty
     """
     new_frame = self.dataframe
     for c in drop_col:
         new_frame = SATA_PRETREAT.DropEmptyData(new_frame, column=c)
     return new_frame
Exemplo n.º 4
0
 def ChiSeqTest(self, chisq_col='', recomb=[]):
     _sata_frame = self._DropEmptyCol(drop_col=[chisq_col])
     chiseq_frame = SATA_PRETREAT.ChiSquarePretreat(
         self.dataframe, column=chisq_col, class_column=self.class_col)
     chiseq_frame = chiseq_frame.fillna(0)
     if recomb == []:
         chiseq_array = np.array(chiseq_frame)
     else:
         chiseq_array = np.array(chiseq_frame[recomb])
     print(chiseq_array)
     chiseq_value = scipy.stats.chi2_contingency(chiseq_array)
     chiseq_frame['chi2'] = chiseq_value[0]
     chiseq_frame['P'] = chiseq_value[1]
     chiseq_frame['df'] = chiseq_value[2]
     return chiseq_frame
Exemplo n.º 5
0
def CalSataItem(dataframe, outtxtfile, WITH_TYPICAL, output_f, label='base'):
    sata_dict = cfg.clicfeat_dict
    for cal_item in SATA_LIST:
        calcframe = SATA_PRETREAT.DropEmptyData(dataframe, column=cal_item)
        print('{:*^120}'.format(cal_item + '_' + label), file=outtxtfile)
        print('{:*^120}'.format(cal_item + '_' + label))
        if sata_dict[cal_item] == 'ori':
            satasclass = SATA_METHODS.SataMethod(dataframe=calcframe,
                                                 class_col='class_result')
            try:
                chiasclass = satasclass.ChiSeqTest(chisq_col=cal_item)
                print(chiasclass, file=outtxtfile)
                OutputFrame(chiasclass,
                            in_f=output_f,
                            filename=cal_item + '_' + label + '_as_class')
            except UnboundLocalError:
                print(cal_item, ' is loss', file=outtxtfile)
            sataasitem = SATA_METHODS.SataMethod(dataframe=calcframe,
                                                 class_col=cal_item)
            try:
                chiasitem = sataasitem.ChiSeqTest(chisq_col='class_result')
                print(chiasitem, file=outtxtfile)
                OutputFrame(chiasitem,
                            in_f=output_f,
                            filename=cal_item + '_' + label + '_as_item')
            except UnboundLocalError:
                print(cal_item, ' is loss', file=outtxtfile)
        elif sata_dict[cal_item] == 'mid':
            if DRAW_BOX == True:
                mean_col = []
                std_col = []
                col_col = []
                for cl in cfg.class_list_costom:
                    kk = calcframe[cal_item][calcframe['class_result'] == cl]
                    kk_count = kk.count()
                    col_col.append(cl + ': Num=' + str(kk_count))
                    kk = np.array(kk).astype(float).astype(int)
                    kk = abs(kk)
                    if kk.size == 0:
                        std_col.append(0)
                        mean_col.append(0)
                    else:
                        mean_col.append(np.mean(kk))
                        std_col.append(np.std(kk))
                print(std_col)
                plt.errorbar(range(len(col_col)),
                             mean_col,
                             yerr=std_col,
                             fmt='o')
                plt.xticks(range(len(col_col)), col_col, rotation=90)
                try:
                    plt.savefig(OUTPUT_PATH + output_f.split('.')[0] + '/' +
                                str(WITH_TYPICAL) + '-' + cal_item + '.tif')
                except FileNotFoundError:
                    os.mkdir(OUTPUT_PATH + output_f.split('.')[0] + '/')
                    plt.savefig(OUTPUT_PATH + output_f.split('.')[0] + '/' +
                                WITH_TYPICAL + '-' + cal_item + '.tif')
                plt.close()
            sataasmid = SATA_METHODS.SataMethod(dataframe=calcframe)
            itemdescribe = sataasmid.Describe(descb_col=cal_item)
            itemttest = sataasmid.CalTTest(describeframe=itemdescribe)
            print(itemdescribe, file=outtxtfile)
            print(itemttest, file=outtxtfile)
            OutputFrame(itemdescribe,
                        in_f=output_f,
                        filename=cal_item + '_' + label + '_describe')
            OutputFrame(itemttest,
                        in_f=output_f,
                        filename=cal_item + '_' + label + '_ttest')
            if cal_item == 'to_last_known_alive':
                print('{:*^120}'.format('logrank'), file=outtxtfile)
                lr_frame = sataasmid.LogRankTest(PATH +
                                                 output_f.split('.')[0] + '/' +
                                                 label + '_logrank')
                OutputFrame(lr_frame,
                            in_f=output_f,
                            filename=label + '_logrank')
                print(lr_frame, file=outtxtfile)
                print(lr_frame)
        else:
            for i in sata_dict[cal_item]:
                for j in sata_dict[cal_item][i]:
                    calcframe[cal_item][calcframe[cal_item] == j] = i
            calcframe.drop(calcframe[calcframe[cal_item] == 'delete'].index,
                           inplace=True)
            satacaledcalss = SATA_METHODS.SataMethod(dataframe=calcframe,
                                                     class_col='class_result')
            try:
                chicaledclass = satacaledcalss.ChiSeqTest(chisq_col=cal_item)
                print(chicaledclass, file=outtxtfile)
                OutputFrame(chicaledclass,
                            in_f=output_f,
                            filename=cal_item + '_' + label + '_as_class')
            except UnboundLocalError:
                print(cal_item, ' is loss', file=outtxtfile)
            satacaleditem = SATA_METHODS.SataMethod(dataframe=calcframe,
                                                    class_col=cal_item)
            try:
                chicaleditem = satacaleditem.ChiSeqTest(
                    chisq_col='class_result')
                print(chicaleditem, file=outtxtfile)
                OutputFrame(chicaleditem,
                            in_f=output_f,
                            filename=cal_item + '_' + label + '_as_item')
            except UnboundLocalError:
                print(cal_item, ' is loss.', file=outtxtfile)
    print("{:=^120}".format('finish'))
    print("{:=^120}".format('finish'), file=outtxtfile)
Exemplo n.º 6
0
def CalSataItem(dataframe, outtxtfile, output_f, label='base'):
    sata_dict = cfg.clicfeat_dict
    for cal_item in SATA_LIST:
        calcframe = SATA_PRETREAT.DropEmptyData(dataframe, column=cal_item)
        print('{:*^120}'.format(cal_item + '_' + label), file=outtxtfile)
        print('{:*^120}'.format(cal_item + '_' + label))
        if sata_dict[cal_item] == 'ori':
            satasclass = SATA_METHODS.SataMethod(dataframe=calcframe,
                                                 class_col='class_result')
            try:
                chiasclass = satasclass.ChiSeqTest(chisq_col=cal_item)
                print(chiasclass, file=outtxtfile)
                OutputFrame(chiasclass,
                            in_f=output_f,
                            filename=cal_item + '_' + label + '_as_class')
            except UnboundLocalError:
                print(cal_item, ' is loss', file=outtxtfile)
            sataasitem = SATA_METHODS.SataMethod(dataframe=calcframe,
                                                 class_col=cal_item)
            try:
                chiasitem = sataasitem.ChiSeqTest(chisq_col='class_result')
                print(chiasitem, file=outtxtfile)
                OutputFrame(chiasitem,
                            in_f=output_f,
                            filename=cal_item + '_' + label + '_as_item')
            except UnboundLocalError:
                print(cal_item, ' is loss', file=outtxtfile)
        elif sata_dict[cal_item] == 'mid':
            sataasmid = SATA_METHODS.SataMethod(dataframe=calcframe)
            itemdescribe = sataasmid.Describe(descb_col=cal_item)
            itemttest = sataasmid.CalTTest(describeframe=itemdescribe)
            print(itemdescribe, file=outtxtfile)
            print(itemttest, file=outtxtfile)
            OutputFrame(itemdescribe,
                        in_f=output_f,
                        filename=cal_item + '_' + label + '_describe')
            OutputFrame(itemttest,
                        in_f=output_f,
                        filename=cal_item + '_' + label + '_ttest')
            if cal_item == 'to_last_known_alive':
                print('{:*^120}'.format('logrank'), file=outtxtfile)
                lr_frame = sataasmid.LogRankTest(PATH +
                                                 output_f.split('.')[0] + '/' +
                                                 label + '_logrank')
                OutputFrame(lr_frame,
                            in_f=output_f,
                            filename=label + '_logrank')
                print(lr_frame, file=outtxtfile)
                print(lr_frame)
        else:
            for i in sata_dict[cal_item]:
                for j in sata_dict[cal_item][i]:
                    calcframe[cal_item][calcframe[cal_item] == j] = i
            calcframe.drop(calcframe[calcframe[cal_item] == 'delete'].index,
                           inplace=True)
            satacaledcalss = SATA_METHODS.SataMethod(dataframe=calcframe,
                                                     class_col='class_result')
            try:
                chicaledclass = satacaledcalss.ChiSeqTest(chisq_col=cal_item)
                print(chicaledclass, file=outtxtfile)
                OutputFrame(chicaledclass,
                            in_f=output_f,
                            filename=cal_item + '_' + label + '_as_class')
            except UnboundLocalError:
                print(cal_item, ' is loss', file=outtxtfile)
            satacaleditem = SATA_METHODS.SataMethod(dataframe=calcframe,
                                                    class_col=cal_item)
            try:
                chicaleditem = satacaleditem.ChiSeqTest(
                    chisq_col='class_result')
                print(chicaleditem, file=outtxtfile)
                OutputFrame(chicaleditem,
                            in_f=output_f,
                            filename=cal_item + '_' + label + '_as_item')
            except UnboundLocalError:
                print(cal_item, ' is loss.', file=outtxtfile)
    print("{:=^120}".format('finish'))
    print("{:=^120}".format('finish'), file=outtxtfile)
    outtxtfile.close()