Exemplo n.º 1
0
def _5train(f1out,
            eachfile,
            train,
            dirout_feature,
            f2resultOut,
            fin_model=None):
    '''
    training the model
    '''
    print('training on the model')

    dir_in = dirout_feature
    dirout = os.path.join(f2resultOut, eachfile)
    check_path(dirout)

    validate = {}
    validate['fin_pair'] = os.path.join(f1out, eachfile, 'validate.txt')
    validate['dir_in'] = dir_in
    onehot = True

    entry(
        dirout,
        train,
        dir_in,
        model_type=Param.CNN1D_OH,
        limit=0,
        onehot=onehot,
        kernel_size=90,
        epochs=80,
        # epochs=30,
        filters=300,
        batch_size=500,
        validate=validate,
        fin_model=fin_model)
Exemplo n.º 2
0
def _1posiPair(dirDIP, dirout):
    '''
    posi tmp and nontmp

    for each
    save 1000 pair
    save 880 protein fasta file/8DIPPredict/data/Ecoli/2pair.fasta
    save 369 tmp  file/8DIPPredict/data/Ecoli/2tmp.fasta
    save 511 nontmp  file/8DIPPredict/data/Ecoli/2nontmp.fasta
    # fin = '/home/jjhnenu/data/PPI/release/otherdata/DIP/Ecoli/2Ecoli20170205_id_pair_12246.txt'
    # foutdir = 'file/8DIPPredict/data/Ecoli'
    # check_path(foutdir)
    # handlePair(foutdir,sep='\t',fin=fin,jumpStep=[5],keepOne=True)
    '''
    # dirDIP = '/home/jjhnenu/data/PPI/release/otherdata/DIP/'
    # dirout = 'file/8DIPPredict/data/'
    for eachdir in os.listdir(dirDIP):
        if eachdir not in ['Ecoli', 'Mus', 'Human', 'SC', 'HP']: continue
        currentdir = os.path.join(dirDIP, eachdir)
        for eachfile in os.listdir(currentdir):
            if 'id_pair' not in eachfile: continue
            fin = os.path.join(currentdir, eachfile)
            foutdir = os.path.join(dirout, eachdir)
            check_path(foutdir)
            handlePair(foutdir, sep='\t', fin=fin, jumpStep=[5], keepOne=True)
Exemplo n.º 3
0
def composeNegaPair(currentdir, fpositive, foutdir):
    ftmp = os.path.join(currentdir, '2tmp.list')
    fnontmp = os.path.join(currentdir, '2nontmp.list')
    fposi = os.path.join(currentdir, '2pair.tsv')

    f1pair = os.path.join(foutdir, '1pair.tsv')
    f2pair = os.path.join(foutdir, '2pair.tsv')
    f2pairInfo = os.path.join(foutdir, '2pairInfo.tsv')
    f3pairInfo = os.path.join(foutdir, '3pairInfo.tsv')
    f4pairInfo_subcell = os.path.join(foutdir, '4pairInfo_subcell.tsv')
    f4pairInfo_subcell_differ = os.path.join(foutdir,
                                             '4pairInfo_subcell_differ.tsv')
    dirout_related = os.path.join(foutdir, '4pairInfo_subcell_differ_related')
    check_path(dirout_related)

    df = pd.read_table(fposi, header=None)
    composeTMP_nonTMP(ftmp, fnontmp, f1pair, int(df.shape[0] * 1.5))

    dropPositiveAndRepeate(f1pair, fpositive, f2pair)
    dropPositiveAndRepeate(f1pair, fposi, f2pair)
    getPairInfo_TMP_nonTMP(f2pair,
                           f2pairInfo,
                           sep='\t',
                           checkTMP=False,
                           keepOne=True)
    saveQualified(f2pairInfo, f3pairInfo)
    handleRow(f3pairInfo, f4pairInfo_subcell, calcuSubcell)
    saveDifferSubcell(f4pairInfo_subcell, f4pairInfo_subcell_differ)
    saveRelated(f4pairInfo_subcell_differ, dirout_related)
Exemplo n.º 4
0
def saveRelated(fin_info,dirout):
    '''
    :param fin_info:
    :param dirout:
    :return:
     # fin_info = os.path.join(dirout, '2subcellular_differ.tsv')
    '''
    check_path(dirout)
    print('save related to',dirout)
    fout_fasta = os.path.join(dirout, '2pair.fasta')
    fout_tmp_fasta = os.path.join(dirout, '2tmp.fasta')
    fout_nontmp_fasta = os.path.join(dirout, '2nontmp.fasta')
    f2positive = os.path.join(dirout, '2pair.tsv')
    f2tmp = os.path.join(dirout, '2tmp.list')
    f2nontmp = os.path.join(dirout, '2nontmp.list')
    f2all = os.path.join(dirout, '2all.list')
    f2tmp_info = os.path.join(dirout, '2tmp_info.tsv')
    f2nontmp_info = os.path.join(dirout, '2nontmp_info.tsv')
    f2all_info = os.path.join(dirout, '2all_info.tsv')

    simplifyTable(fin_info, f2positive)

    extractPairAndFasta(fin_info, fout_fasta, fout_tmp_fasta=fout_tmp_fasta,
                        fout_nontmp_fasta=fout_nontmp_fasta)
    getproteinlist(fin_info,
                   ftmp=f2tmp, fnontmp=f2nontmp, fall=f2all,
                   ftmp_info=f2tmp_info, ftmp_nontmp_info=f2nontmp_info, fall_info=f2all_info)
Exemplo n.º 5
0
def _2_1combineFasta(fposiInfo,fnegaInfo,dirout):
    '''

    :param fposiInfo:
    :param fnegaInfo:
    :param dirout:
    :return:
    fin_pair = '%s/dirRelated/2pair.tsv'%dirout
    fasta = '%s/dirRelated/2pair.fasta'%dirout
    '''
    '''
    config path
    '''

    # fposiInfo = 'file/10humanTrain/1positive/9human_human.tsv'
    # fnegaInfo = 'file/10humanTrain/2negative/4subcellular/2subcellular_differ.tsv'
    #
    # dirout = 'file/10humanTrain/3cluster/all'
    check_path(dirout)

    fpairinfo = os.path.join(dirout, '1pairinfo.tsv')
    dirRelated = os.path.join(dirout, 'dirRelated')
    '''
    concat positive and negative with info
    '''
    df1 = pd.read_table(fposiInfo,header=None)
    df2 = pd.read_table(fnegaInfo,header=None)
    df3 = pd.concat([df1,df2])
    df3.to_csv(fpairinfo,header=None,index=None,sep='\t')
    '''
    save 19724 protein fasta file/10humanTrain/3cluster/all/dirRelated/2pair.fasta
    save 5089 tmp  file/10humanTrain/3cluster/all/dirRelated/2tmp.fasta
    save 14635 nontmp  file/10humanTrain/3cluster/all/dirRelated/2nontmp.fasta
    '''
    saveRelated(fpairinfo, dirRelated)
Exemplo n.º 6
0
def groupCalculate(dirin, filetype='all'):
    """
    /home/jjhnenu/data/PPI/release/result/group/p_fp_1_1/1/all/_evaluate.txt
    :param dirin: /home/jjhnenu/data/PPI/release/result/group
    :return:
    """
    for eachdir in os.listdir(dirin):
        subdir = os.path.join(
            dirin,
            eachdir)  # /home/jjhnenu/data/PPI/release/result/group/p_fp_1_1/
        data = []
        columns = ['Loss', 'Acc', 'Precision', 'Recall', 'F1score', 'MCC']
        # columns = ['loss', 'acc', 'metric_precision', 'metric_recall', 'metric_F1score', 'matthews_correlation']
        print(columns)
        for eachsubdir in os.listdir(subdir):  # 0 1 2 3 4 5
            fin = os.path.join(subdir, eachsubdir, filetype, '_evaluate.txt')
            # fin = os.path.join(subdir,eachsubdir,filetype,'_history_dict.txt')
            if not os.access(fin, os.F_OK): continue
            with open(fin, 'r') as fi:
                line = fi.readline()[:-1]
                # sum += np.array(line.split(':')[-1][1:-1].split(','))
                data.append(line.split(':')[-1][1:-1].split(','))
                print(str(line.split(':')[-1][1:-1].split(','))[1:-1])
        mydata = pd.DataFrame(data)
        t = mydata.apply(pd.to_numeric)
        t.columns = columns
        t.loc['mean'] = t.apply(lambda x: x.mean())
        dirout = os.path.join(subdir.replace('result', 'statistic'))
        check_path(dirout)
        # float_format = '%.3f'
        t.sort_index(inplace=True)
        t.to_csv(os.path.join(dirout, 'result.csv'), index=True, header=True)
        print(dirout)
def load_ed_models(model_paths, pc):
    ed_models = []
    ed_model_params = []
    for i, path in enumerate(model_paths):
        print '...Loading nmt model {}'.format(i)
        ed_model_folder = check_path(path,
                                     'ED_MODEL_FOLDER_{}'.format(i),
                                     is_data_path=False)
        best_model_path = ed_model_folder + '/bestmodel.txt'
        hypoparams_file_reader = codecs.open(ed_model_folder + '/best.dev',
                                             'r', 'utf-8')
        hyperparams_dict = dict([
            line.strip().split(' = ')
            for line in hypoparams_file_reader.readlines()
        ])
        model_hyperparams = {
            'INPUT_DIM': int(hyperparams_dict['INPUT_DIM']),
            'HIDDEN_DIM': int(hyperparams_dict['HIDDEN_DIM']),
            'LAYERS': int(hyperparams_dict['LAYERS']),
            'VOCAB_PATH': hyperparams_dict['VOCAB_PATH']
        }
        # a fix for vocab path when transferring files b/n vm
        model_hyperparams['VOCAB_PATH'] = check_path(path + '/vocab.txt',
                                                     'vocab_path',
                                                     is_data_path=False)
        ed_model_params.append(pc.add_subcollection('ed{}'.format(i)))
        ed_model = SoftAttention(ed_model_params[i], model_hyperparams,
                                 best_model_path)

        ed_models.append(ed_model)
    return ed_models
Exemplo n.º 8
0
def _7trainAndTest(dirout_feature, fin_train, fin_validate, dirout):
    # time 664909.4274818897 ~ 7.6 day
    '''
    training the model
    '''
    print('training on the model')
    check_path(dirout)
    validate = {}
    validate['fin_pair'] = fin_validate
    validate['dir_in'] = dirout_feature
    onehot = True
    entry(
        dirout,
        fin_train,
        dirout_feature,
        model_type=Param.CNN1D_OH,
        limit=0,
        onehot=onehot,
        kernel_size=90,
        epochs=80,
        # epochs=30,
        filters=300,
        batch_size=50,
        validate=validate)
    '''
    testing on the model
    '''
    print('testing the model')
Exemplo n.º 9
0
def calculateResults(dirout,
                     dirin,
                     filename='_evaluate.txt',
                     row=0,
                     resultfilename='result.csv'):
    """
    %s\%s\_evaluate.txt
    :param dirin: contains a list of \%s\_evaluate.txt
    :return:
    dirin = '/home/19jiangjh/data/PPI/release/result_in_paper/alter_ratio/p_fw_v1_train_validate_v2_fixpositive_2/2/test_DIP/'
    dirout = dirin
    calculateResults(dirout,dirin,filename='log.txt',row = 2,resultfilename = 'result.csv')

    """
    # dirin = r'E:\githubCode\BioComputeCodeStore\JiangJiuhong\data\PPI\stage2\processPair2445\pair\positiveV1\onehot\result'
    check_path(dirout)
    count = 0
    data = []
    # columns = ['loss', 'acc', 'metric_precision', 'metric_recall', 'metric_F1score', 'matthews_correlation']
    columns = ['Loss', 'Acc', 'Precision', 'Recall', 'F1score', 'MCC']
    indexs = []
    print(columns)
    print(dirin)
    for eachdir in os.listdir(dirin):
        print(eachdir)
        if '.' in eachdir: continue
        fin = os.path.join(dirin, eachdir)
        sep = '\\' if '\\' in filename else '/'
        if sep in filename:
            for f in filename.split(sep):
                fin = os.path.join(fin, f)
        else:
            fin = os.path.join(fin, filename)
        # fin = '%s\%s\_evaluate.txt' % (dirin, eachdir)
        if not os.access(fin, os.F_OK):
            print('not access to:', fin)
            continue
        with open(fin, 'r') as fi:
            real_row = 0
            while (real_row != row):
                fi.readline()
                real_row = real_row + 1
            line = fi.readline()[:-1]
            # sum += np.array(line.split(':')[-1][1:-1].split(','))
            line = line.replace('nan', '0')
            print('****************', line, '********************')
            data.append(line.split(':')[-1][1:-1].split(','))
            indexs.append(eachdir)
            count = count + 1
            print(str(line.split(':')[-1][1:-1].split(','))[1:-1])
    mydata = pd.DataFrame(data)
    mydata.replace('nan', 0, inplace=True)
    t = mydata.apply(pd.to_numeric)
    t.loc['mean'] = t.apply(lambda x: x.mean())
    indexs.append('mean')
    t.index = indexs
    t.columns = columns
    t.sort_index(inplace=True)
    t.to_csv(os.path.join(dirout, resultfilename), index=True, header=True)
Exemplo n.º 10
0
 def getNpy(self, fin_fasta, out_dir, multi=True):
     check_path(out_dir)
     for ID, seq in self.getYield(fin_fasta, multi=multi):
         if p.checkProtein(seq, 50, 2000, uncomm=True):
             filename = os.path.join(out_dir, "%s.npy" % ID)
             result = self.seq2num(seq)
             if len(result) != 0:
                 np.save(filename, result)
Exemplo n.º 11
0
 def getPhsi_Blos(self, fin_fasta, out_dir, multi=True, checkprotein=True):
     check_path(out_dir)
     p = Protein()
     for ID, seq in self.getYield(fin_fasta, multi=multi):
         if not checkprotein or p.checkProtein(seq, 50, 2000, uncomm=True):
             filename = os.path.join(out_dir, "%s.npy" % ID)
             result = self.phsi_blos(seq)
             if len(result) != 0:
                 np.save(filename, result)
Exemplo n.º 12
0
def savepredict(fin_pair, dir_in, fin_model, dirout_result):
    # fin_pair = '/home/19jiangjh/data/PPI/release/pairdata/p_fw/1/0/test.txt'
    # dir_in = '/home/19jiangjh/data/PPI/release/feature/p_fp_fw_19471'
    # fin_model = '/home/19jiangjh/data/PPI/release/result_in_paper/alter_ratio/p_fw_train_validate/1/_my_model.h5'
    # dirout_result = '/home/19jiangjh/data/PPI/release/result_in_paper/alter_ratio/p_fw_train_validate/1/test'
    check_path(dirout_result)
    onehot = True
    dataarray = BaseData().loadTest(fin_pair,
                                    dir_in,
                                    onehot=onehot,
                                    is_shuffle=False)
    x_test, y_test = dataarray
    model = models.load_model(fin_model, custom_objects=MyEvaluate.metric_json)
    result = model.evaluate(x_test, y_test, verbose=False, batch_size=90)

    result_predict = model.predict(x_test, batch_size=90)
    result_predict = result_predict.reshape(-1)

    result_class = model.predict_classes(x_test, batch_size=90)
    result_class = result_class.reshape(-1)

    y_test = y_test.reshape(-1)

    print('Loss:%f,ACC:%f' % (result[0], result[1]))

    df = pd.read_table(fin_pair, header=None)
    # df.columns = ['tmp', 'nontmp']
    df.rename(columns={0: 'tmp', 1: 'nontmp'}, inplace=True)
    df['real_label'] = list(y_test)
    df['predict_label'] = result_class
    df['predict'] = result_predict
    df.to_csv(os.path.join(dirout_result, 'result.csv'), index=False)

    result_manual = MyEvaluate().evaluate_manual(y_test, result_predict)
    print(
        '[acc,metric_precision, metric_recall, metric_F1score, matthews_correlation]'
    )
    print(result_manual)
    print('[acc,precision,sensitivity,f1,mcc,aps,aucResults,specificity]')
    result_manual2 = calculate_performance(len(x_test), y_test, result_class,
                                           result_predict)
    print(result_manual2)
    with open(os.path.join(dirout_result, 'log.txt'), 'w') as fo:
        fo.write('test dataset %s\n' % fin_pair)
        fo.write('Loss:%f,ACC:%f\n' % (result[0], result[1]))
        fo.write('evaluate result:' + str(result) + '\n')
        fo.write(
            'manual result:[acc,metric_precision, metric_recall, metric_F1score, matthews_correlation]\n'
        )
        fo.write('manual result:' + str(result_manual) + '\n')
        fo.write(
            'manual result2:[acc,precision,sensitivity,f1,mcc,aps,aucResults,specificity]\n'
        )
        fo.write('manual result2:' + str(result_manual2) + '\n')
        fo.flush()
Exemplo n.º 13
0
def entry(dirout,
          fin_pair,
          dir_in,
          model_type=Param.CNN1D,
          limit=0,
          onehot=False,
          kernel_size=3,
          epochs=60,
          filters=250,
          batch_size=100,
          validate=None):
    # model_type = Param.CNN1D
    # fin_pair =  '/home/jjhnenu/data/PPI/release/data/p_fp_fw_2_1_1/all.txt'
    # dir_in =  '/home/jjhnenu/data/PPI/release/feature/pssm400_feature_1D/p_fp_fw_2_1_1/all/'
    # fout = '/home/jjhnenu/data/PPI/release/result/pssm400_feature_1D/p_fp_fw_2_1_1/all/'
    # if 'all' not in dirout:return
    check_path(dirout)
    print('dirout:', dirout)
    # dir_in = dirout.replace(des, 'feature')
    print('dir_in:', dir_in)
    print('fin_pair', fin_pair)
    bd = BaseData()
    if validate == None:
        (x_train, y_train), (x_test, y_test) = bd.load(fin_pair,
                                                       dir_in,
                                                       test_size=0.1,
                                                       limit=limit,
                                                       onehot=onehot)
    else:
        x_test, y_test = bd.loadTest(validate['fin_pair'],
                                     validate['dir_in'],
                                     onehot=onehot,
                                     is_shuffle=True)
        x_train, y_train = bd.loadTest(fin_pair,
                                       dir_in,
                                       onehot=onehot,
                                       is_shuffle=True)

    print('Build and fit model...')
    print('x_train.shape[1:]', x_train.shape[1:])
    mm = MyModel(
        model_type=model_type,
        input_shape=x_train.shape[1:],
        filters=filters,
        kernel_size=kernel_size,
        pool_size=2,
        hidden_dims=250,
        batch_size=batch_size,
        epochs=epochs,
    )
    mm.process(dirout, x_train, y_train, x_test, y_test)
    print('save result to %s' % dirout)
Exemplo n.º 14
0
 def base_compose(self,
                  dirout_feature,
                  fin_pair,
                  dir_feature_db,
                  feature_type='V_PSSM',
                  fout_pair=''):
     check_path(dirout_feature)
     fo = open(fout_pair, 'w') if fout_pair != '' else None
     row = 0
     for pairs in getPairs(fin_pair):
         a = pairs[0]
         b = pairs[1]
         # print(pairs)  # ['O35668', 'P00516']
         fa = os.path.join(dir_feature_db, a + '.npy')
         fb = os.path.join(dir_feature_db, b + '.npy')
         row = row + 1
         print('loading %d th feature pair' % row)
         if not (os.access(fa, os.F_OK) and os.access(fb, os.F_OK)):
             print(
                 '===============features of pairs not found %s %s================'
                 % (a, b), os.access(fa, os.F_OK), os.access(fb, os.F_OK))
             continue
         pa = np.load(fa, allow_pickle=True)
         pb = np.load(fb, allow_pickle=True)
         if (len(pa) < 50 or len(pa) > 2000
                 or max(pa) > 20) or (len(pb) < 50 or len(pb) > 2000
                                      or max(pb) > 20):
             print('wrong length or x')
             continue
         if fo != None:
             fo.write('%s\t%s\n' % (a, b))
             fo.flush()
         # padding
         if feature_type == Feature_type.V_PSSM:
             pc = self.padding_PSSM(pa, pb, vstack=True)
         elif feature_type == Feature_type.H_PSSM:
             pc = self.padding_PSSM(pa, pb, vstack=False)
         elif feature_type == Feature_type.SEQ_1D:
             pc = self.padding_seq1D(pa, pb, vstack=False)
             # elif feature_type == Feature_type.SEQ_1D_OH:pc = self.padding_seq1D(pa,pb,vstack=False)
         elif feature_type == Feature_type.SEQ_2D:
             pc = self.padding_seq2D(pa, pb)
         else:
             print('incoreect feature_type')
             return
         # 保存padding后的成对特征
         fout = os.path.join(dirout_feature, "%s_%s.npy" % (a, b))
         np.save(fout, pc)
         del pc, pa, pb
     if fo != None:
         fo.close()
Exemplo n.º 15
0
    def build_img(self,
                  font_name='',
                  bg_color=None,
                  b_color_img_name="",
                  txt_freq=None):
        try:
            if not font_name.strip():
                pass
            else:
                self.font_name = font_name
            self.font_path = os.path.join(my_dirpath,
                                          "../font/%s.ttf" % self.font_name)
            self.bg_color = bg_color
            self.back_color_img = imread(
                os.path.join(my_dirpath, "../image/%s" % b_color_img_name))
            my_log.logger.info("Start creating word clouds...")
            wc = WordCloud(
                font_path=self.font_path,  #设置字体
                mode="RGBA",
                background_color=self.bg_color,  #背景颜色
                max_words=self.max_words,  # 词云显示的最大词数
                mask=self.back_color_img,  #设置背景图片
                max_font_size=self.max_font_size,  #字体最大值
                random_state=self.random_state,  #设置有多少种随机生成状态,即有多少种配色方案
                colormap="viridis",  #随机颜色
                relative_scaling=1,
                scale=1.2)
            wc.generate_from_frequencies(txt_freq)
            #根据图片的颜色布局进行着色
            # image_colors = ImageColorGenerator(self.back_color_img)
            # wc.recolor(color_func=image_colors)

            # 根据给定的颜色值进行渲染
            # grouped_color_func = GroupedColorFunc(color_to_words, default_color)
            # wc.recolor(color_func=grouped_color_func)

            # 绘制词云,保存图片
            save_img_path = os.path.join(my_dirpath, "./image/build_img/")
            check_path(save_img_path)
            img_path = save_img_path + str(time.time()) + ".png"
            wc.to_file(img_path)
            my_log.logger.info("build img success.img_path:%s" % img_path)
        except Exception as e:
            my_log.logger.error("build img error...")
            my_log.logger.error(e)
Exemplo n.º 16
0
def _3clusters(fin_posi,fin_nega,fin_tmp,fin_nontmp,dirout):
    '''
    :param fin_posi:
    :param fin_nega:
    :param fin_tmp:
    :param fin_nontmp:
    :param dirout:
    :return:

    f4posi 'file/10humanTrain/3cluster/4posi.tsv'
    f4nega 'file/10humanTrain/3cluster/4nega.tsv'

    cd hit 0.4 cd-hit tool :http://weizhong-lab.ucsd.edu/cdhit_suite/cgi-bin/index.cgi?cmd=cd-hit
    get  *.clstr file
    '''
    # fin_posi = 'file/10humanTrain/1positive/2pair.tsv' # (44210, 2)
    # fin_nega = 'file/10humanTrain/2negative/4subcellular/dirRelated/2pair.tsv' # (61323, 2)
    # fin_tmp = 'file/10humanTrain/3cluster/1tmp.clstr'
    # fin_nontmp = 'file/10humanTrain/3cluster/1nontmp.clstr'
    # dirout = 'file/10humanTrain/3cluster/'

    check_path(dirout)
    f2pair = os.path.join(dirout,'1pair.tsv')
    f3out_tmp = os.path.join(dirout,'3tmp.tsv')
    f3out_nontmp = os.path.join(dirout,'3nontmp.tsv')
    f3pair = os.path.join(dirout,'3pair.tsv')
    f3pair_clstr = os.path.join(dirout,'3pair_clstr.tsv')
    f4pair = os.path.join(dirout,'4pair.tsv')
    f4posi = os.path.join(dirout,'4posi.tsv')
    f4nega = os.path.join(dirout,'4nega.tsv')  # (60697, 3)

    '''
    concat positive and negative pair
    '''
    concatPAN(fin_posi, fin_nega, f2pair)

    cluster2Table(fin_tmp,f3out_tmp)
    cluster2Table(fin_nontmp, f3out_nontmp)

    pairWithClusterLable(f2pair,f3out_tmp,f3out_nontmp,fout_clus=f3pair_clstr,fout=f3pair)

    '''
    extract positive,negative
    '''
    saveRelated_posi_nega(f3pair,f4pair,f4posi,f4nega)
Exemplo n.º 17
0
def _1posiSampleHumanPair(fin_1posiInfo,dirout):
    '''
    positive samples human-human
    '''
    # fin_1posiInfo = 'file/1positive/2tmp_nontmp_info_qualified.tsv'
    # dirout = 'file/10humanTrain/1positive'
    check_path(dirout)

    f8tmp_species = os.path.join(dirout, '8tmp_species.tsv')
    f8nontmp_species = os.path.join(dirout, '8nontmp_species.tsv')
    f8species = os.path.join(dirout, '8species.tsv')
    f8sameSpecies = os.path.join(dirout, '8sameSpecies.tsv')
    f8posiSpecies = os.path.join(dirout, '8posiSpecies.tsv')

    f9human_related = os.path.join(dirout, '9human_related.tsv')
    f9human_human = os.path.join(dirout, '9human_human.tsv')  # 44210 pairinfo

    findSpecies(fin_1posiInfo,f8species, f8tmp_species, f8nontmp_species, f8sameSpecies,f8posiSpecies,col=[1,8])

    species = 'HUMAN'
    relatedSpecies(f8posiSpecies, species, f9human_related, f9human_human, col=[0,7,14,15])

    fout_fasta = os.path.join(dirout, '2pair.fasta')
    fout_tmp_fasta = os.path.join(dirout, '2tmp.fasta')
    fout_nontmp_fasta = os.path.join(dirout, '2nontmp.fasta')
    f2positive = os.path.join(dirout, '2pair.tsv')
    f2tmp = os.path.join(dirout, '2tmp.list')
    f2nontmp = os.path.join(dirout, '2nontmp.list')
    f2all = os.path.join(dirout, '2all.list')
    f2tmp_info = os.path.join(dirout, '2tmp_info.tsv')
    f2nontmp_info = os.path.join(dirout, '2nontmp_info.tsv')
    f2all_info = os.path.join(dirout, '2all_info.tsv')

    # save 11995 protein fasta file/10humanTrain/1positive/2pair.fasta
    # save 3513 tmp  file/10humanTrain/1positive/2tmp.fasta
    # save 8482 nontmp  file/10humanTrain/1positive/2nontmp.fasta

    simplifyTable(f9human_human, f2positive)

    extractPairAndFasta(f9human_human, fout_fasta, fout_tmp_fasta=fout_tmp_fasta,
                        fout_nontmp_fasta=fout_nontmp_fasta)
    getproteinlist(f9human_human,
                   ftmp=f2tmp, fnontmp=f2nontmp, fall=f2all,
                   ftmp_info=f2tmp_info, ftmp_nontmp_info=f2nontmp_info, fall_info=f2all_info)
Exemplo n.º 18
0
 def save(self,
          dirout,
          flist,
          ratios,
          limit,
          labels=None,
          sep='\t',
          filename='all.txt',
          groupcount=1,
          repeate=True):
     """
     same length of flist,ratios,labels if not None
     :param fout:
     :param flist:
     :param ratios:
     :param limit:
     :param labels:[1,0,0] 样本中无标签时,需要提供这个标签
     :param sep:
     :return:
     case
     dirin = r'E:\githubCode\BioComputeCodeStore\JiangJiuhong\data\PPI\release\data'
     fin_p = r'%s\positive_2049.txt' % dirin
     fin_fp = r'%s\negative_fpositive_10245.txt' % dirin
     fin_fw = r'%s\negative_fswissprot_7781.txt' % dirin
     fout = r'%s\p_fp_fw_2_1_1\all.txt' % dirin
     flist = [fin_p, fin_fp,fin_fw]
     ratios = [0.5,0.25,0.25]
     labels = [1,0,0]
     limit = 2049
     ComposeData().save(fout, flist, ratios, limit, labels=labels)
     """
     for idx, elem in enumerate(
             self.doCompose(flist,
                            ratios,
                            limit,
                            labels=labels,
                            groupcount=groupcount,
                            repeate=repeate)):
         data = pd.concat(elem)
         data = data.sample(frac=1)
         fout = os.path.join(dirout, str(idx))
         check_path(fout)
         fout = os.path.join(fout, filename)
         quick_save(data, fout, sep=sep)
Exemplo n.º 19
0
 def save(self):
     for i in range(len(self.fin)):
         dirout = os.path.join(self.dirin, self.type[i])
         check_path(dirout)
         ComposeData().save(dirout,
                            self.fin[i],
                            self.ratio[i],
                            self.limit,
                            labels=self.label[i],
                            filename='all.txt')
         print('divided to train and test')
         for eachdir in os.listdir(dirout):
             if '.' in eachdir: continue
             fin = os.path.join(dirout, eachdir, 'all.txt')
             train = os.path.join(dirout, eachdir, 'train.txt')
             test = os.path.join(dirout, eachdir, 'test.txt')
             fouts = [train, test]
             ratios = [0.8, 0.2]
             PairDealer().part(fin, ratios, fouts)
Exemplo n.º 20
0
def process(fin_fasta, dir_feature_db, dir_feature, dir_pair):
    '''
    generate featuredb and feature(feature pair)
    :param fin_fasta:>ID\nseq\n
    :param dir_feature_db:
    :param dir_feature:
    :param dir_pair: contains  several protein file
            # ID pair proteinA\tproteinB\n
    :return:
    '''
    check_path(dir_feature_db)
    check_path(dir_feature)
    # fasta to feature
    fd = FastaDealer()
    fd.getNpy(fin_fasta, dir_feature_db)

    for eachfile in os.listdir(dir_pair):
        print(eachfile)
        fin_pair = os.path.join(dir_pair, eachfile)
        BaseFeature().base_compose(dir_feature,
                                   fin_pair,
                                   dir_feature_db,
                                   feature_type=Feature_type.SEQ_1D)
Exemplo n.º 21
0
 def get_city(self):
     """
         获取城市列表
     """
     if check_path(self.city_names):
         try:
             with pathlib.Path(self.city_names).open("r") as city_file:
                 self.citys = json.loads(city_file.read())
                 return
         except Exception as e:
             error(f"城市文件读取错误: {e}")
     try:
         resp = self.menu_session.get(self.url_list["citys"])
         self.save_debug_file(resp.text, "get_city.html")
         res = self.reg_list["citys"].findall(resp.text)
         addsucess()
         self.citys = json.loads(res[0])
         create_json(self.citys, self.city_names)
     except Exception as e:
         error(f"获取城市信息失败: {e}")
         addfailed()
         exit()
     success(f"{len(self.citys.keys())} citys")
Exemplo n.º 22
0
def savepredict(fin_pair,
                dir_in,
                fin_model,
                dirout_result,
                batch_size=90,
                limit=0,
                posi=False,
                onehot=True):
    # fin_pair = '/home/19jiangjh/data/PPI/release/pairdata/p_fw/1/0/test.txt'
    # dir_in = '/home/19jiangjh/data/PPI/release/feature/p_fp_fw_19471'
    # fin_model = '/home/19jiangjh/data/PPI/release/result_in_paper/alter_ratio/p_fw_train_validate/1/_my_model.h5'
    # dirout_result = '/home/19jiangjh/data/PPI/release/result_in_paper/alter_ratio/p_fw_train_validate/1/test'
    check_path(dirout_result)
    print('predict ', fin_pair, '...')
    print('using feature ', dir_in, '...')
    print('save result in ', dirout_result)
    df = pd.read_table(fin_pair, header=None)
    if df.shape[1] != 3:
        df[2] = 1 if posi else 0

    dataarray = BaseData().loadTest(fin_pair,
                                    dir_in,
                                    onehot=onehot,
                                    is_shuffle=False,
                                    limit=limit)
    x_test, y_test = dataarray
    print('load model...', fin_model)
    # from tensorflow.keras.models import load_model
    # model = load_model(fin_model, custom_objects=MyEvaluate.metric_json)
    model = models.load_model(fin_model, custom_objects=MyEvaluate.metric_json)

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=MyEvaluate.metric)
    result = model.evaluate(x_test, y_test, verbose=1, batch_size=batch_size)

    result_predict = model.predict(x_test, batch_size=batch_size)
    result_class = (result_predict > 0.5).astype("int32")
    result_predict = result_predict.reshape(-1)

    # result_class = model.predict_classes(x_test,batch_size=batch_size)
    # UserWarning: `model.predict_classes()` is deprecated and will be removed after 2021-01-01.

    result_class = result_class.reshape(-1)

    # y_test = y_test.reshape(-1)

    print('Loss:%f,ACC:%f' % (result[0], result[1]))

    if limit != 0: df = df[:limit]

    df.columns = ['tmp', 'nontmp', 'real_label']
    # df.rename(columns={0: 'tmp', 1: 'nontmp'}, inplace=True)
    # df['real_label'] = list(y_test)
    df['predict_label'] = result_class
    df['predict'] = result_predict
    df.sort_values(by=['predict'], ascending=False).to_csv(os.path.join(
        dirout_result, 'result.csv'),
                                                           index=False)

    # result_manual = MyEvaluate().evaluate_manual(y_test, result_predict)
    # print('[acc,metric_precision, metric_recall, metric_F1score, matthews_correlation]')
    # print(result_manual)
    # print('[acc,precision,sensitivity,f1,mcc,aps,aucResults,specificity]')
    # result_manual2 =calculate_performance(len(x_test), y_test, result_class, result_predict)
    # print(result_manual2)

    with open(os.path.join(dirout_result, 'log.txt'), 'w') as fo:
        fo.write('test dataset %s\n' % fin_pair)
        fo.write('Loss:%f,ACC:%f\n' % (result[0], result[1]))
        fo.write('evaluate result:' + str(result) + '\n')
        fo.write(
            'manual result:[acc,metric_precision, metric_recall, metric_F1score, matthews_correlation]\n'
        )
        # fo.write('manual result:' + str(result_manual) + '\n')
        # fo.write('manual result2:[acc,precision,sensitivity,f1,mcc,aps,aucResults,specificity]\n')
        # fo.write('manual result2:'+str(result_manual2)+'\n')
        fo.flush()
Exemplo n.º 23
0
    projection = {'_id': True, 'UNIPROID':True}
    docs = do.GetALL(projection=projection,limit=0)
    for protein in docs:
        if protein['UNIPROID'] == '':continue
        for pro in multiSplit(protein['UNIPROID']):
            yield pro.strip()
def writeProtins(fout):
    proteins = []
    for protein in getallProtein():
        proteins.extend(protein)
    saveList(getallProtein(),fout)


if __name__ == '__main__':
    print('start', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
    start = time.time()

    db_name = 'ttd'
    table_target = 'target'

    fout = 'file/otherfile/2ttd_target_protein.list'
    check_path('file/otherfile/')
    writeProtins(fout)
    pass
    print('stop', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
    print('time', time.time() - start)




Exemplo n.º 24
0
    f2nontmpInfo = os.path.join(dirout,'2nontmpInfo.tsv')

    f2tmp = os.path.join(dirout, '2tmp.list')
    f2nontmp = os.path.join(dirout, '2nontmp.list')

    f3pair = os.path.join(dirout,'3pair.tsv')

    f3pair_norepeat = os.path.join(dirout,'3pair_norepeat.tsv')

    f3pairInfo = os.path.join(dirout,'2pairInfo.tsv')
    f3tmp_fasta = os.path.join(dirout,'3tmp.fasta')
    f3nontmp_fasta = os.path.join(dirout,'3nontmp.fasta')
    f3all_fasta = os.path.join(dirout,'3all.fasta')

    dir3bathData = os.path.join(dirout,'3batchData')
    check_path(dir3bathData)
    f4sample1k = os.path.join(dir3bathData,'3batchData')


    '''
    get TMP, nonTMP list
    query 5208 tmp and 15186 nontmp
    '''
    # generateHumanLists(f1tmp, f1nontmp)
    '''
    get qualified single info
    '''
    # getSingleInfo(f1tmp,f2tmpInfo,fin_type='single')
    # getSingleInfo(f1nontmp,f2nontmpInfo,fin_type='single')
    '''
    get qualified protein
Exemplo n.º 25
0
  ''' 
  input file : 
      positive pair and negative pair
      AC-pair of TMP-nonTMP
      fasta 
  generate feature
  fasta 
  '''
  print()
  '''
 fasta to feature
 '''
  # >ID\nseq\n
  fin_fasta = '/home/jjhnenu/data/PPI/release/otherdata/DIP/_DIP_fasta20170301_simple.seq'
  dir_feature_db = '/home/jjhnenu/data/PPI/release/otherdata/DIP/_featuredb'
  check_path(dir_feature_db)
  fd = FastaDealer()
  fd.getNpy(fin_fasta, dir_feature_db)
  '''
  generate feature pair
  50-2000 no X
  '''
  dir_feature_db = '/home/jjhnenu/data/PPI/release/otherdata/DIP/_featuredb'
  dir_feature = '/home/jjhnenu/data/PPI/release/otherdata/DIP/_feature/'
  # ID pair proteinA\tproteinB\n
  dir_pair = '/home/jjhnenu/data/PPI/release/otherdata/DIP/_TMP_nonTMP/'
  dirout_pair = '/home/jjhnenu/data/PPI/release/otherdata/DIP/_TMP_nonTMP_qualified/'
  check_path(dir_feature)
  check_path(dirout_pair)
  for eachfile in os.listdir(dir_pair):
      print(eachfile)
Exemplo n.º 26
0
def crosshumanTrain(modelreuse=False):
    f2all = 'file/10humanTrain/3cluster/4pair.tsv'  # 正负样本非1:1了

    f1out = 'file/10humanTrain/4train/group'
    f2out = 'file/10humanTrain/4train/cross/group'
    f3out = 'file/10humanTrain/4train/cross'
    # dirout_feature = '/home/19jjhnenu/Data/SeqTMPPI2W/feature/129878/'
    # f2resultOut = '/home/19jjhnenu/Data/SeqTMPPI2W/result/10humanTrain_80epoch/group_reusemodel_5CV'
    dirout_feature = '/root/19jjhnenu/Data/SeqTMPPI2W/feature/129878/'
    f2resultOut = '/root/19jjhnenu/Data/SeqTMPPI2W/result/10humanTrain_80epoch/group_reusemodel_5CV'

    check_path(f2resultOut)
    check_path(f2out)

    # flist = [os.path.join(f1out,x,'all.txt') for x in os.listdir(f1out)]
    # concatFile(flist,f2all)
    '''
    train:test = 5:1
    '''
    train = os.path.join(f3out, 'train_vali.txt')
    test = os.path.join(f3out, 'test.txt')
    ratios_tvt = [5, 1]
    f3outs = [train, test]
    # PairDealer().part(os.path.join(f3out,'all.txt'),ratios_tvt,f3outs)
    # PairDealer().part(f2all,ratios_tvt,f3outs)
    '''
    5cv
    '''
    ratios_tvt = [1] * 5
    f3outs = [os.path.join(f2out, '%d.txt' % x) for x in range(5)]
    # PairDealer().part(os.path.join(f3out,'all.txt'),ratios_tvt,f3outs)
    # PairDealer().part(os.path.join(f3out,'train_vali.txt'),ratios_tvt,f3outs)
    '''
    cross train
    '''
    for cv in range(5):
        # oldfile = '-1'
        oldfile = '2'
        for elem in range(5):
            if elem == cv: continue
            f2dirout = os.path.join(f2resultOut, str(cv))
            f3dirout = os.path.join(f2dirout, str(elem))
            fin_model = os.path.join(f2dirout, oldfile, '_my_model.h5')
            if not os.access(fin_model, os.F_OK) or not modelreuse:
                fin_model = None
            train = os.path.join(f2out, '%d.txt' % elem)
            validate = {}
            validate['fin_pair'] = os.path.join(f2out, '%d.txt' % cv)
            validate['dir_in'] = dirout_feature
            onehot = True

            entry(
                f3dirout,
                train,
                dirout_feature,
                model_type=Param.CNN1D_OH,
                limit=0,
                onehot=onehot,
                kernel_size=90,
                epochs=80,
                # epochs=2,
                filters=300,
                batch_size=500,
                validate=validate,
                fin_model=fin_model)
            oldfile = str(elem)
Exemplo n.º 27
0
def crossTrain(f2out, dirout_feature, f2resultOut, modelreuse=True):
    '''
    cross train and test
    '''
    f1out = 'file/4train/'
    f2outdir = os.path.join(f1out, str(0))
    fin_pair = os.path.join(f2outdir, 'all.txt')

    f2outdir = os.path.join(f1out, '5CV', 'data')
    check_path(f2outdir)
    train = os.path.join(f2outdir, 'train_vali.txt')
    test = os.path.join(f2outdir, 'test.txt')
    ratios_tvt = [5, 1]
    f3outs = [train, test]
    # PairDealer().part(fin_pair,ratios_tvt,f3outs)
    '''
    train model
    '''
    f2outdir = os.path.join(f1out, '5CV', 'data')
    check_path(f2outdir)
    train = os.path.join(f2outdir, 'train_vali.txt')
    # f2out = 'file/4train/5CV/elem'
    ratios_tvt = [1] * 5
    f3outs = [os.path.join(f2out, '%d.txt' % x) for x in range(5)]
    # PairDealer().part(train,ratios_tvt,f3outs)
    limit = 0
    # eachdir = 'benchmark'
    # dirout_feature = '/home/19jjhnenu/Data/Phsi_Blos/feature/%s/' % eachdir
    # f2resultOut = '/home/19jjhnenu/Data/SeqTMPPI2W/result/5CV_1'
    '''
    cross train
    '''
    for cv in range(5):
        oldfile = '-1'
        f2dirout = os.path.join(f2resultOut, str(cv))
        fin_model = ''
        f3dirout = ''
        for elem in range(5):
            if cv == elem: continue
            f3dirout = os.path.join(f2dirout, str(elem))
            fin_model = os.path.join(f2dirout, oldfile, '_my_model.h5')
            if not os.access(fin_model, os.F_OK) or not modelreuse:
                fin_model = None
            train = os.path.join(f2out, '%d.txt' % elem)
            validate = {}
            validate['fin_pair'] = os.path.join(f2out, '%d.txt' % cv)
            validate['dir_in'] = dirout_feature
            onehot = False

            # entry(f3dirout, train, dirout_feature, model_type=Param.CNN1D_OH, limit=0, onehot=onehot, kernel_size=90,
            #       epochs=80,
            #       # epochs=2,
            #       filters=300, batch_size=500, validate=validate,
            #       fin_model=fin_model)

            # entry(f3dirout, train, dirout_feature, model_type=Param.TRANSFORMER, limit=10, onehot=onehot, kernel_size=90,
            #       # epochs=80,
            #       epochs=2,
            #       filters=300, batch_size=500, validate=validate,
            #       fin_model=fin_model)
            #
            #
            oldfile = str(elem)
        #     # print(f3dirout)
        # calculateResults(f2dirout, f2dirout, resultfilename='result.csv')

        # eachdir = 'benchmark'
        # eachdir = 'benchmark'
        # dirout_feature = '/home/19jjhnenu/Data/Phsi_Blos/feature/%s/' % eachdir
        # print('testing the model on test dataset')
        # fin_model = os.path.join(f2dirout, oldfile, '_my_model.h5')
        # fin_test = 'file/4train/5CV/data/test.txt'
        # dirout_result_test = '/home/19jjhnenu/Data/Phsi_Blos/result/5CV_1_test/%d'%cv
        # check_path(dirout_result_test)
        # savepredict(fin_test, dirout_feature, fin_model, dirout_result_test,batch_size=500,limit=2000,onehot = onehot)
        '''
        testing on DIP all.txt in DIP/predict
        '''
        # fin_test = 'file/8DIPPredict/predict/all.txt'
        # dirout_feature = '/home/19jjhnenu/Data/Phsi_Blos/feature/%s/' % 'DIP'
        # fin_model = os.path.join(f2dirout, oldfile, '_my_model.h5')
        # dirout_result_test = '/home/19jjhnenu/Data/Phsi_Blos/result/5CV_1_DIP/%d' % cv
        # check_path(dirout_result_test)
        # savepredict(fin_test, dirout_feature, fin_model, dirout_result_test, batch_size=500,onehot = onehot)
        '''
        testing on DIP all.txt in DIP/data
        '''
        # fin_test = 'file/8DIPPredict/data/all.txt'
        # dirout_feature = '/home/19jjhnenu/Data/Phsi_Blos/feature/%s/' % 'DIP'
        # fin_model = os.path.join(f2dirout, oldfile, '_my_model.h5')
        # dirout_result_test = '/home/19jjhnenu/Data/Phsi_Blos/result/5CV_1_DIP_posi/%d' % cv
        # check_path(dirout_result_test)
        # savepredict(fin_test, dirout_feature, fin_model, dirout_result_test, batch_size=500,onehot = onehot)
        '''
        testing on  all.txt in Imex
        '''
        # fin_test = 'file/8ImexPredict/4pair.tsv'
        # dirout_feature = '/home/19jjhnenu/Data/Phsi_Blos/feature/%s/' % 'IMEx'
        # fin_model = os.path.join(f2dirout, oldfile, '_my_model.h5')
        # dirout_result_test = '/home/19jjhnenu/Data/Phsi_Blos/result/5CV_1_IMEx_posi/%d' % cv
        # check_path(dirout_result_test)
        # savepredict(fin_test, dirout_feature, fin_model, dirout_result_test, batch_size=500,onehot = onehot)
        '''
        IMEx + - 
        '''
        # fin_test = 'file/8ImexPredict/predict/0/all.txt'
        # dirout_feature = '/home/19jjhnenu/Data/Phsi_Blos/feature/%s/' % 'IMEx'
        # fin_model = os.path.join(f2dirout, oldfile, '_my_model.h5')
        # dirout_result_test = '/home/19jjhnenu/Data/Phsi_Blos/result/5CV_1_IMEx/%d' % cv
        # check_path(dirout_result_test)
        # savepredict(fin_test, dirout_feature, fin_model, dirout_result_test, batch_size=500,onehot = onehot)
        '''
        testing on DIP all.txt in DIP/data/Human
        '''
        # for eachfile in ['Ecoli', 'Human', 'Mus', 'SC']:
        #     fin_test = 'file/8DIPPredict/data/%s/2pair.tsv'%eachfile
        #     dirout_feature = '/home/19jjhnenu/Data/Phsi_Blos/feature/%s/' % 'DIP'
        #     fin_model = os.path.join(f2dirout, oldfile, '_my_model.h5')
        #     dirout_result_test = '/home/19jjhnenu/Data/Phsi_Blos/result/5CV_1_DIP_%s/%d' % (eachfile,cv)
        #     check_path(dirout_result_test)
        #     savepredict(fin_test, dirout_feature, fin_model, dirout_result_test, batch_size=500,onehot = onehot)
        '''
Exemplo n.º 28
0
    #
    # fin_pair = 'file/8ImexPredict/data/2pair.tsv'
    # fin_fasta = 'file/8ImexPredict/data/2pair.fasta'
    # eachdir = 'IMEx'
    # dir_feature_db = '/home/19jjhnenu/Data/Phsi_Blos/featuredb/%s/' % eachdir
    # dirout_feature = '/home/19jjhnenu/Data/Phsi_Blos/feature/%s/' % eachdir
    # # getFeature(fin_pair, fin_fasta, dir_feature_db, dirout_feature)
    '''
    train
    '''
    eachdir = 'benchmark'
    dirout_feature = '/home/19jjhnenu/Data/Phsi_Blos/feature/%s/' % eachdir
    dir_in = dirout_feature
    eachdir = 'benchmark'
    dirout = '/home/19jjhnenu/Data/Phsi_Blos/result/%s/' % eachdir
    check_path(dirout)
    validate = {}
    validate['fin_pair'] = 'file/4train/0/validate.txt'
    validate['dir_in'] = dir_in
    onehot = False

    import os
    import tensorflow as tf

    gpu_id = '0,1,2,3'
    # gpu_id = '4,5,6,7'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
    os.system('echo $CUDA_VISIBLE_DEVICES')

    tf_config = tf.compat.v1.ConfigProto()
    tf_config.gpu_options.allow_growth = True
Exemplo n.º 29
0
    SC
    uniprotPairFromImex
    '''
    # # mpf = '/home/jjhnenu/data/PPI/release/criteria/allcession_KW-0472_189043.list'
    # tmpf = '/home/jjhnenu/data/PPI/release/criteria/allcession_KW-0812_131609.list'
    # spf = '/home/jjhnenu/data/PPI/release/criteria/allcession_soluble_614454.list'
    # finPair = '/home/jjhnenu/data/PPI/release/otherdata/Ecoli/2Ecoli20170205_id_pair_12246.txt'
    # foutPair = '/home/jjhnenu/data/PPI/release/otherdata/Ecoli/TMP_SP/TMP_SP.txt'
    # _path,_fname = os.path.split(foutPair)
    # check_path(_path)
    # getTmp_SpPair(tmpf,spf,finPair, foutPair,type1='TMP',type2='SP',crossover = True)

    '''
    count pair
    '''
    # dirout_pair = r'E:\githubCode\BioComputeCodeStore\JiangJiuhong\data\PPI\release\otherdata\DIP\_TMP_nonTMP_qualified_drop_positive'
    # check_path(dirout_pair)
    # for eachfile in os.listdir(dirout_pair):
    #     fout_pair = os.path.join(dirout_pair, eachfile)
    #     countpair(fout_pair)
    '''
    imex 2020 0708
    '''

    tmpf = '/home/jjhnenu/data/PPI/release/criteria/allcession_KW-0812_131609.list'
    spf = '/home/jjhnenu/data/PPI/release/criteria/allcession_soluble_614454.list'
    finPair = '/home/jjhnenu/data/PPI/release/otherdata/uniprotPiarFromImex20200709/getpair15955.csv'
    foutPair = '/home/jjhnenu/data/PPI/release/otherdata/uniprotPiarFromImex20200709/TMP_SP/TMP_SP.txt'
    _path,_fname = os.path.split(foutPair)
    check_path(_path)
    getTmp_SpPair(tmpf,spf,finPair, foutPair,type1='TMP',type2='SP',crossover = True)
Exemplo n.º 30
0
def main(limit=5):
    # if __name__ == '__main__':
    #     limit = 0
    # limit = 5
    onehot = False
    '''
    load feature
    '''
    print('Loading feature...')
    '''
    pssm
    '''
    # model_type = Param.CNN2D
    # tpssmd = TmpPSSMData()
    # # (x_train, y_train), (x_test, y_test) = tpssmd.loadPAN(inPDir, inNDir, limit=limit)
    #
    # fin_pair =  '/home/jjhnenu/data/PPI/release/data/p_fp_fw_2_1_1/all.txt'
    # dir_in =  '/home/jjhnenu/data/PPI/release/feature/pssm_feature_2D/p_fp_fw_2_1_1/all/'
    # fout = '/home/jjhnenu/data/PPI/release/result/pssm_feature_2D/p_fp_fw_2_1_1/all'
    # check_path(fout)
    # (x_train, y_train), (x_test, y_test) = tpssmd.load(fin_pair,dir_in,limit=limit)
    '''
    seq1D
    '''
    # model_type = Param.CNN1D
    # fin_pair =  '/home/jjhnenu/data/PPI/release/data/p_fp_fw_2_1_1/all.txt'
    # dir_in =  '/home/jjhnenu/data/PPI/release/feature/seq_feature_1D/p_fp_fw_2_1_1/all/'
    # fout = '/home/jjhnenu/data/PPI/release/result/seq_feature_1D/p_fp_fw_2_1_1/all/'
    '''
    seq1D onehot
    '''
    # model_type = Param.CNN1D_OH
    # onehot = True
    # fin_pair =  '/home/jjhnenu/data/PPI/release/data/p_fp_fw_2_1_1/all.txt'
    # dir_in =  '/home/jjhnenu/data/PPI/release/feature/seq_feature_1D/p_fp_fw_2_1_1/all'
    # fout = '/home/jjhnenu/data/PPI/release/result/seq_feature_1D_onehot/p_fp_fw_2_1_1/all/'
    '''
    test seq2D
    Could not load dynamic library 'libnvinfer.so.6'; dlerror: libnvinfer.so.6: cannot open shared object file: No such file or directory
    2020-04-30 21:41:50.315008: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer_plugin.so.6'; dlerror: libnvinfer_plugin.so.6: cannot open shared object file: No such file or directory
    2020-04-30 21:41:50.315029: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:30] Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly
    '''
    onehot = True
    model_type = Param.CNN2D
    fin_pair = '/home/jjhnenu/data/PPI/release/pairdata/p_fp_fw_2_1_1/all.txt'
    dir_in = '/home/jjhnenu/data/PPI/release/feature/seq_feature_2D/p_fp_fw_2_1_1/all/'
    fout = '/home/jjhnenu/data/PPI/release/result/seq_feature_2D/p_fp_fw_2_1_1/all/'
    '''
    pssm hstack
    '''
    # model_type = Param.CNN2D
    # fin_pair =  '/home/jjhnenu/data/PPI/release/data/p_fp_fw_2_1_1/all.txt'
    # dir_in =  '/home/jjhnenu/data/PPI/release/feature/pssm_feature_2D_hstack/p_fp_fw_2_1_1/all/'
    # fout = '/home/jjhnenu/data/PPI/release/result/pssm_feature_2D_hstack/p_fp_fw_2_1_1/all/'
    '''
    pssm 400
    '''
    # model_type = Param.CNN1D
    # fin_pair =  '/home/jjhnenu/data/PPI/release/data/p_fp_fw_2_1_1/all.txt'
    # dir_in =  '/home/jjhnenu/data/PPI/release/feature/pssm400_feature_1D/p_fp_fw_2_1_1/all/'
    # fout = '/home/jjhnenu/data/PPI/release/result/pssm400_feature_1D/p_fp_fw_2_1_1/all/'

    check_path(fout)
    (x_train, y_train), (x_test, y_test) = BaseData().load(fin_pair,
                                                           dir_in,
                                                           limit=limit,
                                                           onehot=onehot)
    print('Build and fit model...')
    print('x_train.shape[1:]', x_train.shape[1:])
    mm = MyModel(
        model_type=model_type,
        input_shape=x_train.shape[1:],
        filters=250,
        kernel_size=3,
        pool_size=2,
        hidden_dims=250,
        batch_size=100,
        epochs=80,
    )
    mm.process(fout, x_train, y_train, x_test, y_test)
    print('save result to %s' % fout)