Пример #1
0
    if args.loc_out != '':
        if not (os.path.isdir(args.loc_out)):
            os.makedirs(args.loc_out)
            log.critical("created output folder  %r", args.loc_out)

    config = configparser.RawConfigParser()
    config.read(os.path.join(os.path.dirname(
        os.path.realpath(sys.argv[0])), 'moff_setting.properties'))

    # just for Galaxy input is possible to use one big input file and a list of raw file.
    # the big file must have the result of each raw file and the columns 'Spectrum File' should be availabe
    # This option work only with PS report using only --tsv_list and --raw_list
    if  ( args.tsv_list is not None) and  ( args.raw_list is not None) and (len(args.tsv_list)==1)  :
        data_temp= pd.read_csv(args.tsv_list[0],sep="\t")
        if moff.check_ps_input_data(data_temp.columns.tolist(), ast.literal_eval(config.get('moFF', 'ps_default_export_v1'))) == 1:
            # split the data input file only if inave more than ONE raw file and tha input file contain identification for more the ONE run
            if  len(data_temp['Spectrum File'].unique())> 1 and len(args.raw_list) > 1:

                output_list_loc=[]
                for file in data_temp['Spectrum File'].unique():
                    data_temp[data_temp['Spectrum File']== file].to_csv(os.path.join(os.path.split(args.tsv_list[0])[0],file.split('.')[0]+ '.txt')
                                                                        , sep='\t' , index=False )
                    output_list_loc.append(os.path.join(os.path.split(args.tsv_list[0])[0],file.split('.')[0]+ '.txt') )

                if len(args.raw_list) != len(output_list_loc):
                    exit('-- Number of raw file is different to the number of input sources detectd in your one input file --')
                #sort them to be sure about the association between input - raw file
                args.raw_list= sorted(args.raw_list)
                args.tsv_list= sorted(output_list_loc)
                #clean dataset thta I don use anymore
Пример #2
0
def run_mbr(args):
    """
    Macthing Between Run module.
    :param args:
    :return:
    """
    ch = logging.StreamHandler()
    ch.setLevel(logging.ERROR)
    log.addHandler(ch)

    if args.loc_in is None:
        # the user uses --inputtsv option
        if not (args.loc_out is None):
            # if the user use --output_folder the mbr folder will be created there
            output_dir = os.path.join(args.loc_out, 'mbr_output')
        else:
            # if the user does not use  --output_folder the mbr folder will be created on moFF path location
            output_dir = os.path.join('mbr_output')
            print(os.path.abspath(output_dir))

    else:
        # the user use the --inputF option
        if os.path.exists(os.path.join(args.loc_in)):
            # if '/' in  str(args.loc_in):
            output_dir = os.path.join(args.loc_in, 'mbr_output')
        else:
            exit(
                os.path.join(args.loc_in) +
                ' EXIT input folder path is not well specified --> / missing or wrong path'
            )

            # if not (os.path.isdir(args.loc_in)):
            #   exit(str(args.loc_in) + '-->  input folder does not exist ! ')

            # if str(args.loc_in) == '':
            #    output_dir = 'mbr_output'
            # else:
            #    if os.path.exists(os.path.join(args.loc_in)):
            # if '/' in  str(args.loc_in):
    # output_dir = os.path.join(args.loc_in, 'mbr_output')
    #    else:
    #        exit(os.path.join(args.loc_in) + ' EXIT input folder path not well specified --> / missing ')

    if not (os.path.isdir(output_dir)):

        log.critical("Created MBR output folder in : %s ",
                     os.path.abspath(output_dir))
        os.makedirs(output_dir)
    else:
        log.critical("MBR Output folder in : %s ", os.path.abspath(output_dir))
    # set log to file
    w_mbr = logging.FileHandler(os.path.join(output_dir,
                                             args.log_label + '_mbr_.log'),
                                mode='w')
    w_mbr.setLevel(logging.INFO)
    log.addHandler(w_mbr)

    moff_path = os.path.dirname(os.path.realpath(sys.argv[0]))
    config = configparser.RawConfigParser()
    config.read(os.path.join(moff_path, 'moff_setting.properties'))

    # it s always placed in same folder of moff_mbr.py
    # read input
    # comment better
    # name of the input file
    exp_set = []
    # list of the input dataframe
    exp_t = []
    # list of the output dataframe
    exp_out = []
    # lsit of input datafra used as help
    exp_subset = []
    # list of the name of the mbr output
    exp_out_name = []

    if args.loc_in is None:
        for id_name in args.tsv_list:
            exp_set.append(id_name)
    else:
        for item in os.listdir(args.loc_in):

            if os.path.isfile(os.path.join(args.loc_in, item)):
                if os.path.join(args.loc_in, item).endswith('.' + args.ext):
                    log.critical(item)
                    exp_set.append(os.path.join(args.loc_in, item))

                # sample optiion is valid only if  folder iin option is valid
    if (args.sample is not None) and (args.loc_in is not None):
        exp_set_app = copy.deepcopy(exp_set)
        for a in exp_set:
            if re.search(args.sample, a) is None:
                exp_set_app.remove(a)
        exp_set = exp_set_app

    if (exp_set == []) or (len(exp_set) == 1):
        exit(
            'ERROR input files not found or just one input file selected . check the folder or the extension given in input'
        )

    for a in exp_set:
        log.critical('Reading file: %s ', a)
        exp_subset.append(a)
        data_moff = pd.read_csv(a, sep="\t", header=0)
        list_name = data_moff.columns.values.tolist()
        # get the lists of PS  defaultcolumns from properties file
        list_ps_def = ast.literal_eval(
            config.get('moFF', 'ps_default_export_v1'))
        # here it controls if the input file is a PS export; if yes it maps the input in right moFF name
        if moff.check_ps_input_data(list_name, list_ps_def) == 1:
            log.critical(
                'Detected input file from PeptideShaker export..: %s ', a)
            # map  the columns name according to moFF input requirements
            data_moff, list_name = moff.map_ps2moff(data_moff,
                                                    'col_must_have_mbr')
            log.critical(
                'Mapping columns names into the  the moFF requested column name..: %s ',
                a)
            # print data_moff.columns
        if moff.check_columns_name(
                list_name,
                ast.literal_eval(config.get('moFF', 'col_must_have_mbr')),
                log) == 1:
            exit('ERROR minimal field requested are missing or wrong')
        data_moff['matched'] = 0
        data_moff['mass'] = data_moff['mass'].map('{:.4f}'.format)

        data_moff['code_unique'] = data_moff['mod_peptide'].astype(
            str)  # + '_' + data_moff['mass'].astype(str)
        data_moff = data_moff.sort_values(by='rt')
        exp_t.append(data_moff)
        exp_out.append(data_moff)

    log.critical('Read input --> done ')
    # parameter of the number of query
    # set a list of filed mandatory
    # ['matched','peptide','mass','mz','charge','prot','rt']
    n_replicates = len(exp_t)
    exp_set = exp_subset
    aa = range(0, n_replicates)
    out = list(itertools.product(aa, repeat=2))
    # just to save all the model
    # add matched columns
    list_name.append('matched')
    # final status -1 if one of the output is empty
    out_flag = 1
    # input of the methods
    diff_field = np.setdiff1d(exp_t[0].columns, [
        'matched', 'mod_peptide', 'peptide', 'mass', 'mz', 'charge', 'prot',
        'rt'
    ])

    log.info('Outlier Filtering is %s  ',
             'active' if args.out_flag else 'not active')
    log.info('Number of replicates %i,', n_replicates)
    log.info('Pairwise model computation ----')

    if args.rt_feat_file is not None:
        log.critical('Custom list of peptide used  provided by the user in %s',
                     args.rt_feat_file)
        # log.info('Custom list of peptide used  provided by the user in %s', args.rt_feat_file)
        shared_pep_list = pd.read_csv(args.rt_feat_file, sep='\t')
        shared_pep_list['mass'] = shared_pep_list['mass'].map('{:.4f}'.format)
        shared_pep_list['code'] = shared_pep_list['peptide'].astype(
            str) + '_' + shared_pep_list['mass'].astype(str)
        list_shared_pep = shared_pep_list['code']
        log.info('Custom list of peptide contains  %i ',
                 list_shared_pep.shape[0])

    for jj in aa:
        # list of the model saved
        model_save = []
        # list of the error in min/or sec
        model_err = []
        # list of the status of the model -1 means model not available for low points in the training set
        model_status = []
        c_rt = 0
        pre_pep_save = []
        log.info('matching  in %s', exp_set[jj])
        result = itertools.filterfalse(lambda x: x[0] != jj or x[1] == jj, out)
        for i in result:
            #if i[0] == jj and i[1] != jj:
            if args.rt_feat_file is not None:
                # use of custom peptide
                comA = exp_t[i[0]][exp_t[i[0]]['code_unique'].isin(
                    list_shared_pep)][['code_unique', 'peptide', 'prot', 'rt']]
                comB = exp_t[i[1]][exp_t[i[1]]['code_unique'].isin(
                    list_shared_pep)][['code_unique', 'peptide', 'prot', 'rt']]
                comA = comA.groupby('code_unique', as_index=False).mean()
                comB = comB.groupby('code_unique', as_index=False).mean()
                common = pd.merge(comA, comB, on=['code_unique'], how='inner')
            else:
                # use of shared peptdes.
                log.info('  Matching  %s peptide in   searching in %s ',
                         exp_set[i[0]], exp_set[i[1]])
                list_pep_repA = exp_t[i[0]]['code_unique'].unique()
                list_pep_repB = exp_t[i[1]]['code_unique'].unique()
                log.info('Peptide unique (mass + sequence) %i , %i ',
                         list_pep_repA.shape[0], list_pep_repB.shape[0])
                set_dif_s_in_1 = np.setdiff1d(list_pep_repB, list_pep_repA)
                add_pep_frame = exp_t[i[1]][exp_t[i[1]]['code_unique'].isin(
                    set_dif_s_in_1)].copy()
                #-- prepare the testing set
                add_pep_frame = add_pep_frame[[
                    'peptide', 'mod_peptide', 'mass', 'mz', 'charge', 'prot',
                    'rt'
                ]]
                # add_pep_frame['code_unique'] = '_'.join([add_pep_frame['peptide'], add_pep_frame['prot'], add_pep_frame['mass'].astype(str), add_pep_frame['charge'].astype(str)])
                add_pep_frame['code_unique'] = add_pep_frame['mod_peptide'] + '_' + \
                                               add_pep_frame['prot'] + '_' + '_' + \
                                               add_pep_frame['charge'].astype(str)
                add_pep_frame = add_pep_frame.groupby(
                    'code_unique', as_index=False)['peptide', 'mod_peptide',
                                                   'mass', 'charge', 'mz',
                                                   'prot', 'rt'].aggregate(max)
                add_pep_frame = add_pep_frame[[
                    'code_unique', 'peptide', 'mod_peptide', 'mass', 'mz',
                    'charge', 'prot', 'rt'
                ]]
                list_name = add_pep_frame.columns.tolist()
                list_name = [
                    w.replace('rt', 'rt_' + str(c_rt)) for w in list_name
                ]
                add_pep_frame.columns = list_name
                pre_pep_save.append(add_pep_frame)
                c_rt += 1
                #--------
                pep_shared = np.intersect1d(list_pep_repA, list_pep_repB)
                log.info('  Peptide (mass + sequence)  added size  %i ',
                         add_pep_frame.shape[0])
                log.info('  Peptide (mass + sequence) )shared  %i ',
                         pep_shared.shape[0])
                comA = exp_t[i[0]][exp_t[i[0]]['code_unique'].isin(
                    pep_shared)][['code_unique', 'peptide', 'prot', 'rt']]
                comB = exp_t[i[1]][exp_t[i[1]]['code_unique'].isin(
                    pep_shared)][['code_unique', 'peptide', 'prot', 'rt']]
                # filtering using the variance added 17_08
                flag_var_filt = False
                if flag_var_filt:
                    dd = comA.groupby('code_unique', as_index=False)
                    top_res = dd.agg(['std', 'mean', 'count'])
                    # print np.nanpercentile(top_res['rt']['std'].values,[5,10,20,30,50,60,80,90,95,97,99,100])
                    th = np.nanpercentile(top_res['rt']['std'].values, 60)
                    comA = comA[~comA['code_unique'].isin(top_res[
                        top_res['rt']['std'] > th].index)]
                    # data B '
                    dd = comB.groupby('code_unique', as_index=False)

                    top_res = dd.agg(['std', 'mean', 'count'])
                    # print comB.shape
                    # print np.nanpercentile(top_res['rt']['std'].values,[5,10,20,30,50,60,80,90,95,97,99,100])
                    th = np.nanpercentile(top_res['rt']['std'].values, 60)
                    comB = comB[~comB['code_unique'].isin(top_res[
                        top_res['rt']['std'] > th].index)]

                comA = comA.groupby('code_unique', as_index=False).mean()
                comB = comB.groupby('code_unique', as_index=False).mean()
                common = pd.merge(comA, comB, on=['code_unique'], how='inner')
            if common.shape[0] <= 10 and args.rt_feat_file is not None:
                model_status.append(-1)
                continue
            # filtering outlier option
            else:
                if args.out_flag:
                    filt_x, filt_y, pos_out = MD_removeOutliers(
                        common['rt_y'].values, common['rt_x'].values,
                        args.w_filt)
                    data_B = filt_x
                    data_A = filt_y
                    data_B = np.reshape(data_B, [filt_x.shape[0], 1])
                    data_A = np.reshape(data_A, [filt_y.shape[0], 1])
                    log.info('Outlier founded %i  w.r.t %i', pos_out.shape[0],
                             common['rt_y'].shape[0])
                else:
                    data_B = common['rt_y'].values
                    data_A = common['rt_x'].values
                    data_B = np.reshape(data_B, [common.shape[0], 1])
                    data_A = np.reshape(data_A, [common.shape[0], 1])

                log.info(' Size trainig shared peptide , %i %i ',
                         data_A.shape[0], data_B.shape[0])
                clf = linear_model.RidgeCV(alphas=np.power(
                    2, np.linspace(-30, 30)),
                                           scoring='neg_mean_absolute_error')
                clf.fit(data_B, data_A)
                clf_final = linear_model.Ridge(alpha=clf.alpha_)
                clf_final.fit(data_B, data_A)
                # save the model
                model_save.append(clf_final)
                model_err.append(
                    mean_absolute_error(data_A, clf_final.predict(data_B)))
                log.info(
                    ' Mean absolute error training : %4.4f sec',
                    mean_absolute_error(data_A, clf_final.predict(data_B)))
                model_status.append(1)
                '''
                # GP version
                model_gp, predicted_train, error = train_gp(data_A, data_B,c= str(i[0])+'_'+str(i[1]))
                #print i[1], comA.shape, error

                model_err.append(error)
                model_save.append(model_gp)
                model_status.append(1)
                    '''
        if np.where(np.array(model_status) == -1)[0].shape[0] >= (len(aa) / 2):
            log.error(
                'MBR aborted :  mbr cannnot be run, not enough shared pepetide among the replicates '
            )
            exit(
                'ERROR : mbr cannnot be run, not enough shared pepetide among the replicates'
            )

        log.info('Combination of the  model  --------')
        log.info('Weighted combination  %s : ',
                 'Weighted' if args.w_comb else 'Unweighted')
        if n_replicates == 2:
            test = pre_pep_save[0]
        else:
            test = reduce(
                lambda left, right: pd.merge(
                    left,
                    right,
                    on=[
                        'code_unique', 'peptide', 'mod_peptide', 'mass', 'mz',
                        'charge', 'prot'
                    ],
                    how='outer'), pre_pep_save)
        test = test.groupby('code_unique', as_index=False).aggregate(max)
        test.drop('code_unique', axis=1, inplace=True)
        test['time_pred'] = test.iloc[:, 6:(6 + (n_replicates - 1))].apply(
            lambda x: combine_model(x, model_save, model_err, args.w_comb),
            axis=1)
        #test['time_pred'] = test.iloc[:, 6: (6 + (n_replicates - 1))].apply(
        #    lambda x: combine_model(x, model_save[(jj * (n_replicates - 1)):((jj + 1) * (n_replicates - 1))],
        #                            model_err[(jj * (n_replicates - 1)):((jj + 1) * (n_replicates - 1))], args.w_comb),
        #    axis=1)
        test['matched'] = 1

        # still to check better
        if test[test['time_pred'] <= 0].shape[0] >= 1:
            log.info(
                ' -- Predicted negative RT: those peptide will be deleted')
            test = test[test['time_pred'] > 0]

        list_name = test.columns.tolist()
        list_name = [w.replace('time_pred', 'rt') for w in list_name]
        test.columns = list_name

        # test = test[['peptide','mod_peptide', 'mass', 'mz', 'charge',
        # 'prot', 'rt', 'matched','uncertainty_win']]
        test = test[[
            'peptide', 'mod_peptide', 'mass', 'mz', 'charge', 'prot', 'rt',
            'matched'
        ]]
        # just put nan with the missing values
        for field in diff_field.tolist():
            test[field] = np.nan
        log.info('Before adding %s contains %i ', exp_set[jj],
                 exp_t[jj].shape[0])
        exp_out[jj] = pd.concat([exp_t[jj], test],
                                join='outer',
                                axis=0,
                                sort=False)
        log.info('After MBR %s contains:  %i  peptides', exp_set[jj],
                 exp_out[jj].shape[0])
        log.critical('matched features   %i  MS2 features  %i ',
                     exp_out[jj][exp_out[jj]['matched'] == 1].shape[0],
                     exp_out[jj][exp_out[jj]['matched'] == 0].shape[0])
        exp_out[jj].to_csv(path_or_buf=os.path.join(
            output_dir,
            os.path.split(exp_set[jj])[1].split('.')[0] + '_match.txt'),
                           sep='\t',
                           index=False)
        exp_out_name.append(
            os.path.join(
                output_dir,
                os.path.split(exp_set[jj])[1].split('.')[0] + '_match.txt'))
        if exp_out[jj].shape[0] > 0:
            out_flag = 1 * out_flag
        else:
            out_flag = -1 * out_flag

    w_mbr.close()
    log.removeHandler(w_mbr)
    return out_flag, exp_out_name
Пример #3
0
def run_mbr(args):
    """
    Macthing Between Run module.
    :param args:
    :return:
    """
    ch = logging.StreamHandler()
    ch.setLevel(logging.ERROR)
    log.addHandler(ch)

    if args.loc_in is None:
        # the user uses --inputtsv option
        if not (args.loc_out is None):
            # if the user use --output_folder the mbr folder will be created there
            output_dir = os.path.join(args.loc_out, 'mbr_output')
        else:
            # if the user does not use  --output_folder the mbr folder will be created on moFF path location
            output_dir = os.path.join('mbr_output')
            print(os.path.abspath(output_dir))

    else:
        # the user use the --inputF option
        if os.path.exists(os.path.join(args.loc_in)):
            # if '/' in  str(args.loc_in):
            output_dir = os.path.join(args.loc_in, 'mbr_output')
        else:
            exit(os.path.join(args.loc_in) +
                 ' EXIT input folder path is not well specified --> / missing or wrong path')

            # if not (os.path.isdir(args.loc_in)):
            #   exit(str(args.loc_in) + '-->  input folder does not exist ! ')

            # if str(args.loc_in) == '':
            #    output_dir = 'mbr_output'
            # else:
            #    if os.path.exists(os.path.join(args.loc_in)):
            # if '/' in  str(args.loc_in):
    # output_dir = os.path.join(args.loc_in, 'mbr_output')
    #    else:
    #        exit(os.path.join(args.loc_in) + ' EXIT input folder path not well specified --> / missing ')

    if not (os.path.isdir(output_dir)):

        log.critical("Created MBR output folder in : %s ",
                     os.path.abspath(output_dir))
        os.makedirs(output_dir)
    else:
        log.critical("MBR Output folder in : %s ", os.path.abspath(output_dir))
    # set log to file
    w_mbr = logging.FileHandler(os.path.join(
        output_dir, args.log_label + '_mbr_.log'), mode='w')
    w_mbr.setLevel(logging.INFO)
    log.addHandler(w_mbr)

    moff_path = os.path.dirname(os.path.realpath(sys.argv[0]))
    config = configparser.RawConfigParser()
    config.read(os.path.join(moff_path, 'moff_setting.properties'))

    # it s always placed in same folder of moff_mbr.py
    # read input
    # comment better
    # name of the input file
    exp_set = []
    # list of the input dataframe
    exp_t = []
    # list of the output dataframe
    exp_out = []
    # lsit of input datafra used as help
    exp_subset = []
    # list of the name of the mbr output
    exp_out_name = []

    if args.loc_in is None:
        for id_name in args.tsv_list:
            exp_set.append(id_name)
    else:
        for item in os.listdir(args.loc_in):

            if os.path.isfile(os.path.join(args.loc_in, item)):
                if os.path.join(args.loc_in, item).endswith('.' + args.ext):
                    log.critical(item)
                    exp_set.append(os.path.join(args.loc_in, item))

                # sample optiion is valid only if  folder iin option is valid
    if (args.sample is not None) and (args.loc_in is not None):
        exp_set_app = copy.deepcopy(exp_set)
        for a in exp_set:
            if re.search(args.sample, a) is None:
                exp_set_app.remove(a)
        exp_set = exp_set_app

    if (exp_set == []) or (len(exp_set) == 1):
        exit(
            'ERROR input files not found or just one input file selected . check the folder or the extension given in input')

    for a in exp_set:
        log.critical('Reading file: %s ', a)
        exp_subset.append(a)
        data_moff = pd.read_csv(a, sep="\t", header=0)
        list_name = data_moff.columns.values.tolist()
        # get the lists of PS  defaultcolumns from properties file
        list_ps_def = ast.literal_eval(
            config.get('moFF', 'ps_default_export_v1'))
        # here it controls if the input file is a PS export; if yes it maps the input in right moFF name
        if moff.check_ps_input_data(list_name, list_ps_def) == 1:
            log.critical(
                'Detected input file from PeptideShaker export..: %s ', a)
            # map  the columns name according to moFF input requirements
            data_moff, list_name = moff.map_ps2moff(
                data_moff, 'col_must_have_mbr')
            log.critical(
                'Mapping columns names into the  the moFF requested column name..: %s ', a)
            # print data_moff.columns
        if moff.check_columns_name(list_name, ast.literal_eval(config.get('moFF', 'col_must_have_mbr')), log) == 1:
            exit('ERROR minimal field requested are missing or wrong')
        data_moff['matched'] = 0
        data_moff['mass'] = data_moff['mass'].map('{:.4f}'.format)

        data_moff['code_unique'] = data_moff['mod_peptide'].astype(
            str)  # + '_' + data_moff['mass'].astype(str)
        data_moff = data_moff.sort_values(by='rt')
        exp_t.append(data_moff)
        exp_out.append(data_moff)

    log.critical('Read input --> done ')
    # parameter of the number of query
    # set a list of filed mandatory
    # ['matched','peptide','mass','mz','charge','prot','rt']
    n_replicates = len(exp_t)
    exp_set = exp_subset
    aa = range(0, n_replicates)
    out = list(itertools.product(aa, repeat=2))
    # just to save all the model
    # add matched columns
    list_name.append('matched')
    # final status -1 if one of the output is empty
    out_flag = 1
    # input of the methods
    diff_field = np.setdiff1d(exp_t[0].columns, [
        'matched', 'mod_peptide', 'peptide', 'mass', 'mz', 'charge', 'prot', 'rt'])

    log.info('Outlier Filtering is %s  ', 'active' if
        args.out_flag else 'not active')
    log.info('Number of replicates %i,', n_replicates)
    log.info('Pairwise model computation ----')

    if args.rt_feat_file is not None:
        log.critical(
            'Custom list of peptide used  provided by the user in %s', args.rt_feat_file)
        # log.info('Custom list of peptide used  provided by the user in %s', args.rt_feat_file)
        shared_pep_list = pd.read_csv(args.rt_feat_file, sep='\t')
        shared_pep_list['mass'] = shared_pep_list['mass'].map('{:.4f}'.format)
        shared_pep_list['code'] = shared_pep_list['peptide'].astype(
            str) + '_' + shared_pep_list['mass'].astype(str)
        list_shared_pep = shared_pep_list['code']
        log.info('Custom list of peptide contains  %i ',
                 list_shared_pep.shape[0])

    for jj in aa:
        # list of the model saved
        model_save = []
        # list of the error in min/or sec
        model_err = []
        # list of the status of the model -1 means model not available for low points in the training set
        model_status = []
        c_rt = 0
        pre_pep_save = []
        log.info('matching  in %s', exp_set[jj])
        result = itertools.filterfalse(lambda x: x[0] != jj or x[1] == jj, out)
        for i in result:
            #if i[0] == jj and i[1] != jj:
            if args.rt_feat_file is not None:
                # use of custom peptide
                comA = exp_t[i[0]][exp_t[i[0]]['code_unique'].isin(list_shared_pep)][
                    ['code_unique', 'peptide', 'prot', 'rt']]
                comB = exp_t[i[1]][exp_t[i[1]]['code_unique'].isin(list_shared_pep)][
                    ['code_unique', 'peptide', 'prot', 'rt']]
                comA = comA.groupby('code_unique', as_index=False).mean()
                comB = comB.groupby('code_unique', as_index=False).mean()
                common = pd.merge(
                    comA, comB, on=['code_unique'], how='inner')
            else:
                # use of shared peptdes.
                log.info('  Matching  %s peptide in   searching in %s ',
                         exp_set[i[0]], exp_set[i[1]])
                list_pep_repA = exp_t[i[0]]['code_unique'].unique()
                list_pep_repB = exp_t[i[1]]['code_unique'].unique()
                log.info('Peptide unique (mass + sequence) %i , %i ',
                         list_pep_repA.shape[0],
                         list_pep_repB.shape[0])
                set_dif_s_in_1 = np.setdiff1d(list_pep_repB, list_pep_repA)
                add_pep_frame = exp_t[i[1]][exp_t[i[1]]['code_unique'].isin(
                    set_dif_s_in_1)].copy()
                #-- prepare the testing set
                add_pep_frame = add_pep_frame[[
                    'peptide', 'mod_peptide', 'mass', 'mz', 'charge', 'prot', 'rt']]
                # add_pep_frame['code_unique'] = '_'.join([add_pep_frame['peptide'], add_pep_frame['prot'], add_pep_frame['mass'].astype(str), add_pep_frame['charge'].astype(str)])
                add_pep_frame['code_unique'] = add_pep_frame['mod_peptide'] + '_' + \
                                               add_pep_frame['prot'] + '_' + '_' + \
                                               add_pep_frame['charge'].astype(str)
                add_pep_frame = add_pep_frame.groupby('code_unique', as_index=False)[
                    'peptide', 'mod_peptide', 'mass', 'charge', 'mz', 'prot', 'rt'].aggregate(max)
                add_pep_frame = add_pep_frame[[
                    'code_unique', 'peptide', 'mod_peptide', 'mass', 'mz', 'charge', 'prot', 'rt']]
                list_name = add_pep_frame.columns.tolist()
                list_name = [w.replace('rt', 'rt_' + str(c_rt))
                             for w in list_name]
                add_pep_frame.columns = list_name
                pre_pep_save.append(add_pep_frame)
                c_rt += 1
                #--------
                pep_shared = np.intersect1d(list_pep_repA, list_pep_repB)
                log.info(
                    '  Peptide (mass + sequence)  added size  %i ', add_pep_frame.shape[0])
                log.info('  Peptide (mass + sequence) )shared  %i ',
                         pep_shared.shape[0])
                comA = exp_t[i[0]][exp_t[i[0]]['code_unique'].isin(pep_shared)][
                    ['code_unique', 'peptide', 'prot', 'rt']]
                comB = exp_t[i[1]][exp_t[i[1]]['code_unique'].isin(pep_shared)][
                    ['code_unique', 'peptide', 'prot', 'rt']]
                # filtering using the variance added 17_08
                flag_var_filt = False
                if flag_var_filt:
                    dd = comA.groupby('code_unique', as_index=False)
                    top_res = dd.agg(['std', 'mean', 'count'])
                    # print np.nanpercentile(top_res['rt']['std'].values,[5,10,20,30,50,60,80,90,95,97,99,100])
                    th = np.nanpercentile(top_res['rt']['std'].values, 60)
                    comA = comA[~ comA['code_unique'].isin(
                        top_res[top_res['rt']['std'] > th].index)]
                    # data B '
                    dd = comB.groupby('code_unique', as_index=False)

                    top_res = dd.agg(['std', 'mean', 'count'])
                    # print comB.shape
                    # print np.nanpercentile(top_res['rt']['std'].values,[5,10,20,30,50,60,80,90,95,97,99,100])
                    th = np.nanpercentile(top_res['rt']['std'].values, 60)
                    comB = comB[~ comB['code_unique'].isin(
                        top_res[top_res['rt']['std'] > th].index)]

                comA = comA.groupby('code_unique', as_index=False).mean()
                comB = comB.groupby('code_unique', as_index=False).mean()
                common = pd.merge(
                    comA, comB, on=['code_unique'], how='inner')
            if common.shape[0] <= 10 and args.rt_feat_file is not None:
                model_status.append(-1)
                continue
            # filtering outlier option
            else:
                if args.out_flag :
                    filt_x, filt_y, pos_out = MD_removeOutliers(common['rt_y'].values, common['rt_x'].values,
                                                                args.w_filt)
                    data_B = filt_x
                    data_A = filt_y
                    data_B = np.reshape(data_B, [filt_x.shape[0], 1])
                    data_A = np.reshape(data_A, [filt_y.shape[0], 1])
                    log.info('Outlier founded %i  w.r.t %i',
                             pos_out.shape[0], common['rt_y'].shape[0])
                else:
                    data_B = common['rt_y'].values
                    data_A = common['rt_x'].values
                    data_B = np.reshape(data_B, [common.shape[0], 1])
                    data_A = np.reshape(data_A, [common.shape[0], 1])

                log.info(' Size trainig shared peptide , %i %i ',
                         data_A.shape[0], data_B.shape[0])
                clf = linear_model.RidgeCV(alphas=np.power(
                    2, np.linspace(-30, 30)), scoring='neg_mean_absolute_error')
                clf.fit(data_B, data_A)
                clf_final = linear_model.Ridge(alpha=clf.alpha_)
                clf_final.fit(data_B, data_A)
                # save the model
                model_save.append(clf_final)
                model_err.append(mean_absolute_error(
                    data_A, clf_final.predict(data_B)))
                log.info(' Mean absolute error training : %4.4f sec',
                         mean_absolute_error(data_A, clf_final.predict(data_B)))
                model_status.append(1)
                '''
                # GP version
                model_gp, predicted_train, error = train_gp(data_A, data_B,c= str(i[0])+'_'+str(i[1]))
                #print i[1], comA.shape, error

                model_err.append(error)
                model_save.append(model_gp)
                model_status.append(1)
                    '''
        if np.where(np.array(model_status) == -1)[0].shape[0] >= (len(aa) / 2):
            log.error(
                'MBR aborted :  mbr cannnot be run, not enough shared pepetide among the replicates ')
            exit('ERROR : mbr cannnot be run, not enough shared pepetide among the replicates')

        log.info('Combination of the  model  --------')
        log.info('Weighted combination  %s : ', 'Weighted' if
        args.w_comb else 'Unweighted')
        if n_replicates == 2:
            test = pre_pep_save[0]
        else:
            test = reduce(
                lambda left, right: pd.merge(left, right, on=[
                                             'code_unique', 'peptide', 'mod_peptide', 'mass', 'mz', 'charge', 'prot'], how='outer'),
                pre_pep_save)
        test = test.groupby('code_unique', as_index=False).aggregate(max)
        test.drop('code_unique', axis=1, inplace=True)
        test['time_pred'] = test.iloc[:, 6: (6 + (n_replicates - 1))].apply(
            lambda x: combine_model(x, model_save, model_err, args.w_comb),axis=1)
        #test['time_pred'] = test.iloc[:, 6: (6 + (n_replicates - 1))].apply(
        #    lambda x: combine_model(x, model_save[(jj * (n_replicates - 1)):((jj + 1) * (n_replicates - 1))],
        #                            model_err[(jj * (n_replicates - 1)):((jj + 1) * (n_replicates - 1))], args.w_comb),
        #    axis=1)
        test['matched'] = 1

        # still to check better
        if test[test['time_pred'] <= 0].shape[0] >= 1:
            log.info(' -- Predicted negative RT: those peptide will be deleted')
            test = test[test['time_pred'] > 0]

        list_name = test.columns.tolist()
        list_name = [w.replace('time_pred', 'rt') for w in list_name]
        test.columns = list_name

        # test = test[['peptide','mod_peptide', 'mass', 'mz', 'charge',
        # 'prot', 'rt', 'matched','uncertainty_win']]
        test = test[['peptide', 'mod_peptide', 'mass',
                     'mz', 'charge', 'prot', 'rt', 'matched']]
        # just put nan with the missing values
        for field in diff_field.tolist():
            test[field] = np.nan
        log.info('Before adding %s contains %i ',
                 exp_set[jj], exp_t[jj].shape[0])
        exp_out[jj] = pd.concat(
            [exp_t[jj], test], join='outer', axis=0, sort=False)
        log.info('After MBR %s contains:  %i  peptides',
                 exp_set[jj], exp_out[jj].shape[0])
        log.critical('matched features   %i  MS2 features  %i ', exp_out[jj][exp_out[jj]['matched'] == 1].shape[0],
                     exp_out[jj][exp_out[jj]['matched'] == 0].shape[0])
        exp_out[jj].to_csv(
            path_or_buf=os.path.join(output_dir, os.path.split(exp_set[jj])[1].split('.')[0] + '_match.txt'), sep='\t',
            index=False)
        exp_out_name.append(os.path.join(output_dir, os.path.split(
            exp_set[jj])[1].split('.')[0] + '_match.txt'))
        if exp_out[jj].shape[0] > 0:
            out_flag = 1 * out_flag
        else:
            out_flag = -1 * out_flag


    w_mbr.close()
    log.removeHandler(w_mbr)
    return out_flag, exp_out_name
Пример #4
0
        else:
            raw_list = None

        loc_raw = args.raw_repo if not None else raw_list
        loc_output = args.loc_out

        config = configparser.RawConfigParser()
        config.read(
            os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])),
                         'moff_setting.properties'))
        df = pd.read_csv(file_name, sep="\t")
        # add same safety checks len > 1
        # Flag for pride pipeline, or to set from second to minute as input rt time scale
        moff_pride_flag = False
        if moff.check_ps_input_data(
                df.columns.tolist(),
                ast.literal_eval(config.get('moFF', 'moffpride_format'))) == 1:
            # if it is a moff_pride data I do not check aany other requirement
            log.critical('moffPride input detected')
            moff_pride_flag = True
        else:
            if not 'matched' in df.columns:
                # check if it is a PS file ,
                list_name = df.columns.values.tolist()
                # get the lists of PS  defaultcolumns from properties file
                list = ast.literal_eval(
                    config.get('moFF', 'ps_default_export_v1'))
                # here it controls if the input file is a PS export; if yes it maps the input in right moFF name
                if moff.check_ps_input_data(list_name, list) == 1:
                    # map  the columns name according to moFF input requirements
                    if not args.peptide_summary: