def expmat_construction(exp_file, exp_paramlist, charge_list):
    mslev = 1
    for param in exp_paramlist:
        mm, mrange_min, mrange_max, mz_range, MZ_SCALE, \
              tt, gradient_starttime, gradient_endtime, gradient_time, TIME_SCALE, \
              window, shift =  exp_paramlist[mslev]
    print ' from:', exp_file, ' using mass spectrogram from', gradient_starttime, 'to', gradient_endtime, 'minutes'
    exp_df = pd.read_pickle(exp_file)
    # transform exp_df_head from str to numeric
    exp_df_head = ['ind', 'mslev', 'bpmz', 'bpint', 'starttime']
    for each in exp_df_head:
        exp_df[each] = pd.to_numeric(exp_df[each])
    # drop out of range time
    exp_df = exp_df[exp_df['starttime'] >= gradient_starttime]
    exp_df = exp_df[exp_df['starttime'] < gradient_endtime]

    # combine array and its bp to list of float
    for bp, ar, combine in zip(['bpmz', 'bpint'], ['mzarray', 'intarray'],
                               ['allmz', 'allint']):
        exp_df[combine] = exp_df[bp].apply(lambda x: [x]) + exp_df[ar]

    ## Create index
    exp_df['starttime'] = time_index(exp_df['starttime'], gradient_starttime,
                                     tt)
    exp_df['allmz'] = mz_index(exp_df['allmz'].values, mrange_min, mrange_max,
                               mm)
    exp_df = exp_df[['ind', 'starttime', 'allmz', 'allint']]
    time_col = []
    time_col_temp = []
    for index, row in exp_df.iterrows():
        # remove out of range m
        row['allint'] = [
            i for m, i in zip(row['allmz'], row['allint'])
            if m >= 0 and m < MZ_SCALE
        ]
        row['allmz'] = [m for m in row['allmz'] if m >= 0 and m < MZ_SCALE]
        # use bincount to sum int at same mz_index to create time_index col with MZ_SCALE length
        timecol_array = np.bincount(row['allmz'],
                                    row['allint'],
                                    minlength=(MZ_SCALE))
        timecol_array[timecol_array < 1] = 0
        time_col_temp.append(timecol_array)  # append each row, int sum
        if index % 500 == 0:
            time_col.extend(time_col_temp)
            time_col_temp = []
    # flush last
    time_col.extend(time_col_temp)
    exp_df['allint_overlap'] = time_col

    expdf_row = np.tile(np.arange(MZ_SCALE), exp_df.shape[0])
    expdf_col = np.repeat(exp_df['starttime'].values, MZ_SCALE)
    expdf_value = np.concatenate(exp_df['allint_overlap'].values)

    exp_mat = sparse.coo_matrix((expdf_value,\
           (expdf_row, expdf_col)), \
           shape=(MZ_SCALE, TIME_SCALE))

    exp_mat = smoothingtime_mat(exp_mat, window, shift)
    exp_mat, mat_mean = rescale_mat(exp_mat)
    return exp_mat, exp_paramlist, mat_mean
def refMS1_construction(refms1_df, M_header, iso_header, charge_list,
                        iso_maxnumber, globalparam_list, eps):
    mslev = 1
    mm = globalparam_list[mslev][globalparam_list[0].index('mm')]
    mrange_min = globalparam_list[mslev][globalparam_list[0].index(
        'mrange_min')]
    mrange_max = globalparam_list[mslev][globalparam_list[0].index(
        'mrange_max')]
    MZ_SCALE = globalparam_list[mslev][globalparam_list[0].index('MZ_SCALE')]

    if np.all(
            refms1_df.prot.values != refms1_df.sort_values('prot').prot.values
    ):
        print('Warning: Prot is not alphabetically sorted')
        exit()

    mziso_df = refms1_df.rename_axis('pept_id')
    print(mziso_df.columns)
    # melt/pivot Mheader (all charge) into 'variable' col and its mz value into 'value' col
    ## so every line is a singly charged peptide with this iso head abundance .melt([dfkeep], pivotthing)
    mziso_df = mziso_df.reset_index().melt(
        ['prot', 'pept_id', 'pept', 'mod', 'modpept', 'rtpeak'] + iso_header,
        M_header)
    mziso_df = mziso_df.rename(columns={'value': 'mz', 'variable': 'charge'})
    mziso_df['charge'] = mziso_df['charge'].str[-1].astype(int)
    mziso_df['mod'] = mziso_df['mod'].fillna('')
    # drop NA, sort same pept_id (same prot) up from small charge first
    mziso_df = mziso_df.dropna().sort_values(['pept_id', 'charge'])
    # reset index after correct sort to use as col
    mziso_df = mziso_df.rename_axis('tempidx').reset_index().drop('tempidx', 1)
    # print(mziso_df, mziso_df.shape)
    prot_peptcount = mziso_df['prot'].value_counts().sort_index().to_frame(
        name='ms1count')
    peptcount = prot_peptcount.values.sum()
    pept_ioncount = mziso_df['pept'].value_counts().sort_index().to_frame(
        name='ms1count')
    # print(' pept_ioncount:', pept_ioncount)

    mzidx_header = []
    for idx, iso_head in enumerate(iso_header):
        mziso_df['mz_' + iso_head] = mziso_df['mz'].values + (
            idx / mziso_df['charge'].values)
        mziso_df['mzidx_' + iso_head] = mz_index(
            mziso_df['mz_' + iso_head].values, mrange_min, mrange_max, mm)
        mzidx_header.append('mzidx_' + iso_head)

    mziso_df['final_mzidx'] = mziso_df[mzidx_header].values.tolist()
    mziso_df['final_normisoab'] = mziso_df[iso_header].values.tolist()

    if iso_maxnumber > 1:
        mziso_df_col = np.repeat(mziso_df.index.values, iso_maxnumber)
        mziso_df_row = np.concatenate(mziso_df['final_mzidx'].values)
        mziso_df_value = np.concatenate(mziso_df['final_normisoab'].values)
    else:  # if no isotope
        mziso_df_col = np.array(mziso_df.index.values)
        mziso_df_row = np.array(mziso_df['mzidx_isoab0'].values)
        mziso_df_value = np.array(mziso_df['isoab0'].values)

    if mziso_df.index.values.tolist() != list(
            np.arange(len(mziso_df.index.values))):
        print('Warning mziso_df reindex is wrong')
        print(mziso_df.index.values)
        exit()

    nonzero_idx = np.multiply([mziso_df_row >= 0],
                              [mziso_df_value > 0])  # keep True True
    nonzero_idx = tuple(nonzero_idx)
    sreference = sparse.coo_matrix((mziso_df_value[nonzero_idx],\
            (mziso_df_row[nonzero_idx], mziso_df_col[nonzero_idx])), \
            shape=(MZ_SCALE, peptcount))

    sreference = normalize(sreference, norm='l1', axis=0)
    return sreference, mziso_df, prot_peptcount, pept_ioncount