elif file.endswith('.hd5'): df = scimpute.read_hd5(file).transpose() else: raise Exception('file extension error: not hd5/csv') elif matrix_mode == 'gene_row': if file.endswith('.csv'): df = scimpute.read_csv(file) elif file.endswith('.hd5'): df = scimpute.read_hd5(file) else: raise Exception('file extension error: not hd5/csv') else: raise Exception('cmd err in the argv[2]') # summary nz_rate_df = scimpute.nnzero_rate_df(df) print('input matrix.shape:', df.shape) print('nz_rate: {}'.format(round(nz_rate_df, 3))) print(df.ix[0:3, 0:3]) # filter # read_per_gene = df.sum(axis=1) read_per_cell = df.sum(axis=0) df_filtered = df.loc[(read_per_gene >= gene_min), (read_per_cell >= cell_min)] nz_rate_filtered = scimpute.nnzero_rate_df(df_filtered) print('filtered matrix : ', df_filtered.shape) print('nz_rate:', nz_rate_filtered) print(df_filtered.ix[0:3, 0:3]) scimpute.save_hd5(df_filtered, tag0 + '.hd5') # histogram of filtered data
elif file.endswith('.hd5'): df = scimpute.read_hd5(file).transpose() else: raise Exception('file extension error: not hd5/csv') elif matrix_mode == 'gene_row': if file.endswith('.csv'): df = scimpute.read_csv(file) elif file.endswith('.hd5'): df = scimpute.read_hd5(file) else: raise Exception('file extension error: not hd5/csv') else: raise Exception('cmd err in the argv[2]') # summary nz_rate_df = scimpute.nnzero_rate_df(df) print('df.shape, [gene, cell]:', df.shape) print('nz_rate: {}'.format(round(nz_rate_df, 3))) print(df.ix[0:3, 0:3]) # exp df = np.power(10, df) -1 print('after exp(value) - 1') print(df.iloc[:3, :3]) # lib-size per million normalization df = scimpute.df_normalization(df) print('after normalization') print(df.ix[0:3, 0:3]) read_per_gene = df.sum(axis=1) read_per_cell = df.sum(axis=0)
print('usage: python data_select_genes.py df_big.hd5 df_small.hd5 outname') print('assume gene(row) cell(columns), also gene_row inside code') if len(sys.argv) is not 4: raise Exception("error: the num of arguments not correct") else: print('running:') print(sys.argv) big_name = str(sys.argv[1]) # big.hd5 small_name = str(sys.argv[2]) # small.hd5 out_name = str(sys.argv[3]) # big.small.hd5 # read print('read big-df..') df_big = pd.read_hdf(big_name) nz_big = scimpute.nnzero_rate_df(df_big) print('nz_rate big-df: ', nz_big) print('read small-df..') df_small = scimpute.read_hd5(small_name) nz_small = scimpute.nnzero_rate_df(df_small) print('nz_rate small_df: ', nz_small) # Remove .x from ID # df_big.index = df_big.index.to_series().astype(str).str.replace(r'\.[0-9]*','').astype(str) # print('because the index is different, remove the appendix') # print('big df after changing index', df_big.ix[0:5, 0:5]) print('df_big index is unique? {}'.format(df_big.index.is_unique)) print('df_small index is unique? {}'.format(df_small.index.is_unique))