Пример #1
0
    elif file.endswith('.hd5'):
        df = scimpute.read_hd5(file).transpose()
    else:
        raise Exception('file extension error: not hd5/csv')
elif matrix_mode == 'gene_row':
    if file.endswith('.csv'):
        df = scimpute.read_csv(file)
    elif file.endswith('.hd5'):
        df = scimpute.read_hd5(file)
    else:
        raise Exception('file extension error: not hd5/csv')
else:
    raise Exception('cmd err in the argv[2]')

# summary
nz_rate_df = scimpute.nnzero_rate_df(df)
print('input matrix.shape:', df.shape)
print('nz_rate: {}'.format(round(nz_rate_df, 3)))
print(df.ix[0:3, 0:3])

# filter #
read_per_gene = df.sum(axis=1)
read_per_cell = df.sum(axis=0)
df_filtered = df.loc[(read_per_gene >= gene_min), (read_per_cell >= cell_min)]
nz_rate_filtered = scimpute.nnzero_rate_df(df_filtered)
print('filtered matrix : ', df_filtered.shape)
print('nz_rate:', nz_rate_filtered)
print(df_filtered.ix[0:3, 0:3])
scimpute.save_hd5(df_filtered, tag0 + '.hd5')

# histogram of filtered data
Пример #2
0
    elif file.endswith('.hd5'):
        df = scimpute.read_hd5(file).transpose()
    else:
        raise Exception('file extension error: not hd5/csv')
elif matrix_mode == 'gene_row':
    if file.endswith('.csv'):
        df = scimpute.read_csv(file)
    elif file.endswith('.hd5'):
        df = scimpute.read_hd5(file)
    else:
        raise Exception('file extension error: not hd5/csv')
else:
    raise Exception('cmd err in the argv[2]')

# summary
nz_rate_df = scimpute.nnzero_rate_df(df)
print('df.shape, [gene, cell]:', df.shape)
print('nz_rate: {}'.format(round(nz_rate_df, 3)))
print(df.ix[0:3, 0:3])

# exp
df = np.power(10, df) -1
print('after exp(value) - 1')
print(df.iloc[:3, :3])

# lib-size per million normalization
df = scimpute.df_normalization(df)
print('after normalization')
print(df.ix[0:3, 0:3])
read_per_gene = df.sum(axis=1)
read_per_cell = df.sum(axis=0)
Пример #3
0
print('usage: python data_select_genes.py df_big.hd5  df_small.hd5 outname')
print('assume gene(row) cell(columns), also gene_row inside code')

if len(sys.argv) is not 4:
    raise Exception("error: the num of arguments not correct")
else:
    print('running:')
    print(sys.argv)
    big_name = str(sys.argv[1])  # big.hd5
    small_name = str(sys.argv[2])  # small.hd5
    out_name = str(sys.argv[3])  # big.small.hd5

# read
print('read big-df..')
df_big = pd.read_hdf(big_name)
nz_big = scimpute.nnzero_rate_df(df_big)
print('nz_rate big-df: ', nz_big)

print('read small-df..')
df_small = scimpute.read_hd5(small_name)
nz_small = scimpute.nnzero_rate_df(df_small)
print('nz_rate small_df: ', nz_small)

# Remove .x from ID
# df_big.index = df_big.index.to_series().astype(str).str.replace(r'\.[0-9]*','').astype(str)
# print('because the index is different, remove the appendix')
# print('big df after changing index', df_big.ix[0:5, 0:5])

print('df_big index is unique? {}'.format(df_big.index.is_unique))
print('df_small index is unique? {}'.format(df_small.index.is_unique))