forked from tanaes/script_bin
/
split_biom.py
58 lines (37 loc) · 1.54 KB
/
split_biom.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#!/usr/bin/env python
import argparse
from biom import load_table
from h5py import File
from biom.util import biom_open
parser = argparse.ArgumentParser()
parser.add_argument("-n", "--n", type=int,
help="split biom file into n biom files")
parser.add_argument("-i", "--input_fp",
help="biom file to split")
def main():
args = parser.parse_args()
n = args.n
input_fp = args.input_fp
biom_table = load_table(input_fp)
obs_ids = biom_table.ids(axis='observation')
print "{0} total ids\n".format(len(obs_ids))
chunk_size = int(len(obs_ids)/n)
last_id = -1
for chunk in range(1,n):
begin_id = last_id + 1
end_id = chunk * chunk_size
print "chunk: {0} begin: {1} end: {2}\n".format(chunk, begin_id, end_id)
sub_ids = obs_ids[begin_id : end_id]
sub_table = biom_table.filter(lambda val, id_, md: id_ in sub_ids, axis='observation', invert=False, inplace=False)
with biom_open('chunk{0}.biom'.format(chunk), 'w') as out_f:
sub_table.to_hdf5(out_f, "split_biom.py")
last_id = end_id
begin_id = last_id + 1
chunk += 1
print "chunk: {0} begin: {1} end: {2}\n".format(chunk, begin_id, len(obs_ids))
sub_ids = obs_ids[last_id + 1 : ]
sub_table = biom_table.filter(lambda val, id_, md: id_ in sub_ids, axis='observation', invert=False, inplace=False)
with biom_open('chunk{0}.biom'.format(n), 'w') as out_f:
sub_table.to_hdf5(out_f, "split_biom.py")
if __name__ == "__main__":
main()