tmpdir += '/'
	hashobject = Fastq_Reader(inputdir,outputdir)
	cp = np.load(hashobject.output_path+'cluster_probs.npy')
	cluster_probs = dict(enumerate(cp))
	Hashq_Files = glob.glob(os.path.join(hashobject.input_path,'*.hashq.*'))
	Hashq_Files = [fp for fp in Hashq_Files if '.tmp' not in fp]
	Hashq_Files.sort()
	infile = Hashq_Files[fr]
	outpart = infile[-6:-3]
	sample_id = infile[infile.rfind('/')+1:infile.index('.hashq')]
	tmpdir += str(fr) + '/'
	os.system('mkdir '+tmpdir)
	G = [open('%s%s.%s.cols.%d' % (tmpdir,sample_id,outpart,i),'w') for i in range(0,2**hashobject.hash_size,2**hashobject.hash_size/50)]
	f = gzip.open(infile)
	r_id = 0
	for a in hashobject.hash_read_generator(f):
		for x in a[2]:
			G[int(x*50/2**hashobject.hash_size)].write('%d\t%d\n' % (x,r_id))
		r_id += 1
	R = r_id
	f.close()
	for g in G:
		g.close()
	if R < 50:
		print 'Fewer than 50 reads...doing nothing'
	else:
		ClusterFile = open(hashobject.output_path+'cluster_cols.npy')
		ValueFile = open(hashobject.output_path+'cluster_vals.npy')
		G = [open('%s%s.%s.ids.%d' % (tmpdir,sample_id,outpart,i),'w') for i in range(0,R,R/50)]
		# If sharing ClusterFile among many jobs is not practical, we may aggregate jobs below by 1/50 ClusterFile fractions across samples (so each job reads 1 fraction)
		for i in range(0,2**hashobject.hash_size,2**hashobject.hash_size/50):
Exemplo n.º 2
0
 Hashq_Files = glob.glob(os.path.join(hashobject.input_path, '*.hashq.*'))
 Hashq_Files = [fp for fp in Hashq_Files if '.tmp' not in fp]
 Hashq_Files.sort()
 infile = Hashq_Files[fr]
 outpart = infile[-6:-3]
 sample_id = infile[infile.rfind('/') + 1:infile.index('.hashq')]
 tmpdir += str(fr) + '/'
 os.system('mkdir ' + tmpdir)
 G = [
     open('%s%s.%s.cols.%d' % (tmpdir, sample_id, outpart, i), 'w')
     for i in range(0, 2**hashobject.hash_size, 2**hashobject.hash_size /
                    50)
 ]
 f = gzip.open(infile)
 r_id = 0
 for a in hashobject.hash_read_generator(f):
     for x in a[2]:
         G[int(x * 50 / 2**hashobject.hash_size)].write('%d\t%d\n' %
                                                        (x, r_id))
     r_id += 1
 R = r_id
 f.close()
 for g in G:
     g.close()
 if R < 50:
     print 'Fewer than 50 reads...doing nothing'
 else:
     ClusterFile = open(hashobject.output_path + 'cluster_cols.npy')
     ValueFile = open(hashobject.output_path + 'cluster_vals.npy')
     G = [
         open('%s%s.%s.ids.%d' % (tmpdir, sample_id, outpart, i), 'w')