예제 #1
0
def split_file(file2, ncpu, min_num_lines=100):
    print 'splitting [%s] into %d pieces' % (file2, ncpu)
    D, fname, fbase, fext = lib_utils.separateDirFn2(file2)
    out_prefix = '%s/%s_parts' % (D, fbase)

    #compute the number of lines per a splitted file
    N = lib_utils.count_num_lines(file2)
    L = round(1. * N / ncpu) + 1
    if L <= min_num_lines:
        file2b = '%s_00' % out_prefix
        print 'the input file size is too small to split.'
        shutil.copyfile(file2, file2b)
        return [file2b]
    else:
        cmd = 'split -d -l %d %s %s' % (L, file2, out_prefix)
        lib_utils.gen_msg_time('cmd', cmd, 'split_file')  #debug
        upair_fns = []
        for j in range(ncpu):
            part_fn = '%s%02d' % (out_prefix, j)
            if lib_utils.check_if_file_valid(part_fn):
                upair_fns.append(part_fn)
            else:
                lib_utils.gen_msg_time('error',
                                       'file[%s] does not exist' % part_fn,
                                       'split_file')
        print 'done.'
        return upair_fns
예제 #2
0
	def __init__(self,resource_dir,work_dir='/tmp',go_bin=os.path.join(os.environ['DIVINE'],'python_libs','bin','fastsemsim'),debug=False):
		self.resource_dir = resource_dir
		
		self.obo_fn = '%s/go.obo'%self.resource_dir
		if not lib_utils.check_if_file_valid(self.obo_fn):
			lib_utils.gen_msg_time('error','check if the file[%s] exists'%self.obo_fn,'lib_geneontology.py')
		
		self.goa_fn = '%s/gene_association.goa_human'%self.resource_dir
		if not lib_utils.check_if_file_valid(self.goa_fn):
			lib_utils.gen_msg_time('error','check if the file[%s] exists'%self.goa_fn,'lib_geneontology.py')
			
		self.work_dir = work_dir
		
		#rfix = random.randint(1,1e7)
		#rfix = 0
		
		#to prepare input_fn
		self.query_fn = None
		
		self.gobin = go_bin
		
		self.go_categories = ['biological_process','molecular_function','cellular_component']
		self.gosim_fns = {}
		self.uniprot2RefGenes = {}
		self.debug = debug
예제 #3
0
	def gen_all_pairs_file(self,upair_file):
		uniprot_list_fn = '%s/go_%d.tmp'%(self.work_dir,random.randint(1,1e7))
		cmd = "cut -f2 %s | grep -vP '^!' | sort | uniq > %s"%(self.goa_fn,uniprot_list_fn)
		lib_utils.gen_msg_time('cmd',cmd,'lib_geneontology.run_sim')
		
		Uniprot = lib_utils.file2list(uniprot_list_fn)
		fp2 = lib_utils.open2(upair_file,'w')
		U = len(Uniprot)
		print 'total number of queries[%d]'%U
		for i,u1 in enumerate(Uniprot):
			print '[%d/%d]'%(i,U)
			for u2 in Uniprot[i+1:]:
				fp2.write('%s\t%s\n'%(u1,u2))
		fp2.close()
		print 'done.'
예제 #4
0
	def run_sim(self,out_dir,sim_method='SimRel'):
		
		for root_node in self.go_categories:
			gocmd = self.gobin
			gocmd += " --ontology_file %s"%self.obo_fn
			gocmd += " --ontology_file_format obo"
			gocmd += " --ac_file %s"%self.goa_fn
			gocmd += " --ac_sep '\\t'"
			gocmd += " --task SS"
			gocmd += " -o GeneOntology"
			
			if root_node not in self.go_categories:
				lib_utils.gen_msg_time('error','%s not defined'%root_node,'lib_geneontology.py')
			gocmd += " --root %s"%root_node
			
			gocmd += " --ac fly"
			gocmd += " --tss %s"%sim_method
			gocmd += " --tmix max"
			gocmd += " --remove_nan"
			gocmd += " --ignore_EC TAS"
			gocmd += " --query_ss_type obj"
			gocmd += " --query_mode pairs"
			gocmd += " --query_input file"
			gocmd += " --query_file %s" % self.query_fn
			
			gosim_fn = "%s/%s_%s.tsv" % (out_dir,self.query_fn.split('/')[-1],root_node)
			gocmd += " --output_file %s"%gosim_fn
			
			if self.debug and lib_utils.check_if_file_valid(gosim_fn):
				lib_utils.gen_msg_time('notice','reuse %s'%gosim_fn,'lib_geneontology.run_sim')
			else:
				lib_utils.gen_msg_time('cmd',gocmd,'lib_geneontology.run_sim')
				#print 'reuse...' #debug

			self.gosim_fns[root_node]=gosim_fn
예제 #5
0
	def map_uniprot_to_refgene(self):
		tmp_fn = '%s_tmp%d'%(self.query_fn,random.randint(1,1e7))
		cmd = 'cut -f1 %s | sort | uniq > %s'%(self.query_fn,tmp_fn)
		lib_utils.gen_msg_time('cmd',cmd,'map_uniprot_to_refgene')
		Query = lib_utils.file2list(tmp_fn,False)
		os.unlink(tmp_fn)
		
		tmp_fn = '%s_tmp%d'%(self.query_fn,random.randint(1,1e7))
		cmd = 'cut -f2 %s | sort | uniq > %s'%(self.query_fn,tmp_fn)
		lib_utils.gen_msg_time('cmd',cmd,'map_uniprot_to_refgene')
		Query.extend(lib_utils.file2list(tmp_fn,False))
		os.unlink(tmp_fn)

		fp = lib_utils.open2(self.goa_fn,'r')
		for i in fp:
			if i[0]=='!':continue
			j=i.split('\t')
			refGene = j[2]
			uniprot = j[1]
			if uniprot in Query:
				if refGene not in self.uniprot2RefGenes:
					self.uniprot2RefGenes[uniprot]=refGene
		fp.close()