Пример #1
0
    def class_parse(self,
                    curs,
                    inputfile,
                    output_table,
                    organism_given,
                    sequence_type,
                    run_type=10):
        """
		09-15-05
		"""
        sys.stderr.write("Parsing for %s...\n" % output_table)
        inf = open(inputfile, 'r')
        iter = unigene_data_block_iterator(inf)
        block_no = 0
        pwm_line_pattern = re.compile(
            r'\d\d  '
        )  #used to identify pwm lines, first two characters are numbers
        for block in iter:
            block_no += 1
            if block_no == 1:  #skip the first block
                continue

            unit = self.transfac_embl_parse(block, run_type)
            row = [unit.acc, unit.id, unit.class_struc, unit.comment, \
             unit.factors, unit.external_database_links, unit.reference] #comment is description
            self.submit2class(curs, output_table, row)

            if self.report and block_no % 500 == 0:
                sys.stderr.write('%s%s' % ('\x08' * 20, block_no))
        del iter, inf
        sys.stderr.write("Done\n")
Пример #2
0
    def factor_parse(self,
                     curs,
                     inputfile,
                     output_table,
                     organism_given,
                     sequence_type,
                     run_type=2):
        """
		09-15-05
			reference is a list of ref_acc
		"""
        sys.stderr.write("Parsing for %s...\n" % output_table)
        inf = open(inputfile, 'r')
        iter = unigene_data_block_iterator(inf)
        block_no = 0
        for block in iter:
            block_no += 1
            if block_no == 1:  #skip the first block
                continue

            unit = self.transfac_embl_parse(block, run_type)
            row = [unit.acc, unit.id, unit.tf_name, unit.synonyms, unit.organism, unit.gene_acc, unit.homolog,\
             unit.class_acc, unit.class_tree_id, unit.size, unit.sequence, unit.sequence_source, unit.feature,\
             unit.struc_feature, unit.cell_positive, unit.cell_negative, unit.expr_pattern, unit.func_feature,\
             unit.interacting_partners, unit.matrices, unit.sites, unit.binding_region, unit.external_database_links,\
             unit.reference]
            self.submit2factor(curs, output_table, row)

            if self.report and block_no % 50 == 0:
                sys.stderr.write('%s%s' % ('\x08' * 20, block_no))
        del iter, inf
        sys.stderr.write("Done\n")
Пример #3
0
    def site_parse(self,
                   curs,
                   inputfile,
                   output_table,
                   organism_given,
                   sequence_type,
                   run_type=6):
        """
		09-15-05
		"""
        sys.stderr.write("Parsing for %s...\n" % output_table)
        inf = open(inputfile, 'r')
        iter = unigene_data_block_iterator(inf)
        block_no = 0
        pwm_line_pattern = re.compile(
            r'\d\d  '
        )  #used to identify pwm lines, first two characters are numbers
        for block in iter:
            block_no += 1
            if block_no == 1:  #skip the first block
                continue

            unit = self.transfac_embl_parse(block, run_type)
            row = [unit.acc, unit.id, unit.type, unit.description, unit.gene_acc, unit.organism, unit.gene_region, unit.sequence,\
             unit.element, unit.position_ref_point, unit.position_start, unit.position_end, unit.cell_source, unit.method, \
             unit.comment, unit.external_database_links, unit.reference]
            self.submit2site(curs, output_table, row)

            if self.report and block_no % 500 == 0:
                sys.stderr.write('%s%s' % ('\x08' * 20, block_no))
        del iter, inf
        sys.stderr.write("Done\n")
Пример #4
0
	def get_matrix_id2acc(self, matrix_file):
		"""
		09-10-05
		
		"""
		sys.stderr.write("Setting up matrix_id2acc from %s..."%matrix_file)
		inf = open(matrix_file,'r')
		iter = unigene_data_block_iterator(inf)
		matrix_id2acc = {}
		for block in iter:
			if block=='':	#the last nothing block
				break
			block = cStringIO.StringIO(block)
			acc = None
			id = None
			for line in block:
				if line.find('AC'+' '*2)==0:
					acc = line[4:-1]
				if line.find('ID'+' '*2)==0:
					id = line[4:-1]
			if acc and id:	#the first block of the matrix_file has no matrix
				matrix_id2acc[id] = acc
		
		del iter, inf
		sys.stderr.write("Done\n")
		return matrix_id2acc
Пример #5
0
    def get_matrix_id2acc(self, matrix_file):
        """
		09-10-05
		
		"""
        sys.stderr.write("Setting up matrix_id2acc from %s..." % matrix_file)
        inf = open(matrix_file, 'r')
        iter = unigene_data_block_iterator(inf)
        matrix_id2acc = {}
        for block in iter:
            if block == '':  #the last nothing block
                break
            block = cStringIO.StringIO(block)
            acc = None
            id = None
            for line in block:
                if line.find('AC' + ' ' * 2) == 0:
                    acc = line[4:-1]
                if line.find('ID' + ' ' * 2) == 0:
                    id = line[4:-1]
            if acc and id:  #the first block of the matrix_file has no matrix
                matrix_id2acc[id] = acc

        del iter, inf
        sys.stderr.write("Done\n")
        return matrix_id2acc
Пример #6
0
	def reference_parse(self, curs, inputfile, output_table, organism_given, sequence_type, run_type=5):
		sys.stderr.write("Parsing for %s...\n"%output_table)
		inf = open(inputfile,'r')
		iter = unigene_data_block_iterator(inf)
		block_no = 0
		for block in iter:
			block_no += 1
			if block_no == 1:	#skip the first block
				continue
			ref_acc = ''
			ref_external_link = ''
			ref_authors = ''
			ref_title = ''
			ref_journal	= ''
			block = cStringIO.StringIO(block)
			for line in block:
				line = line.replace("'", 'PRIME')	#replace the ' to avoid database-submit error
				if line[:4] == 'AC  ':
					ref_acc = line[4:-1]
				if line[:4] == 'RX  ':
					ref_external_link = line[4:-1]
				if line[:4] == 'RA  ':
					ref_authors = line[4:-1]
				if line[:4] == 'RT  ':
					ref_title = line[4:-1]
				if line[:4] == 'RL  ':
					ref_journal = line[4:-1]
			row = [ref_acc, ref_external_link, ref_authors, ref_title, ref_journal]
			self.submit2reference(curs, output_table, row)
			
			if self.report and block_no%500==0:
				sys.stderr.write('%s%s'%('\x08'*20, block_no))
		del iter, inf
		sys.stderr.write("Done\n")
Пример #7
0
	def matrix_parse(self, curs, inputfile, output_table, organism_given, sequence_type, run_type=3):
		"""
		09-15-05
			reference is a list of ref_acc
			add pwm and consensus
		09-16-05
			add site_in_matrix_accs to output_table('matrix')
			add site_in_matrix_list to submit to site_in_matrix
		"""
		sys.stderr.write("Parsing for %s...\n"%output_table)
		inf = open(inputfile,'r')
		iter = unigene_data_block_iterator(inf)
		block_no = 0
		pwm_line_pattern = re.compile(r'\d\d  ')	#used to identify pwm lines, first two characters are numbers
		for block in iter:
			block_no += 1
			if block_no == 1:	#skip the first block
				continue
			unit = self.transfac_embl_parse(block, run_type)
			row = [unit.acc, unit.id, unit.tf_name, unit.description, unit.factors, unit.pwm, unit.consensus, \
				unit.basis, unit.sites, unit.site_in_matrix_accs, unit.comment, unit.reference]
			self.submit2matrix(curs, output_table, row, unit.site_in_matrix_list)
			
			if self.report and block_no%500==0:
				sys.stderr.write('%s%s'%('\x08'*20, block_no))
		del iter, inf
		sys.stderr.write("Done\n")
Пример #8
0
	def factor_parse(self, curs, inputfile, output_table, organism_given, sequence_type, run_type=2):
		"""
		09-15-05
			reference is a list of ref_acc
		"""
		sys.stderr.write("Parsing for %s...\n"%output_table)
		inf = open(inputfile,'r')
		iter = unigene_data_block_iterator(inf)
		block_no = 0
		for block in iter:
			block_no += 1
			if block_no==1:	#skip the first block
				continue
			
			unit = self.transfac_embl_parse(block, run_type)
			row = [unit.acc, unit.id, unit.tf_name, unit.synonyms, unit.organism, unit.gene_acc, unit.homolog,\
				unit.class_acc, unit.class_tree_id, unit.size, unit.sequence, unit.sequence_source, unit.feature,\
				unit.struc_feature, unit.cell_positive, unit.cell_negative, unit.expr_pattern, unit.func_feature,\
				unit.interacting_partners, unit.matrices, unit.sites, unit.binding_region, unit.external_database_links,\
				unit.reference]
			self.submit2factor(curs, output_table, row)
			
			if self.report and block_no%50==0:
				sys.stderr.write('%s%s'%('\x08'*20, block_no))
		del iter, inf
		sys.stderr.write("Done\n")
Пример #9
0
	def transformMatrix(self, matrix_fname, output_fname, profile_id_set):
		"""
		2006-08-14
			similar to transfacdb.py's transfac_embl_parse
		"""
		sys.stderr.write("Transforming matrix...")
		inf = open(matrix_fname,'r')
		iter = unigene_data_block_iterator(inf)
		outf = open(output_fname, 'w')
		profile_id_outf = open('%s.id_mapping'%output_fname, 'w')
		block_no = 0
		pwm_line_pattern = re.compile(r'\d\d  ')	#used to identify pwm lines, first two characters are numbers
		profile_id_list = []
		divid_f= lambda x: x/sum(pwm_row)
		for block in iter:
			block_no += 1
			if block_no == 1:	#skip the first block
				continue
			pwm_list = []
			block = cStringIO.StringIO(block)
			for line in block:
				try:
					if pwm_line_pattern.match(line):
						ls = line.split()
						pwm_row = map(float, ls[1:-1])
						pwm_row = map(divid_f, pwm_row)
						pwm_list.append(pwm_row)
					if line[:4] == 'ID  ':
						id = line[4:-1]
				except:
					print 'Except: %s'%repr(sys.exc_info()[0])
					print line
					sys.exit(2)
			#output it if it's in profile_id_set
			if id in profile_id_set:
				profile_id_list.append(id)
				profile_id_outf.write('%s\t%s\n'%(len(profile_id_list), id))
				outf.write('>%s\n'%(len(pwm_list)))
				for pwm_row in pwm_list:
					outf.write('%.3f\t%.3f\t%.3f\t%.3f\n'%(pwm_row[0], pwm_row[1], pwm_row[2], pwm_row[3]))
			if self.report and block_no%500==0:
				sys.stderr.write('%s%s'%('\x08'*20, block_no))
		del iter, inf, outf, profile_id_outf
		sys.stderr.write("Done\n")
Пример #10
0
    def reference_parse(self,
                        curs,
                        inputfile,
                        output_table,
                        organism_given,
                        sequence_type,
                        run_type=5):
        sys.stderr.write("Parsing for %s...\n" % output_table)
        inf = open(inputfile, 'r')
        iter = unigene_data_block_iterator(inf)
        block_no = 0
        for block in iter:
            block_no += 1
            if block_no == 1:  #skip the first block
                continue
            ref_acc = ''
            ref_external_link = ''
            ref_authors = ''
            ref_title = ''
            ref_journal = ''
            block = cStringIO.StringIO(block)
            for line in block:
                line = line.replace(
                    "'",
                    'PRIME')  #replace the ' to avoid database-submit error
                if line[:4] == 'AC  ':
                    ref_acc = line[4:-1]
                if line[:4] == 'RX  ':
                    ref_external_link = line[4:-1]
                if line[:4] == 'RA  ':
                    ref_authors = line[4:-1]
                if line[:4] == 'RT  ':
                    ref_title = line[4:-1]
                if line[:4] == 'RL  ':
                    ref_journal = line[4:-1]
            row = [
                ref_acc, ref_external_link, ref_authors, ref_title, ref_journal
            ]
            self.submit2reference(curs, output_table, row)

            if self.report and block_no % 500 == 0:
                sys.stderr.write('%s%s' % ('\x08' * 20, block_no))
        del iter, inf
        sys.stderr.write("Done\n")
Пример #11
0
    def parse_embl_profile(self, embl_profile, output_file, matrix_id2acc,
                           mc_given, cc_given):
        """
		09-10-05
		09-11-05
			the firstline of output_file changed to the basename of the output_file, not the full path.
			The format is based on internal profile(minSUM92.prf), which is a little bit different from
				what is said in the documentation file from Kangyu(No first blank).
		"""
        sys.stderr.write("Parsing from %s to %s..." %
                         (embl_profile, output_file))
        inf = open(embl_profile, 'r')
        iter = unigene_data_block_iterator(inf)
        of = open(output_file, 'w')
        #write some header information
        of.write("%s\n" % os.path.basename(output_file))
        of.write("From %s.\n" % (os.path.basename(embl_profile)))
        of.write(" MIN_LENGTH 300\n")
        of.write("0.0\n")
        for block in iter:
            if block == '':  #the last nothing block
                break
            block = cStringIO.StringIO(block)
            id = None
            mc = None
            cc = None
            for line in block:
                if line.find('ID' + ' ' * 1) == 0:
                    id = line[3:-1]
                if line.find('MC' + ' ' * 1) == 0:
                    mc = line[3:-1]
                if line.find('CC' + ' ' * 1) == 0:
                    cc = line[3:-1]
            if mc_given:
                mc = mc_given
            if cc_given:
                cc = cc_given
            if id and mc and cc:
                acc = matrix_id2acc[id]
                of.write(' 1.000000 %s %s %s %s\n' %
                         (cc, mc, acc, id))  #first character is blank
        of.write('//\n')
        del inf
        sys.stderr.write("Done\n")
Пример #12
0
	def parse_embl_profile(self, embl_profile, output_file, matrix_id2acc, mc_given, cc_given):
		"""
		09-10-05
		09-11-05
			the firstline of output_file changed to the basename of the output_file, not the full path.
			The format is based on internal profile(minSUM92.prf), which is a little bit different from
				what is said in the documentation file from Kangyu(No first blank).
		"""
		sys.stderr.write("Parsing from %s to %s..."%(embl_profile, output_file))
		inf = open(embl_profile,'r')
		iter = unigene_data_block_iterator(inf)
		of = open(output_file, 'w')
		#write some header information
		of.write("%s\n"%os.path.basename(output_file))
		of.write("From %s.\n"%(os.path.basename(embl_profile)))
		of.write(" MIN_LENGTH 300\n")
		of.write("0.0\n")
		for block in iter:
			if block=='':	#the last nothing block
				break
			block = cStringIO.StringIO(block)
			id = None
			mc = None
			cc = None
			for line in block:
				if line.find('ID'+' '*1)==0:
					id = line[3:-1]
				if line.find('MC'+' '*1)==0:
					mc = line[3:-1]
				if line.find('CC'+' '*1)==0:
					cc = line[3:-1]
			if mc_given:
				mc = mc_given
			if cc_given:
				cc = cc_given
			if id and mc and cc:
				acc = matrix_id2acc[id]
				of.write(' 1.000000 %s %s %s %s\n'%(cc,mc, acc,id))	#first character is blank
		of.write('//\n')
		del inf
		sys.stderr.write("Done\n")
Пример #13
0
	def class_parse(self, curs, inputfile, output_table, organism_given, sequence_type, run_type=10):
		"""
		09-15-05
		"""
		sys.stderr.write("Parsing for %s...\n"%output_table)
		inf = open(inputfile,'r')
		iter = unigene_data_block_iterator(inf)
		block_no = 0
		pwm_line_pattern = re.compile(r'\d\d  ')	#used to identify pwm lines, first two characters are numbers
		for block in iter:
			block_no += 1
			if block_no == 1:	#skip the first block
				continue
			
			unit = self.transfac_embl_parse(block, run_type)
			row = [unit.acc, unit.id, unit.class_struc, unit.comment, \
				unit.factors, unit.external_database_links, unit.reference]	#comment is description
			self.submit2class(curs, output_table, row)
			
			if self.report and block_no%500==0:
				sys.stderr.write('%s%s'%('\x08'*20, block_no))
		del iter, inf
		sys.stderr.write("Done\n")
Пример #14
0
    def matrix_parse(self,
                     curs,
                     inputfile,
                     output_table,
                     organism_given,
                     sequence_type,
                     run_type=3):
        """
		09-15-05
			reference is a list of ref_acc
			add pwm and consensus
		09-16-05
			add site_in_matrix_accs to output_table('matrix')
			add site_in_matrix_list to submit to site_in_matrix
		"""
        sys.stderr.write("Parsing for %s...\n" % output_table)
        inf = open(inputfile, 'r')
        iter = unigene_data_block_iterator(inf)
        block_no = 0
        pwm_line_pattern = re.compile(
            r'\d\d  '
        )  #used to identify pwm lines, first two characters are numbers
        for block in iter:
            block_no += 1
            if block_no == 1:  #skip the first block
                continue
            unit = self.transfac_embl_parse(block, run_type)
            row = [unit.acc, unit.id, unit.tf_name, unit.description, unit.factors, unit.pwm, unit.consensus, \
             unit.basis, unit.sites, unit.site_in_matrix_accs, unit.comment, unit.reference]
            self.submit2matrix(curs, output_table, row,
                               unit.site_in_matrix_list)

            if self.report and block_no % 500 == 0:
                sys.stderr.write('%s%s' % ('\x08' * 20, block_no))
        del iter, inf
        sys.stderr.write("Done\n")
Пример #15
0
	def site_parse(self, curs, inputfile, output_table, organism_given, sequence_type, run_type=6):
		"""
		09-15-05
		"""
		sys.stderr.write("Parsing for %s...\n"%output_table)
		inf = open(inputfile,'r')
		iter = unigene_data_block_iterator(inf)
		block_no = 0
		pwm_line_pattern = re.compile(r'\d\d  ')	#used to identify pwm lines, first two characters are numbers
		for block in iter:
			block_no += 1
			if block_no == 1:	#skip the first block
				continue
			
			unit = self.transfac_embl_parse(block, run_type)
			row = [unit.acc, unit.id, unit.type, unit.description, unit.gene_acc, unit.organism, unit.gene_region, unit.sequence,\
				unit.element, unit.position_ref_point, unit.position_start, unit.position_end, unit.cell_source, unit.method, \
				unit.comment, unit.external_database_links, unit.reference]
			self.submit2site(curs, output_table, row)
			
			if self.report and block_no%500==0:
				sys.stderr.write('%s%s'%('\x08'*20, block_no))
		del iter, inf
		sys.stderr.write("Done\n")