def class_parse(self, curs, inputfile, output_table, organism_given, sequence_type, run_type=10): """ 09-15-05 """ sys.stderr.write("Parsing for %s...\n" % output_table) inf = open(inputfile, 'r') iter = unigene_data_block_iterator(inf) block_no = 0 pwm_line_pattern = re.compile( r'\d\d ' ) #used to identify pwm lines, first two characters are numbers for block in iter: block_no += 1 if block_no == 1: #skip the first block continue unit = self.transfac_embl_parse(block, run_type) row = [unit.acc, unit.id, unit.class_struc, unit.comment, \ unit.factors, unit.external_database_links, unit.reference] #comment is description self.submit2class(curs, output_table, row) if self.report and block_no % 500 == 0: sys.stderr.write('%s%s' % ('\x08' * 20, block_no)) del iter, inf sys.stderr.write("Done\n")
def factor_parse(self, curs, inputfile, output_table, organism_given, sequence_type, run_type=2): """ 09-15-05 reference is a list of ref_acc """ sys.stderr.write("Parsing for %s...\n" % output_table) inf = open(inputfile, 'r') iter = unigene_data_block_iterator(inf) block_no = 0 for block in iter: block_no += 1 if block_no == 1: #skip the first block continue unit = self.transfac_embl_parse(block, run_type) row = [unit.acc, unit.id, unit.tf_name, unit.synonyms, unit.organism, unit.gene_acc, unit.homolog,\ unit.class_acc, unit.class_tree_id, unit.size, unit.sequence, unit.sequence_source, unit.feature,\ unit.struc_feature, unit.cell_positive, unit.cell_negative, unit.expr_pattern, unit.func_feature,\ unit.interacting_partners, unit.matrices, unit.sites, unit.binding_region, unit.external_database_links,\ unit.reference] self.submit2factor(curs, output_table, row) if self.report and block_no % 50 == 0: sys.stderr.write('%s%s' % ('\x08' * 20, block_no)) del iter, inf sys.stderr.write("Done\n")
def site_parse(self, curs, inputfile, output_table, organism_given, sequence_type, run_type=6): """ 09-15-05 """ sys.stderr.write("Parsing for %s...\n" % output_table) inf = open(inputfile, 'r') iter = unigene_data_block_iterator(inf) block_no = 0 pwm_line_pattern = re.compile( r'\d\d ' ) #used to identify pwm lines, first two characters are numbers for block in iter: block_no += 1 if block_no == 1: #skip the first block continue unit = self.transfac_embl_parse(block, run_type) row = [unit.acc, unit.id, unit.type, unit.description, unit.gene_acc, unit.organism, unit.gene_region, unit.sequence,\ unit.element, unit.position_ref_point, unit.position_start, unit.position_end, unit.cell_source, unit.method, \ unit.comment, unit.external_database_links, unit.reference] self.submit2site(curs, output_table, row) if self.report and block_no % 500 == 0: sys.stderr.write('%s%s' % ('\x08' * 20, block_no)) del iter, inf sys.stderr.write("Done\n")
def get_matrix_id2acc(self, matrix_file): """ 09-10-05 """ sys.stderr.write("Setting up matrix_id2acc from %s..."%matrix_file) inf = open(matrix_file,'r') iter = unigene_data_block_iterator(inf) matrix_id2acc = {} for block in iter: if block=='': #the last nothing block break block = cStringIO.StringIO(block) acc = None id = None for line in block: if line.find('AC'+' '*2)==0: acc = line[4:-1] if line.find('ID'+' '*2)==0: id = line[4:-1] if acc and id: #the first block of the matrix_file has no matrix matrix_id2acc[id] = acc del iter, inf sys.stderr.write("Done\n") return matrix_id2acc
def get_matrix_id2acc(self, matrix_file): """ 09-10-05 """ sys.stderr.write("Setting up matrix_id2acc from %s..." % matrix_file) inf = open(matrix_file, 'r') iter = unigene_data_block_iterator(inf) matrix_id2acc = {} for block in iter: if block == '': #the last nothing block break block = cStringIO.StringIO(block) acc = None id = None for line in block: if line.find('AC' + ' ' * 2) == 0: acc = line[4:-1] if line.find('ID' + ' ' * 2) == 0: id = line[4:-1] if acc and id: #the first block of the matrix_file has no matrix matrix_id2acc[id] = acc del iter, inf sys.stderr.write("Done\n") return matrix_id2acc
def reference_parse(self, curs, inputfile, output_table, organism_given, sequence_type, run_type=5): sys.stderr.write("Parsing for %s...\n"%output_table) inf = open(inputfile,'r') iter = unigene_data_block_iterator(inf) block_no = 0 for block in iter: block_no += 1 if block_no == 1: #skip the first block continue ref_acc = '' ref_external_link = '' ref_authors = '' ref_title = '' ref_journal = '' block = cStringIO.StringIO(block) for line in block: line = line.replace("'", 'PRIME') #replace the ' to avoid database-submit error if line[:4] == 'AC ': ref_acc = line[4:-1] if line[:4] == 'RX ': ref_external_link = line[4:-1] if line[:4] == 'RA ': ref_authors = line[4:-1] if line[:4] == 'RT ': ref_title = line[4:-1] if line[:4] == 'RL ': ref_journal = line[4:-1] row = [ref_acc, ref_external_link, ref_authors, ref_title, ref_journal] self.submit2reference(curs, output_table, row) if self.report and block_no%500==0: sys.stderr.write('%s%s'%('\x08'*20, block_no)) del iter, inf sys.stderr.write("Done\n")
def matrix_parse(self, curs, inputfile, output_table, organism_given, sequence_type, run_type=3): """ 09-15-05 reference is a list of ref_acc add pwm and consensus 09-16-05 add site_in_matrix_accs to output_table('matrix') add site_in_matrix_list to submit to site_in_matrix """ sys.stderr.write("Parsing for %s...\n"%output_table) inf = open(inputfile,'r') iter = unigene_data_block_iterator(inf) block_no = 0 pwm_line_pattern = re.compile(r'\d\d ') #used to identify pwm lines, first two characters are numbers for block in iter: block_no += 1 if block_no == 1: #skip the first block continue unit = self.transfac_embl_parse(block, run_type) row = [unit.acc, unit.id, unit.tf_name, unit.description, unit.factors, unit.pwm, unit.consensus, \ unit.basis, unit.sites, unit.site_in_matrix_accs, unit.comment, unit.reference] self.submit2matrix(curs, output_table, row, unit.site_in_matrix_list) if self.report and block_no%500==0: sys.stderr.write('%s%s'%('\x08'*20, block_no)) del iter, inf sys.stderr.write("Done\n")
def factor_parse(self, curs, inputfile, output_table, organism_given, sequence_type, run_type=2): """ 09-15-05 reference is a list of ref_acc """ sys.stderr.write("Parsing for %s...\n"%output_table) inf = open(inputfile,'r') iter = unigene_data_block_iterator(inf) block_no = 0 for block in iter: block_no += 1 if block_no==1: #skip the first block continue unit = self.transfac_embl_parse(block, run_type) row = [unit.acc, unit.id, unit.tf_name, unit.synonyms, unit.organism, unit.gene_acc, unit.homolog,\ unit.class_acc, unit.class_tree_id, unit.size, unit.sequence, unit.sequence_source, unit.feature,\ unit.struc_feature, unit.cell_positive, unit.cell_negative, unit.expr_pattern, unit.func_feature,\ unit.interacting_partners, unit.matrices, unit.sites, unit.binding_region, unit.external_database_links,\ unit.reference] self.submit2factor(curs, output_table, row) if self.report and block_no%50==0: sys.stderr.write('%s%s'%('\x08'*20, block_no)) del iter, inf sys.stderr.write("Done\n")
def transformMatrix(self, matrix_fname, output_fname, profile_id_set): """ 2006-08-14 similar to transfacdb.py's transfac_embl_parse """ sys.stderr.write("Transforming matrix...") inf = open(matrix_fname,'r') iter = unigene_data_block_iterator(inf) outf = open(output_fname, 'w') profile_id_outf = open('%s.id_mapping'%output_fname, 'w') block_no = 0 pwm_line_pattern = re.compile(r'\d\d ') #used to identify pwm lines, first two characters are numbers profile_id_list = [] divid_f= lambda x: x/sum(pwm_row) for block in iter: block_no += 1 if block_no == 1: #skip the first block continue pwm_list = [] block = cStringIO.StringIO(block) for line in block: try: if pwm_line_pattern.match(line): ls = line.split() pwm_row = map(float, ls[1:-1]) pwm_row = map(divid_f, pwm_row) pwm_list.append(pwm_row) if line[:4] == 'ID ': id = line[4:-1] except: print 'Except: %s'%repr(sys.exc_info()[0]) print line sys.exit(2) #output it if it's in profile_id_set if id in profile_id_set: profile_id_list.append(id) profile_id_outf.write('%s\t%s\n'%(len(profile_id_list), id)) outf.write('>%s\n'%(len(pwm_list))) for pwm_row in pwm_list: outf.write('%.3f\t%.3f\t%.3f\t%.3f\n'%(pwm_row[0], pwm_row[1], pwm_row[2], pwm_row[3])) if self.report and block_no%500==0: sys.stderr.write('%s%s'%('\x08'*20, block_no)) del iter, inf, outf, profile_id_outf sys.stderr.write("Done\n")
def reference_parse(self, curs, inputfile, output_table, organism_given, sequence_type, run_type=5): sys.stderr.write("Parsing for %s...\n" % output_table) inf = open(inputfile, 'r') iter = unigene_data_block_iterator(inf) block_no = 0 for block in iter: block_no += 1 if block_no == 1: #skip the first block continue ref_acc = '' ref_external_link = '' ref_authors = '' ref_title = '' ref_journal = '' block = cStringIO.StringIO(block) for line in block: line = line.replace( "'", 'PRIME') #replace the ' to avoid database-submit error if line[:4] == 'AC ': ref_acc = line[4:-1] if line[:4] == 'RX ': ref_external_link = line[4:-1] if line[:4] == 'RA ': ref_authors = line[4:-1] if line[:4] == 'RT ': ref_title = line[4:-1] if line[:4] == 'RL ': ref_journal = line[4:-1] row = [ ref_acc, ref_external_link, ref_authors, ref_title, ref_journal ] self.submit2reference(curs, output_table, row) if self.report and block_no % 500 == 0: sys.stderr.write('%s%s' % ('\x08' * 20, block_no)) del iter, inf sys.stderr.write("Done\n")
def parse_embl_profile(self, embl_profile, output_file, matrix_id2acc, mc_given, cc_given): """ 09-10-05 09-11-05 the firstline of output_file changed to the basename of the output_file, not the full path. The format is based on internal profile(minSUM92.prf), which is a little bit different from what is said in the documentation file from Kangyu(No first blank). """ sys.stderr.write("Parsing from %s to %s..." % (embl_profile, output_file)) inf = open(embl_profile, 'r') iter = unigene_data_block_iterator(inf) of = open(output_file, 'w') #write some header information of.write("%s\n" % os.path.basename(output_file)) of.write("From %s.\n" % (os.path.basename(embl_profile))) of.write(" MIN_LENGTH 300\n") of.write("0.0\n") for block in iter: if block == '': #the last nothing block break block = cStringIO.StringIO(block) id = None mc = None cc = None for line in block: if line.find('ID' + ' ' * 1) == 0: id = line[3:-1] if line.find('MC' + ' ' * 1) == 0: mc = line[3:-1] if line.find('CC' + ' ' * 1) == 0: cc = line[3:-1] if mc_given: mc = mc_given if cc_given: cc = cc_given if id and mc and cc: acc = matrix_id2acc[id] of.write(' 1.000000 %s %s %s %s\n' % (cc, mc, acc, id)) #first character is blank of.write('//\n') del inf sys.stderr.write("Done\n")
def parse_embl_profile(self, embl_profile, output_file, matrix_id2acc, mc_given, cc_given): """ 09-10-05 09-11-05 the firstline of output_file changed to the basename of the output_file, not the full path. The format is based on internal profile(minSUM92.prf), which is a little bit different from what is said in the documentation file from Kangyu(No first blank). """ sys.stderr.write("Parsing from %s to %s..."%(embl_profile, output_file)) inf = open(embl_profile,'r') iter = unigene_data_block_iterator(inf) of = open(output_file, 'w') #write some header information of.write("%s\n"%os.path.basename(output_file)) of.write("From %s.\n"%(os.path.basename(embl_profile))) of.write(" MIN_LENGTH 300\n") of.write("0.0\n") for block in iter: if block=='': #the last nothing block break block = cStringIO.StringIO(block) id = None mc = None cc = None for line in block: if line.find('ID'+' '*1)==0: id = line[3:-1] if line.find('MC'+' '*1)==0: mc = line[3:-1] if line.find('CC'+' '*1)==0: cc = line[3:-1] if mc_given: mc = mc_given if cc_given: cc = cc_given if id and mc and cc: acc = matrix_id2acc[id] of.write(' 1.000000 %s %s %s %s\n'%(cc,mc, acc,id)) #first character is blank of.write('//\n') del inf sys.stderr.write("Done\n")
def class_parse(self, curs, inputfile, output_table, organism_given, sequence_type, run_type=10): """ 09-15-05 """ sys.stderr.write("Parsing for %s...\n"%output_table) inf = open(inputfile,'r') iter = unigene_data_block_iterator(inf) block_no = 0 pwm_line_pattern = re.compile(r'\d\d ') #used to identify pwm lines, first two characters are numbers for block in iter: block_no += 1 if block_no == 1: #skip the first block continue unit = self.transfac_embl_parse(block, run_type) row = [unit.acc, unit.id, unit.class_struc, unit.comment, \ unit.factors, unit.external_database_links, unit.reference] #comment is description self.submit2class(curs, output_table, row) if self.report and block_no%500==0: sys.stderr.write('%s%s'%('\x08'*20, block_no)) del iter, inf sys.stderr.write("Done\n")
def matrix_parse(self, curs, inputfile, output_table, organism_given, sequence_type, run_type=3): """ 09-15-05 reference is a list of ref_acc add pwm and consensus 09-16-05 add site_in_matrix_accs to output_table('matrix') add site_in_matrix_list to submit to site_in_matrix """ sys.stderr.write("Parsing for %s...\n" % output_table) inf = open(inputfile, 'r') iter = unigene_data_block_iterator(inf) block_no = 0 pwm_line_pattern = re.compile( r'\d\d ' ) #used to identify pwm lines, first two characters are numbers for block in iter: block_no += 1 if block_no == 1: #skip the first block continue unit = self.transfac_embl_parse(block, run_type) row = [unit.acc, unit.id, unit.tf_name, unit.description, unit.factors, unit.pwm, unit.consensus, \ unit.basis, unit.sites, unit.site_in_matrix_accs, unit.comment, unit.reference] self.submit2matrix(curs, output_table, row, unit.site_in_matrix_list) if self.report and block_no % 500 == 0: sys.stderr.write('%s%s' % ('\x08' * 20, block_no)) del iter, inf sys.stderr.write("Done\n")
def site_parse(self, curs, inputfile, output_table, organism_given, sequence_type, run_type=6): """ 09-15-05 """ sys.stderr.write("Parsing for %s...\n"%output_table) inf = open(inputfile,'r') iter = unigene_data_block_iterator(inf) block_no = 0 pwm_line_pattern = re.compile(r'\d\d ') #used to identify pwm lines, first two characters are numbers for block in iter: block_no += 1 if block_no == 1: #skip the first block continue unit = self.transfac_embl_parse(block, run_type) row = [unit.acc, unit.id, unit.type, unit.description, unit.gene_acc, unit.organism, unit.gene_region, unit.sequence,\ unit.element, unit.position_ref_point, unit.position_start, unit.position_end, unit.cell_source, unit.method, \ unit.comment, unit.external_database_links, unit.reference] self.submit2site(curs, output_table, row) if self.report and block_no%500==0: sys.stderr.write('%s%s'%('\x08'*20, block_no)) del iter, inf sys.stderr.write("Done\n")