def sortarchs(inputdir, outputdir): archsdir = outputdir Path.mkdir(archsdir) sorted_archs = {} loop_file_name = os.path.join(archsdir, 'ArchDB.{0}.db') loop_split_file_name = os.path.join(archsdir, 'ArchDB.{0}.{1:02d}-{2:02d}.db') sections_ini = [ 0, 4, 7,14,21] sections_end = [ 4, 6,13,20, 0] for archfile in Path.list_files(root = inputdir, pattern = '*.archObj'): filename = os.path.basename(archfile) data = filename.split('_') length = int(data[0]) archtype = data[1] sorted_archs.setdefault(archtype,{}).setdefault(length,[]) sorted_archs[archtype][length].append(archfile) for archtype in sorted_archs: SBIglobals.alert('verbose', None, "ARCHS: " + archtype + "\n") fd = File(loop_file_name.format(archtype), 'w') fdp = [] for x in range(len(sections_ini)): fdp.append(File(loop_split_file_name.format(archtype, sections_ini[x], sections_end[x]), 'w')) for length in sorted(sorted_archs[archtype]): SBIglobals.alert('verbose', None, '\t{0}'.format(length)) for archfile in sorted_archs[archtype][length]: SBIglobals.alert('verbose', None, '\t\t{0}\n'.format(archfile)) nsp = Arch.load(archfile) fd.descriptor.write(nsp.archtype_format() + "\n") for x in range(len(fdp)): if length >= sections_ini[x] and (sections_end[x] == 0 or length <= sections_end[x]): fdp[x].descriptor.write(nsp.archtype_format() + "\n") fd.close() for x in range(len(fdp)): fdp[x].close()
def Arch2SQL(database, sqlfile, verbose): if verbose: sys.stderr.write("Retrieving data from {0} ...\n".format(database)) newsource = Source(name='PDBarch', source="http://www-pdb.org/") outdir = os.path.join(os.path.join(os.path.abspath(sqlfile), '00')) Path.mkdir(outdir) sql_fd = gzip.open(os.path.join(outdir, '0000.sql.gz'), 'wb') sql_fd.write(start_transaction()) sql_fd.write(newsource.toSQL()) sql_fd.write(end_transaction()) sql_fd.close() files_list_by_pdb = {} subdirs = ['archobj', 'superobj'] for subdir in subdirs: for archobjfile in Path.list_files(os.path.join(database, subdir)): if archobjfile.endswith('.archObj'): data = tuple( os.path.splitext( os.path.split(archobjfile)[-1])[0].split('_')[2:]) files_list_by_pdb[data] = archobjfile old_pdb = None newArchSet = None for dofdata in sorted(files_list_by_pdb): pdb = dofdata[0] + '_' + dofdata[1] if pdb != old_pdb: if old_pdb is not None: sql_fd.write(newArchSet.toSQL()) sql_fd.write(end_transaction()) sql_fd.close() outdir = os.path.join( os.path.join(os.path.abspath(sqlfile), dofdata[0][1:3].lower())) Path.mkdir(outdir) if verbose: sys.stderr.write("Retrieving loops from {0} ...\n".format(pdb)) sql_fd = gzip.open(os.path.join(outdir, pdb + '.sql.gz'), 'wb') sql_fd.write(start_transaction()) if verbose: sys.stderr.write("Printing data from {0} ...\n".format(pdb)) old_pdb = pdb newArchSet = Arch(pdb) newArchSet.archs = SSpair.load(files_list_by_pdb[dofdata]) sql_fd.write(newArchSet.toSQL()) sql_fd.write(end_transaction()) sql_fd.close() if verbose: sys.stderr.write("End execution.\n")
def get_PDB(self, pdbID): if self.has_local: rootdir = os.path.join(self.local, pdbID.lower()[1:3]) for pdb_file in Path.list_files(root=rootdir, pattern='*.ent.gz'): newfile = File(file_name=pdb_file, action='r') if newfile.prefix.lstrip('pdb').upper() == pdbID.upper(): return pdb_file #If we do not find it in local (or we do not have a local) we search it on the FTP pdb_file = 'pdb' + pdbID.lower() + '.ent.gz' source = 'ftp://' + PDBftp['address'] + os.path.join( PDBftp['structures'], pdbID[1:3].lower(), pdb_file) try: urllib.urlretrieve(source, pdb_file) except: return False return os.path.abspath(pdb_file)
def _process(self): tmoFile = File(self._pdbtmfile, 'w', True) for xmlfile in Path.list_files( os.path.join(self._local, 'pdbtm/database/'), '*.xml'): xmldata = TM( pdb=os.path.splitext(os.path.split(xmlfile)[1])[0].upper()) skip_chains = set() read = False fdxml = open(xmlfile) for line in fdxml: if line.startswith(' <TMRES>'): xmldata.tmres = line elif line.startswith(' <TMTYPE'): xmldata.tmtype = line elif line.startswith(' <PDBKWRES'): xmldata.kwres = line elif line.startswith(' <SIDEDEFINITION'): m = re.search('Side1="(\S+)"', line) xmldata.side = m.group(1) elif line.startswith(' <APPLY_TO_CHAIN'): m = re.search('NEW_CHAINID=\"(\S{1})\"', line) if m: skip_chains.add(m.group(1)) elif line.startswith(' <CHAIN '): m = re.search( 'CHAINID=\"(\S{1})\" NUM_TM=\"(\d{1})\" TYPE=\"(\S+)\"', line) if m: chain, num, tmtype = m.group(1), m.group(2), m.group(3) if not chain in skip_chains: cdata = tuple([chain, num, tmtype]) xmldata.set_chain(cdata) read = True elif line.startswith(' <REGION ') and read: m = re.search( 'pdb_beg=\"(\-*\d+\w*)\"[\s\S]+pdb_end=\"(\-*\d+\w*)\"\s+type=\"(\w{1})\"', line) ini, end, tmtype = m.group(1), m.group(2), m.group(3) xmldata.set_chain(cdata, tuple([ini, end, tmtype])) elif line.startswith(' </CHAIN>'): read = False fdxml.close() if len(xmldata.chains) > 0: tmoFile.write(str(xmldata) + "\n") tmoFile.close()
def DS2SQL(database, looplist, sqlfile, verbose): Path.mkdir(sqlfile) for dsfile in Path.list_files(database): subclasstype = os.path.split(dsfile)[-1].split('.')[1] classification = Cclass(subclasstype) if verbose: sys.stderr.write( "Retrieving data for subclass {0} ...\n".format(subclasstype)) loops = readlist(looplist, subclasstype) sql_fd = gzip.open(os.path.join(sqlfile, subclasstype + '.sql.gz'), 'wb') sql_fd.write(start_transaction()) sql_in = open(dsfile) read = False for line in sql_in: dataline = line.rstrip('\n') #SKIP LINES if line.startswith('==') or line.startswith('***') or len( line.strip()) == 0 or line.startswith( '---- P R O T E I N C O D E ----'): continue if line.startswith('CONSENSUS & MULTIPLE ALIGNEMENT IN THE'): data = line.split(':')[-1].strip().split() classification.subclasses = Subclass( tuple([data[0].strip(), data[3].strip()]), data[4]) workscls = classification.lastsubclass read = True continue if line.startswith('GLOBAL STATISTICS'): read = False continue if read: if line.startswith( ' SEQUENCE ALIGNEMENT :' ): parse_mode, counter = 'P', 0 elif line.startswith( ' ACCESSIBLE SURFACE ALIGNEMENT :' ): parse_mode, counter = 'E', 0 elif line.startswith( ' RAMACHANDRAN :' ): parse_mode, counter = 'R', 0 elif line.startswith( ' SECONDARY STRUCTURE :' ): parse_mode, counter = 'S', 0 elif line.startswith('--------- CONSENSUS THORNTON :'): workscls.add_consensus(dataline, 'DS', loops) elif line.startswith('--------- CONSENSUS TOPOLOGY'): workscls.add_topology(dataline, 'DS') elif line.startswith('CENTROIDE POLAR COORD. :'): workscls.add_coordinates(dataline) elif line.startswith('--------- RAMACHANDRAN PATTERN :'): workscls.ram_pat = re.sub( '\(X\)', '', dataline.split(':')[1].strip().strip('.')) elif line.startswith('--------- SEQUENCE PATTERN :'): workscls.seq_pat = re.sub( '\(X\)', '', dataline.split(':')[1].strip().strip('.')) elif line.startswith('--------- BURIAL PATTERN :'): workscls.exp_pat = re.sub( '\(X\)', '', dataline.split(':')[1].strip().strip('.')) elif line.startswith(' ' ) and len(dataline) < 400: if parse_mode == 'P': workscls.loops = Loop(info=dataline) if parse_mode == 'E': workscls.loops[counter].add_surface(info=dataline) counter += 1 if parse_mode == 'R': workscls.loops[counter].add_ramachandran(info=dataline) counter += 1 if parse_mode == 'S': workscls.loops[counter].add_secondary_str( info=dataline) counter += 1 sql_fd.write(classification.toSQL('DS')) sql_in.close() sql_fd.write(end_transaction()) sql_fd.close() if verbose: sys.stderr.write("End execution.\n")
def localPDBeChems(self): for chem_file in Path.list_files(root = self.local, pattern = '*.cif'): yield chem_file
def localPDBs(self): for pdb_file in Path.list_files(root=self.local, pattern='*.ent.gz'): yield pdb_file