def open_molecule_file(uploadedfile, logfile=os.devnull, filetype=None): #charset = 'utf-8' #if "charset" in uploadedfile and uploadedfile.charset is not None: #charset = uploadedfile.charset if filetype is None: if "filetype" not in uploadedfile or uploadedfile.filetype is None: basename, ext = os.path.splitext(uploadedfile.name) ext = ext.lower() ext = ext.strip('.') if ext in MOLECULE_EXTENSION_TYPES.keys(): filetype = MOLECULE_EXTENSION_TYPES[ext] uploadedfile.filetype = filetype else: raise InvalidMoleculeFileExtension(ext=ext) else: filetype = uploadedfile.filetype with stdout_redirected(to=logfile, stdout=sys.stderr): with stdout_redirected(to=logfile, stdout=sys.stdout): print('Loading molecule...') uploadedfile.seek(0) if filetype == 'sdf' or filetype == 'mol': suppl = ForwardSDMolSupplier(uploadedfile, removeHs=False) mol = next(suppl) try: next(suppl) except StopIteration: pass except: raise else: raise MultipleMoleculesinSDF() finally: del suppl if mol is None: if filetype == 'sdf': raise ParsingError("Invalid SDFile file.") else: raise ParsingError("Invalid MDL Mol file.") print('Assigning chirality from struture...') AssignAtomChiralTagsFromStructure(mol, replaceExistingTags=False) print('Finished loading molecule.') return mol
def get_chembl_molecule_ids(datachembl, parents=False): ids = [] try: for molecule in datachembl["molecules"]: if parents and "molecule_hierarchy" in molecule.keys( ) and molecule["molecule_hierarchy"]: if "parent_chembl_id" in molecule["molecule_hierarchy"].keys(): chembl_id = molecule["molecule_hierarchy"][ "parent_chembl_id"] else: chembl_id = molecule["molecule_chembl_id"] ids.append(int(chembl_id.replace('CHEMBL', ''))) ids = list_unique(ids) except: raise ParsingError("Cannot parse ChEMBL molecule information.") return ids
def chembl_get_molregno_from_html(chemblid, getmol_url='/chembl/download_helper/getmol/' ): URL = "https://www.ebi.ac.uk/chembl/compound/inspect/" molregno = None do_not_skip_on_debug = False SIZE_LIMIT = 512000 RECIEVE_TIMEOUT = 120 errdata = dict() try: response = requests.get(URL + chemblid, timeout=30, stream=False, verify=True) response.raise_for_status() encoding = response.encoding chunks = response.iter_content(chunk_size=524288) parser = ChemblUrlInspectCompoundDownloadMolHTMLParser( getmol_url=getmol_url) size = 0 start = time.time() for chunk in chunks: size += len(chunk) if size > SIZE_LIMIT: raise StreamSizeLimitError('response too large') if time.time() - start > RECIEVE_TIMEOUT: raise StreamTimeoutError('timeout reached') chunk = chunk.decode(encoding) parser.feed(chunk) if not parser.keep_parsing: break parser.close() href_url = parser.url m = re.search(re.escape(getmol_url) + r'(\d+)', href_url) if m: molregno = int(m.group(1)) if molregno is None: raise ParsingError("Molecule structure molregno not found.") return (molregno, errdata) except HTTPError: errdata['Error'] = True errdata['ErrorType'] = 'HTTPError' errdata['status_code'] = response.status_code errdata['reason'] = response.reason except ConnectionError as e: errdata['Error'] = True errdata['ErrorType'] = 'ConnectionError' errdata['reason'] = 'Cannot connect.' except Timeout as e: errdata['Error'] = True errdata['ErrorType'] = 'Timeout' errdata['reason'] = 'Timeout exceeded.' except TooManyRedirects as e: errdata['Error'] = True errdata['ErrorType'] = 'TooManyRedirects' errdata['reason'] = 'Too many redirects.' except StreamSizeLimitError as e: errdata['Error'] = True errdata['ErrorType'] = 'StreamSizeLimitError' errdata['reason'] = str(e) except StreamTimeoutError as e: errdata['Error'] = True errdata['ErrorType'] = 'StreamTimeoutError' errdata['reason'] = str(e) except ParsingError as e: errdata['Error'] = True errdata['ErrorType'] = 'ParsingError' errdata['reason'] = str(e) except: errdata['Error'] = True errdata['ErrorType'] = 'Internal' errdata['reason'] = '' do_not_skip_on_debug = True raise finally: try: parser.close() except: pass try: response.close() except: pass if not (settings.DEBUG and do_not_skip_on_debug): return (molregno, errdata)
def chembl_get_compound_id_query_result_url( postdata, chembl_submission_url='https://www.ebi.ac.uk/chembl/compound/ids'): results_url = None do_not_skip_on_debug = False SIZE_LIMIT = 512000 RECIEVE_TIMEOUT = 120 errdata = dict() try: response = requests.post(chembl_submission_url, data=postdata, timeout=30, stream=False, verify=True) response.raise_for_status() encoding = response.encoding chunks = response.iter_content(chunk_size=524288) target = ChemblResultsUrl() parser = xmlparser(target=target) size = 0 start = time.time() for chunk in chunks: size += len(chunk) if size > SIZE_LIMIT: raise StreamSizeLimitError('response too large') if time.time() - start > RECIEVE_TIMEOUT: raise StreamTimeoutError('timeout reached') chunk = chunk.decode(encoding) parser.feed(chunk) if not target.keep_parsing: break results_url = parser.close() if results_url is None: raise ParsingError("No query result url found.") results_url = results_url.replace(':', '/') return (results_url, errdata) except HTTPError: errdata['Error'] = True errdata['ErrorType'] = 'HTTPError' errdata['status_code'] = response.status_code errdata['reason'] = response.reason except ConnectionError as e: errdata['Error'] = True errdata['ErrorType'] = 'ConnectionError' errdata['reason'] = 'Cannot connect.' except Timeout as e: errdata['Error'] = True errdata['ErrorType'] = 'Timeout' errdata['reason'] = 'Timeout exceeded.' except TooManyRedirects as e: errdata['Error'] = True errdata['ErrorType'] = 'TooManyRedirects' errdata['reason'] = 'Too many redirects.' except StreamSizeLimitError as e: errdata['Error'] = True errdata['ErrorType'] = 'StreamSizeLimitError' errdata['reason'] = str(e) except StreamTimeoutError as e: errdata['Error'] = True errdata['ErrorType'] = 'StreamTimeoutError' errdata['reason'] = str(e) except ParsingError as e: errdata['Error'] = True errdata['ErrorType'] = 'ParsingError' errdata['reason'] = str(e) except: errdata['Error'] = True errdata['ErrorType'] = 'Internal' errdata['reason'] = '' do_not_skip_on_debug = True raise finally: try: parser.close() except: pass try: response.close() except: pass if not (settings.DEBUG and do_not_skip_on_debug): return (results_url, errdata)
def retreive_isoform_data_uniprot(acnum,\ size_limit=512000,buffer_size=512000,recieve_timeout=120,connect_timeout=30): COLUMNS = 'comment(ALTERNATIVE PRODUCTS)' KEYS = set(('Event', 'Named isoforms', 'Comment', 'Name', 'Synonyms', 'IsoId', 'Sequence', 'Note')) MANDATORY_KEYS = set(('Event', 'IsoId', 'Sequence')) do_not_skip_on_debug = False data,errdata = retreive_data_uniprot(acnum,columns=COLUMNS,\ size_limit=size_limit,buffer_size=buffer_size,recieve_timeout=recieve_timeout,connect_timeout=connect_timeout) try: if data == dict(): print('F**K, empty data') return elif 'Alternative products (isoforms)' not in data.keys(): raise ParsingError('Cannot parse isoform data.') rawdata = data.pop('Alternative products (isoforms)') if rawdata.find('ALTERNATIVE PRODUCTS:') == 0: rawdata = rawdata[22:].strip() rows = rawdata.split(';') if rawdata[-1] == ';': rows.pop() for row in rows: row = row.strip() keyval = row.split('=') key = keyval[0] val = keyval[1].strip() if key == 'Named isoforms': try: data[key] = int(val) except ValueError: raise ParsingError( 'Cannot parse isoform data, invalid format.') except: raise else: if key not in data.keys(): data[key] = [] if key not in set(('Name', 'Note', 'Comment')): val = [i.strip() for i in val.split(',')] data[key].append(val) datakeys = set(data.keys()) if datakeys.issubset(KEYS) and MANDATORY_KEYS.issubset(datakeys): for acnlist, seqlist in zip(data['IsoId'], data['Sequence']): for seq in seqlist: if seq == 'Displayed': data['Displayed'] = acnlist[0] else: raise ParsingError( 'Cannot parse isoform data, invalid format.') elif rawdata != '': raise ParsingError('Cannot parse isoform data.') except ParsingError as e: errdata['Error'] = True errdata['ErrorType'] = 'ParsingError' errdata['reason'] = str(e) except: errdata['Error'] = True errdata['ErrorType'] = 'Internal' do_not_skip_on_debug = True raise finally: if not (settings.DEBUG and do_not_skip_on_debug): print(data, errdata) return (data, errdata)
def retreive_fasta_seq_uniprot(acnum, size_limit=102400, buffer_size=512000, recieve_timeout=120, connect_timeout=30): URL = 'http://www.uniprot.org/uniprot/' data = dict() errdata = dict() do_not_skip_on_debug = False try: response = requests.get(URL + str(acnum) + '.fasta', timeout=recieve_timeout, stream=True) response.raise_for_status() encoding = response.encoding sequencere = re.compile('^[A-Z]+$') header = '' sequence = '' size = 0 headerread = False chunkend = False remain = '' start = time.time() chunks = response.iter_content(chunk_size=buffer_size) while True: try: chunk = next(chunks) size += len(chunk) if size > size_limit: raise StreamSizeLimitError('Response too large.') if time.time() - start > recieve_timeout: raise StreamTimeoutError( 'Stream download time limit reached.') chunk = chunk.decode(encoding) chunk = remain + chunk except StopIteration: if chunkend: break else: lines = [remain] chunkend = True pass except: raise else: lines = chunk.split('\n') remain = lines.pop() for line in lines: if line != '': isheader = line.find('>sp|') == 0 or line.find('>tr|') == 0 if headerread or not isheader: if sequencere.search(line): sequence += line.replace('-', 'X') elif line.find('*') < 0: ParsingError( 'Cannot parse fasta line:\n' + ' translation stop character ("*") not accepted.' ) elif isheader: return else: raise ParsingError('Cannot parse fasta line:\n' + '"' + line + '"') else: #do only for header header = line headerread = True data['header'] = header data['sequence'] = sequence except HTTPError: errdata['Error'] = True errdata['ErrorType'] = 'HTTPError' errdata['status_code'] = response.status_code errdata['reason'] = response.reason except ConnectionError as e: errdata['Error'] = True errdata['ErrorType'] = 'ConnectionError' errdata['reason'] = 'Cannot connect.' except Timeout as e: errdata['Error'] = True errdata['ErrorType'] = 'Timeout' errdata['reason'] = 'Timeout exceeded.' except TooManyRedirects as e: errdata['Error'] = True errdata['ErrorType'] = 'TooManyRedirects' errdata['reason'] = 'Too many redirects.' except StreamSizeLimitError as e: errdata['Error'] = True errdata['ErrorType'] = 'StreamSizeLimitError' errdata['reason'] = str(e) except StreamTimeoutError as e: errdata['Error'] = True errdata['ErrorType'] = 'StreamTimeoutError' errdata['reason'] = str(e) except ParsingError as e: errdata['Error'] = True errdata['ErrorType'] = 'ParsingError' errdata['reason'] = str(e) except: errdata['Error'] = True errdata['ErrorType'] = 'Internal' do_not_skip_on_debug = True raise finally: try: response.close() except: pass if not (settings.DEBUG and do_not_skip_on_debug): return (data, errdata)
def retreive_data_uniprot(acnum,isoform=None,columns='id,entry name,reviewed,protein names,organism,length',\ size_limit=512000,buffer_size=512000,recieve_timeout=120,connect_timeout=30): ### Returns a dictionary with the selected columns as keys. 'id' --> 'entry' URL = 'http://www.uniprot.org/uniprot/?' data = dict() errdata = dict() do_not_skip_on_debug = False try: if isoform is None: seqstr = '' else: seqstr = '+AND+sequence:' + acnum + '-' + str(isoform) #~ print(URL+'query=accession:'+str(acnum)+'+AND+active:yes'+seqstr+'&columns='+columns+'&format=tab') response = requests.get(URL + 'query=accession:' + str(acnum) + '+AND+active:yes' + seqstr + '&columns=' + columns + '&format=tab', timeout=connect_timeout, stream=True) response.raise_for_status() encoding = response.encoding rowcounter = 0 size = 0 start = time.time() headersread = False chunkend = False remain = '' chunks = response.iter_content(chunk_size=buffer_size) while True: try: chunk = next(chunks) size += len(chunk) if size > size_limit: raise StreamSizeLimitError('Response too large.') if time.time() - start > recieve_timeout: raise StreamTimeoutError( 'Stream download time limit reached.') chunk = chunk.decode(encoding) chunk = remain + chunk except StopIteration: if chunkend: break else: lines = [remain] chunkend = True pass except: raise else: lines = chunk.split('\n') remain = lines.pop() for line in lines: if line != '': vallist = line.split('\t') if headersread: if rowcounter > 0: raise ParsingError( 'Error parsing data: secondary accession number pointing to multiple entries.' ) if len(headers) == len(vallist): for header, value in zip(headers, vallist): data[str(header.strip())] = value.strip() else: raise ParsingError('Error parsing data.') rowcounter += 1 else: #do only for first line headers = vallist headersread = True except HTTPError: errdata['Error'] = True errdata['ErrorType'] = 'HTTPError' errdata['status_code'] = response.status_code errdata['reason'] = response.reason except ConnectionError as e: errdata['Error'] = True errdata['ErrorType'] = 'ConnectionError' errdata['reason'] = 'Cannot connect.' except Timeout as e: errdata['Error'] = True errdata['ErrorType'] = 'Timeout' errdata['reason'] = 'Timeout exceeded.' except TooManyRedirects as e: errdata['Error'] = True errdata['ErrorType'] = 'TooManyRedirects' errdata['reason'] = 'Too many redirects.' except StreamSizeLimitError as e: errdata['Error'] = True errdata['ErrorType'] = 'StreamSizeLimitError' errdata['reason'] = str(e) except StreamTimeoutError as e: errdata['Error'] = True errdata['ErrorType'] = 'StreamTimeoutError' errdata['reason'] = str(e) except ParsingError as e: errdata['Error'] = True errdata['ErrorType'] = 'ParsingError' errdata['reason'] = str(e) except: errdata['Error'] = True errdata['ErrorType'] = 'Internal' do_not_skip_on_debug = True raise finally: try: response.close() except: pass if not (settings.DEBUG and do_not_skip_on_debug): return (data, errdata)