def open_molecule_file(uploadedfile, logfile=os.devnull, filetype=None):

    #charset = 'utf-8'
    #if "charset" in uploadedfile and uploadedfile.charset is not None:
    #charset = uploadedfile.charset
    if filetype is None:
        if "filetype" not in uploadedfile or uploadedfile.filetype is None:
            basename, ext = os.path.splitext(uploadedfile.name)
            ext = ext.lower()
            ext = ext.strip('.')
            if ext in MOLECULE_EXTENSION_TYPES.keys():
                filetype = MOLECULE_EXTENSION_TYPES[ext]
                uploadedfile.filetype = filetype
            else:
                raise InvalidMoleculeFileExtension(ext=ext)

        else:
            filetype = uploadedfile.filetype

    with stdout_redirected(to=logfile, stdout=sys.stderr):
        with stdout_redirected(to=logfile, stdout=sys.stdout):
            print('Loading molecule...')
            uploadedfile.seek(0)
            if filetype == 'sdf' or filetype == 'mol':

                suppl = ForwardSDMolSupplier(uploadedfile, removeHs=False)
                mol = next(suppl)
                try:
                    next(suppl)
                except StopIteration:
                    pass
                except:
                    raise
                else:
                    raise MultipleMoleculesinSDF()
                finally:
                    del suppl
                if mol is None:
                    if filetype == 'sdf':
                        raise ParsingError("Invalid SDFile file.")
                    else:
                        raise ParsingError("Invalid MDL Mol file.")
            print('Assigning chirality from struture...')
            AssignAtomChiralTagsFromStructure(mol, replaceExistingTags=False)
            print('Finished loading molecule.')

    return mol
Exemplo n.º 2
0
def get_chembl_molecule_ids(datachembl, parents=False):
    ids = []
    try:
        for molecule in datachembl["molecules"]:
            if parents and "molecule_hierarchy" in molecule.keys(
            ) and molecule["molecule_hierarchy"]:
                if "parent_chembl_id" in molecule["molecule_hierarchy"].keys():
                    chembl_id = molecule["molecule_hierarchy"][
                        "parent_chembl_id"]
            else:
                chembl_id = molecule["molecule_chembl_id"]
            ids.append(int(chembl_id.replace('CHEMBL', '')))
        ids = list_unique(ids)
    except:
        raise ParsingError("Cannot parse ChEMBL molecule information.")
    return ids
Exemplo n.º 3
0
def chembl_get_molregno_from_html(chemblid,
                                  getmol_url='/chembl/download_helper/getmol/'
                                  ):
    URL = "https://www.ebi.ac.uk/chembl/compound/inspect/"
    molregno = None
    do_not_skip_on_debug = False
    SIZE_LIMIT = 512000
    RECIEVE_TIMEOUT = 120
    errdata = dict()
    try:
        response = requests.get(URL + chemblid,
                                timeout=30,
                                stream=False,
                                verify=True)
        response.raise_for_status()
        encoding = response.encoding
        chunks = response.iter_content(chunk_size=524288)
        parser = ChemblUrlInspectCompoundDownloadMolHTMLParser(
            getmol_url=getmol_url)

        size = 0
        start = time.time()
        for chunk in chunks:
            size += len(chunk)
            if size > SIZE_LIMIT:
                raise StreamSizeLimitError('response too large')
            if time.time() - start > RECIEVE_TIMEOUT:
                raise StreamTimeoutError('timeout reached')
            chunk = chunk.decode(encoding)
            parser.feed(chunk)
            if not parser.keep_parsing:
                break
        parser.close()
        href_url = parser.url
        m = re.search(re.escape(getmol_url) + r'(\d+)', href_url)
        if m:
            molregno = int(m.group(1))
        if molregno is None:
            raise ParsingError("Molecule structure molregno not found.")
        return (molregno, errdata)

    except HTTPError:
        errdata['Error'] = True
        errdata['ErrorType'] = 'HTTPError'
        errdata['status_code'] = response.status_code
        errdata['reason'] = response.reason
    except ConnectionError as e:
        errdata['Error'] = True
        errdata['ErrorType'] = 'ConnectionError'
        errdata['reason'] = 'Cannot connect.'
    except Timeout as e:
        errdata['Error'] = True
        errdata['ErrorType'] = 'Timeout'
        errdata['reason'] = 'Timeout exceeded.'
    except TooManyRedirects as e:
        errdata['Error'] = True
        errdata['ErrorType'] = 'TooManyRedirects'
        errdata['reason'] = 'Too many redirects.'
    except StreamSizeLimitError as e:
        errdata['Error'] = True
        errdata['ErrorType'] = 'StreamSizeLimitError'
        errdata['reason'] = str(e)
    except StreamTimeoutError as e:
        errdata['Error'] = True
        errdata['ErrorType'] = 'StreamTimeoutError'
        errdata['reason'] = str(e)
    except ParsingError as e:
        errdata['Error'] = True
        errdata['ErrorType'] = 'ParsingError'
        errdata['reason'] = str(e)
    except:
        errdata['Error'] = True
        errdata['ErrorType'] = 'Internal'
        errdata['reason'] = ''
        do_not_skip_on_debug = True
        raise
    finally:
        try:
            parser.close()
        except:
            pass
        try:
            response.close()
        except:
            pass
        if not (settings.DEBUG and do_not_skip_on_debug):
            return (molregno, errdata)
Exemplo n.º 4
0
def chembl_get_compound_id_query_result_url(
        postdata,
        chembl_submission_url='https://www.ebi.ac.uk/chembl/compound/ids'):
    results_url = None
    do_not_skip_on_debug = False
    SIZE_LIMIT = 512000
    RECIEVE_TIMEOUT = 120
    errdata = dict()
    try:
        response = requests.post(chembl_submission_url,
                                 data=postdata,
                                 timeout=30,
                                 stream=False,
                                 verify=True)
        response.raise_for_status()
        encoding = response.encoding
        chunks = response.iter_content(chunk_size=524288)
        target = ChemblResultsUrl()
        parser = xmlparser(target=target)

        size = 0
        start = time.time()
        for chunk in chunks:
            size += len(chunk)
            if size > SIZE_LIMIT:
                raise StreamSizeLimitError('response too large')
            if time.time() - start > RECIEVE_TIMEOUT:
                raise StreamTimeoutError('timeout reached')
            chunk = chunk.decode(encoding)
            parser.feed(chunk)
            if not target.keep_parsing:
                break

        results_url = parser.close()

        if results_url is None:
            raise ParsingError("No query result url found.")
        results_url = results_url.replace(':', '/')
        return (results_url, errdata)

    except HTTPError:
        errdata['Error'] = True
        errdata['ErrorType'] = 'HTTPError'
        errdata['status_code'] = response.status_code
        errdata['reason'] = response.reason
    except ConnectionError as e:
        errdata['Error'] = True
        errdata['ErrorType'] = 'ConnectionError'
        errdata['reason'] = 'Cannot connect.'
    except Timeout as e:
        errdata['Error'] = True
        errdata['ErrorType'] = 'Timeout'
        errdata['reason'] = 'Timeout exceeded.'
    except TooManyRedirects as e:
        errdata['Error'] = True
        errdata['ErrorType'] = 'TooManyRedirects'
        errdata['reason'] = 'Too many redirects.'
    except StreamSizeLimitError as e:
        errdata['Error'] = True
        errdata['ErrorType'] = 'StreamSizeLimitError'
        errdata['reason'] = str(e)
    except StreamTimeoutError as e:
        errdata['Error'] = True
        errdata['ErrorType'] = 'StreamTimeoutError'
        errdata['reason'] = str(e)
    except ParsingError as e:
        errdata['Error'] = True
        errdata['ErrorType'] = 'ParsingError'
        errdata['reason'] = str(e)
    except:
        errdata['Error'] = True
        errdata['ErrorType'] = 'Internal'
        errdata['reason'] = ''
        do_not_skip_on_debug = True
        raise
    finally:
        try:
            parser.close()
        except:
            pass
        try:
            response.close()
        except:
            pass
        if not (settings.DEBUG and do_not_skip_on_debug):
            return (results_url, errdata)
Exemplo n.º 5
0
def retreive_isoform_data_uniprot(acnum,\
  size_limit=512000,buffer_size=512000,recieve_timeout=120,connect_timeout=30):
    COLUMNS = 'comment(ALTERNATIVE PRODUCTS)'
    KEYS = set(('Event', 'Named isoforms', 'Comment', 'Name', 'Synonyms',
                'IsoId', 'Sequence', 'Note'))
    MANDATORY_KEYS = set(('Event', 'IsoId', 'Sequence'))
    do_not_skip_on_debug = False
    data,errdata = retreive_data_uniprot(acnum,columns=COLUMNS,\
    size_limit=size_limit,buffer_size=buffer_size,recieve_timeout=recieve_timeout,connect_timeout=connect_timeout)
    try:

        if data == dict():
            print('F**K, empty data')
            return
        elif 'Alternative products (isoforms)' not in data.keys():
            raise ParsingError('Cannot parse isoform data.')

        rawdata = data.pop('Alternative products (isoforms)')
        if rawdata.find('ALTERNATIVE PRODUCTS:') == 0:
            rawdata = rawdata[22:].strip()
            rows = rawdata.split(';')
            if rawdata[-1] == ';':
                rows.pop()
            for row in rows:

                row = row.strip()
                keyval = row.split('=')
                key = keyval[0]
                val = keyval[1].strip()
                if key == 'Named isoforms':
                    try:
                        data[key] = int(val)
                    except ValueError:
                        raise ParsingError(
                            'Cannot parse isoform data, invalid format.')
                    except:
                        raise
                else:
                    if key not in data.keys():
                        data[key] = []
                    if key not in set(('Name', 'Note', 'Comment')):
                        val = [i.strip() for i in val.split(',')]
                    data[key].append(val)
            datakeys = set(data.keys())
            if datakeys.issubset(KEYS) and MANDATORY_KEYS.issubset(datakeys):
                for acnlist, seqlist in zip(data['IsoId'], data['Sequence']):
                    for seq in seqlist:
                        if seq == 'Displayed':
                            data['Displayed'] = acnlist[0]

            else:
                raise ParsingError(
                    'Cannot parse isoform data, invalid format.')
        elif rawdata != '':
            raise ParsingError('Cannot parse isoform data.')
    except ParsingError as e:
        errdata['Error'] = True
        errdata['ErrorType'] = 'ParsingError'
        errdata['reason'] = str(e)
    except:
        errdata['Error'] = True
        errdata['ErrorType'] = 'Internal'
        do_not_skip_on_debug = True
        raise
    finally:
        if not (settings.DEBUG and do_not_skip_on_debug):
            print(data, errdata)
            return (data, errdata)
Exemplo n.º 6
0
def retreive_fasta_seq_uniprot(acnum,
                               size_limit=102400,
                               buffer_size=512000,
                               recieve_timeout=120,
                               connect_timeout=30):
    URL = 'http://www.uniprot.org/uniprot/'
    data = dict()
    errdata = dict()
    do_not_skip_on_debug = False
    try:
        response = requests.get(URL + str(acnum) + '.fasta',
                                timeout=recieve_timeout,
                                stream=True)
        response.raise_for_status()
        encoding = response.encoding
        sequencere = re.compile('^[A-Z]+$')
        header = ''
        sequence = ''
        size = 0
        headerread = False
        chunkend = False
        remain = ''
        start = time.time()
        chunks = response.iter_content(chunk_size=buffer_size)
        while True:
            try:

                chunk = next(chunks)
                size += len(chunk)
                if size > size_limit:
                    raise StreamSizeLimitError('Response too large.')
                if time.time() - start > recieve_timeout:
                    raise StreamTimeoutError(
                        'Stream download time limit reached.')
                chunk = chunk.decode(encoding)
                chunk = remain + chunk

            except StopIteration:
                if chunkend:
                    break
                else:
                    lines = [remain]
                    chunkend = True
                    pass
            except:
                raise
            else:
                lines = chunk.split('\n')
                remain = lines.pop()
            for line in lines:
                if line != '':
                    isheader = line.find('>sp|') == 0 or line.find('>tr|') == 0
                    if headerread or not isheader:
                        if sequencere.search(line):
                            sequence += line.replace('-', 'X')
                        elif line.find('*') < 0:
                            ParsingError(
                                'Cannot parse fasta line:\n' +
                                ' translation stop character ("*") not accepted.'
                            )
                        elif isheader:
                            return
                        else:
                            raise ParsingError('Cannot parse fasta line:\n' +
                                               '"' + line + '"')
                    else:
                        #do only for header
                        header = line
                        headerread = True
            data['header'] = header
            data['sequence'] = sequence

    except HTTPError:
        errdata['Error'] = True
        errdata['ErrorType'] = 'HTTPError'
        errdata['status_code'] = response.status_code
        errdata['reason'] = response.reason
    except ConnectionError as e:
        errdata['Error'] = True
        errdata['ErrorType'] = 'ConnectionError'
        errdata['reason'] = 'Cannot connect.'
    except Timeout as e:
        errdata['Error'] = True
        errdata['ErrorType'] = 'Timeout'
        errdata['reason'] = 'Timeout exceeded.'
    except TooManyRedirects as e:
        errdata['Error'] = True
        errdata['ErrorType'] = 'TooManyRedirects'
        errdata['reason'] = 'Too many redirects.'
    except StreamSizeLimitError as e:
        errdata['Error'] = True
        errdata['ErrorType'] = 'StreamSizeLimitError'
        errdata['reason'] = str(e)
    except StreamTimeoutError as e:
        errdata['Error'] = True
        errdata['ErrorType'] = 'StreamTimeoutError'
        errdata['reason'] = str(e)

    except ParsingError as e:
        errdata['Error'] = True
        errdata['ErrorType'] = 'ParsingError'
        errdata['reason'] = str(e)
    except:
        errdata['Error'] = True
        errdata['ErrorType'] = 'Internal'
        do_not_skip_on_debug = True
        raise
    finally:
        try:
            response.close()
        except:
            pass
        if not (settings.DEBUG and do_not_skip_on_debug):
            return (data, errdata)
Exemplo n.º 7
0
def retreive_data_uniprot(acnum,isoform=None,columns='id,entry name,reviewed,protein names,organism,length',\
  size_limit=512000,buffer_size=512000,recieve_timeout=120,connect_timeout=30):
    ### Returns a dictionary with the selected columns as keys. 'id' --> 'entry'
    URL = 'http://www.uniprot.org/uniprot/?'
    data = dict()
    errdata = dict()
    do_not_skip_on_debug = False
    try:
        if isoform is None:
            seqstr = ''
        else:
            seqstr = '+AND+sequence:' + acnum + '-' + str(isoform)
        #~ print(URL+'query=accession:'+str(acnum)+'+AND+active:yes'+seqstr+'&columns='+columns+'&format=tab')
        response = requests.get(URL + 'query=accession:' + str(acnum) +
                                '+AND+active:yes' + seqstr + '&columns=' +
                                columns + '&format=tab',
                                timeout=connect_timeout,
                                stream=True)
        response.raise_for_status()
        encoding = response.encoding
        rowcounter = 0
        size = 0
        start = time.time()
        headersread = False
        chunkend = False
        remain = ''
        chunks = response.iter_content(chunk_size=buffer_size)
        while True:

            try:

                chunk = next(chunks)
                size += len(chunk)
                if size > size_limit:
                    raise StreamSizeLimitError('Response too large.')
                if time.time() - start > recieve_timeout:
                    raise StreamTimeoutError(
                        'Stream download time limit reached.')
                chunk = chunk.decode(encoding)
                chunk = remain + chunk

            except StopIteration:
                if chunkend:
                    break
                else:
                    lines = [remain]
                    chunkend = True
                    pass
            except:
                raise
            else:
                lines = chunk.split('\n')
                remain = lines.pop()
            for line in lines:

                if line != '':
                    vallist = line.split('\t')
                    if headersread:
                        if rowcounter > 0:
                            raise ParsingError(
                                'Error parsing data: secondary accession number pointing to multiple entries.'
                            )
                        if len(headers) == len(vallist):
                            for header, value in zip(headers, vallist):
                                data[str(header.strip())] = value.strip()

                        else:
                            raise ParsingError('Error parsing data.')
                        rowcounter += 1
                    else:
                        #do only for first line

                        headers = vallist
                        headersread = True

    except HTTPError:
        errdata['Error'] = True
        errdata['ErrorType'] = 'HTTPError'
        errdata['status_code'] = response.status_code
        errdata['reason'] = response.reason
    except ConnectionError as e:
        errdata['Error'] = True
        errdata['ErrorType'] = 'ConnectionError'
        errdata['reason'] = 'Cannot connect.'
    except Timeout as e:
        errdata['Error'] = True
        errdata['ErrorType'] = 'Timeout'
        errdata['reason'] = 'Timeout exceeded.'
    except TooManyRedirects as e:
        errdata['Error'] = True
        errdata['ErrorType'] = 'TooManyRedirects'
        errdata['reason'] = 'Too many redirects.'
    except StreamSizeLimitError as e:
        errdata['Error'] = True
        errdata['ErrorType'] = 'StreamSizeLimitError'
        errdata['reason'] = str(e)
    except StreamTimeoutError as e:
        errdata['Error'] = True
        errdata['ErrorType'] = 'StreamTimeoutError'
        errdata['reason'] = str(e)

    except ParsingError as e:
        errdata['Error'] = True
        errdata['ErrorType'] = 'ParsingError'
        errdata['reason'] = str(e)
    except:
        errdata['Error'] = True
        errdata['ErrorType'] = 'Internal'
        do_not_skip_on_debug = True
        raise
    finally:
        try:
            response.close()
        except:
            pass
        if not (settings.DEBUG and do_not_skip_on_debug):
            return (data, errdata)