Exemplo n.º 1
0
def print_alignment_kegg(model):
    f = open("cor.txt")
    f_o = open("cor_readable.txt", "w")
    kegg = KEGG()

    for i in f:
        if ":***:" in i:
            k, b = i.split(":***:")
            b = b.strip()

            if not k == "MULTIR":
                k = kegg.get(k)

                i1 = k.find("NAME") + 4
                i2 = k[i1:].find("\n")

                k = k[i1:i1 + i2].strip()

            if not b == "MULTIR":
                b = model.reactions[b]

            print(k, ":***:", b)
            f_o.write(k + ":***:" + b + "\n")
    f.close()
    f_o.close()
Exemplo n.º 2
0
def pathwayInfo(code):
    # Function to get info about a pathway, from the code

    # Intialize searcher
    kSearcher = KEGG()
    # Get result and parse it in a dictionnary
    result = kSearcher.get(code)

    # Add code at the begining of the list
    dictResult = kSearcher.parse(result)

    # Initialize an empty list
    pathwayList = []

    # If name exist as a key in dictionnary, else 'NA' insted
    pathwayList.append(code)
    if 'NAME' in dictResult.keys():
        # If pathway name is a string comma separated, replace comma by semicolon
        # Fix to avoid wrong column formating at the end of the script
        nameStr = str(dictResult['NAME'][0].replace(',', ';'))
        pathwayList.append(nameStr)
    else:
        pathwayList.append('NA')

    # If class exist as a key in dictionnary, else 'NA' instead
    if 'CLASS' in dictResult.keys():
        # If pathway name is a string comma separated, replace comma by semicolon
        # Fix to avoid wrong column formating at the end of the script
        classStr = str(dictResult['CLASS']).replace(',', ';')
        pathwayList.append(classStr)
    else:
        pathwayList.append('NA')

    return pathwayList
Exemplo n.º 3
0
def extract_sequences(dict, flist):
    '''
    Get orthologs sequences on KEGG and write to a fasta file for each kegg id
        
        arg: dictionnary with keggid as key and orthologs as value (list)
        
    '''
    k = KEGG()

    ocount = {}

    #loop through orthologs dictionnary to get sequences from kegg
    for key, list in dict.items():
        #print(key)
        if (key + ".fas") in flist:
            print(key + " is already created !!!")
            continue

        #create string with sequences to write fasta file for each genes
        string = ""
        for x in range(0, len(list)):
            for i in range(0, len(list[x])):
                data_seq = k.get(list[x][i], option="ntseq", parse=True)
                string = string + data_seq + "\n"
                #print(data_seq)

        print("writing : " + key + ".fas")
        #write file
        with open(os.path.join('orthologs_fastas/', key + '.fas'), 'w') as f:
            read_data = f.write(string)
        f.closed
Exemplo n.º 4
0
def get_genes_from_kegg_pathway(pathway):
    from bioservices.kegg import KEGG
    k = KEGG()
    k.organism = 'hsa'
    pathway = k.get(pathway)
    genes = k.parse(pathway)['GENE']
    entrez, symbol = zip(*[i.split('  ') for i in genes])
    return symbol
Exemplo n.º 5
0
def get_kegg_info(stId):
    """
    Get kegg dict by pathway id.
    """
    k = KEGG()
    data = k.get(stId)
    dict_data = k.parse(data)
    return dict_data
Exemplo n.º 6
0
def retrieve_kegg_formula(reactome_compound_name):
    k = KEGG()
    compound_name = reactome_compound_name.replace('COMPOUND', 'cpd')
    res = k.get(compound_name).split('\n')
    for line in res:
        if line.startswith('FORMULA'):
            formula = line.split()[1]  # get the second token
            return formula
    return None
Exemplo n.º 7
0
def get_single_compound_metadata_online(compound_id):

    if compound_id.upper().startswith('C'):
        s = KEGG()
        res = s.get(compound_id)
        return s.parse(res)
    else:
        ch = ChEBI()
        res = ch.getCompleteEntity('CHEBI:'+compound_id)
        return res
Exemplo n.º 8
0
def extract_orthologs(filename):
    '''
    Create dictionnary with keggid as key and list of orthologs as value
        
        arg: csv with keggids
        return : dict with orthologs
    
    '''

    orthos_dict = {}
    k = KEGG()

    #get list of gammaproteobacteria from csv
    df = pd.read_csv(filename, sep="\t", tupleize_cols=1)
    df_gamma = pd.read_csv('gammaproteo.csv', sep="\t", tupleize_cols=1)
    gamma_list = df_gamma['KEGG'].tolist()

    #loop through keggid to get orthologs
    for keggid in df['kegg_id']:

        if keggid == "no":
            continue

        print(str(keggid))
        ortho_list = []

        #get orthologs on kegg
        data = k.get(keggid)
        dict_data = k.parse(data)

        if isinstance(dict_data, int):
            continue

    #loop through kegg orthologs data and verify that organisms are gammaproteobacteria
        for key, value in dict_data['GENES'].items():

            if key.lower() in gamma_list:
                # print(key.lower(), value.split('(')[0].split())
                para_num = len(value.split('(')[0].split())
                para_list = []

                for i in range(0, para_num):
                    #print(value.split('(')[0].split()[i])
                    para_list.append(key.lower() + ":" +
                                     value.split('(')[0].split()[i])

                ortho_list.append(para_list)

        orthos_dict[keggid] = ortho_list

    return orthos_dict
Exemplo n.º 9
0
 def id2seq(self, hsa):
     s = KEGG()
     d = s.get(hsa)
     dict_d = s.parse(d)
     pattern = re.compile(r'\s+')
     try:
         seq = re.sub(pattern, '', dict_d['AASEQ'])
     except:
         seq = ''
     #print('SEQ:', seq)
     text_file = open("dummy.txt", "w")
     text_file.write('>' + str(hsa) + '\n' + seq)
     text_file.close()
     return None
Exemplo n.º 10
0
def get_metabs(KEGG, reac_id):
    subs_list = []
    prod_list = []

    # Get reaction data from KEGG using a KEGG reaction ID
    r_data = KEGG.get(reac_id)
    # Parse the information retrieved
    r_parsed = KEGG.parse(r_data)
    # Split the equation into substrates and products
    split_eq = re.split('<=>', r_parsed['EQUATION'])
    # Remove the plus signs between the metabolites
    subs_list = [s.strip() for s in split_eq[0].split('+')]
    prod_list = [p.strip() for p in split_eq[1].split('+')]

    return [subs_list, prod_list]
Exemplo n.º 11
0
def get_compound_metadata_online(kegg_ids):

    s = KEGG()
    metadata_map = {}
    for i in range(len(kegg_ids)):
        try:
            if i % 10 == 0:
                print("Retrieving %d/%d KEGG records" % (i, len(kegg_ids)))
            kegg_id = kegg_ids[i]
            res = s.get(kegg_id)
            d = s.parse(res)
            first_name = d['NAME'][0]
            first_name = first_name.replace(';', '') # strip last ';' character
            metadata_map[kegg_id] = {'display_name': first_name}
        except TypeError:
            print('kegg_id=%s parsed_data=%s' % (kegg_id, d))
    return metadata_map
Exemplo n.º 12
0
def get_seq(filename):
    '''
    Create dictionnary with species as keys and sequences as values for an alignment
    
    arg: filename with gene name
    return: organism dictionnary with sequences
    '''
    
    k = KEGG()
    records = list(SeqIO.parse(os.path.join('alignments_nogaps/',filename), "fasta"))
    
    idlist = []
    orglist = [] 
    seqlist = []
    orgdict = {}
    
    #go through sequences and search for organism name on kegg
    for record in records:
        
        idsplit = (record.id).split('_',1)
        id = idsplit[0] + ':' + idsplit[1]
        
        handle  = k.get(id)
        if isinstance( handle, int ):
            print(id)
            continue
            
        org = k.parse(handle)['ORGANISM']
        org = org.split()
        org = org[1] +" "+ org[2]
        seqlist.append(list(str(record.seq)))
        orglist.append(org)
        idlist.append(id)

    duplist = set(orglist)
    
    # create dict with organism as key and sequences for organism as values
    for org in duplist:
        indices = [i for i, x in enumerate(orglist) if x == org]
        seqs = []
        for e in indices:
            seqs.append(seqlist[e])
        orgdict[org] = seqs
        
    #print(orgdict)
    return orgdict
def queryKegg(theIDs):
    print("Currently querying KEGG...")
    k = KEGG()
    keggData = list()
    IDlist = list()

    for id in theIDs:
        ids = id[3:]
        query = k.find("acb", ids)
        query = query.split('\t')
        finalQuery = query[0]
        data = k.get(finalQuery)
        dictData = k.parse(data)

        keggData.append(dictData)
        IDlist.append(ids)

    return keggData, IDlist
Exemplo n.º 14
0
def main():
    # Start KEGG interface
    k = KEGG()
    # Create a dict to store final result
    data = dict()

    # Read in KEGG gene ID & gene symbol pairs
    with open("hsa_gene_list.json", "r") as g:
        gene_data = json.load(g)

    for gene in gene_data.keys():
        print gene
        g_data = k.get(gene)
        g_prsd = k.parse(g_data)
        data[gene] = g_prsd

    with open('ginfo.json', 'w') as fw:
        json.dump(data, fw)
Exemplo n.º 15
0
    def get_reaction_ECs_from_kegg(self):
        self.reaction_ECs = defaultdict(set)

        kegg = KEGG()
        for r in self.model.reactions:
            ECs = []
            try:
                reacts = r.split(" ")
                for i in reacts:
                    if i not in self.reaction_ECs:
                        print("KEGG reaction", i)
                        ECs += kegg.parse(kegg.get(i))['ENZYME']
                        for e in ECs:
                            self.reaction_ECs[i].add(e)

            except Exception as inst:
                print(inst)
            #for e in ECs:
            #    self.reaction_ECs[r].add(e)

        print("EC data loaded from KEGG")
Exemplo n.º 16
0
def enzymeInfo(code, ignored, stats, verbosity):
    # Function to get info about an enzyme, from the code
    # This function return a double list

    # Intialize KEGG searcher
    kSearch = KEGG(verbose=verbosity)

    # Get result and parse it in a dictionnary
    print(f"[+] Get info about enzyme {code}")
    result = kSearch.get(code)

    # If KEGG return an int, the enzyme code doesn't match in databases
    if type(result) is int:
        return False
    else:

        dictResult = kSearch.parse(result)

        # Create prefix list, info about enzyme herself
        prefixList = []

        # Add code at the begining of the list
        prefixList.append(code)

        # If name is present as key, else 'NA' insted
        if 'NAME' in dictResult.keys():
            #prefixList.append(dictResult['NAME'])
            # Convert names from list into a string
            # with strop '[]' part, and replace initial separator , by ;
            namesStr = str(dictResult['NAME']).strip("'[]'").replace(',', ';')
            prefixList.append(namesStr)
        else:
            prefixList.append('NA')

        # If definition is present as key, else 'NA' insted
        if 'DEFINITION' in dictResult.keys():
            # If definition is a string comma separated, replace comma by semicolon
            # Fix to avoid wrong column formating at the end of the script
            definitionStr = str(dictResult['DEFINITION']).replace(',', ';')
            prefixList.append(definitionStr)
        else:
            prefixList.append('NA')

        # If pathway exist as a key in result
        if 'PATHWAY' in dictResult.keys():

            # Get all pathways as keys in dictionnary
            pathwayList = list(dictResult['PATHWAY'].keys())

            # Create final list, which contain :
            # - prefix (info about enzyme)
            # - suffix list (info about each enzyme's pathways)
            finalList = []

            # Add suffix in final list
            finalList.append(prefixList)

            for pathway in pathwayList:
                # If pathway not in ignored list
                if pathway not in ignored:
                    print(f"  [-] Get info about {pathway} pathway")
                    suffixList = pathwayInfo(pathway)

                    # Add number of pathway for stats
                    stats['NB_PATHWAY'] = stats['NB_PATHWAY'] + 1

                    # Add suffix of pathway in final list
                    finalList.append(suffixList)

                # If enzyme have only 1 pathway and this pathway is in ignored list
                # Bad luck !
                elif len(pathwayList) == 1 and pathway in ignored:
                    print(
                        f"  [!] Enzyme {code} have only 1 pathway : {pathway}")
                    print(f"  [!] and this pathway is ignored")
                    # Add entries for stats
                    stats['ENZYME_ONLY_IGNORED_PATHWAY'] = stats[
                        'ENZYME_ONLY_IGNORED_PATHWAY'] + 1
                    stats['LIST_ENZYME_ONLY_IGNORED_PATHWAY'].append(code)
                    # Artificially create pathway entry, but empty
                    suffixList = ['NA']
                    # Add suffix of pathway in final list
                    finalList.append(suffixList)

                else:
                    print(f"  [!] Ignored pathway : {pathway}")

        # Else, if pathway doesn't exist as a key in result
        elif 'PATHWAY' not in dictResult.keys():

            # Initialize an empty list
            finalList = []
            # Add suffix in final list
            finalList.append(prefixList)
            # Display a alert message
            print(f"[!] No pathway detected for enzyme {code}\n")
            # Increment number of failed pathway in stats and add enzyme in list
            stats['MISSING_PATHWAY_IN_KEGG'] = stats[
                'MISSING_PATHWAY_IN_KEGG'] + 1
            stats['LIST_MISSING_PATHWAY_IN_KEGG'].append(code)
            # Artificially create pathway entry, but empty
            suffixList = ['NA']
            # Add suffix of pathway in final list
            finalList.append(suffixList)

        return finalList
Exemplo n.º 17
0
        enzymes = ReadFile(args.include)
    else:
        log("Fetch enzymes from kegg")
        enzymes = p.enzymeIds
        log("%s enzymes fetched" % len(enzymes))

    ecs = {}

    if args.outfile:
        hout = open(args.outfile, 'w')
    else:
        hout = sys.stdout

    houtcsv = csv.writer(hout, delimiter='\t')
    log("Fetch enzymes from kegg")
    a = k.get(' '.join(enzymes))
    log("Fetch enzymes from kegg")
    for ec in enzymes:
        ec = ec.replace("ec:", "")
        if ec in exclude:
            continue
        l = []
        try:
            log("Fecthing %s from kegg" % ec)
            result = k.get(ec)
        except urllib2.HTTPError:
            continue
        parsed = p.parse(result)
        # # Check if the enzyme is obsolete
        if "Obsolete" in parsed["entry"]:
            continue
Exemplo n.º 18
0
t_feat = DataFrame(t_fa.transform(trans_n.T), index=trans_n.columns, columns=['Factor %d' % (i + 1) for i in range(3)])
print t_feat['Factor 2'].sort_values()

sns.set(style='ticks', context='paper', rc={'axes.linewidth': .3, 'xtick.major.width': .3, 'ytick.major.width': .3})
g = sns.pairplot(t_hfac, hue='type', palette=pal)
plt.savefig('%s/reports/transcriptomics_pairplot.pdf' % wd, bbox_inches='tight')
plt.close('all')
print '[INFO] Corr plotted!'


# -- Bioservices KEGG infomration
bioser = KEGG(cache=True)
bioser.organism = 'hsa'

# Get pathways
keggp = {p: bioser.get(p) for p in bioser.pathwayIds}
print '[INFO] Pathways fetched'

keggp_name = {p: re.findall('NAME\s+(.*)?\n', keggp[p])[0].split(' - ')[0] for p in keggp}
keggp_comp = {p: {c for keggc in re.findall('(COMPOUND.*?)\n[A-Z]', keggp[p], re.S)[0].split('\n') for c in re.findall('\s+(C[0-9]+)\s+', keggc)} for p in keggp if 'COMPOUND' in keggp[p]}
keggp_gene = {p: {g for keggg in re.findall('(GENE.*?)\n[A-Z]', keggp[p], re.S)[0].split('\n') for g in re.findall('\s+([A-Z]+.+);', keggg)} for p in keggp if 'GENE' in keggp[p]}
keggp_tf = {p: {tf for tf in tf_targets_dict if len(tf_targets_dict[tf].intersection(keggp_gene[p])) > 0} for p in keggp_gene}
print '[INFO] Pathways genes fetched'

keggp_comp_m = DataFrame([(p, m, 1) for p in keggp_comp for m in keggp_comp[p]], columns=['pathway', 'metabolite', 'value'])
keggp_comp_m = pivot_table(keggp_comp_m, index='pathway', columns='metabolite', values='value', fill_value=0)
print keggp_comp_m.head

keggp_gene_m = DataFrame([(p, m, 1) for p in keggp_comp for m in keggp_gene[p]], columns=['pathway', 'gene', 'value'])
keggp_gene_m = pivot_table(keggp_gene_m, index='pathway', columns='gene', values='value', fill_value=0)
print keggp_gene_m.head()
Exemplo n.º 19
0
from bioservices.kegg import KEGG

output = open('eclist.txt', 'w')

kegg = KEGG()
pathway = kegg.get('ath00900')
dict_data = kegg.parse(pathway)
# print(dict_data)

# g = x.get('tbr03440:Tb11.01.0910/aaseq')
# print(g)

# res = x.parse_kgml_pathway("tbr03440")
# print(res['entries'][0])

# for key, value in dict_data['GENE'].items():
# 	print(key, value)

# for gene in dict_data['GENE']:
# 	output.write(gene + '\n')

for value in dict_data['GENE'].values():
    EC = value.split('[EC:')[1]
    EC = EC.split(']')[0]
    EC = EC.replace(' ', '\n')
    output.write(EC + '\n')
Exemplo n.º 20
0
from bioservices.kegg import KEGG
kegg = KEGG()
pathway = kegg.get("ko01230")
dict_data = kegg.parse(pathway)
print(dict_data)
output = open("modules.txt", "w")
modules_dict = {}
for key in dict_data['MODULE'].keys():
    pathway = kegg.get(key)
    module_data = kegg.parse(pathway)
    #print(module_data)
    orthologs = []
    for ortholog in module_data['ORTHOLOGY'].keys():
        data = [ortholog, module_data['ORTHOLOGY'][ortholog]]
        orthologs.append("_".join(data))
        modules_dict[ortholog] = key
    output.write('{}\t{}\t{}\t{}\n'.format(key, module_data['NAME'],
                                           module_data['DEFINITION'],
                                           "//".join(orthologs)))
def mapSpecies(mousepeptrackfilename):
    RETRY_TIME = 20.0
    mouseTohumanfilepath = os.path.join(os.getcwd(), 'MouseToHuman.tsv')
    print("Extracting Mouse to Human Map data, job starts",
          str(datetime.datetime.now()))
    #increase the field size of CSV
    csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2))
    try:
        urllib.urlretrieve(
            'http://www.informatics.jax.org/downloads/reports/HOM_MouseHumanSequence.rpt',
            mouseTohumanfilepath)
        urllib.urlcleanup()
    except:
        print("Can't able to download MouseToHuman.tsv file!!")

    colnameMousHu = [
        'HomoloGene ID', 'Common Organism Name', 'NCBI Taxon ID', 'Symbol',
        'EntrezGene ID', 'Mouse MGI ID', 'HGNC ID', 'OMIM Gene ID',
        'Genetic Location', 'Genomic Coordinates (mouse: , human: )',
        'Nucleotide RefSeq IDs', 'Protein RefSeq IDs', 'SWISS_PROT IDs'
    ]

    mouseHumandata = []
    homologID = []
    with open(mouseTohumanfilepath) as mhtsvfile:
        mhreader = csv.DictReader(mhtsvfile, delimiter='\t')
        for mhrow in mhreader:
            mhtemplist = []
            for i in colnameMousHu:
                mhtempdata = str(mhrow[i]).strip()
                mhtemplist.append(mhtempdata)
            if len(mhtemplist[-1].strip()) > 0:
                homologID.append(mhtemplist[0])
                mouseHumandata.append(mhtemplist)
    homologID = list(set(homologID))
    homologID.sort()

    mousehumandic = {}
    for homologidItem in homologID:
        tempHumanHomoUniID = ''
        tempMouseHomoUniID = ''
        for item in mouseHumandata:
            if homologidItem == item[0]:
                if 'mouse' in item[1].strip().lower():
                    tempMouseHomoUniID = item[-1].strip()
                else:
                    tempHumanHomoUniID = item[-1].strip()
        if len(tempMouseHomoUniID.strip()) > 0 and len(
                tempHumanHomoUniID.strip()) > 0 and tempHumanHomoUniID.strip(
                ).upper() != 'NA':
            mousehumandic[tempMouseHomoUniID] = tempHumanHomoUniID

    colname=['UniProtKB Accession','Protein','Gene','Organism','Peptide Sequence','Summary Concentration Range Data','All Concentration Range Data','All Concentration Range Data-Sample LLOQ Based','Peptide ID',\
    'Special Residues','Molecular Weight','GRAVY Score','Transitions','Retention Time','Analytical inofrmation',\
    'Gradients','AAA Concentration','CZE Purity','Panel','Knockout','LLOQ','ULOQ','Sample LLOQ','Protocol','Trypsin','QC. Conc. Data','Human UniProtKB Accession']

    finalresult = []
    finalresult.append(colname)
    humanUniprotID = []
    with open(mousepeptrackfilename) as csvfile:
        reader = csv.DictReader(csvfile, delimiter='\t')
        for row in reader:
            templist = []
            for i in colname[:-1]:
                tempdata = str(row[i]).strip()
                templist.append(tempdata)
            if len(str(templist[0]).strip()) > 0:
                if templist[0].split('-')[0] in mousehumandic:
                    humanUniprotID.append(
                        mousehumandic[templist[0].split('-')[0]])
                    templist.append(mousehumandic[templist[0].split('-')[0]])
                else:
                    templist.append('NA')

            finalresult.append(templist)

    with open(mousepeptrackfilename, 'wb') as pf:
        pwriter = csv.writer(pf, delimiter='\t')
        pwriter.writerows(finalresult)

    disGenDataDicName = disGenData()
    #disGenDataDicName='disGen.obj'
    disGenDataDic = cPickle.load(open(disGenDataDicName, 'rb'))
    unqhumanUniprotID = list(set(humanUniprotID))
    humanUniprotfuncinfodic = {}
    countProt = 0
    for subcode in unqhumanUniprotID:
        time.sleep(2)
        drugbanklist = []
        PN = 'NA'
        GN = 'NA'
        OG = 'NA'
        OGID = 'NA'
        dislist = []
        unidislist = []
        unidisURLlist = []
        disgendislist = []
        disgendisURLlist = []
        GoIDList = []
        GoNamList = []
        GoTermList = []
        GOinfo = []
        try:
            countProt += 1
            if countProt % 1000 == 0:
                print str(
                    countProt
                ), "th protein Protein Name, Gene, Organism Name,drug bank data,disease data job starts", str(
                    datetime.datetime.now())

            SGrequestURL = "https://www.uniprot.org/uniprot/" + str(
                subcode) + ".xml"
            SGunifile = urllib.urlopen(SGrequestURL)
            SGunidata = SGunifile.read()
            SGunifile.close()

            try:
                SGunidata = minidom.parseString(SGunidata)
                try:
                    drugdata = (SGunidata.getElementsByTagName('dbReference'))
                    for duItem in drugdata:
                        if (duItem.attributes['type'].value
                            ).upper() == 'DRUGBANK':
                            try:
                                drugname = (str(
                                    duItem.getElementsByTagName('property')
                                    [0].attributes['value'].value).strip())
                                drugid = str(
                                    duItem.attributes['id'].value).strip()
                                durl = '<a target="_blank" href="https://www.drugbank.ca/drugs/' + drugid + '">' + drugname + '</a>'
                                drugbanklist.append(durl)
                            except:
                                pass
                        if (duItem.attributes['type'].value
                            ).strip() == 'NCBI Taxonomy':
                            try:
                                OGID = str(
                                    duItem.attributes['id'].value).strip()
                            except:
                                pass
                except IndexError:
                    pass

                try:
                    godata = (SGunidata.getElementsByTagName('dbReference'))
                    for gItem in godata:
                        if (gItem.attributes['type'].value).upper() == 'GO':
                            try:
                                gonamedetails = (str(
                                    gItem.getElementsByTagName('property')
                                    [0].attributes['value'].value).strip()
                                                 ).split(':')[1]
                                gotermdetails = (str(
                                    gItem.getElementsByTagName('property')
                                    [0].attributes['value'].value).strip()
                                                 ).split(':')[0]
                                GoNamList.append(gonamedetails)
                                goid = str(
                                    gItem.attributes['id'].value).strip()
                                GoIDList.append(goid)
                                tempGoTerm = None

                                if gotermdetails.lower() == 'p':
                                    tempGoTerm = 'Biological Process'
                                if gotermdetails.lower() == 'f':
                                    tempGoTerm = 'Molecular Function'
                                if gotermdetails.lower() == 'c':
                                    tempGoTerm = 'Cellular Component'
                                GoTermList.append(tempGoTerm)
                                tempGOData = gonamedetails + ';' + goid + ';' + tempGoTerm
                                GOinfo.append(tempGOData)
                            except:
                                pass

                        if (gItem.attributes['type'].value
                            ).strip() == 'NCBI Taxonomy':
                            try:
                                OGID = str(
                                    gItem.attributes['id'].value).strip()
                            except:
                                pass
                except IndexError:
                    pass

                try:
                    try:
                        PN = (((SGunidata.getElementsByTagName('protein')[0]
                                ).getElementsByTagName('recommendedName')[0]
                               ).getElementsByTagName('fullName')[0]
                              ).firstChild.nodeValue

                    except:
                        PN = (((SGunidata.getElementsByTagName('protein')[0]
                                ).getElementsByTagName('submittedName')[0]
                               ).getElementsByTagName('fullName')[0]
                              ).firstChild.nodeValue

                except IndexError:
                    pass

                try:
                    try:
                        GN = ((
                            SGunidata.getElementsByTagName('gene')[0]
                        ).getElementsByTagName('name')[0]).firstChild.nodeValue
                    except:
                        GN = 'NA'
                except IndexError:
                    pass

                try:
                    try:
                        OG = ((
                            SGunidata.getElementsByTagName('organism')[0]
                        ).getElementsByTagName('name')[0]).firstChild.nodeValue
                    except:
                        OG = 'NA'
                except IndexError:
                    pass

                try:
                    disdata = SGunidata.getElementsByTagName('disease')
                    for dItem in disdata:
                        disname = ''
                        disshort = ''
                        disURL = ''
                        disID = ''
                        try:
                            disname = (dItem.getElementsByTagName('name')[0]
                                       ).firstChild.nodeValue
                            disID = (dItem.attributes['id'].value).upper()
                        except:
                            pass
                        try:
                            disshort = (dItem.getElementsByTagName('acronym')
                                        [0]).firstChild.nodeValue
                        except:
                            pass
                        if len(disname.strip()) > 0:
                            disURL = '<a target="_blank" href="https://www.uniprot.org/diseases/' + disID + '">' + str(
                                disname.strip()) + '(' + str(
                                    disshort) + ')' + '</a>'
                            dislist.append(
                                str(disname.strip()) + '(' + str(disshort) +
                                ')')
                            unidislist.append(
                                str(disname.strip()) + '(' + str(disshort) +
                                ')')
                            unidisURLlist.append(disURL)
                except IndexError:
                    pass

            except ExpatError:
                pass
        except IOError:
            pass
        drugbankdata = 'NA'
        disdata = 'NA'
        uniDisData = 'NA'
        uniDisURLData = 'NA'
        disgenDisData = 'NA'
        disgenDisURLData = 'NA'
        goiddata = 'NA'
        gonamedata = 'NA'
        gotermdata = 'NA'
        goData = 'NA'
        if GN != 'NA' and GN in disGenDataDic:
            disgendislist = disGenDataDic[GN][0]
            disgendisURLlist = disGenDataDic[GN][1]
            if len(dislist) > 0:
                dislist = dislist + disGenDataDic[GN][0]
            else:
                dislist = disGenDataDic[GN][0]

        if len(GoIDList) > 0:
            goiddata = '|'.join(list(set(GoIDList)))
        if len(GoNamList) > 0:
            gonamedata = '|'.join(list(set(GoNamList)))
        if len(GoTermList) > 0:
            gotermdata = '|'.join(list(set(GoTermList)))
        if len(GOinfo) > 0:
            goData = '|'.join(list(set(GOinfo)))
        if len(drugbanklist) > 0:
            drugbankdata = '|'.join(list(set(drugbanklist)))
        if len(dislist) > 0:
            disdata = '|'.join(list(set(dislist)))
        if len(unidislist) > 0:
            uniDisData = '|'.join(list(set(unidislist)))
        if len(unidisURLlist) > 0:
            uniDisURLData = '|'.join(list(set(unidisURLlist)))
        if len(disgendislist) > 0:
            disgenDisData = '|'.join(list(set(disgendislist)))
        if len(disgendisURLlist) > 0:
            disgenDisURLData = '|'.join(list(set(disgendisURLlist)))
        humanUniprotfuncinfodic[subcode] = [
            PN, GN, OG, OGID, disdata, uniDisData, uniDisURLData,
            disgenDisData, disgenDisURLData, drugbankdata, goiddata,
            gonamedata, gotermdata, goData
        ]
    hudicfile = 'humanUniprotfuncinfodic.obj'
    hudicf = open(hudicfile, 'wb')
    pickle.dump(humanUniprotfuncinfodic, hudicf, pickle.HIGHEST_PROTOCOL)
    hudicf.close()

    print("Extracting KEGG pathway name, job starts",
          str(datetime.datetime.now()))
    hkeggdictfile = {}
    huniproturl = 'https://www.uniprot.org/uploadlists/'
    hk = KEGG()
    for hkx in range(0, len(unqhumanUniprotID), 2000):
        countProt += hkx + 2000
        if countProt % 2000 == 0:
            print(str(countProt), "th protein kegg job starts",
                  str(datetime.datetime.now()))

        huniprotcodes = ' '.join(unqhumanUniprotID[hkx:hkx + 2000])
        huniprotparams = {
            'from': 'ACC',
            'to': 'KEGG_ID',
            'format': 'tab',
            'query': huniprotcodes
        }

        while True:
            try:
                hkuniprotdata = urllib.urlencode(huniprotparams)
                hkuniprotrequest = urllib2.Request(huniproturl, hkuniprotdata)
                hkuniprotresponse = urllib2.urlopen(hkuniprotrequest)
                for hkuniprotline in hkuniprotresponse:
                    hkudata = hkuniprotline.strip()
                    if not hkudata.startswith("From"):
                        hkuinfo = hkudata.split("\t")
                        if len(hkuinfo[1].strip()):
                            hkegg = hk.get(hkuinfo[1].strip())
                            hkudict_data = hk.parse(hkegg)
                            try:
                                try:
                                    if len(str(hkuinfo[0]).strip()) > 5:
                                        tempkeggData = '|'.join(
                                            '{};{}'.format(key, value)
                                            for key, value in
                                            hkudict_data['PATHWAY'].items())
                                        hkeggdictfile[hkuinfo[0].strip()] = [
                                            hkudict_data['PATHWAY'].values(),
                                            tempkeggData
                                        ]
                                except TypeError:
                                    pass
                            except KeyError:
                                pass
                break
            except urllib2.HTTPError:
                time.sleep(RETRY_TIME)
                print(
                    'Hey, I am trying again until succeeds to get data from KEGG!',
                    str(datetime.datetime.now()))
                pass

    hkdicfile = 'humankeggdic.obj'
    hkdicf = open(hkdicfile, 'wb')
    pickle.dump(hkeggdictfile, hkdicf, pickle.HIGHEST_PROTOCOL)
    hkdicf.close()
Exemplo n.º 22
0
from bioservices.kegg import KEGG

k = KEGG()
path = k.get("K00855")
kdict = k.parse(path)
print(kdict)
help(kdict)
with open("play.out", "wt") as result:
	result.write("\n".join(kdict.keys()))
Exemplo n.º 23
0
def get_kegg_info(stId):
    k = KEGG()
    data = k.get(stId)
    dict_data = k.parse(data)
    return dict_data
Exemplo n.º 24
0
import re
from bioservices.kegg import KEGG

# -- KEGG bioservice
bioser = KEGG(cache=True)
bioser.organism = 'hsa'

# Get pathways
keggp = {p: bioser.get(p) for p in bioser.pathwayIds}
print '[INFO] Pathways fetched'

# Get reactions
keggr = {r: bioser.get(r) for r in bioser.reactionIds}
print '[INFO] Reactions fetched'

# Get enzymes
kegge = {e: bioser.get(e) for e in bioser.enzymeIds}
print '[INFO] Enzymes fetched'

# keggc = {c: bioser.get(c) for c in bioser.compoundIds}
# print '[INFO] Compounds fetched'
#
# # Get modules
# keggm = {m: bioser.get(m) for m in bioser.moduleIds}
# print '[INFO] Modules fetched'


# -- KEGG methods
def get_pathway_names(pathways=None):
    pathways_ = pathways if pathways else set(keggp)
    return {p: re.findall('NAME\s+(.*)?\n', keggp[p])[0].split(' - ')[0] for p in pathways_}
Exemplo n.º 25
0
def get_pathways_from_KEGG(model, update_existing=False):
    """
    This function extracts pathway and subsystem information from KEGG by using the KEGG annotation of each reaction.
    The pathways we use are the ones given here: https://www.genome.jp/kegg/pathway.html, 
    under heading 1.: Metabolism. However we don't use the *1.0 Global and overview maps* or 
    *1.12 Chemical structure and transformation maps*, because they don't 
    represent metabolic subsystems. What we here refer to as *subsustems* are the subheadings under Metabolism, i.e.:
    - Carbohydrate metabolism
    - Energy metabolism
    - Lipid metabolism
    - Nucleotide metabolism
    - Amino acid metabolism
    - Metabolism of other amino acids
    - Glycan biosynthesis and metabolism
    - Metabolism of cofactors and vitamins
    - Metabolism of terpenoids and polyketides
    - Biosynthesis of other secondary metabolites
    - Xenobiotics biodegradation and metabolism

    """
    from bioservices.kegg import KEGG
    kegg = KEGG()
    kegg_dict, kegg_overview_maps = _get_KEGG_pathways()
    inverse_pathway_dict = _get_inverse_pathway_dict(kegg_dict)

    for reaction in model.reactions:

        # Skip reactions which already have an kegg.pathway annoatation
        # if update_existing = False
        if not update_existing:
            try:
                reaction.annotation["kegg.pathway"]
            except KeyError:
                pass
            else:
                # Skip this one
                continue

        try:
            kegg_id = reaction.annotation["kegg.reaction"]
        except KeyError:
            continue

        kegg_info = kegg.get(kegg_id, parse=True)

        try:
            full_kegg_pathways = kegg_info["PATHWAY"].values()
        except:
            continue

        kegg_pathways = [
            x for x in full_kegg_pathways if not x in kegg_overview_maps
        ]

        try:
            subsystem = list(
                set([inverse_pathway_dict[x] for x in kegg_pathways]))
        except:
            print("Error!: ", reaction.id, kegg_pathways)
            continue

        print("KEGG Subsystem ", reaction.id, subsystem)

        reaction.annotation["kegg.pathway"] = kegg_pathways
        reaction.annotation["kegg.subsystem"] = subsystem
    return model
def mapSpecies(mousepeptrackfilename):
	#increase the field size of CSV
	csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2))
	uniproturl = 'https://www.uniprot.org/uploadlists/'

	RETRY_TIME = 20.0
	mdf= pd.read_csv(mousepeptrackfilename, delimiter='\t')
	mouseGenes=list(mdf['Gene'].unique())
	mouseGenes=[g for g in mouseGenes if str(g) !='nan']

	mousehumandic={}
	for gx in range(0,len(mouseGenes),1000):
		genecodes=' '.join(mouseGenes[gx:gx+1000])
		geneuniprotparams = {
		'from':'GENENAME',
		'to':'ACC',
		'format':'tab',
		'query':genecodes,
		'columns':'id,genes(PREFERRED),organism-id,reviewed'

		}
		while True:
			try:
				geneuniprotdata = urllib.urlencode(geneuniprotparams)
				geneuniprotrequest = urllib2.Request(uniproturl, geneuniprotdata)
				geneuniprotresponse = urllib2.urlopen(geneuniprotrequest)
				for guniprotline in geneuniprotresponse:
					gudata=guniprotline.strip()
					if not gudata.startswith("Entry"):
						guinfo=gudata.split("\t")
						if '9606' == guinfo[2].lower() and 'reviewed' == guinfo[3].lower() and guinfo[-1].lower() ==guinfo[1].lower() and len(guinfo[0].strip())>1:
							mousehumandic[guinfo[-1].strip()]=guinfo[0].strip()
				break
			except urllib2.HTTPError:
				time.sleep(RETRY_TIME)
				print ('Hey, I am trying again until succeeds to get data from uniprot data!',str(datetime.datetime.now()))
			except httplib.BadStatusLine:
				time.sleep(RETRY_TIME)
				print ('Hey, I am trying again until succeeds to get data from  uniprot data!',str(datetime.datetime.now()))


	colname=['UniProtKB Accession','Protein','Gene','Organism','Peptide Sequence','Summary Concentration Range Data','All Concentration Range Data','All Concentration Range Data-Sample LLOQ Based','Peptide ID',\
	'Special Residues','Molecular Weight','GRAVY Score','Transitions','Retention Time','Analytical inofrmation',\
	'Gradients','AAA Concentration','CZE Purity','Panel','Knockout','LLOQ','ULOQ','Sample LLOQ','Protocol','Trypsin','QC. Conc. Data','Human UniProtKB Accession']

	finalresult=[]
	finalresult.append(colname)
	humanUniprotID=[]
	with open(mousepeptrackfilename) as csvfile:
		reader = csv.DictReader(csvfile, delimiter='\t')
		for row in reader:
			templist=[]
			for i in colname[:-1]:
				tempdata=str(row[i]).strip()
				templist.append(tempdata)
			if len(str(templist[2]).strip())>0:
				if templist[2] in mousehumandic:
					huUniId=mousehumandic[templist[2]]
					humanUniprotID.append(huUniId)
					templist.append(huUniId)
				else:
					templist.append('NA')

			finalresult.append(templist)

	with open(mousepeptrackfilename,'wb') as pf:
		pwriter =csv.writer(pf,delimiter='\t')
		pwriter.writerows(finalresult)

	unqhumanUniprotID=list(set(humanUniprotID))
	humanUniprotfuncinfodic={}
	countProt=0
	for subcode in unqhumanUniprotID:
		time.sleep(2)
		drugbanklist=[]
		PN='NA'
		GN='NA'
		OG='NA'
		OGID='NA'
		dislist=[]
		GoIDList=[]
		GoNamList=[]
		GoTermList=[]
		try:
			countProt+=1
			if countProt%1000 ==0:
				print str(countProt), "th protein Protein Name, Gene, Organism Name,drug bank data,disease data job starts",str(datetime.datetime.now())

			SGrequestURL="https://www.uniprot.org/uniprot/"+str(subcode)+".xml"
			SGunifile=urllib.urlopen(SGrequestURL)
			SGunidata= SGunifile.read()
			SGunifile.close()

			try:
				SGunidata=minidom.parseString(SGunidata)
				try:
					drugdata=(SGunidata.getElementsByTagName('dbReference'))
					for duItem in drugdata:
						if (duItem.attributes['type'].value).upper() == 'DRUGBANK':
							try:
								drugname=(str(duItem.getElementsByTagName('property')[0].attributes['value'].value).strip())
								drugid=str(duItem.attributes['id'].value).strip()
								durl='<a target="_blank" href="https://www.drugbank.ca/drugs/'+drugid+'">'+drugname+'</a>'
								drugbanklist.append(durl)
							except:
								pass
						if (duItem.attributes['type'].value).strip() == 'NCBI Taxonomy':
							try:
								OGID=str(duItem.attributes['id'].value).strip()
							except:
								pass
				except IndexError:
					pass

				try:
					godata=(SGunidata.getElementsByTagName('dbReference'))
					for gItem in godata:
						if (gItem.attributes['type'].value).upper() == 'GO':
							try:
								gonamedetails=(str(gItem.getElementsByTagName('property')[0].attributes['value'].value).strip()).split(':')[1]
								gotermdetails=(str(gItem.getElementsByTagName('property')[0].attributes['value'].value).strip()).split(':')[0]
								GoNamList.append(gonamedetails)
								goid=str(gItem.attributes['id'].value).strip()
								GoIDList.append(goid)
								if gotermdetails.lower()=='p':
									GoTermList.append('Biological Process')
								if gotermdetails.lower()=='f':
									GoTermList.append('Molecular Function')
								if gotermdetails.lower()=='c':
									GoTermList.append('Cellular Component')
							except:
								pass

						if (gItem.attributes['type'].value).strip() == 'NCBI Taxonomy':
							try:
								OGID=str(gItem.attributes['id'].value).strip()
							except:
								pass
				except IndexError:
					pass

				try:
					try:
						PN=(((SGunidata.getElementsByTagName('protein')[0]).getElementsByTagName('recommendedName')[0]).getElementsByTagName('fullName')[0]).firstChild.nodeValue

					except:
						PN=(((SGunidata.getElementsByTagName('protein')[0]).getElementsByTagName('submittedName')[0]).getElementsByTagName('fullName')[0]).firstChild.nodeValue

				except IndexError:
					pass

				try:
					try:
						GN=((SGunidata.getElementsByTagName('gene')[0]).getElementsByTagName('name')[0]).firstChild.nodeValue
					except:
						GN='NA'
				except IndexError:
					pass

				try:
					try:
						OG=((SGunidata.getElementsByTagName('organism')[0]).getElementsByTagName('name')[0]).firstChild.nodeValue
					except:
						OG='NA'
				except IndexError:
					pass

				try:
					disdata=SGunidata.getElementsByTagName('disease')
					for dItem in disdata:
						disname=''
						disshort=''
						try:
							disname=(dItem.getElementsByTagName('name')[0]).firstChild.nodeValue
						except:
							pass
						try:
							disshort=(dItem.getElementsByTagName('acronym')[0]).firstChild.nodeValue
						except:
							pass
						if len(disname.strip())>0:
							dislist.append(str(disname.strip())+'('+str(disshort)+')')
				except IndexError:
					pass

			except ExpatError:
				pass
		except IOError:
			pass
		drugbankdata='NA'
		disdata='NA'
		goiddata='NA'
		gonamedata='NA'
		gotermdata='NA'
		if len(GoIDList)>0:
			goiddata='|'.join(list(set(GoIDList)))
		if len(GoNamList)>0:
			gonamedata='|'.join(list(set(GoNamList)))
		if len(GoTermList)>0:
			gotermdata='|'.join(list(set(GoTermList)))
		if len(drugbanklist)>0:
			drugbankdata='|'.join(list(set(drugbanklist)))
		if len(dislist)>0:
			disdata='|'.join(list(set(dislist)))

		humanUniprotfuncinfodic[subcode]=[PN,GN,OG,OGID,disdata,drugbankdata,goiddata,gonamedata,gotermdata]

	hudicfile='humanUniprotfuncinfodic.obj'
	hudicf = open(hudicfile, 'wb')
	pickle.dump(humanUniprotfuncinfodic, hudicf , pickle.HIGHEST_PROTOCOL)
	hudicf.close()	

	print ("Extracting KEGG pathway name, job starts",str(datetime.datetime.now()))
	hkeggdictfile={}
	hk = KEGG()
	for hkx in range(0,len(unqhumanUniprotID),2000):
		countProt+=hkx+2000
		if countProt%2000 ==0:
			print (str(countProt), "th protein kegg job starts",str(datetime.datetime.now()))

		huniprotcodes=' '.join(unqhumanUniprotID[hkx:hkx+2000])
		huniprotparams = {
		'from':'ACC',
		'to':'KEGG_ID',
		'format':'tab',
		'query':huniprotcodes
		}
		
		while True:
			try:
				hkuniprotdata = urllib.urlencode(huniprotparams)
				hkuniprotrequest = urllib2.Request(uniproturl, hkuniprotdata)
				hkuniprotresponse = urllib2.urlopen(hkuniprotrequest)
				for hkuniprotline in hkuniprotresponse:
					hkudata=hkuniprotline.strip()
					if not hkudata.startswith("From"):
						hkuinfo=hkudata.split("\t")
						if len(hkuinfo[1].strip()):
							hkegg=hk.get(hkuinfo[1].strip())
							hkudict_data = hk.parse(hkegg)
							try:
								try:
									if len(str(hkuinfo[0]).strip()) >5:
										hkeggdictfile[hkuinfo[0].strip()]=hkudict_data['PATHWAY'].values()
								except TypeError: 
									pass
							except KeyError:
								pass
				break
			except urllib2.HTTPError:
				time.sleep(RETRY_TIME)
				print ('Hey, I am trying again until succeeds to get data from KEGG!',str(datetime.datetime.now()))
				pass

	hkdicfile='humankeggdic.obj'
	hkdicf = open(hkdicfile, 'wb')
	pickle.dump(hkeggdictfile, hkdicf , pickle.HIGHEST_PROTOCOL)
	hkdicf.close()
Exemplo n.º 27
0
                EC2KO_dic[item].append(
                    ko
                )  #here append may be used, bc only one item is added at a time
            else:
                EC2KO_dic[item] = [ko]
    else:
        KO_dic[ko] = [definition]

#here a list of KOs is made to be searched for - if pathway is defined
kostoget = {}

if args.pathway != 'none':
    output = open('eclist.txt', 'w')

    kegg = KEGG()
    pathway = kegg.get(args.pathway)
    dict_data = kegg.parse(pathway)
    if dict_data == 404:
        print("WARNING: BAD PATHWAY SUBMITTED TO KEGG!")
    elif dict_data == None:
        print("WARNING: ERROR CONNECTING TO KEGG SERVER!")
    #print(dict_data)
    print("ECQUERY: PROCESSING PATHWAY/MODULE")
    try:
        for key in dict_data['ORTHOLOGY'].keys():
            print("adding ortholog {} to eclist".format(key))
            value = dict_data['ORTHOLOGY'][key]
            print(value)
            if "," in key:
                for item in key.split(","):
                    kostoget[item] = value
Exemplo n.º 28
0
def search(query, source="wikipathways", result_format="xml", species=None, genes=None, user=None):
    path_array = []
    
    if source.lower() in ["wikipathways", "all"] and species is None:
        url = "http://webservice.wikipathways.org/"
        ext = "/findPathwaysByText?query=" + str(query)
        r = requests.get(url+ext, headers={"Content-Type": "application/json"})

        if not r.ok:
            r.raise_for_status()
            sys.exit()

        tree = ET.ElementTree(ET.fromstring(r.text))
        root = tree.getroot()
        for child in root:
            temp_path_dict = {}
            for subchild in child:
                if subchild.tag == "{http://www.wikipathways.org/webservice}id":
                    temp_path_dict["identifier"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}score":
                    temp_path_dict["score"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}url":
                    temp_path_dict["url"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}name":
                    temp_path_dict["name"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}species":
                    temp_path_dict["species"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}revision":
                    temp_path_dict["revision"] = subchild.text
                
            temp_path = gnomics.objects.pathway.Pathway(identifier = temp_path_dict["identifier"], identifier_type = "WikiPathways ID", name = temp_path_dict["name"], taxon = temp_path_dict["species"], source = "WikiPathways")
            
            if temp_path_dict["identifier"] not in path_array:
                path_array.append(temp_path)
    
    elif source.lower() in ["wikipathways", "all"] and species is not None:
        url = "http://webservice.wikipathways.org/"
        ext = "/findPathwaysByText?query=" + str(query) + "&species=" + str(species)
        r = requests.get(url+ext, headers={"Content-Type": "application/json"})

        if not r.ok:
            r.raise_for_status()
            sys.exit()

        tree = ET.ElementTree(ET.fromstring(r.text))
        root = tree.getroot()
        path_array = []
        for child in root:
            temp_path_dict = {}
            for subchild in child:
                if subchild.tag == "{http://www.wikipathways.org/webservice}id":
                    temp_path_dict["identifier"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}score":
                    temp_path_dict["score"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}url":
                    temp_path_dict["url"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}name":
                    temp_path_dict["name"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}species":
                    temp_path_dict["species"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}revision":
                    temp_path_dict["revision"] = subchild.text
                
            temp_path = gnomics.objects.pathway.Pathway(identifier = temp_path_dict["identifier"], identifier_type = "WikiPathways ID", name = temp_path_dict["name"], taxon = temp_path_dict["species"], source = "WikiPathways")
            
            if temp_path_dict["identifier"] not in path_array:
                path_array.append(temp_path)
    
    if source.lower() in ["kegg", "all"] and genes is not None:
        k = KEGG()
        
    elif source.lower() in ["kegg", "all"] and genes is None:
        k = KEGG()
        list_of_pathways = k.find("pathway", query)
        temp_path_list = list_of_pathways.split("\n")
        
        for thing in temp_path_list:
            temp_split = thing.split("\t")
            if len(temp_split) != 1:
                path_id = temp_split[0].strip().split(":")[1]
                path_name = temp_split[1].strip()

                if "map" in path_id:
                    temp_path = gnomics.objects.pathway.Pathway(identifier=path_id, identifier_type="KEGG MAP PATHWAY ID", source="KEGG", name=path_name)
                    path_array.append(temp_path)
                elif "ko" in path_id:
                    temp_path = gnomics.objects.pathway.Pathway(identifier=path_id, identifier_type="KEGG KO PATHWAY ID", source="KEGG", name=path_name)
                    path_array.append(temp_path)
                elif "ec" in path_id:
                    temp_path = gnomics.objects.pathway.Pathway(identifier=path_id, identifier_type="KEGG EC PATHWAY ID", source="KEGG", name=path_name)
                    path_array.append(temp_path)
                elif "rn" in path_id:
                    temp_path = gnomics.objects.pathway.Pathway(identifier=path_id, identifier_type="KEGG RN PATHWAY ID", source="KEGG", name=path_name)
                    path_array.append(temp_path)
                else:
                    print(k.get(path_id))
        
    return path_array
Exemplo n.º 29
0
#from Bio import Entrez
from bioservices.kegg import KEGG
import sys

k = KEGG()
#Entrez.email = "*****@*****.**"

#file = open(sys.argv[1], "r")
file = open("../data/ids5.txt", "r")
result = ""
k.organism = "lpl"
k.get()
#for line in file.readlines():
#
file.close()
print(result)

# for line in file.readlines():
#     handle = Entrez.esearch(db="pubmed", term=line)
#     record = Entrez.read(handle)
#     ids = record["IdList"]
#     print(ids)
Exemplo n.º 30
0
def search(query,
           source="wikipathways",
           result_format="xml",
           species=None,
           genes=None,
           user=None):
    path_array = []

    if source.lower() in ["wikipathways", "all"] and species is None:
        url = "http://webservice.wikipathways.org/"
        ext = "/findPathwaysByText?query=" + str(query)
        r = requests.get(url + ext,
                         headers={"Content-Type": "application/json"})

        if not r.ok:
            r.raise_for_status()
            sys.exit()

        tree = ET.ElementTree(ET.fromstring(r.text))
        root = tree.getroot()
        for child in root:
            temp_path_dict = {}
            for subchild in child:
                if subchild.tag == "{http://www.wikipathways.org/webservice}id":
                    temp_path_dict["identifier"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}score":
                    temp_path_dict["score"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}url":
                    temp_path_dict["url"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}name":
                    temp_path_dict["name"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}species":
                    temp_path_dict["species"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}revision":
                    temp_path_dict["revision"] = subchild.text

            temp_path = gnomics.objects.pathway.Pathway(
                identifier=temp_path_dict["identifier"],
                identifier_type="WikiPathways ID",
                name=temp_path_dict["name"],
                taxon=temp_path_dict["species"],
                source="WikiPathways")

            if temp_path_dict["identifier"] not in path_array:
                path_array.append(temp_path)

    elif source.lower() in ["wikipathways", "all"] and species is not None:
        url = "http://webservice.wikipathways.org/"
        ext = "/findPathwaysByText?query=" + str(query) + "&species=" + str(
            species)
        r = requests.get(url + ext,
                         headers={"Content-Type": "application/json"})

        if not r.ok:
            r.raise_for_status()
            sys.exit()

        tree = ET.ElementTree(ET.fromstring(r.text))
        root = tree.getroot()
        path_array = []
        for child in root:
            temp_path_dict = {}
            for subchild in child:
                if subchild.tag == "{http://www.wikipathways.org/webservice}id":
                    temp_path_dict["identifier"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}score":
                    temp_path_dict["score"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}url":
                    temp_path_dict["url"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}name":
                    temp_path_dict["name"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}species":
                    temp_path_dict["species"] = subchild.text
                elif subchild.tag == "{http://www.wikipathways.org/webservice}revision":
                    temp_path_dict["revision"] = subchild.text

            temp_path = gnomics.objects.pathway.Pathway(
                identifier=temp_path_dict["identifier"],
                identifier_type="WikiPathways ID",
                name=temp_path_dict["name"],
                taxon=temp_path_dict["species"],
                source="WikiPathways")

            if temp_path_dict["identifier"] not in path_array:
                path_array.append(temp_path)

    if source.lower() in ["kegg", "all"] and genes is not None:
        k = KEGG()

    elif source.lower() in ["kegg", "all"] and genes is None:
        k = KEGG()
        list_of_pathways = k.find("pathway", query)
        temp_path_list = list_of_pathways.split("\n")

        for thing in temp_path_list:
            temp_split = thing.split("\t")
            if len(temp_split) != 1:
                path_id = temp_split[0].strip().split(":")[1]
                path_name = temp_split[1].strip()

                if "map" in path_id:
                    temp_path = gnomics.objects.pathway.Pathway(
                        identifier=path_id,
                        identifier_type="KEGG MAP PATHWAY ID",
                        source="KEGG",
                        name=path_name)
                    path_array.append(temp_path)
                elif "ko" in path_id:
                    temp_path = gnomics.objects.pathway.Pathway(
                        identifier=path_id,
                        identifier_type="KEGG KO PATHWAY ID",
                        source="KEGG",
                        name=path_name)
                    path_array.append(temp_path)
                elif "ec" in path_id:
                    temp_path = gnomics.objects.pathway.Pathway(
                        identifier=path_id,
                        identifier_type="KEGG EC PATHWAY ID",
                        source="KEGG",
                        name=path_name)
                    path_array.append(temp_path)
                elif "rn" in path_id:
                    temp_path = gnomics.objects.pathway.Pathway(
                        identifier=path_id,
                        identifier_type="KEGG RN PATHWAY ID",
                        source="KEGG",
                        name=path_name)
                    path_array.append(temp_path)
                else:
                    print(k.get(path_id))

    return path_array
Exemplo n.º 31
0
def pathwayVisualization(KEGG_id, path_to_csv, redirect=True, compound=False):
    """
    The pathwayVisualization function returns a graph visualization based on user input
    
    Args:
        KEGG_id     (str): string specifying KEGG pathway ID to visualize
        path_to_csv (str): string specifying data to overlay on graph
        redirect    (bool): True to split nodes into their components. Defaults to True
        compound    (bool): True to display compounds (such as Ca2+). Defaults to False
        
    Returns:
        A graph visualization using the visjs_network function from visjs_2_jupyter
    """
    
    s = KEGG()
    
    res = s.get(KEGG_id, "kgml")
    
    if res == 404 or res == 400:
        print KEGG_id + ' is not a valid KEGG ID'
        return
    
    result = s.parse_kgml_pathway(KEGG_id)
    
    ETroot = parsingXML(KEGG_id, s)
    
    G=nx.DiGraph()
    
    max_id, compound_array = addNodes(G, result)
    setCoord(G, ETroot)
    
    if redirect is False:
        getNodeSymbols(G, s, compound)
    else:
        parent_list, parent_dict = splitNodes(G, s, max_id)
    
    complex_array, component_array, node_dict, comp_dict = undefNodes(G, ETroot)
    
    if redirect is False:
        addEdges(G, result, component_array, node_dict)
    else:
        addAndRedirectEdges(G, result, complex_array, component_array, parent_list, parent_dict, node_dict, comp_dict)
    
    #add reactions to graph
    addReaction(G, ETroot)
    
    edge_to_name = dict()
    for edge in G.edges():
        if G.edge[edge[0]][edge[1]]['name'] == 'phosphorylation':
            edge_to_name[edge] = G.edge[edge[0]][edge[1]]['value']
        elif G.edge[edge[0]][edge[1]]['name'] == 'dephosphorylation':
            edge_to_name[edge] = G.edge[edge[0]][edge[1]]['value']
        elif 'dephosphorylation' in G.edge[edge[0]][edge[1]]['name']:
            edge_to_name[edge] = G.edge[edge[0]][edge[1]]['name'].replace('dephosphorylation', '-p')
        elif 'phosphorylation' in G.edge[edge[0]][edge[1]]['name']:
            edge_to_name[edge] = G.edge[edge[0]][edge[1]]['name'].replace('phosphorylation', '+p')
        else:
            edge_to_name[edge] = G.edge[edge[0]][edge[1]]['name']
            
        edge_to_name[edge] = edge_to_name[edge].replace('activation, ', '')
        edge_to_name[edge] = edge_to_name[edge].replace('inhibition, ', '')
        edge_to_name[edge] = edge_to_name[edge].replace('activation', '')
        edge_to_name[edge] = edge_to_name[edge].replace('inhibition', '')

    #edges are transparent
    edge_to_color = dict()
    for edge in G.edges():
        if 'activation' in G.edge[edge[0]][edge[1]]['name']:
            edge_to_color[edge] = 'rgba(26, 148, 49, 0.3)' #green
        elif 'inhibition' in G.edge[edge[0]][edge[1]]['name']:
            edge_to_color[edge] = 'rgba(255, 0, 0, 0.3)' #red
        else:
            edge_to_color[edge] = 'rgba(0, 0, 255, 0.3)' #blue
    
    #for graph with split nodes
    if redirect is True:
        #remove undefined nodes from graph
        G.remove_nodes_from(complex_array)

        #remove nodes with more than one gene
        G.remove_nodes_from(parent_list)

    if compound is False:
        #remove compound nodes
        G.remove_nodes_from(compound_array)
        
    node_to_symbol = dict()
    for node in G.node:
        if G.node[node]['type'] == 'map':
            node_to_symbol[node] = G.node[node]['gene_names']
        else:
            if 'symbol' in G.node[node]:
                node_to_symbol[node] = G.node[node]['symbol']
            elif 'gene_names'in G.node[node]:
                node_to_symbol[node] = G.node[node]['gene_names']
            else: 
                node_to_symbol[node] = G.node[node]['name']
            
    # getting name of nodes
    node_to_gene = dict()
    for node in G.node:
        node_to_gene[node] = G.node[node]['gene_names']
            
    # getting x coord of nodes
    node_to_x = dict()
    for node in G.node:
        node_to_x[node] = G.node[node]['x']
    
    # getting y coord of nodes
    node_to_y = dict()
    for node in G.node:
        node_to_y[node] = G.node[node]['y']
    
    id_to_log2fold = log2FoldChange(G, path_to_csv)
    
    # Create color scale with negative as green and positive as red
    my_scale = spectra.scale([ "green", "#CCC", "red" ]).domain([ -4, 0, 4 ])
    
    # color nodes based on log2fold data
    node_to_color = dict()
    
    for node in G.nodes():

        if node in id_to_log2fold:
            node_to_color[node] = my_scale(id_to_log2fold[node][0]).hexcode

        else:
            node_to_color[node] = '#f1f1f1'

    # getting nodes in graph
    nodes = G.nodes()
    numnodes = len(nodes)
    node_map = dict(zip(nodes,range(numnodes)))  # map to indices for source/target in edges
    
    # getting edges in graph
    edges = G.edges()
    numedges = len(edges)

    # dictionaries that hold per node and per edge attributes
    nodes_dict = [{"id":node_to_gene[n],"degree":G.degree(n),"color":node_to_color[n], "node_shape":"box",
                 "node_size":10,'border_width':1, "id_num":node_to_symbol[n], "x":node_to_x[n], "y":node_to_y[n]} for n in nodes]

    edges_dict = [{"source":node_map[edges[i][0]], "target":node_map[edges[i][1]], 
                  "color":edge_to_color[edges[i]], "id":edge_to_name[edges[i]], "edge_label":'',
                 "hidden":'false', "physics":'true'} for i in range(numedges)]        

    # html file label for first graph (must manually increment later)
    time = 1700

    # create graph here
    #return G
    return visJS_module.visjs_network(nodes_dict, edges_dict, time_stamp = time, node_label_field = "id_num", 
                               edge_width = 3, border_color = "black", edge_arrow_to = True, edge_font_size = 15,
                               edge_font_align= "top", physics_enabled = False, graph_width = 1000, graph_height = 1000)