def execute_query(db, qfields=[], verbose=False): # Get query qfields list fields = db[0].findAll("field") # Prepare URL link = db[0].findAll("link")[0]["stern"] # Compile URL if link[:4]=='http': if db[0]["method"] == "POST": i = 0 for field in fields: data = {field.text: qfields[i]} i += 1 return helper.connectionError(link, data) elif db[0]["method"] == "GET": query_string = "" if db[0]["type"] != "text/csv": i = 0 for field in fields: # Detect controller field (always first field) if "lowercase" in field: print(qfields[i].lower()) if field.text == "": query_string += qfields[i] + "?" # All other fields are query fields else: query_string += field.text + field["op"] + qfields[i] + "&" i += 1 query_string = query_string[:-1] link += query_string + \ db[0].findAll("link")[0]["aft"] if verbose: print(link) return helper.connectionError(link) else: return open(link)
def planttfdb(MSUID): # Find the file url = 'http://planttfdb.cbi.pku.edu.cn/download.php' html_page = helper.connectionError(url) soup = BeautifulSoup(html_page.content, "lxml") # Find headers for search in soup.findAll('table', {"id": "oid_tfid"}): for linkfound in search.findAll('a'): if (linkfound.contents[0] == "Oryza sativa subsp. japonica"): link = 'http://planttfdb.cbi.pku.edu.cn/' + linkfound.get( 'href') break # Give the entire name of the file with the extension .gz filename = link.split("/")[-1] # Give the name of the file without .gz uncompressName = filename[:-3] + ".txt" pathToFile = helper.formatPathToFile(uncompressName) # Test existant file if (not helper.existFile(pathToFile)): # Fetch the file by the url and decompress it r = helper.connectionError(link) decompressedFile = gzip.decompress(r.content) # Create the file .txt with open(pathToFile, "wb") as f: f.write(decompressedFile) f.close() # Use the previous created file (.txt) with open(pathToFile, "r+b") as file: # Import file tab-delimited try: array = pd.read_csv(file, sep="\t", header=None) except pd.io.common.EmptyError: array = pd.DataFrame() # Named columns array.columns = ["TF_ID", "Gene_ID", "Family"] data = array.loc[array['TF_ID'] == MSUID] if (not data.empty): return data else: data = array.loc[array['Gene_ID'] == MSUID] if (data.empty): return False else: hashmap = {"Family": data["Family"].values[0]} return hashmap
def ic4r(RAPID): link = 'http://expression.ic4r.org/expression-api?term=' + RAPID + '#showtable' html_page = helper.connectionError(link) soup = BeautifulSoup(html_page.content, "lxml") # Find headers headers = [] for head in soup.findAll('thead'): for link in head.findAll('tr'): for linkhead in link.findAll('th'): headers.append(linkhead.contents) content = [] for body in soup.findAll('tbody'): for link in body.findAll('tr'): dict = {} i = 0 for linkbody in link.findAll('td'): try: dict[str(headers[i][0])] = linkbody.contents[0] except: dict[str(headers[i][0])] = 'None' i = i + 1 #content.append(dict) print(dict)
def get_header(db, test_id): dict = {} #database descriptor querry database_descriptor = BeautifulSoup(open("database-description.xml").read(), "xml").findAll("database",{"type":"text/html","dbname":db}) if not database_descriptor: raise ValueError('Database Not Found') for database in database_descriptor: if database["method"]=="POST": link = database.findAll("link")[0]["stern"] data = {'rapId': test_id} res = helper.connectionErrorPost(link, data) elif database["method"]=="GET": link = database.findAll("link")[0]["stern"] + test_id + database.findAll("link")[0]["aft"] res = helper.connectionError(link) # Headers declaration headers = [] for header in database.findAll("header"): headers.append(header.text) # Connection handling ret = BeautifulSoup(res.content, "lxml") data = ret.findAll(database.findAll("data_struct")[0]["indicator"], {database.findAll("data_struct")[0]["identifier"] : database.findAll("data_struct")[0]["identification_string"]})[0] # Header detection for header in data.findAll('th'): header = header.text.replace('\r', '') header = header.replace('\n', '') header = header.replace('\t', '') headers.append(header) dict[database["dbname"]]=headers return dict
def gramene(RAPID): """ Download the file located in the URL :param url: The url for accessing the file :type url: String """ # Fetch the file by the url and decompress it link = 'http://data.gramene.org/v53/genes?q=' + RAPID + '&bedFeature=gene&bedCombiner=canonical' html_page = helper.connectionError(link) return html_page.content.decode('UTF-8')
def loadFileURL(nameFile, url): """ Download the file located in the rapdb download page """ # Fetch the file by the url and decompress it html_page = helper.connectionError(url) # Create the file .txt with open(nameFile, "wb") as f: f.write(html_page.content) print("File created") f.close()
def msu(id): link = "http://rice.plantbiology.msu.edu/cgi-bin/sequence_display.cgi?orf=" + id html_page = helper.connectionError(link) soup = BeautifulSoup(html_page.content, "lxml") headers = ["Genomic Sequence", "CDS", "Protein"] dict = {} i = 0 for search in soup.findAll('pre'): dataFormat = search.text.replace('>' + id, '') dataFormat = dataFormat.replace('\n', '') dict[headers[i]] = dataFormat i = i + 1 return dict
def snpSeek(contig, start, end): Log = open('log.txt', 'w') url = 'http://snp-seek.irri.org/ws/genomics/gene/osnippo/' u = '' model = '&model=rap\n' #'&model=msu7\n' data = [] Log.write(url + contig + '?' + 'start=' + start + '&end=' + end + '&model=msu7\n') try: #u = urllib.urlopen(url + contig + '?' + 'start='+ start + '&end='+ end+'&model=msu7\n') urlFind = url + contig + '?' + 'start=' + start + '&end=' + end + '&model=msu7' r = helper.connectionError(urlFind) # encodage en bytes et pas en string d'ou le decode data = json.loads(r.content.decode('UTF-8')) except: Log.write(url + contig + '?' + 'start=' + start + '&end=' + end + '&model=msu7\n') pass locus = contig + ':' + start + '-' + end #retourne un tableau return data
def rapdb(RAPID): #Parameters # RAPID_valide = "Os06g0654600" #End parameters link = "http://rapdb.dna.affrc.go.jp/tools/search/run?keyword=" + RAPID + "&submit=Search&id=on&size=10" html_page = helper.connectionError(link) soup = BeautifulSoup(html_page.content, "lxml") result = soup.find('tr', attrs={"class": "result"}) hashmap = {} try: rapid = result.find('td', attrs={"class": "c01"}).a.contents[0] except: print("Error : empty ID") rapid = RAPID try: description = result.find('td', attrs={"class": "c02"}).contents[0] except: print("Error : empty description") description = "" try: position = result.find('td', attrs={"class": "c03"}).a.contents[0] except: print("Error : empty position") position = "" try: RAP_symbol = result.find('td', attrs={"class": "c04"}).contents[0] except: print("Error : empty RAP symbol") RAP_symbol = "" try: RAP_name = result.find('td', attrs={"class": "c05"}).contents[0] except: print("Error : empty RAP_name") RAP_name = "" try: CGSNL_symbol = result.find('td', attrs={"class": "c06"}).contents[0] except: print("Error : empty CGSNL_symbol") CGSNL_symbol = "" try: CGSNL_name = result.find('td', attrs={"class": "c07"}).contents[0] except: print("Error : empty CGSNL_name") CGSNL_name = "" try: Oryzabase_symbol = result.find('td', attrs={ "class": "c08" }).contents[0] except: print("Error : empty Oryzabase_symbol") Oryzabase_symbol = "" try: Oryzabase_name = result.find('td', attrs={"class": "c09"}).contents[0] except: print("Error : empty Oryzabase_name") Oryzabase_name = "" hashmap = { "ID": rapid, "Description": description, "Position": position, "RAP-DB Gene Symbol Synonym(s)": RAP_symbol, "RAP-DB Gene Name Synonym(s)": RAP_name, "CGSNL Gene Symbol": CGSNL_symbol, "CGSNL Gene Name": CGSNL_name, "Oryzabase Gene Symbol Synonym(s)": Oryzabase_symbol, "Oryzabase Gene Name Synonym(s)": Oryzabase_name } return json.dumps(hashmap)
def query(db, qfields=[]): # database descriptor querry database_descriptor = BeautifulSoup( open("database-description.xml").read(), "xml").findAll("database", dbname=db.lower()) if not database_descriptor: raise ValueError('Database Not Found') # Prepare URL link = database_descriptor[0].findAll("link")[0]["stern"] # Get Headers list headers = [] for header in database_descriptor[0].findAll("header"): headers.append(header.text) # Get query qfields list fields = [] for field in database_descriptor[0].findAll("field"): fields.append(field.text) if database_descriptor[0]["method"] == "POST": i = 0 for field in fields: data = {field: qfields[i]} i += 1 res = helper.connectionError(link, data) elif database_descriptor[0]["method"] == "GET": query_string = "" if database_descriptor[0]["type"] != "text/csv": i = 0 for field in fields: # Detect controller field (always first field) if field == "": query_string += qfields[i] + "?" # All other fields are query fields else: query_string += field + "=" + qfields[i] + "&" i += 1 query_string = query_string[:-1] link += query_string + \ database_descriptor[0].findAll("link")[0]["aft"] res = helper.connectionError(link) # Handle HTML based query if (database_descriptor[0]["type"] == "text/html"): # Handling Connection ret = BeautifulSoup(res.content, "lxml") data = ret.findAll( database_descriptor[0].findAll("data_struct")[0]["indicator"], { database_descriptor[0].findAll("data_struct")[0]["identifier"]: database_descriptor[0].findAll("data_struct")[0] ["identification_string"] }) result = [] count = 0 if data != []: regex = re.compile( database_descriptor[0].findAll("prettify")[0].text, re.IGNORECASE) replaceBy = database_descriptor[0].findAll( "prettify")[0]['replaceBy'] for dataLine in data[0].findAll(database_descriptor[0].findAll( "data_struct")[0]["line_separator"]): dict = {} i = 0 for dataCell in dataLine.findAll( database_descriptor[0].findAll( "data_struct")[0]["cell_separator"]): #dataNearly = re.sub(r'\xa0',' ', dataCell.text) #dataFormat = regex.sub("", dataNearly) dataFormat = regex.sub(replaceBy, dataCell.text) dict[headers[i]] = dataFormat i += 1 if dict == {}: continue result.append(dict) return result # Handle JSON based query elif (database_descriptor[0]["type"] == "text/JSON"): # Return as a List of Dictionary return json.loads(res.content.decode("UTF-8")) # Handle csv based DB if (database_descriptor[0]["type"] == "text/csv"): ret = csv.reader(res.content.decode( database_descriptor[0]["encoding"]).splitlines(), delimiter=list(database_descriptor[0]["deli"])[0], quoting=csv.QUOTE_NONE) data = [] for row in ret: i = 0 dict = {} for header in headers: dict[header] = row[i] i += 1 f = 0 for field in fields: if (dict[field] == qfields[f]) & (qfields[f] != ""): data.append(dict) f += 1 return data