def download_linke(coords, proxy, port, saveFile, saveMode): print proxy, port print proxy != "" url = "http://www.soda-is.com/eng/services/service_invoke/gui.php?" + "xml_descript=soda_tl.xml&Submit2=Month" session = Session() session.verify = False if proxy != "": proxies = {proxy: port} session.proxies = proxies br = RoboBrowser(session=session, parser="lxml") br.open(url) linke_form = br.get_forms()[1] num = len(coords) index = 0 with open(saveFile, saveMode) as f: try: for coord in coords: inlon, inlat = coord linke_form["lat"].value = inlat linke_form["lon"].value = inlon sf = linke_form.submit_fields.getlist("execute") br.submit_form(linke_form, submit=sf[0]) linke_table = br.find("table", {"cellspacing": "0", "cellpadding": "2"}) linkes = get_monthly_linke_str(get_linke_values(linke_table)) s = "%s,%s,%s\n" % (format(inlon, "0.5f"), format(inlat, "0.5f"), linkes) if len(s) > 48: f.write(s) print "Done with point %i of %i: (%s, %s)" % ( index + 1, num, format(inlon, "0.5f"), format(inlat, "0.5f"), ) index += 1 br.back() print "DONE!" except Exception as e: not_dl = list(coords[index:]) with open(saveFile + "_notdownloaded.txt", "w") as nd: for c in not_dl: nd.write("%s,%s\n" % (str(c[0]), str(c[1]))) print e
def main(): # Browse to Rap Genius browser = RoboBrowser(history=True) browser = RoboBrowser( parser="html.parser") # will get a warning if parser not declared browser.open('http://rapgenius.com/') # Search for Queen form = browser.get_form(action='/search') form # <RoboForm q=> form['q'].value = 'queen' browser.submit_form(form) # Look up the first song songs = browser.select('.song_name') try: browser.follow_link(songs[0]) except IndexError: print("Songs Index doesn't exist!") return lyrics = browser.select('.lyrics') try: lyrics[0].text # \n[Intro]\nIs this the real life... except IndexError: print("Lyrics Index doesn't exist!") # Back to results page browser.back() # Look up my favorite song browser.follow_link('death on two legs') # Can also search HTML using regex patterns lyrics = browser.find(class_=re.compile(r'\blyrics\b')) print(lyrics.text) # \n[Verse 1]\nYou suck my blood like a leech...
def fetch(url): browser = RoboBrowser(history=True, parser="html.parser") browser.open(url) votes = browser.select('.moderatorenSlider a.beitrag') followed_links = set() total_scores = {} for v in votes: if v["href"] in followed_links: continue else: followed_links.add(v["href"]) print(v["href"]) browser.follow_link(v) try: scores = extractVotes(browser) print(scores) for title, score in scores.items(): if title not in total_scores: total_scores[title] = (score, 1) else: score_, num = total_scores[title] total_scores[title] = (score_+score, num+1) except Exception as e: print(e) browser.back() return total_scores
def download_internal(user_id, from_date, to_date): """Download the csv files for the transaction between the given dates""" # Create the browser and open the lloyds login page browser = RoboBrowser(parser='html5lib') browser.open( 'https://online.lloydsbank.co.uk/personal/logon/login.jsp?WT.ac=hpIBlogon' ) while 'Enter Memorable Information' not in browser.parsed.title.text: print(browser.parsed.title.text) form = browser.get_form(id='frmLogin') form['frmLogin:strCustomerLogin_userID'] = str(user_id) form['frmLogin:strCustomerLogin_pwd'] = prompt('Enter password: '******'re logged in, now enter memorable information print(browser.parsed.title.text) form = browser.get_form(id='frmentermemorableinformation1') field = 'frmentermemorableinformation1:strEnterMemorableInformation_memInfo{0}' for i in range(1, 4): label = browser.find("label", {"for": field.format(i)}) form[field.format(i)] = ' ' + prompt(label.text.strip()) browser.submit_form(form) # hopefully now we're logged in... print(browser.parsed.title.text) links = [] for link in browser.get_links("View statement"): if link.text == "View statement": links.append(link) # loop through all accounts for index, link in enumerate(links): acc_name = link['data-wt-ac'].split(" resource")[0] print(acc_name) print(browser.parsed.title) browser.follow_link(link) yield acc_name, download_account_internal(browser, from_date, to_date) browser.back()
def scrape_revigo_csv(input_GOstats_tsv, out_file, pvalue_cutoff=0.05, fdr_cutoff=1.0): """ """ oh = open(out_file, "w") # get input goterms from GOstats result goterms = GOstats2Revigo(input_GOstats_tsv, pvalue_cutoff=pvalue_cutoff, fdr_cutoff=fdr_cutoff, output_column=3) if goterms: br = RoboBrowser(parser="lxml") br.open("http://revigo.irb.hr/") form = br.get_form() #print(form) form["goList"].value = goterms br.submit_form(form) download_rsc_link = br.find("a", href=re.compile("toR.jsp")) br.follow_link(download_rsc_link) #r_code = br.response.content.decode("utf-8") #print(r_code) br.back() download_csv_link = br.find("a", href=re.compile("export.jsp")) br.follow_link(download_csv_link) csv_content = br.response.content.decode("utf-8") oh.write(csv_content) else: oh.write( "term_ID,description,frequency,plot_X,plot_Y,plot_size,log10 p-value,userVal_2,uniqueness,dispensability,representative,eliminated" ) oh.close()
def autoRevigo(name): name = name os.chdir("/home/david/Documents/BenoitLab/RNA-seq/Gprofiler/") os.listdir(".") file = open(name, "r") string = "" for line in file.readlines(): # print(line) string = string + line + "\n" goterms = string br = RoboBrowser(parser="html") br.open("http://revigo.irb.hr/") form = br.get_form() form["goList"].value = goterms br.submit_form(form) download_rsc_link = br.find("a", href=re.compile("toR.jsp")) br.follow_link(download_rsc_link) r_code = br.response.content.decode("utf-8") print(r_code) br.back() download_csv_link = br.find("a", href=re.compile("export.jsp")) br.follow_link(download_csv_link) csv_content = br.response.content.decode("utf-8") writefile = open("/home/david/Documents/BenoitLab/RNA-seq/Revigo/" + name, "w") writefile.write(csv_content) writefile.close()
def attack(self): user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0' accept = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' accept_language = 'en-US,en;q=0.5' s = requests.Session() s.headers['User-Agent'] = user_agent s.headers['Accept'] = accept s.headers['Accept-Language'] = accept_language robo = RoboBrowser(session=s, history=True, parser='html.parser') robo.open(self.url) if self.atr_form is "id": form = robo.get_form(id=self.atr_value) elif self.atr_form is "class": form = robo.get_form(class_=self.atr_value) elif self.atr_form is "name": form = robo.get_form(name=self.atr_value) elif self.atr_form is "action": form = robo.get_form(action=self.atr_value) else: self.log.append("Tidak Menemukan Form Login") return None #melakukan input salah sebagai kondisi gagal login saat melakukan Brute Force form[self.name_input[0]].value = "xxxxx" form[self.name_input[1]].value = "xxxxx" robo.submit_form(form) urlFailed = str(robo.url) #melakukan serangan Brute for username in self.user_list: for password in self.pass_list: robo.open(self.url) form[self.name_input[0]].value = username form[self.name_input[1]].value = password robo.submit_form(form) url = str(robo.url) if url != urlFailed: self.log_csv.append(["Brute Force", self.url]) self.log.append("login is success or you has been locked out of attempts") self.log.append("Url after login : "******"Username : "******"Password : "******"Sensitive Data Exposed", self.url]) self.log.append("url contain sensitive data maybe have vulnerability") try: #mencari SESSION PHPSESSID sess = robo.session.cookies['PHPSESSID'] self.log_csv.append(["Session ID Exposed", self.url]) self.log.append("found PHPSESSID maybe have vulnerability fixation attack") self.log.append("PHPSESSID : " + sess) #mencoba logout dan kembali urlLog = robo.url linkLogout = robo.get_link(text="logout") if linkLogout is not None: robo.follow_link(linkLogout) robo.back(n=1) if robo.url == urlLog: self.log.append("session not destroyed maybe have vulnerabilty") except: pass return time.sleep(5) self.log.append("Brute Force failed - Login not successfull") return
link['href']='https://bitbucket.org'+link['href'].encode("utf-8").strip() #link['href']='/odigeoteam/frontend-html5' print link['href'] #print link browser.follow_link(link) branches = browser.select('li.branches') if len(branches)>0 : print 'branches '+ branches[0].select('span.value')[0].text tags = browser.select('li.tags') if len(tags)>0 : print 'tags' + tags[0].select('span.value')[0].text enlaces = browser.find_all('a') #print enlaces for enlace in enlaces: if enlace.get('href') == '#forks': print 'forks '+ enlace.select('span.value')[0].text if enlace.get('href') == '#tags': print 'tags '+ enlace.select('span.value')[0].text if enlace.get('href') == '#branches': print 'branches '+ enlace.select('span.value')[0].text if enlace.get('href') == '#followers': print 'watchers '+ enlace.select('span.value')[0].text # Back to results page browser.back() except Exception: pass
def scrapeTerms(num_terms=1, requested_term="", saveHtml=False): # define a list for course information courses = [] # create a new browser to navigate search forms browser = RoboBrowser(parser='html.parser') # open course search page and get term options browser.open('https://www.uvic.ca/BAN1P/bwckschd.p_disp_dyn_sched') term_options = browser.find_all("option") if (debug): found_terms = [] for term_option in term_options: found_terms.append(term_option['value']) print("found terms: " + str(found_terms)) # iterate through data for all terms for term_option in term_options: # isolate term value term = term_option['value'] # skip empty form values if len(term) == 0: continue # collect only the requested amount of data if (requested_term != "" and term != requested_term): continue if num_terms == 0: break else: num_terms -= 1 print('COLLECTING FROM TERM ' + term + ". \n") # get, fill, and submit the search form (there is only one form on this page) term_form = browser.get_form() term_form['p_term'].value = term browser.submit_form(term_form) # get a list of all available subjects subject_dropdown = browser.find('select', attrs={"name": "sel_subj"}) for subject_option in subject_dropdown.find_all("option"): # isolate subject abbreviation subject = subject_option['value'] # get, fill, and submit the class schedule form schedule_form = browser.get_form() schedule_form.fields.getlist('sel_subj')[1].value = subject browser.submit_form(schedule_form) # save the the search result page result = str(browser.parsed()) if saveHtml: filename = 'ResultPages\\' + term + subject + '.html' with open(filename, 'w') as file: try: file.write(result) print(filename + ' downloaded') except Exception as e: print("Could not save file! ", e) # parse the resulting html string print('Parsing ' + term + subject + '.html ... ', end="") subject_courses = parse(result, term) for subject_course in subject_courses: courses.append(subject_course) # go back to the class search form browser.back(1) # go back to the term selection form browser.back(1) # return from scrapeTerms return courses
def _download_traces(dates, period): url = "http://archive.routeviews.org/route-views.wide/bgpdata/" MRAI_BIN_DIR = "/srv/agarcia/TFM/" #path to save files if (period == 0): dir = MRAI_BIN_DIR + "ONE_DAY_BGP_TRACES_BZ2/" elif (period == 1): dir = MRAI_BIN_DIR + "THREE_DAY_BGP_TRACES_BZ2/" else: dir = MRAI_BIN_DIR + "BGP_TRACES_BZ2/" if not os.path.exists(dir): os.makedirs(dir) for _dt in dates: dt = datetime.datetime.strptime(_dt, "%Y-%m-%d") dt_web = dt.strftime("%Y") + "." + dt.strftime("%m") + "/" print("Opening browser...") br = RoboBrowser() br.open(url) #Buscamos la fecha que queremos y hacemos click link_date = br.get_link(dt_web) br.follow_link(link_date) #Buscamos el link UPDATES y hacemos click link_update = br.get_link("UPDATES/") br.follow_link(link_update) #Obtenemos los 2 DIAS antes y despues de la fecha deseada #(5 dias en total) days = [] days.append(dt) if (period != 0): for day_p in range(1, period + 1): d_before = dt - datetime.timedelta(days=day_p) d_after = dt + datetime.timedelta(days=day_p) days.insert(0, d_before) days.append(d_after) print(days) #Para cada dia descargamos todos los BGP update traces for day in days: print("Downloading files of day " + day.strftime("%Y-%m-%d") + "\n") elem = "updates." + day.strftime("%Y") + day.strftime( "%m") + day.strftime("%d") _dt_web = day.strftime("%Y") + "." + day.strftime("%m") + "/" br.back() br.back() br.follow_link(br.get_link(dt_web)) br.follow_link(br.get_link("UPDATES/")) links = br.get_links(elem) for link in links: file = (str(link).split('"'))[1] url_dw = "http://archive.routeviews.org/route-views.wide/bgpdata/" + _dt_web + "UPDATES/" + file filename = dir + file r = requests.get(url_dw) with open(filename, "wb") as code: code.write(r.content)
def get_legistar_entries(past_entries, city, search_regex): agenda_url = city["agenda_site"] browser = RoboBrowser(history=True) header = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8', 'Connection': 'keep-alive' } s = requests.Session() s.headers = header browser = RoboBrowser(session=s, parser="lxml") #Try to open the Legistar site try: browser.open(agenda_url) browser.submit_form(browser.get_form()) links = browser.find_all(href=re.compile("View\.ashx\?M\=A")) except: print("There was a problem opening the URL: " + agenda_url) print("Aborting search for agendas from " + city["name"]) return [], [] positive_results = [] new_agendas = [] for link in links: meetingid = str(link) pdf_url = city["root_site"] + str(link['href']) meetingid = meetingid[meetingid.find(";ID=") + 4:] meetingid = meetingid[0:6] if not any(meetingid in entry for entry in past_entries): #print(l) new_agendas = new_agendas + [meetingid] browser.follow_link(link) content = browser.response.content term_match = search_pdf(meetingid, content, search_regex) browser.back() if (len(term_match) > 0): page_body = str(browser.response.content) if city["uses_meetingagenda"] == True: deets = re.findall( "\:" + meetingid + ",.*?" + meetingid + ".*?MeetingAgendaStatus", page_body) details = ''.join([line for line in deets[0].split('\\')]) details = ''.join([line for line in details.split('\"')]) details = ''.join([line for line in details.split(':')]) details = details.replace("u0026", '&') meeting_date = details[details.find("start") + 5:details.find("end")].split()[0] else: index1 = page_body.find("View.ashx?M=A&ID=" + meetingid) page_body = page_body[0:index1] index2 = page_body.rfind("<tr") page_body = page_body[index2:] date_matches = re.findall('[\\d]+/[\\d]+/\\d\\d\\d\\d', page_body) meeting_date = date_matches[0] #turn into string of hash tags matches = "" for term in set(term_match): for bogus in ['-', ' ']: if bogus in term: term = term.replace(bogus, "") matches = matches + "#" + term + ", " positive_results.append( (meetingid, "#" + city["short"] + " #" + city["hash_tag"] + " city meeting on " + meeting_date + " about " + matches, pdf_url)) return new_agendas, positive_results
def get_non_legistar_entries(past_entries, city, search_regex): positive_results = [] new_agendas = [] browser = RoboBrowser(history=True) header = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8', 'Connection': 'keep-alive' } s = requests.Session() s.headers = header browser = RoboBrowser(session=s, parser="lxml") agenda_url = city["agenda_site"] #non-Legistar sites need to be very specific - these sites could throw anything at you. #if you need to add another city, follow this format: #if if city["short"] == "berkeley": try: browser.open(agenda_url) links = browser.find_all("a", title="Agenda") except: print("There was a problem opening the URL: " + agenda_url) print("Aborting search for agendas from " + city["name"]) return [], [] for link in links: url = city["root_site"] + str(link['href']) meetingid = url[url.rfind("/") + 1:url.rfind(".aspx")] #print(meetingid) if not any(meetingid in entry for entry in past_entries): new_agendas = new_agendas + [meetingid] browser.follow_link(link) content = str(browser.response.content) content = content.lower() content = content[content.find("innercontentcontainer"):] term_match = [] m = re.findall(search_regex, content.lower()) if m is not None and len(m) > 0: term_match = term_match + list(set(m)) browser.back() if (len(term_match) > 0): page_body = str(browser.response.content) index1 = page_body.find(meetingid) page_body = page_body[0:index1] index2 = page_body.rfind("<tr>") page_body = page_body[index2:] deets = re.findall('[\\d]+/[\\d]+', page_body) meeting_date = deets[0] matches = "" for term in set(term_match): for bogus in ['-', ' ']: if bogus in term: term = term.replace(bogus, "") matches = matches + "#" + term + ", " positive_results.append( (meetingid, "#" + city["short"] + " #" + city["hash_tag"] + " city meeting on " + meeting_date + " about " + matches, url)) elif city["short"] == "berkeleyprc" or city["short"] == "berkeleyp&j": try: browser.open(agenda_url) links = browser.find_all("a", title=re.compile(".genda")) except: print("There was a problem opening the URL: " + agenda_url) print("Aborting search for agendas from " + city["name"]) return [], [] for link in links: meetingid = str(link) url = city["root_site"] + str(link['href']).replace(" ", "%20") #print(url) pdf_index = url.rfind(".pdf") if pdf_index < 0: meetingid = url[url.rfind("/") + 1:] if not any(meetingid in entry for entry in past_entries): new_agendas = new_agendas + [meetingid] continue meetingid = url[url.rfind("/") + 1:pdf_index] if not any(meetingid in entry for entry in past_entries): new_agendas = new_agendas + [meetingid] browser.follow_link(link) content = browser.response.content term_match = search_pdf(meetingid, content, search_regex) browser.back() if (len(term_match) > 0): searchdex = str(link['title']) deets = searchdex.split() meeting_date = deets[0].lower() for bogus in string.ascii_letters: if bogus in meeting_date: meeting_date = meeting_date.replace(bogus, "") matches = "" for term in set(term_match): for bogus in ['-', ' ']: if bogus in term: term = term.replace(bogus, "") matches = matches + "#" + term + ", " positive_results.append( (meetingid, "#" + city["short"] + " #" + city["hash_tag"] + " mtg on " + meeting_date + " about " + matches, url)) else: return [], [] return new_agendas, positive_results
b.response.status_code b.links dir(b) b.get_links() b = RoboBrowser(parser="lxml") b.open("http://www.chandrashekar.info") b.get_links() b.get_links() b.forms dir(b) b.get_links() b.get_links()[-3] l = b.get_links()[-3] b.follow_link(l) b.url b.back() b.url b.forward() b.url b.get_forms() b.get_forms()[0] f = b.get_forms()[0] f f["name"] = "smith" f["subject"] = "dslfj lsdjf lsdjf lksdj flsdjf" f b.submit_form(f) import requests r = requests.get("http://pypi.python.org/pypi", params={":action" : "search", "term" : term, "submit" : "search"})
def download_linke(coords, proxy, port, saveFile, saveMode): # print proxy, port # print proxy != '' url = ("http://www.soda-is.com/eng/services/service_invoke/gui.php?" + "xml_descript=soda_tl.xml&Submit2=Month") # url = "http://www.soda-pro.com/web-services/atmosphere/turbidity-linke-2003" session = Session() session.verify = False if proxy != '': proxies = {proxy: port} session.proxies = proxies br = RoboBrowser(session=session, parser="lxml") br.open(url) linke_form = br.get_forms()[1] num = len(coords) index = 0 with open(saveFile, saveMode) as f: try: for coord in coords: inlon, inlat = coord linke_form['lat'].value = inlat linke_form['lon'].value = inlon sf = linke_form.submit_fields.getlist('execute') br.submit_form(linke_form, submit=sf[0]) linke_table = br.find("table", { "cellspacing": "0", "cellpadding": "2" }) linkes = get_monthly_linke_str(get_linke_values(linke_table)) s = "%s,%s,%s\n" % (format(inlon, '0.5f'), format( inlat, '0.5f'), linkes) if len(s) > 48: f.write(s) print "Done with point %i of %i: (%s, %s)" % ( index + 1, num, format(inlon, '0.5f'), format(inlat, '0.5f')) index += 1 br.back() print "DONE!" except Exception as e: not_dl = list(coords[index:]) with open(saveFile + "_notdownloaded.txt", "w") as nd: for c in not_dl: nd.write("%s,%s\n" % (str(c[0]), str(c[1]))) print e
def scrape_revigo(data_dir): f = open(os.path.join(data_dir, "goterms.txt"), "r") goterms = f.read() f.close() br = RoboBrowser(parser="lxml") br.open("http://revigo.irb.hr/") form = br.get_form() form["goList"].value = goterms br.submit_form(form) download_rsc_link = br.find("a", href=re.compile("toR.jsp")) br.follow_link(download_rsc_link) r_code = br.response.content.decode("utf-8") f = open(os.path.join(data_dir, "rsc.R"), "a") f.write(r_code) f.close() br.back() download_csv_link = br.find("a", href=re.compile("export.jsp")) br.follow_link(download_csv_link) csv_content = br.response.content.decode("utf-8") f = open(os.path.join(data_dir, "rsc.csv"), "a") f.write(csv_content) f.close() br.back() download_tree_link = br.find("a", href=re.compile("toR_treemap.jsp")) br.follow_link(download_tree_link) r_code = br.response.content.decode("utf-8") f = open(os.path.join(data_dir, "tree_map.R"), "a") f.write(r_code) f.close() br.back() download_csv_link = br.find("a", href=re.compile("export_treemap.jsp")) br.follow_link(download_csv_link) csv_content = br.response.content.decode("utf-8") f = open(os.path.join(data_dir, "tree_map.csv"), "a") f.write(csv_content) f.close() br.back() # get cytoscape graph cytoscape_link = br.find("a", href=re.compile("download.jsp")) br.follow_link(cytoscape_link) cytoscape_content = br.response.content.decode("utf-8") f = open(os.path.join(data_dir, "cytoscape_map.xgmml"), "a") f.write(cytoscape_content) f.close()
import re from robobrowser import RoboBrowser # Browse to Rap Genius browser = RoboBrowser(history=True) browser.open('http://rapgenius.com/') # Search for Queen form = browser.get_form(action='/search') form # <RoboForm q=> form['q'].value = 'queen' browser.submit_form(form) # Look up the first song songs = browser.select('.song_name') browser.follow_link(songs[0]) lyrics = browser.select('.lyrics') lyrics[0].text # \n[Intro]\nIs this the real life... # Back to results page browser.back() # Look up my favorite song browser.follow_link('death on two legs') # Can also search HTML using regex patterns lyrics = browser.find(class_=re.compile(r'\blyrics\b')) lyrics.text
def scrape_bio_and_albums(keywords): """ scrapes artists' data on https://www.lyrics.com/, and stores text (bio and albums) from an HTML page source of each artist's page in file (text&csv files). :param keywords: list of keywords that should represent artists' names (list) :return: """ if keywords and isinstance(keywords, list): # builds new object of RoboBrowser with given params browser = RoboBrowser( parser='html.parser', user_agent= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', history=True, timeout=10) # Open a URL (using 'RoboBrowser' library). browser.open(BASE_URL) for keyword in keywords: if keyword and len(keyword) > 1: # get browser url (should be 'old' after searching a term - if browser goes to new url) old_url = browser.url # trying to search keyword on 'lyrics.com' (using RoboBrowser's methods to handle forms) form = browser.get_form( id='search-frm') # Find form by ID 'search-frm' form[ 'st'].value = keyword # sets query value 'st' with given keyword browser.submit_form( form) # Submit a form - to search given keyword # check if the url is changed (after searching a keyword) if old_url != browser.url: # select required <a> tags, using CSS Selectors (see on BeautifulSoup's documentation) a_tags = browser.select( 'body p[class~=serp-flat-list] a[href^=artist/]') if a_tags: # browser.follow_link(a_tags[0]) # builds base url with href - to open required url using 'open()' method, # and avoid including the "/lyrics/" part in url, when using 'follow_link()' method first_artist_url = a_tags[0]['href'].replace( "artist", BASE_URL + "artist") # Open URL (should get url of the first suggested artist's page in results) browser.open(first_artist_url) # parse response content (bs4 obj), using HTML parser specified by the browser soup = browser.parsed if soup: artist_bio_tag = soup.find( class_='artist-bio') # find tag by class if artist_bio_tag: # save parsed text (artist bio) from page source to a text file save_source(keyword + " - bio", artist_bio_tag.get_text(), dir_path=os.path.join( ARTISTS_PATH, keyword)) # parse albums&songs from html tables, and save the data to a csv file albums_to_csv(soup, keyword + " - albums", dir_path=os.path.join( ARTISTS_PATH, keyword)) browser.back() # Go back in browser history. browser.back() # Go back in browser history.
break curImp = [] curText = br.parsed() for j in curText: curImp.extend(j.find_all('div', class_='td-post-content')) curS = list(set(curImp)) curRow = object() for i in curS: curRow = i break print("Movie Data") star = curRow.p starText = star.get_text() cast = star.next_sibling castText = cast.get_text() direct = cast.next_sibling directText = direct.get_text() starsSplit = starText.split(":") starsAct = starsSplit[1] if len(starsSplit) >= 2 else "" castSplit = castText.split(":") castAct = castSplit[1] if len(castSplit) >= 2 else "" directSplit = directText.split(":") directAct = directSplit[1] if len(directSplit) >= 2 else "" string = names + "," + date + "," + firstday + "," + firstWeekend + "," + lifetime + "," + starsAct + "[" + castAct + "]" + "," + "[" + directAct + "]" + "\n" print(string) csv.write(string) br.back() if csv: csv.close()