def __init__(self, qr): self.bib = dict() for div in qr: temp = bs4(str(div.findAll("div", {"class": "gs_ri"}))) self.bib['url'] = (temp.find('a')['href']) # get the url self.bib['title'] = (temp.find('a').text ) # get the title of the url self.bib['Publisher'] = (bs4( str(div.findAll("div", {"class": "gs_a"}))).text ) # get publishing details self.bib['abstract'] = (bs4( str(div.findAll("div", {"class": "gs_rs"}))).text ) #get abstract of publication self.bib['citedby'] = (bs4( str(div.findAll("div", {"class": "gs_fl"}))).find('a').text ) #get number of citations for pub in pub_details: temp = pub.split('-') author.append(temp[0]) year.append(temp[1]) try: pub_journal.append(temp[2]) except: pub_journal.append('NA') self.url = url self.urlText = urlText self.abstract = abstract self.cite = re.sub("[A-Z,a-z]", "", str(cite)) self.author = author self.pub_journal = pub_journal self.year = year
class CheckMp3Path(object): def __init__(self, thread_num=30): self.data_file = "prder" self.session = requests.session() self.session.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36" } self.get = self.session.get self.thread_num = thread_num self.html_parser = htmlP() def start(self): records = self.load_file() index = 0 que = Queue.Queue() threads = [] for i in records: index += 1 que.put([i, self.req]) for j in xrange(self.thread_num): threads.append(Down(que, "线程" + str(j), DB())) for j in threads: j.setDaemon(True) j.start() for j in threads: j.join() print "END ALL" def req(self, album): url = "http://www.xiami.com/search/album?key=%s" % album print "搜索:", url con = self.get(url).content try: href = re.findall('CDcover100.+?href="(.+?)"', con, re.S)[0] except IndexError, e: print e, url return "", "" print "专辑", href con = self.get(href).content try: language = re.findall("语种.+?td>.+?<td.+?>(.+?)</td", con, re.S)[0].strip() language = bs4(language).text language = self.html_parser.unescape(language).encode("utf8") #rtime = '-'.join(re.findall("<span>发行时间:(\d+)年(\d+)月(\d+)日</span", con, re.S)[0]) #rtime = re.findall("发行时间.+?top\">(.+?)</td", con, re.S)[0].replace("年", '-').replace("月", '-').replace("日", '') genre = re.findall("专辑风格.+?<a.+?>(.+?)</a", con, re.S)[0].strip() genre = bs4(genre).text genre = self.html_parser.unescape(genre).encode("utf8") except Exception, e: print e return "", ""
def searchResults(topic, n=None): queryRes = [] query = addQuotes(topic) if n is None: n = 10 for i in range(0, n): # url to scrape url_scrape = 'https://scholar.google.com/scholar?start=%d0' % i + '&q=%s' % query + '&hl=en&as_sdt=0,44' #add delay between requests time.sleep(0) # get the html data html = requests.get(url=url_scrape) # Convert html text to beautiful soup object soup = bs4(html.text) ''' f = open('TopicQuery_HTML.txt','wb') f.write(str(soup)) f.close() ''' # Get the query results from html page q_table = soup.findAll("div", {"class": "gs_r"}) q = queryResult(q_table)
def save_data_from_link(link): final_data={} try: page=html.fromstring(requests.get(link).text) final_data['title']=page.xpath("//h3[@class='post-title entry-title']//div//p//text()")[0] all_dd=page.xpath("//dl[@class='dl-horizontal']//dd") all_dt=page.xpath("//dl[@class='dl-horizontal']//dt") for item in zip(all_dt,all_dd): final_data[bs4(etree.tostring(item[0])).text]=bs4(etree.tostring(item[1])).text final_data['about']=html2text(etree.tostring(page.xpath("//div[@id='info']")[0])) final_data['event']=html2text(etree.tostring(page.xpath("//div[@id='events']")[0])) final_data['register']=html2text(etree.tostring(page.xpath("//div[@id='register']")[0])) final_data['contact']=html2text(etree.tostring(page.xpath("//div[@id='contact']")[0])) final_data['tags']=page.xpath("//ul[@class='list-unstyled list-inline blog-tags']//a//text()") #about,event,register,contact are in Markdown-structured text except: pass return final_data
def login(self, *arg): url = 'https://kyfw.12306.cn/otn/index/init' self.request = urllib2.Request(url) try: response = urllib2.urlopen(self.request).read() except Exception: return self.login() soup = bs4(response) data = soup.find('a', {'id':'login_user'}).text print data
def get_college_details(): source=get_source(url) soup=bs4(source) x=soup.findAll('tr') i=1 details=[] for item in x: country=item.find('td',{'class':'country'}).img['alt'] rank=i university=item.a.text #print (country,rank,university) i=i+1 details.append((country,rank,university)) return details
def getNFLMatchups(self, path): results = [] soup = bs4( open(path, 'r').read() ) ##grid = soup.find("td", {"class" : "viBodyBorderNorm"}) ### bigger table but, seemingly can get more specificity by color grid = soup.find("table", {"bgcolor" : "C48F1B"}) #tr color:d6bd7b are rows with info, that I don't need so remove extraneous_rows = grid.findAll("tr", {"bgcolor" : "d6bd7b"}) [row.extract() for row in extraneous_rows] game_rows = grid.findAll("tr") #note; the above game_rows have all betting information, helful for future stories for game in game_rows: dirty_teams = game.findAll("a", {"target":None}) row = {'home' : dirty_teams[1].text, 'away' : dirty_teams[0].text} results.append(row) return results
def search_forms(self, txt): # Soup Object self.soup = bs4(txt) self.forms = self.soup.findAll('form') self.form_len = len(self.forms) if self.form_len < 0: return False self.site = {} self.i = 0 for self.form in self.forms: self._f = self.control(self.form) self._key = str(self.i) self.site[self._key] = {} for self.key in self._f: self.site[self._key][self.key] = self._f[self.key] self.inputs = self.forms[self.i].findAll('input') self.site[self._key]['inputs'] = [] for self._input in self.inputs: self._in = {} self._i = self.control(self._input) for self.key in self._i: self._in[self.key] = self._i[self.key] self.site[self._key]['inputs'].append(self._in) self.i += 1 return self.site
for row in record['eGQueryResult']: if row["DbName"]=="pubmed": print row["Count"] handle=Entrez.esearch(db="pubmed", term="Kai Zheng", retmax=200,usehistory="y") record=Entrez.read(handle) handle.close() idlist=record["IdList"] webenv=record["WebEnv"] query=record["QueryKey"] papers=Entrez.efetch(db="pubmed", query_key=query, rettype="abstract", WebEnv=webenv, retmode="html", retmax=50) for pap in papers: try: pap=bs4(pap) print type(pap).prettify() except: print 'Nope' # print "title:", record.get("TI") # print "authors:", record.get("AU") # print "source:", record.get("SO") # hand2=Entrez.efetch(db="pubmed", id=idlist, rettype='abstract', retmode="text", retmax=50) # records= Medline.parse(handle) # for record in records: # print records # print "title:", record.get("TI","?") # print "authors:", record.get("AU","?") # print "source:", record.get("SO","?")
def get_rental_data(neighborhoods): """This function loops through all the items in neighborhoods, scrapes craiglist for date for that neighborhood, appends it to a list, and uploads a json to s3. Args: neighborhoods: neighborhoods is a dictionary containing the names of the neighborhoods as keys and the craigslist URLs as values. """ # Create list to hold all scraped data rental_data = [] # Loop through neighborhoods dict for neighborhood, url in neighborhoods.items(): # Retrieve page with the requests module response = requests.get(url) # Create BeautifulSoup object; parse with 'lxml' soup = bs4(response.text, 'lxml') # results are returned as an iterable list results = soup.find_all('li', class_="result-row") # Loop through returned results for result in results: # Error handling try: # Identify and return bedrooms and footage raw_br = result.find( 'span', class_="housing").text.split("-")[0].strip() if regex.search(raw_br): bedrooms = float(regex.search(raw_br).group(1)) else: continue raw_sqft = result.find( 'span', class_="housing").text.split("-")[1].strip() if regex.search(raw_sqft): sqft = float(regex.search(raw_sqft).group(1)) else: continue # Get datetime of post datetime = result.find("time")["datetime"] # Identify and return title of listing title = result.find('a', class_="result-title").text # Identify and return price of listing price = float(result.a.span.text.strip("$")) # Identify and return link to listing link = result.a['href'] # Create dictionary for result data = { "neighborhood": neighborhood, "datetime": datetime, "title": title, "price": price, "bedrooms": bedrooms, "sqft": sqft, "link": link } # Append data to list rental_data.append(data) except: continue # Load rental data to s3 obj = s3.Object(output_bucket, ouput_obj_path) obj.put(Body=json.dumps(rental_data, separators=(',', ':')))
def strip_tags(html): return ''.join(bs4(html).findAll(text=True))
def makeSoup(src): return bs4(src)
import requests from config import account,password from BeautifulSoup import BeautifulSoup as bs4 s = requests.session() # initial request paraments urlLogin = "******" header = { 'Host':'signin.fcu.edu.tw', 'Content-Type':'application/x-www-form-urlencoded', 'Referer':'https://signin.fcu.edu.tw/clockin/login.aspx', 'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36' } # get initial page session r = s.get(urlLogin) response = bs4(r.text) # initial login paraments postdataLogin = { 'i__EVENTTARGET':'', '__EVENTARGUMENT':'', 'LoginLdap$LoginButton':'登入' } # parse form data for element in response.findAll('input',{'type':'hidden','value':True}): postdataLogin[str(element['name'])] = str(element['value']) postdataLogin['LoginLdap$UserName'] = account postdataLogin['LoginLdap$Password'] = password # log in loginHtml = s.post(urlLogin,data=postdataLogin,headers=header)