def push(site): print('push site is opening now....') time.sleep(0.5) sitemap_url = site["sitemapLink"] try: print('get sitemap link....', 'utf-8') data_ = bp(requests.get(sitemap_url).content, 'lxml') except Exception as e: print(e) list_url = [] print('---------------------------------') for x, y in enumerate(data_.find_all('loc')): print(x, y.string) list_url.append(y.string.replace('http://', 'https://www.')) print('---------------------------------') print('pushing....') for x in list_url: print('now we push the siteWeb counts are:', x) get_(site["baiduAPIUrl"], x) print("推送完成,6s后自动关闭....") time.sleep(6)
def push(site): print('push site is opening now....') time.sleep(0.5) sitemap_url = site["sitemapLink"] try: print('get sitemap link....','utf-8') data_ = bp(requests.get(sitemap_url).content,'lxml') except Exception as e: print(e) list_url=[] print('---------------------------------') for x,y in enumerate(data_.find_all('loc')): print(x,y.string) list_url.append(y.string.replace('http://','https://www.')) print('---------------------------------') print('pushing....') xmlData = {"siteUrl":site["url"],"urlList":list_url} postTheXML(site["baiduAPIUrl"],xmlData) print("推送完成,6s后自动关闭....") time.sleep(6)
def getSitemap(sitemapLink,saveAddress): print('push site is opening now....') time.sleep(0.5) site_url = sitemapLink try: print('get sitemap link....','utf-8') data_ = bp(requests.get(site_url).content,'lxml') except Exception as e: print(e) list_url = [] print('---------------------------------') for x,y in enumerate(data_.find_all('loc')): print(x,y.string) list_url.append(y.string.replace('http://','https://www.')) print('---------------------------------') print('Lisy',list_url) target = saveAddress try: with open(target,"w") as f: for i in range(len(list_url)): f.write(list_url[i]+"\n") print("写入完成,路径{}\n 6s后自动关闭".format(target)) time.sleep(5) except Exception as e: print("ERROR! ",e.__str__())
def start_1(self,text): s = bp(text,'html.parser') e = s.find_all('a',class_='position_link') a = [] for x in e: a.append('http:'+x.get('href')) return a
def process_wrapper(lineID): outfile1 = open("bb.2", "a+") outfile2 = open("bb.3", "a+") with open("bb.1") as infile: for i, line in enumerate(infile): if i != lineID: continue else: line = infile.readline() my_url = 'http://astrogeo.org/cgi-bin/imdb_get_source.csh?source=' + line[ 0:(len(line) - 1)] #print(lineID) uClient = uReq(my_url) page_html = uClient.read() uClient.close() page_parse = bp(page_html, "html.parser") containers = page_parse.findAll('tr', {"valign": "TOP"}) if len(containers) != 0: container = containers[0] sub_containers = container.findAll("td", {'align': 'RIGHT'}) sub_container1 = sub_containers[1] sub_container2 = sub_containers[2] print(sub_container1.text, file=outfile1) print(sub_container2.text, file=outfile2) else: print('11:11:11.1111', file=outfile1) print('+11:11:11.111', file=outfile2) logging.info( 'Source not found in the Astrogeo catalogue: %s' % line) outfile1.close() outfile2.close()
def qidian_fengtui_list(url): r=rq.get(url) soup=bp(r.text,'lxml') lst=soup.find_all('a') with open('fengtui.txt','a',encoding='utf-8') as tar: for i in lst: if "book.qidian.com" in i.attrs['href']: if i.text=='': pass else: tar.write(i.text+'\n') print(i.text)
def get_news(url, n_pages=1): """ Collect news from a given web page """ news = [] while n_pages: print("Collecting data from page: {}".format(url)) response = requests.get(url) soup = bp(response.content, "html.parser") news_list = extract_news(soup) next_page = extract_next_page(soup) url = "https://news.ycombinator.com/" + next_page news.extend(news_list) n_pages -= 1 return news
def start_2(self,text): s = bp(text,'html.parser') e = s.find('dd', class_='job_request') fromsite = '拉勾网' # positionType = s.find('li',class_='labels').string salary = s.find('span',class_='salary').string positionName = s.find('span',class_='ceil-job').string # c = '' for x in e.find_all('span'): c = c+x.string d = c.split('/') # city = d[1] copyright()
def process_wrapper(lineID): outfile = open("aa2.tmp", "a+") with open("aa1") as infile: for i, line in enumerate(infile): if i != lineID: continue else: my_url = 'http://astrogeo.org/cgi-bin/imdb_get_source.csh?source=' + line[ 0:(len(line) - 1)] uClient = uReq(my_url) page_html = uClient.read() uClient.close() page_parse = bp(page_html, "html.parser") containers = page_parse.findAll("td", { "align": "RIGHT", "nowrap": "" }) out = open("kk.{}".format(i), "w") for x in range(6, len(containers), 3): container = containers[x] out.write(container.text) out.close() #remove the blank lines for a text file: with open("kk.{}".format(i), 'r') as inp: with open("ll.{}".format(i), 'w') as out: for line1 in inp: if not line1.isspace(): out.write(line1) #Check for the nearly simultaneous and different bands observations: df = pd.read_csv("ll.{}".format(i), sep=' ', header=None, names=['Date', 'Band']) for j, k in zip(range(len(df)), range(1, len(df))): d1 = df['Date'][j] d2 = df['Date'][k] b1 = df['Band'][j] b2 = df['Band'][k] if days_between(d1, d2) < 365 and b1 != b2: str_line = '%s %s' % (str(i), line) outfile.write(str_line) outfile.close()
def parse(self, response): item = ItDataItem() s = bp(response.body, 'html.parser') a = s.find_all('table', class_='newlist') a.pop(0) print a for x in a: item['fromsite'] = '智联招聘' item['positionType'] = x.b.get_text().encode('utf8') item['positionName'] = x.a.get_text().encode('utf8') item['salary'] = x.find_all( 'td', class_='zwyx')[0].get_text().encode('utf8') item['city'] = x.find_all( 'td', class_='gzdd')[0].get_text().encode('utf8') if u'学历' in x.li.find_all('span')[3].get_text(): item['education'] = x.li.find_all('span')[3].get_text().encode( 'utf8') item['workYear'] = '' else: if u'经验' in x.li.find_all('span')[3].get_text(): item['workYear'] = x.li.find_all( 'span')[3].get_text().encode('utf8') item['education'] = x.li.find_all( 'span')[4].get_text().encode('utf8') else: pass item['jobDes'] = x.li.li.get_text().encode('utf8') item['company'] = x.find_all( 'td', class_='gsmc')[0].get_text().encode('utf8') item['companySize'] = x.li.find_all('span')[2].get_text().encode( 'utf8') item['financeStage'] = x.li.find_all('span')[1].get_text().encode( 'utf8') item['industryField'] = '' #-------# item['rate'] = 0 item['number'] = 0 yield item
def extract(response): ip_port = {'ip': '', 'port': ''} try: s = bp(response, 'html.parser') a = s.find_all('tr') a.pop(0) for x in a: ip = x.find_all('td')[0].get_text() port = x.find_all('td')[1].get_text() ip_port['ip'] = ip ip_port['port'] = port if test_for_list(ip_port): if input('http://' + ip['ip'] + ':' + ip['port']): print '插入成功' else: print '插入失败' continue else: print '验证失败' continue return True except: return False
def start(url): z = 1 for i in range(97, 153): urlx = url + str(i) #time.sleep(random.randint(29,50)) print 'Crawl : ' + urlx UA = random.choice(headerss) headers = {'User-Agent': UA} proxies = {'http': '115.237.13.176:8118'} req = requests.get(url=urlx, headers=headers, proxies=proxies) #time.sleep(random.randint(29,60)) #print req.headers reqe = req.content.encode('utf-8', 'utf-8') bs = bp(reqe, 'lxml') a = bs.find_all('a', class_='xlistju') for x in a: try: print str(z) + ' : ' + x.string with open('longzu.txt', 'a+') as a: a.write(str(z) + ' ' + str(x.string) + '\n' + '\n') #time.sleep(random.randint(2, 8)) z += 1 except: pass
def parse_detail(self,response): s = bp(response.body,'html.parser') item = ItDataItem() a = s.find_all('div',class_='tHeader')[0] item['fromsite'] = '前程无忧'.encode('utf8') item['positionType'] = 0 item['positionName'] = a.h1.get_text().encode('utf8') item['salary'] = a.strong.get_text().encode('utf8') item['city'] = a.span.get_text().encode('utf8') try: item['education'] = s.find_all('span',class_='sp2')[0].get_text() item['workYear'] = s.find_all('span',class_='sp1')[0].get_text() except: item['education'] = '' item['workYear'] = '' item['jobDes'] = a.find_all('div',class_='bmsg')[0].get_text().encode('utf8') item['company'] = a.find_all('p',class_='cname')[0].get_text().encode('utf8') item['companySize'] = a.find_all('p',class_='msg')[0].get_text().split('|')[1] item['financeStage'] = a.find_all('p',class_='msg')[0].get_text().split('|')[0] item['industryField'] = a.find_all('p',class_='msg')[0].get_text().split('|')[2] #-----# item['rate'] = '' item['number'] = '' yield item
import smtplib import requests from bs4 import BeautifulSoup as bp vanpeople_URL = 'https://www.vanpeople.com/c/s_W1/1/0/0/0/0/0.html' vansky_URL = 'https://www.vansky.com/info/ZFBG08.html?page=1&location=&year=&title=w1' """Vanpeople Part""" vanpeople_page = requests.get(vanpeople_URL) soup = bp(vanpeople_page.content, 'html.parser') vanpeople_macbook_title_list = [] vanpeople_macbook_date_list = [] hyper_link_list = [] date_bp = soup.find_all('div', class_='f-fl time') title_link_bp = soup.find_all('a', href=True, class_='ahtitle') for i in range(10): temp = title_link_bp[i] temp2 = date_bp[i] vanpeople_macbook_title_list.append(temp.text.strip()) hyper_link_list.append(temp['href']) vanpeople_macbook_date_list.append(temp2.text.strip()) vanpeople_result = zip(vanpeople_macbook_title_list, vanpeople_macbook_date_list, hyper_link_list) for i in vanpeople_result: print(i)
#coding:utf-8 import requests import time from bs4 import BeautifulSoup as bp print unicode('Langzi.Fun 自动推送开启....', 'utf-8') time.sleep(0.5) site_url = 'https://code.boyumanong.top/baidusitemap.xml' try: print unicode('Langzi.Fun 获取sitemap链接....', 'utf-8') data_ = bp(requests.get(site_url).content, 'lxml') except Exception.e: print e list_url = [] def get_(data): headers = {'User-Agent': 'curl/7.12.1 ', 'Content-Type': 'text/plain '} try: r = requests.post( url= 'http://data.zz.baidu.com/urls?site=code.boyumanong.top&token=这里改写成你的token', data=data) print r.status_code print r.content except Exception, e: print e
import requests from bs4 import BeautifulSoup as bp page = requests.get("http://dataquestio.github.io/web-scraping-pages/simple.html") soup = bp(page.content, "html.parser") print(page.status_code) print("\n") print(soup.prettify())
# A função next_siblings() do BeautifulSoup facilita a coleta de dados em tabelas, # principalmente as que têm linhas de título from urllib.request import urlopen from bs4 import BeautifulSoup as bp html = urlopen("http://www.pythonscraping.com/pages/page3.html") bsObj = bp(html, "lxml") for sibling in bsObj.find("table", {"id": "giftList"}).tr.next_siblings: print(sibling) # não captura os irmãos do título (função excludente), apenas as linhas seguintes
def parseHtml(html): # 获取一个html 返回得到翻译结果一段div元素 soup = bp(html, features="lxml") explain = soup.find('div', {'class': "trans-container"}) return explain
import requests as rq from bs4 import BeautifulSoup as bp import itertools as it url = 'http://www.100ppi.com/mprice/mlist-1.html' r = rq.get(url) soup = bp(r.text, 'lxml') print(soup)
from urllib.request import urlopen as uReq #Make the website url as a variable: my_url = 'http://astrogeo.org/cgi-bin/imdb_get_source.csh' #Open the url: uClient = uReq(my_url) #Read the url and save its content as a variable: page_html = uClient.read() #Close the url: uClient.close() #Parse (i.e analyze) the html page into its parts: page_parse = bp(page_html, "html.parser") #See the parts of the html page within the variable 'page_parse' in #a structured way (or organized format) so that the sub-parts of this #variable having the required informations can be identified: #print(bp.prettify(page_parse)) #find the sections or parts of the parsed html page which has #all the required information and save it as a variable: containers = page_parse.findAll("table", {"cellpadding": "3%"}) #Check the length of the variable 'containers' to know the #number of sections of the same type has the required information. #print(len(containers)) #Select the first section within container and save it as a variable:
''' Process: To scrapping the disaster types from https://www.emdat.be/classification Input : website link: https://www.emdat.be/classification Output : CSV file containing the disaster types ''' import requests import csv from bs4 import BeautifulSoup as bp url = 'https://www.emdat.be/classification' res = requests.get(url) soap = bp(res.text, 'html.parser') a = [] rows = soap.find_all('tr') data = [] for row in range(len(rows)): team_row = [] columns = rows[row].findAll('td') for column in columns: team_row.append(column.getText()) if len(team_row) == 4: data.append(team_row) else: d1 = [] m = 4 - len(team_row) for i in range(0, m): d1.append('NONE') d1 = d1 + team_row data.append(d1) with open('filename.csv', 'w') as csvfile:
ch_it = 0.0 for i in range(nodes): old = score[i] score[i] = 1 - d for j in range(nodes): score[i] += d * similarty[i][j] * score[j] / total[j] old = score[i] - old if ch_it < old: ch_it = old iterations += 1 files = os.listdir(path) for file in files: #reading all the files and generating the sentences and its corresponding vetors in fullsentences and sentences respectively doc = io.open(path + file, 'r', encoding='utf-8') soup = bp(doc.read(), 'html.parser') fulls = [word.string for word in (soup.find_all('p'))[1:]] for full in fulls: temp = full.encode('ascii').split('.') initial = len(fullsentences) fullsentences += temp sent = [x.translate(None, string.punctuation).strip() for x in temp] i = 0 for line in sent: if line == '': fullsentences.pop(i + initial) continue temp = defaultdict(lambda: 0) j = 0 for word in nltk.word_tokenize(line): temp[word.lower()] += 1
def parse(self, response): s = bp(response.body,'html_parser') a = s.find_all('div',class_='el')[10:] for x in a: link = x.a.get('href') yield scrapy.Request(link,callback=self.parse_detail)
from bs4 import BeautifulSoup as bp import requests import csv csv_file = open('./dados/fornecedores/fornecedores.csv', 'w', newline='', encoding='utf-8') csv_write = csv.writer(csv_file) csv_write.writerow(['id', 'empresa', 'telefone', 'site', 'email']) url = 'http://www.abimad.com.br/associados/pesquisar#menu-top' r = requests.get(url) sup = bp(r.content, 'html.parser') todos_elementos = sup.find_all('figure') id = 0 for elemento in todos_elementos: empresa = elemento.h5.text telefone = elemento.find( 'i', class_='fa-phone').next_element.next_element.text site = elemento.find('i', class_='fa-link').next_element.next_element.text email = elemento.find('i', class_='fa-at').next_element.next_element.text csv_write.writerow([id, empresa, telefone, site, email]) id += 1 csv_file.close()
import requests from pprint import pprint as pp from bs4 import BeautifulSoup as bp url = 'https://news.ycombinator.com' r = requests.get('https://news.ycombinator.com/news?p=2') page = bp(r.content, 'html.parser') def extract_news(parser): """ Extract news from a given web page """ news_list = [] titles = [] tables = parser.table.find_all('table') # Находим все таблицы needed_table = tables[1] # Выбираем нужную. tds = needed_table.find_all('td', attrs={'class': 'subtext'}) # Код всех строчек с информацией о новости ttls = needed_table.find_all('a', attrs={'class': 'storylink'}) # Код всех заголовков for title in ttls: # С titles.append(title.text) for i, td in enumerate(tds): tr = td.find_all('a') url = ttls[i]['href'] title = titles[i] author = td.find('a', attrs={'class': 'hnuser'}) if author == None: author = '--' else: author = author.text
def process_third_step(lineID): z_out = open("cc.1", "a+") with open('bb.6', 'r') as f1, open('bb.7', 'r') as f2: for ID, line in enumerate(zip(f1, f2)): if ID != lineID: continue else: x = line[0] y = line[1] my_url = 'http://ned.ipac.caltech.edu/cgi-bin/nph-objsearch?in_csys=Equatorial&in_equinox=J2000.0&lon=' + x[ 0:(len(x) - 1)] + '&lat=' + y[0:( len(y) - 1 )] + '&radius=0.1&out_csys=Equatorial&out_equinox=J2000.0&obj_sort=Distance+to+search+center&of=pre_text&zv_breaker=30000.0&list_limit=32&img_stamp=YES&z_constraint=Unconstrained&z_value1=&z_value2=&z_unit=z&ot_include=ANY&in_objtypes3=Radio&nmp_op=ANY&search_type=Near+Position+Search' #print(my_url) uClient = uReq(my_url) page_html = uClient.read() uClient.close() page_parse = bp(page_html, "html.parser") containers = page_parse.findAll('td', {"bgcolor": "lightgrey"}) if len(containers) != 0: container = containers[0] else: my_url1 = 'http://ned.ipac.caltech.edu/cgi-bin/nph-objsearch?in_csys=Equatorial&in_equinox=J2000.0&lon=' + x[ 0:(len(x) - 1)] + '&lat=' + y[0:( len(y) - 1 )] + '&radius=0.5&out_csys=Equatorial&out_equinox=J2000.0&obj_sort=Distance+to+search+center&of=pre_text&zv_breaker=30000.0&list_limit=32&img_stamp=YES&z_constraint=Unconstrained&z_value1=&z_value2=&z_unit=z&ot_include=ANY&in_objtypes3=Radio&nmp_op=ANY&search_type=Near+Position+Search' uClient1 = uReq(my_url1) page_html1 = uClient1.read() uClient1.close() page_parse1 = bp(page_html1, "html.parser") containers1 = page_parse1.findAll('td', {"bgcolor": "lightgrey"}) if len(containers1) != 0: container = containers1[0] else: my_url2 = 'http://ned.ipac.caltech.edu/cgi-bin/nph-objsearch?in_csys=Equatorial&in_equinox=J2000.0&lon=' + x[ 0:(len(x) - 1)] + '&lat=' + y[0:( len(y) - 1 )] + '&radius=2.0&out_csys=Equatorial&out_equinox=J2000.0&obj_sort=Distance+to+search+center&of=pre_text&zv_breaker=30000.0&list_limit=32&img_stamp=YES&z_constraint=Unconstrained&z_value1=&z_value2=&z_unit=z&ot_include=ANY&in_objtypes3=Radio&nmp_op=ANY&search_type=Near+Position+Search' uClient2 = uReq(my_url2) page_html2 = uClient2.read() uClient2.close() page_parse2 = bp(page_html2, "html.parser") containers2 = page_parse2.findAll( 'td', {"bgcolor": "lightgrey"}) if len(containers2) != 0: container = containers2[0] else: my_url3 = 'http://ned.ipac.caltech.edu/cgi-bin/nph-objsearch?in_csys=Equatorial&in_equinox=J2000.0&lon=11h11m11.1111&lat=+11d11m11.111&radius=2.0&out_csys=Equatorial&out_equinox=J2000.0&obj_sort=Distance+to+search+center&of=pre_text&zv_breaker=30000.0&list_limit=32&img_stamp=YES&z_constraint=Unconstrained&z_value1=&z_value2=&z_unit=z&ot_include=ANY&in_objtypes3=Radio&nmp_op=ANY&search_type=Near+Position+Search' uClient3 = uReq(my_url3) page_html3 = uClient3.read() uClient3.close() page_parse3 = bp(page_html3, "html.parser") containers3 = page_parse3.findAll( 'td', {"bgcolor": "lightgrey"}) container = containers3[0] logging.info( 'Radio source not found within 2.0 arcmin radius from NED around coordinates:\n%s %s' % (x, y)) #i=i+1 outfile = open("kk.{}".format(ID), "w") outfile.write(container.pre.text) outfile.close() #Removing the first 3 lines of a text file: lines = open("kk.{}".format(ID), "r").readlines() open("ll.{}".format(ID), 'w').writelines(lines[3:]) #Selecting particular columns based on their positions in a text file table: inp = open("ll.{}".format(ID), 'r') lines = inp.readlines() out = open("mm.{}".format(ID), 'w') for line in lines: out.write(line[74:83]) out.write(line[120:123]) out.write("\n") out.close() inp.close() #Sorting a text file table with respect to a column of numbers: with open("mm.{}".format(ID), 'r') as inp: with open("nn.{}".format(ID), 'w') as out: lines = inp.readlines() lines.sort(key=lambda line: int(line.split()[1]), reverse=True) out.writelines(lines) #Selecting a particular element of text file table: inp = open("nn.{}".format(ID), 'r').readlines() line = inp[0].split() if line[0] == '...': z_value = 'NaN' #We can make it '0.0' also. else: z_value = line[0] z_out.write(z_value) z_out.write('\n') z_out.close()
#coding:utf-8 import requests import time from bs4 import BeautifulSoup as bp print unicode('Langzi.Fun 自动推送开启....','utf-8') time.sleep(0.5) site_url = 'https://yuwangi.github.io/baidusitemap.xml' try: print unicode('Langzi.Fun 获取sitemap链接....','utf-8') # print requests.get(site_url).content data_ = bp(requests.get(site_url).content,"html.parser") print "2222" print data_.url except Exception,e: print "111111" print e list_url=[] def get_(data): headers={'User-Agent':'curl/7.12.1 ', 'Content-Type':'text/plain '} try: r = requests.post(url='http://data.zz.baidu.com/urls?site=yuwangi.github.io&token=kViymNiCmZjrxxoa',data=data) print r.status_code print r.content except Exception,e: print e
import requests from bs4 import BeautifulSoup as bp import pandas as pd page = requests.get("https://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168") soup = bp(page.content, 'html.parser') seven_day = soup.find(id="seven-day-forecast") forecast_items = seven_day.find_all(class_="tombstone-container") today = forecast_items[0] #print(today.prettify()) period = today.find(class_="period-name").get_text() short_desc = today.find(class_="short-desc").get_text() temp = today.find(class_="temp").get_text() #print(period) #print(short_desc) #print(temp) img = today.find("img") desc = img['title'] #print(desc) period_tags = seven_day.select(".tombstone-container .period-name") periods = [pt.get_text() for pt in period_tags] #print(periods) short_descs = [sd.get_text() for sd in seven_day.select(".tombstone-container .short-desc")] temps = [t.get_text() for t in seven_day.select(".tombstone-container .temp")]
import requests from bs4 import BeautifulSoup as bp url = 'http://news.qq.com/' newdata = requests.get(url).text soup = bp(newdata, 'lxml') titles = soup.select("div.text > em.f14 > a.linkto") for i in range(len(titles)): for n in titles: my_title = n.get_text() my_link = n.get("href") dataset = {my_title, my_link} print(dataset)