print('\n##########正在下载第{}页数据##########\n'.format(page)) base_url = 'https://www.zhipin.com/c100010000-p100109/?page={}&ka=page-{}'.format( page, page) if page > 1: sleep(5) cookie = 'Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1596689319,1596706170; lastCity=100010000; __g=-; __zp_stoken__=a22daJB4DI1lsZkxcNEotc3AEMXhEbHRqFFdAYwB7JnIMWydyTWshf2pXaGIpFndlLUcoPGVnDFJ0PTAYFwhsHnJqKx0nInloej8bZVR9OyoNIBtUZ1xOB31HTgcZKwkub35tQxcGDVg2eT4%3D; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1596706456; __zp_sseed__=+ESIwp4DFQO7vkpLz5T9FtTBTD9zO5XO5H3GNSZMuTc=; __zp_sname__=cfe88225; __zp_sts__=1596706745034; __c=1596706172; __l=l=%2Fwww.zhipin.com%2Fc100010000-p100109%2F%3Fpage%3D10%26ka%3Dpage-10&r=&g=; __a=84984979.1596689321.1596689321.1596706172.20.2.12.20' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36', 'cookie': cookie, } html_data = get(url=base_url, headers=headers).text selector = Selector(html_data) result_list = selector.css("#main > div > div.job-list > ul > li") for sel in result_list: Job_benefits = sel.css( "div > div.info-append.clearfix > div.info-desc ::text" ).extract_first() #工作福利 job_name = sel.css( "div > div.info-primary > div.primary-wrapper > div > div.job-title > span.job-name > a ::text" ).extract_first() Working_data_1 = sel.css( "div > div.info-append.clearfix > div.tags > span:nth-child(1) ::text" ).extract_first() #工作数据_1 Working_data_2 = sel.css( "div > div.info-append.clearfix > div.tags > span:nth-child(2) ::text"
html = ''' <div> <ul> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> ''' from parsel import Selector selector = Selector(text=html) result = selector.css('.item-0').re('link.*') print(result)
def view_bot(browser): # The "pandas" package import the excel file and then creates a basic url list in python. df = pd.read_excel( r'C:\Users\Owner\PycharmProjects\LinkedInVeiwBot\URLS.xlsx') url_list = df["URLS"].tolist() # This guy suppose to open an excel.csv file and activate write mode:'w' writer = csv.writer(open('Linkedinfile.csv', 'w', encoding='utf-8')) # writerow() method to the write to the file object writer.writerow([ 'Name', 'Job Title', 'Company', 'Last jobs', 'College', 'Location', 'URL' ]) for url in url_list: # sleep to make sure everything loads, add random to make us look human. browser.get(url) time.sleep(random.randint(5, 10)) # assigning the source code for the web page to variable sel sel = Selector(text=browser.page_source) # Find the button 'view more' browser.execute_script( "window.scrollTo(0, document.body.scrollHeight);") time.sleep(2) try: element = browser.find_element_by_xpath( '//section[@id="experience-section"]/div/button') button_text = element.text while button_text != "Show fewer experiences": browser.execute_script("return arguments[0].scrollIntoView();", element) browser.execute_script("window.scrollBy(0,-200);") time.sleep(2) element.click() time.sleep(2) element = browser.find_element_by_xpath( '//section[@id="experience-section"]/div/button') button_text = element.text except: print("No button found") else: pass # xpath to extract the text from the class containing the name name = sel.xpath( '//*[starts-with(@class, "pv-top-card-section__name")]/text()' ).extract_first() # if name exists if name: # .strip() will remove the new line /n and white spaces name = name.strip() # xpath to extract the text from the class containing the job title job_title = sel.xpath( '//*[starts-with(@class, "pv-top-card-section__headline")]/text()' ).extract_first() if job_title: job_title = job_title.strip() # xpath to extract the text from the class containing the company company = sel.xpath( '//*[starts-with(@class,"pv-entity__secondary-title")]/text()' ).extract_first() if company: company = company.strip() # Trying to get the entire list of job secondary job titles. last_positions_array = \ browser.find_elements_by_xpath('//a[@data-control-name = "background_details_company"]/div/h3') last_jobs_array = browser.find_elements_by_xpath( '//span[@class = "pv-entity__secondary-title"]') last_jobs = '' for index, job in enumerate(last_jobs_array): last_jobs += last_positions_array[index].text + ' at ' last_jobs += job.text last_jobs += ' || ' last_jobs = last_jobs[:-2] # xpath to extract the text from the class containing the college college = sel.xpath( '//*[starts-with(@class, "pv-entity__school-name t-16 t-black t-bold")]/text()' ).extract_first() if college: college = college.strip() # xpath to extract the text from the class containing the location location = sel.xpath( '//*[starts-with(@class, "pv-top-card-section__location")]/text()' ).extract_first() if location: location = location.strip() # assignment of the current URL linkedin_url = browser.current_url # validating if the fields exist on the profile name = validate_field(name) job_title = validate_field(job_title) company = validate_field(company) college = validate_field(college) location = validate_field(location) linkedin_url = validate_field(linkedin_url) # printing the output to the terminal print('\n') print('Name: ' + name) print('Job Title: ' + job_title) print('Company: ' + company) print('Previous Jobs: ' + last_jobs) print('College: ' + college) print('Location: ' + location) print('URL: ' + linkedin_url) print('\n') # This guy writes all the params below into csv file. writer.writerow([ name, job_title, company, last_jobs, college, location, linkedin_url ])
tree = etree.ElementTree(root) if not os.path.exists(ARTICLE_STORAGE + categoryName): os.makedirs(ARTICLE_STORAGE + categoryName) tree.write(ARTICLE_STORAGE + categoryName + "/" + str(uuid.uuid4()) + ".xml", encoding='utf-8', pretty_print=True) driver = webdriver.Chrome() fileList = os.listdir(path=REF_STORAGE) countHref = 1028 for k in range(N, len(fileList)): f = open(REF_STORAGE + fileList[k]) for line in f: driver.get(URL_VALUE + line) sel = Selector(text=driver.find_element_by_xpath("//*").get_attribute("outerHTML")) textName = sel.xpath("//div[@class='main']//h1/i/text()").extract_first() textList = sel.xpath("//div[@class='ocr']/p/text()").extract() textArticle = '' # if set(item.lower() for item in textList).isdisjoint(END_ARTICLE): # continue checkGoodContent = True for i in range(0, len(textList)): textList[i].replace('\ufeff', "") if textName.lower() in textList[i].lower() and i < len(textList) - 1: i = i + 1 while textList[i].lower() not in END_ARTICLE and i < len(textList): textArticle = textArticle + textList[i] + " "
import requests from parsel import Selector url = 'http://www.porters.vip/confusion/recruit.html' # 向目标网址发起请求 resp = requests.get(url) # 使用响应正文初始化Selector sel = Selector(resp.text) # 取出响应正文中的企业名称 company = sel.css('h1.interval::text').get() print(company)
def __get_download_file_name(self, response): selector = Selector(response.text) file_name = selector.xpath('//*[@id="footer"]/button/@onclick').get() file_name = file_name.replace('location.href=', "").replace("\'", "") return file_name
async def get_cities() -> dict: response = await request('GET', DOMAIN) if response: tree = Selector(response.text) return Dict.name_link(tree, XPATH_TO_CITIES)
def __init__(self, page: str): self._sel = Selector(page) self._result = None
def page_ok(page: str): sel = Selector(text=page) if len(sel.css('.error_Block')): return False return True
def getAndParseURL(result): #result = requests.get(url) soup = BeautifulSoup(result, 'html.parser') child = soup.find_all('a') for i in range(0, len(child)): if 'Next' in child[i].get_text(): nextlink = child[i]['href'] print(nextlink) return(nextlink) return None all_links = [] base_path = 'http://example.webscraping.com' response = requests.get(base_path) selector = Selector(response.text) href_links = selector.xpath('//a/@href').getall() all_links += href_links # get URL r = requests.get("http://example.webscraping.com/") next = '' #data = r.text #soup = BeautifulSoup(data) #table = soup.findAll('td') while(1): r = requests.get("http://example.webscraping.com/"+next) data = r.text soup = BeautifulSoup(data) for link in soup.find_all('td'): print(link.find('a')['href'])
def search_char(char): r = requests.get(f'https://guildstats.eu/character?nick={char.name}#tab2') sel = Selector(r.text) char.online_time = sel.xpath('//table[@id="myTable"]//td[2]/text()').extract_first() return char
def start_urls(html): selector = Selector(html) base_url = selector.xpath('//div[@class="hezi"]//li/a/@href').getall() titles = selector.xpath('//ul[@class="img"]//p/a/text()').getall() return base_url,titles
def get_html(html,url): selector = Selector(html) next_url = selector.xpath('//div[@id="pages"]//a[@class="a1"][2]/@href').get() img_urls = selector.xpath('//div[@class="content"]/img/@src').getall() title = selector.xpath('//div[@class="content"]/img/@alt').get() return next_url,img_urls
def getdata(): url = "https://iqmining.com/pricing" logger.info(f"get page {url}") z1 = s.get(url, timeout=60) response = Selector(text=z1.text) jscode = response.xpath( '//script[contains(.,"pricesConfig")]/text()').extract_first() parse_js = js2xml.parse(jscode) pricesConfig = js2xml.jsonlike.getall(parse_js) ret = [] for k, v in pricesConfig[0].items(): gold, silver, bronze = {"t": "gold"}, {"t": "silver"}, {"t": "bronze"} gold.update(v) silver.update(v) bronze.update(v) del gold["fee"] del gold["options"] del gold["new_price"] del bronze["fee"] del bronze["options"] del bronze["new_price"] del silver["fee"] del silver["options"] del silver["new_price"] coin = "" if k in ["sha256", "shapro"]: coin = "BTC" elif k == "shabch": coin = "BCH" elif k == "eth": coin = "ETH" else: continue if coin == "ETH": gold["contract_size"] = v["mingold"] silver["contract_size"] = v["minsilver"] bronze["contract_size"] = v["mincalc"] else: # BTC BCH 这边拿到的是 GH/s 的值 基础单位是 1000GH=1TH gold["contract_size"] = v["mingold"] / 1000 silver["contract_size"] = v["minsilver"] / 1000 bronze["contract_size"] = v["mincalc"] / 1000 gold["coin"] = coin silver["coin"] = coin bronze["coin"] = coin if v["fee"]: # BTC BCH 这边拿到的是 10GH/s 的值 基础单位是 1000GH gold["electricity_fee"] = float(v["fee"]["gold"]) * 100 silver["electricity_fee"] = float(v["fee"]["silver"]) * 100 bronze["electricity_fee"] = float(v["fee"]["bronze"]) * 100 else: gold["electricity_fee"] = 0 silver["electricity_fee"] = 0 bronze["electricity_fee"] = 0 if v.get("new_price", ""): ##打折 price_info = v["new_price"] else: price_info = v["options"] for y, p in price_info.items(): if y == "y0": continue elif y == "y1": gold["duration"] = 365 silver["duration"] = 365 bronze["duration"] = 365 elif y == "y2": gold["duration"] = 365 * 2 silver["duration"] = 365 * 2 bronze["duration"] = 365 * 2 elif y == "y5": gold["duration"] = 365 * 5 silver["duration"] = 365 * 5 bronze["duration"] = 365 * 5 if coin == "ETH": ## ETH这边拿到的是 0.1 MH/s 的值,基础单位是1MH gold["upfront_fee"] = float(p["gold"]) * 10 silver["upfront_fee"] = float(p["silver"]) * 10 bronze["upfront_fee"] = float(p["bronze"]) * 10 else: ## BTC BCH 这边拿到的是 10GH/s 的值 基础单位是 1000GH gold["upfront_fee"] = float(p["gold"]) * 100 silver["upfront_fee"] = float(p["silver"]) * 100 bronze["upfront_fee"] = float(p["bronze"]) * 100 ret.append(gold.copy()) ret.append(silver.copy()) ret.append(bronze.copy()) return ret
except urllib.error.URLError: pass except urllib.error.HTTPError: pass except timeout: pass else: fail.append(s[i]) print ("failed to retive info from ",s[i],i) flag = True if flag ==True: pass else: clap = response.read() clap = clap.decode("utf-8") h = Selector(text=clap) date = h.xpath('//meta[@content][@name="pub_date"]/@content').extract() if date: pass else: date = h.xpath('//meta[@content][@name="parsely-pub-date"]/@content').extract() key = h.xpath('//meta[@content][@name="keywords"]/@content').extract() info = h.xpath('//div[@id = "article_body"]/p//text()').extract() if not info: info = h.xpath('//div[@class = "article-body__content"]/p//text()').extract() if len(info)>1: info = ' '.join(str(r) for r in info) info = info.replace(u"\xa0", u" ") if "T" in date[0]: date,t = date[0].split('T') else:
def home_ok(page: str): sel = Selector(text=page) if len(sel.css('#mySignin')): return False return True
def test_make_links_absolute(self): text = u'<a href="file.html">link to file</a>' sel = Selector(text=text, base_url='http://example.com') sel.root.make_links_absolute() self.assertEqual(u'http://example.com/file.html', sel.xpath('//a/@href').extract_first())
def lang_url(page: str): sel = Selector(text=page) url = sel.xpath('//*[@id="englishLanguage"]/@href').get() return f'{MiCubacelParser.url_base}{url}'
import requests from parsel import Selector import time import pandas as pd start = time.time() all_images = {} result = [] response = requests.get('https://www.tk421.net/lotr/film/') selector = Selector(response.text) href_links = selector.xpath('//a/@href').getall() del href_links[-1] def moviename(tag): if ('fotr' in tag): return 'The Fellowship of the Ring' elif ('ttt' in tag): return 'The Two Towers' elif ('rotk' in tag): return 'The Return of the King' else: return None txtflag = 0 for link in href_links: try:
import time import requests from parsel import Selector from headers import COMMENTS_HEADERS base_url = "http://www.dianping.com/shop/67408602/review_all/p{}" for i in range(1, 10): if i > 1: COMMENTS_HEADERS["Referer"] = base_url.format(i - 1) res = requests.get(base_url.format(1), headers=COMMENTS_HEADERS) selector = Selector(text=res.text) if selector.css(".review-recommend").getall(): print(selector.css(".review-recommend").getall()) else: print(base_url.format(1)) print(res.content.decode("u8")) time.sleep(5)
def scrappyprofile(url, user): driver = StartSelenium(user) driver.get(url) time.sleep(2) scheight = .1 while scheight < 9.9: driver.execute_script( "window.scrollTo(0, document.body.scrollHeight/%s);" % scheight) scheight += .01 time.sleep(1) pagina = driver.page_source page = open("source.txt", "w") selec = Selector(text=pagina) page.write(pagina) url_imagen = selec.xpath( '//*[starts-with(@class, "pv-top-card-section__photo presence-entity__image EntityPhoto-circle-9 lazy-image loaded ember-view")]/@src' ).extract() if len(url_imagen) > 0: b = 1 else: b = 0 name = selec.xpath( '//*[starts-with(@class, "inline t-24 t-black t-normal break-words")]/text()' ).extract() title = selec.xpath( '//*[starts-with(@class, "mt1 t-18 t-black t-normal")]/text()' ).extract() address = selec.xpath( '//*[starts-with(@class, "t-16 t-black t-normal inline-block")]/text()' ).extract() contacts = selec.xpath( '//*[starts-with(@class, "ember-view")]/text()').extract() extracto = selec.xpath( '//*[starts-with(@class, "pv-about__summary-text mt4 t-14 ember-view")]/span/text()' ).extract() # Obtenemos los cargos cargos = [] css1cargo = selec.xpath( '//*[starts-with(@class, "pv-entity__summary-info pv-entity__summary-info--background-section ")]/h3/text()' ).extract() cargos += css1cargo cargo = selec.xpath( '//*[starts-with(@class, "t-16 t-black t-bold")]/span/text()').extract( ) cargomas = selec.xpath( '//*[starts-with(@class, "t-14 t-black t-bold")]/span/text()').extract( ) cargomas += cargo i = 1 while i < len(cargomas): cargos.append(cargomas[i]) i += 2 # Obtenemos las empresas empresas = selec.xpath( '//*[starts-with(@class, "pv-entity__secondary-title t-14 t-black t-normal")]/text()' ).extract() # Obtenemos las fechas por empresa fechas = selec.xpath( '//*[starts-with(@class, "pv-entity__date-range t-14 t-black--light t-normal")]/span/text()' ).extract() #print("name: " + name[0]) #print("title: " + title[0]) #print("address: " + address[0]) #print("contacts: " + str(contacts)) #print("extracto: " + str(extracto)) if b == 1: imagen = url[28:-1] + ".jpg" else: imagen = "default.jpg" perfil = Usuario(name[0], title[0], url, imagen) if len(extracto) > 0: perfil.extracto = extracto[0] trabajos = [] if len(cargos) != len(empresas): for i in range(len(cargos) - len(empresas)): empresas.append("") if len(cargos) != int(len(fechas) / 2): for i in range(len(cargos) - int(len(fechas) / 2)): fechas.append("") fechas.append("") for i in range(len(cargos)): fechas.pop(0) trabajo = Cargo(cargos[i], empresas[i], "", fechas[i]) trabajos.append(trabajo) #print("insertado: "+trabajo.fecha) perfil.cargos = trabajos #print("Cargos: "+ str(len(perfil.cargos))) #Obtenemos la educacion escuelas = selec.xpath( '//*[starts-with(@class, "pv-entity__school-name t-16 t-black t-bold")]/text()' ).extract() #print("escuelas: "+ str(escuelas)) if len(escuelas) > 0: titulos = selec.xpath( '//*[starts-with(@class, "pv-entity__secondary-title pv-entity__degree-name t-14 t-black t-normal")]/span/text()' ).extract() disciplinas = selec.xpath( '//*[starts-with(@class, "pv-entity__secondary-title pv-entity__fos t-14 t-black t-normal")]/span/text()' ).extract() fechas = selec.xpath( '//*[starts-with(@class, "pv-entity__dates t-14 t-black--light t-normal")]/span/time/text()' ).extract() educacion = [] for long in range(len(escuelas) - len(disciplinas)): disciplinas.append("") disciplinas.append("") #print("longitudes: " + str(len(escuelas)) + " : " + str(len(titulos)) + ":" + str(len(disciplinas))) for i in range(len(escuelas)): escuela = Escuela(escuelas[i]) if len(titulos) > 0: titulos.pop(0) escuela.titulacion = titulos[i] if len(disciplinas) > 0: disciplinas.pop(0) escuela.disciplina = disciplinas[i] escuela.fecha = fechas[i] + " - " + fechas[i + 1] #print("Escuelas: " + escuela.name) #print("Titulos: " + escuela.titulacion) #print("Disciplinas: " + escuela.disciplina) #print("fechas: " + str(fechas)) educacion.append(escuela) perfil.escuelas = educacion #obteniendo las aptitudes aptitudes = selec.xpath( '//*[starts-with(@class, "pv-skill-category-entity__name-text t-16 t-black t-bold")]/text()' ).extract() #print("aptitudes: " + str(aptitudes)) perfil.aptitudes = aptitudes #Obteniendo los intereses intereses = selec.xpath( '//*[starts-with(@class, "pv-entity__summary-info ember-view")]/h3/span/text()' ).extract() #print("Intereses: " + str(intereses)) perfil.intereses = intereses contactos, datosModelo = extractContacts(driver, url) perfil.contactos = contactos #Obteniendo licencias y certificaciones certificaciones = selec.xpath( '//*[starts-with(@class, "pv-certifications__summary-info pv-entity__summary-info pv-entity__summary-info--background-section pv-certifications__summary-info--has-extra-details")]/h3/text()' ).extract() #print("Certificaciones: " + str(certificaciones)) perfil.certificaciones = certificaciones #Obteniendo logros logrosTitles = selec.xpath( '//*[starts-with(@class, "pv-accomplishments-block__count t-32 t-black t-normal pr3")]/span/text()' ).extract() logros = selec.xpath( '//*[starts-with(@class, "pv-accomplishments-block__list-container")]/ul/li/text()' ).extract() #print("Logros T: " + str(logrosTitles)) #print("logros: "+ str(logros)) conjuntoLogros = [] i = 1 while i < len(logrosTitles): aux = "" for j in range(int(logrosTitles[i])): if j == 0: aux = logros.pop(0) else: aux = aux + ", " + logros.pop(0) conjuntoLogros.append(aux) i += 2 #print("conjuntoLogros: " + str(conjuntoLogros)) perfil.logrosTitles = logrosTitles[0::2] perfil.logros = conjuntoLogros perfil.datosModelo = datosModelo connector.insertarUsuario(perfil) driver.close() return perfil
else: hucreIci = oku['A{}'.format(hucreNo)].value urunAdresi = hucreIci baslik = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' } toplama = requests.get(urunAdresi, headers=baslik) if toplama.status_code == 200: secici = Selector(toplama.text) duzenle = re.compile( '<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});') urun = secici.css( '#product-detail-app > div > div.pr-cn > div.pr-cn-in > div.pr-in-w > div:nth-child(1) > div.pr-in-cn > h1' ).get() satici = secici.css( '#product-detail-app > div > div.pr-cn > div.pr-cn-in > div.pr-in-at > div.pr-in-sl-cnt > div > div.sl-nm > a' ).get() pfiyat = secici.xpath( '/html/body/div[3]/div/div/div[2]/div[2]/div[1]/div[1]/div[1]/div[2]/div/div/span[1]' ).get()
from selenium import webdriver from selenium.webdriver.common.keys import Keys from time import sleep from parsel import Selector import openpyxl import pandas as pd import matplotlib.pyplot as plt driver=webdriver.Chrome('C:/Users/gunjan/Desktop/Web_Scraping/chromedriver') driver.get('http://quotes.toscrape.com/') sel=Selector(text=driver.page_source) #quotes=sel.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "text", " " ))]/text()').extract() #author=sel.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "author", " " ))]/text()').extract() tags=sel.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "tag", " " ))]/text()').extract() #about_links=sel.xpath('//span//a').extract() #next_btn=driver.find_element_by_xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "next", " " ))]//a') #next_btn.click() path="C:/Users/gunjan/Desktop/quotes_web_scrap/tags_data.xlsx" workbook=openpyxl.load_workbook(path) sheet=workbook.active for r in range(2,len(tags)+1): #for c in range(1,4): #sheet.cell(row=r+1,column=1).value=quotes[r-1] #sheet.cell(row=r+1,column=2).value=author[r-1] sheet.cell(row=r,column=1).value=tags[r-1]
def selector(self): return Selector(self.text)
def main(): url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html' res = get_html_content(url) enc = chardet.detect(res) html = res.decode(enc['encoding'], errors='ignore') xpath_css = Selector(text=html) all_urls = xpath_css.xpath('//tr[@class="provincetr"]/td/a') base = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/' for url in all_urls[17:18]: province_url = base + url.xpath('./@href').extract_first() province = url.xpath('./text()').extract_first() res = get_html_content(province_url) enc = chardet.detect(res) html = res.decode(enc['encoding'], errors='ignore') xpath_css = Selector(text=html) city_code = xpath_css.xpath( '//tr[@class="citytr"]/td[1]/a/text()').extract() city_list = xpath_css.xpath( '//tr[@class="citytr"]/td[2]/a/text()').extract() city_urls = xpath_css.xpath( '//tr[@class="citytr"]/td[1]/a/@href').extract() # for i in range(len(city_urls[12])): url1 = base + city_urls[1] # print(url1) res = get_html_content(url1) enc = chardet.detect(res) html = res.decode(enc['encoding'], errors='ignore') xpath_css = Selector(text=html) county_code = xpath_css.xpath( '//tr[@class="countytr"]/td[1]/a/text()').extract() county_list = xpath_css.xpath( '//tr[@class="countytr"]/td[2]/a/text()').extract() county_urls = xpath_css.xpath( '//tr[@class="countytr"]/td[1]/a/@href').extract() for j in range(len(county_urls)): # print('省:{} 市:{} 县:{}'.format(province,city_list[20],county_list[j])) url2 = url1[0:-9] + county_urls[j] res = get_html_content(url2) enc = chardet.detect(res) try: html = res.decode("gbk").encode("utf-8") except: html = res.decode("gb2312").encode("utf-8") real_html = html.decode('utf-8') xpath_css = Selector(text=real_html) town_code = xpath_css.xpath( '//tr[@class="towntr"]/td[1]/a/text()').extract() town_list = xpath_css.xpath( '//tr[@class="towntr"]/td[2]/a/text()').extract() town_urls = xpath_css.xpath( '//tr[@class="towntr"]/td[1]/a/@href').extract() for k in range(len(town_urls)): # print('省:{} 市:{} 县:{} 镇:{}'.format(province,city_list[20],county_list[j],town_list[k])) url3 = url2[0:-11] + town_urls[k] print(url3) res = get_html_content(url3) # enc = chardet.detect(res) # html = res.decode(enc['encoding'],errors = 'ignore') try: html = res.decode("gbk").encode("utf-8") except: html = res.decode("gb2312").encode("utf-8") real_html = html.decode('utf-8') xpath_css = Selector(text=real_html) villagetr_code = xpath_css.xpath( '//tr[@class="villagetr"]/td[1]/text()').extract() villagetr_code1 = xpath_css.xpath( '//tr[@class="villagetr"]/td[2]/text()').extract() villagetr_list = xpath_css.xpath( '//tr[@class="villagetr"]/td[3]/text()').extract() for x in range(len(villagetr_list)): print('省:{} 市:{} 县:{} 镇:{} 村:{}'.format( province, str(city_list[1]).replace('市辖区', province) + '--' + city_code[1], county_list[j] + '--' + county_code[j], town_list[k] + '--' + town_code[k], villagetr_list[x] + '--' + villagetr_code[x] + '--' + villagetr_code1[x])) save_to_mysql( province, str(city_list[1]).replace('市辖区', province) + '--' + city_code[1], county_list[j] + '--' + county_code[j], town_list[k] + '--' + town_code[k], villagetr_list[x] + '--' + villagetr_code[x] + '--' + villagetr_code1[x])
hxs = Selector(text=data) posts = hxs.xpath( '//ul[@class="archive"]/li/span[@class="channel markets_and_finance"]/following-sibling::h1/a/@href' ).extract() posted.append(posts) return posted if __name__ == '__main__': print("in main") totalWeeks = [] totalPosts = [] url = 'http://www.businessweek.com/archive/news.html#r=404' data = urllib.request.urlopen(url).read() data = data.decode("utf-8") sel = Selector(text=data) months = sel.xpath('//ul/li/a').re( 'http://www.businessweek.com/archive/\\d+-\\d+/news.html') #admittMonths = 12*(2015-1991) + 8 m = [] for i in months: m.append([i]) totalWeeks = [] pool = Pool(8) totalWeeks = pool.map(mon, m) totalWeeks = [ent for sublist in totalWeeks for ent in sublist] print(len(totalWeeks)) #club = [ent for sublist in totalWeeks for ent in sublist] #print (len(club)) club = [ent for sublist in totalWeeks for ent in sublist] print(len(club))
from parsel import Selector import requests URL_BASE = "http://books.toscrape.com/catalogue/" next_page_url = 'page-1.html' while next_page_url: # Busca o conteúdo da próxima página response = requests.get(URL_BASE + next_page_url) selector = Selector(text=response.text) # Imprime os produtos de uma determinada página for product in selector.css(".product_pod"): # Busca e extrai o título e o preço # title = product.css("h3 a::attr(title)").get() # price = product.css(".price_color::text").get() # print(title, price) # Busca o detalhe de um produto detail_href = product.css("h3 a::attr(href)").get() detail_page_url = URL_BASE + detail_href # Baixa o conteúdo da página de detalhes detail_response = requests.get(detail_page_url) detail_selector = Selector(text=detail_response.text) # Extrai a descrição do produto description = detail_selector.css( "#product_description ~ p::text").get() print(description) # Descobre qual é a próxima página
import requests from parsel import Selector response = requests.get( "http://books.toscrape.com/catalogue/the-grand-design_405/index.html") selector = Selector(text=response.text) titles = selector.css(".product_page > .row > .product_main > h1::text").get() price = selector.css( ".product_page > .row > .product_main > p::text").re_first(r"\d+\.\d{2}") description = selector.css(".product_page > p::text").get() url = selector.css("img::attr(src)").get() quantity = selector.css(".instock").re_first(r"\d{1,}") suffix = "...more" if description.endswith(suffix): description = description[:-len(suffix)] print(titles, price, description, url, quantity, sep=",")
def linkedin(screen_name): for person in people: if person.screen_name == screen_name: return person.__dict__ chromedriver = 'C:/Users/DELL/chromedriver_win32/chromedriver.exe' options = webdriver.ChromeOptions() options.add_argument('headless') driver = webdriver.Chrome(executable_path=chromedriver, chrome_options=options) # driver.get method() will navigate to a page given by the URL address driver.get('https://www.linkedin.com') # locate email form by_class_name username = driver.find_element_by_class_name('login-email') # send_keys() to simulate key strokes username.send_keys('*****@*****.**') # sleep for 0.5 seconds sleep(0.5) # locate password form by_class_name password = driver.find_element_by_class_name('login-password') # send_keys() to simulate key strokes password.send_keys('meriam12345') sleep(0.5) # locate submit button by_xpath sign_in_button = driver.find_element_by_xpath('//*[@type="submit"]') # .click() to mimic button click sign_in_button.click() sleep(0.5) # driver.get method() will navigate to a page given by the URL address driver.get('https:www.google.com') sleep(3) # locate search form by_name search_query = driver.find_element_by_name('q') # send_keys() to simulate the search text key strokes nn = 'site:linkedin.com/in/ AND ' + screen_name search_query.send_keys(nn) #a changer avec parametre de la fonction sleep(0.5) # navigate to the URL address specified by search_query in parameters.py driver.get(nn) #a changer avec parametre de la fonction # .send_keys() to simulate the return key search_query.send_keys(Keys.RETURN) sleep(3) # locate URL by_class_name linkedin_urls = driver.find_elements_by_class_name('iUh30') # variable linkedin_url is equal to the list comprehension linkedin_urls = [url.text for url in linkedin_urls] sleep(0.5) # For loop to iterate over each URL in the list returned from the google search query linkedin_url = linkedin_urls[0] # get the profile URL driver.get(linkedin_url) sleep(5) # assigning the source code for the web page to variable sel sel = Selector(text=driver.page_source) # xpath to extract the text from the class containing the name name = sel.xpath( '//*[starts-with(@class, "pv-top-card-section__name")]/text()' ).extract_first() # if name exists if name: # .strip() will remove the new line /n and white spaces name = name.strip() # xpath to extract the text from the class containing the job title job_title = sel.xpath( '//*[starts-with(@class, "pv-top-card-section__headline")]/text()' ).extract_first() if job_title: job_title = job_title.strip() postes = sel.xpath( '//*[starts-with(@class, "t-16 t-black t-bold")]/text()').getall() for poste in postes: if poste: poste = poste.strip() societes = sel.xpath( '//*[starts-with(@class, "pv-entity__secondary-title")]/text()' ).getall() for societe in societes: if societe: societe = societe.strip() descriptions = sel.xpath( '//*[starts-with(@class, "lt-line-clamp__line")]/text()').getall() for description in descriptions: if description: description = description.strip() universites = sel.xpath( '//*[starts-with(@class, "pv-entity__school-name t-16 t-black t-bold")]/text()' ).getall() for universite in universites: if universite: universite = universite.strip() linkedin_url = driver.current_url name = validate_field(name) job_title = validate_field(job_title) postes = validate_field(postes) societes = validate_field(societes) descriptions = validate_field(descriptions) universites = validate_field(universites) linkedin_url = validate_field(linkedin_url) driver.quit() person = Person(name, job_title, postes, societes, descriptions, universites, linkedin_url) people.append(person) return person.__dict__
def parse(response): # 生日 shengri = re.search( '出生日期.*?</td>.*?<td class="data_tb_content".*?>(.*?)</td>', response, flags=re.S) if shengri: shengri = shengri.group(1) else: shengri = '' response = Selector(text=response) # 姓名 xingming = response.xpath( '//td[contains(text(),"姓名")]/following-sibling::td[1]/text()').get( default='') # 姓名 xingbie = response.xpath( '//td[contains(text(),"性别")]/following-sibling::td[1]/text()').get( default='') # 姓名 zhiwu = response.xpath( '//td[contains(text(),"所内职务")]/following-sibling::td[1]/text()').get( default='') # 姓名 dangyuan = response.xpath( '//td[contains(text(),"是否党员")]/following-sibling::td[1]/text()').get( default='') # 姓名 xueli = response.xpath( '//td[contains(text(),"学历")]/following-sibling::td[1]/text()').get( default='') # 姓名 xuewei = response.xpath( '//td[contains(text(),"学位")]/following-sibling::td[1]/text()').get( default='') # 姓名 zhuanye = response.xpath( '//td[contains(text(),"所学专业")]/following-sibling::td[1]/text()').get( default='') # 姓名 xuexiao = response.xpath( '//td[contains(text(),"毕业学校")]/following-sibling::td[1]/text()').get( default='') # 姓名 kaohe = response.xpath( '//td[contains(text(),"资格取得方式(考试/考核)")]/following-sibling::td[1]/text()' ).get(default='') # 姓名 shuhao = response.xpath( '//td[contains(text(),"全科合格证书号")]/following-sibling::td[1]/text()' ).get(default='') # 姓名 nianfen = response.xpath( '//td[contains(text(),"全科合格年份")]/following-sibling::td[1]/text()').get( default='') # 姓名 bianhao = response.xpath( '//td[contains(text(),"注册会计师证书编号")]/following-sibling::td[1]/text()' ).get(default='') # 姓名 gudong = response.xpath( '//td[contains(text(),"是否合伙人(股东)")]/following-sibling::td[1]/text()' ).get(default='') # 姓名 jianhao = response.xpath( '//td[contains(text(),"批准注册文件号")]/following-sibling::td[1]/text()' ).get(default='') # 姓名 shijian = response.xpath( '//td[contains(text(),"批准注册时间")]/following-sibling::td[1]/text()').get( default='') # 姓名 wusuo = response.xpath( '//td[contains(text(),"所在事务所")]/following-sibling::td[1]/text()').get( default='') # 姓名 xueshi = response.xpath( '//td[contains(text(),"本年度应完成学时")]/following-sibling::td[1]/text()' ).get(default='') # 姓名 yixueshi = response.xpath( '//td[contains(text(),"本年度已完成学时")]/following-sibling::td[1]/text()' ).get(default='') # 姓名 xinxi = response.xpath( '//td[contains(text(),"处罚/惩戒信息")]/following-sibling::td[1]/text()' ).get(default='') # 姓名 huodong = response.xpath( '//td[contains(text(),"参加公益活动")]/following-sibling::td[1]/text()').get( default='') # 生日 items = { 'xingming': xingming.strip(), 'xingbie': xingbie.strip(), 'zhiwu': zhiwu.strip(), 'dangyuan': dangyuan.strip(), 'xueli': xueli.strip(), 'xuewei': xuewei.strip(), 'zhuanye': zhuanye.strip(), 'xuexiao': xuexiao.strip(), 'kaohe': kaohe.strip(), 'shuhao': shuhao.strip(), 'nianfen': nianfen.strip(), 'bianhao': bianhao.strip(), 'gudong': gudong.strip(), 'jianhao': jianhao.strip(), 'shijian': shijian.strip(), 'wusuo': wusuo.strip(), 'xueshi': xueshi.strip(), 'yixueshi': yixueshi.strip(), 'xinxi': xinxi.strip(), 'huodong': huodong.strip(), 'shengri': shengri.strip(), } print(items) pipeline(items)