async def test_stop(test_client): agent = Agent("jid@server", "password") agent.web.setup_routes() client = await test_client(agent.web.app) response = await client.get("/spade/stop") response = await response.text() sel = Selector(text=response) assert sel.css("div.alert-warning > span::text").get().strip() == "Agent is stopping now." with LogCapture() as log: try: await client.get("/spade/stop/now/", timeout=0.0005) except requests.exceptions.ReadTimeout: pass log.check_present(('spade.Web', 'WARNING', "Stopping agent from web interface.")) counter = 5 while agent.is_alive() and counter > 0: counter -= 0.5 time.sleep(0.5) assert not agent.is_alive()
def test_has_class_tab(self): body = u""" <p CLASS="foo\tbar">First</p> """ sel = Selector(text=body) self.assertEqual( [x.extract() for x in sel.xpath(u'//p[has-class("foo")]/text()')], [u'First'])
def get_classes(html): doc = Selector(text=html) classes = set(doc.xpath('//*[@class]/@class').extract()) result = set() for cls in classes: for _cls in cls.split(): result.add(_cls) return result
def getimgsrc(pin_id): url = 'http://huaban.com/pins/%s/' % pin_id z = requests.get(url,headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}) sel = Selector(text=z.text) jscode = sel.xpath("//script[contains(., 'app.page = app.page')]/text()").extract_first() parsed_js = js2xml.parse(jscode) for i in parsed_js.xpath('//property[@name="pins"]//property[@name="key"]/string/text()'): print 'http://img.hb.aicdn.com/' + i
def get_categories(self, response=None): if not response: response = requests.get(self.url) sel = Selector(text=response.text) categories = sel.xpath("//select[@id='search_category']" "/option/text()").extract() categories = [c.split(' by ')[0].replace(' & ', '-') for c in categories] return categories
def getheaders(): '从网页源代码内解析出 uuid与Xsrftoken' z1 = s.get('https://www.zhihu.com/') sel = Selector(z1.text) jsdata = sel.css('div#data::attr(data-state)').extract_first() xudid = json.loads(jsdata)['token']['xUDID'] xsrf = json.loads(jsdata)['token']['xsrf'] headers = headers_raw_to_dict(post_headers_raw) headers['X-UDID'] = xudid headers['X-Xsrftoken'] = xsrf return headers
def load_chapters(url): """ Loads all chapters from a manga comic and returns a list for dictionaries with related data. :return: chapter list in asc order """ text = requests.get(url).text sel = Selector(text) hel_gen = sel.css(".chlist h3, .chlist h4") chapter_gen = map(hel_to_chapter, hel_gen) available_chapter_gen = filter(lambda v: v['title'], chapter_gen) return reversed(list(available_chapter_gen))
def update_match_streams(self, matches: List[Match]) -> List[Match]: """Populate Match objects with stream urls""" updated = [] for item in matches: # Populate stream data if match is live if not item['time_secs']: resp = self.session.get(item['url']) sel_detailed = Selector(text=resp.text) item['stream'] = sel_detailed.xpath("//div[@class='matches-streams']" "/span[.//a[re:test(text(),'english', 'i')]]" "//iframe/@src").extract_first() item['stream'] = clean_stream_url(item['stream']) updated.append(item) return updated
async def test_add_get(test_client): agent = Agent("jid@server", "password") agent.web.add_get("/test", lambda request: {"number": 42}, "examples/hello.html") agent.web.setup_routes() client = await test_client(agent.web.app) response = await client.get("/test") response = await response.text() sel = Selector(text=response) assert sel.css("h1::text").get().strip() == "42" agent.stop()
def main(argv=None, progname=None): parser = argparse.ArgumentParser(prog=progname, description=__doc__) parser.add_argument('expr', metavar='EXPRESSION', help="A CSSexpression, or a XPath expression if --xpath is given.") parser.add_argument('file', metavar='FILE', nargs='?', help="If missing, it reads the HTML content from the standard input.") parser.add_argument('--xpath', action='store_true', help="Given expression is a XPath expression.") parser.add_argument('--re', metavar='PATTERN', help="Apply given regular expression.") parser.add_argument('--encoding', metavar='ENCODING', default='utf-8', help="Input encoding. Default: utf-8.") parser.add_argument('--repr', action='store_true', help="Output result object representation instead of as text.") # TODO: Output this and parsel version. args = parser.parse_args(argv) if args.file: text = open(args.file).read() else: text = sys.stdin.read() if isinstance(text, six.binary_type): try: text = text.decode(args.encoding) except UnicodeDecodeError: parser.error("Failed to decode input using encoding: %s" % args.encoding) sel = Selector(text=text) if args.xpath: result = sel.xpath(args.expr) else: result = sel.css(args.expr) if args.re: regex = args.re.encode(args.encoding) regex = regex.decode('string_escape' if six.PY2 else 'unicode_escape') out = result.re(re.compile(regex, re.IGNORECASE | re.UNICODE)) else: out = result.extract() if args.repr: pprint.pprint(out) else: print("\n".join(out)) return 0
def get_alexa_demographics(url, db_session=False): if db_session is not False: result = list(db_session.query(WebsitesCache).filter_by(link=url)) if len(result) > 0 and result[0].male_ratio_alexa >= 0: return float(result[0].male_ratio_alexa), float(result[0].female_ratio_alexa) else: return 0.0, 0.0 orig_url = url url = "http://www.alexa.com/siteinfo/" + url response = requests.get(url) # We need the decode part because Selector expects unicode. selector = Selector(response.content.decode('utf-8')) bars = selector.css("#demographics-content .demo-col1 .pybar-bg") values = [] for bar in bars: value = bar.css("span::attr(style)").extract()[0] value = int(re.search(r'\d+', value).group()) values.append(value) male_ratio = 0.0 female_ratio = 0.0 if sum(values) == 0: print "No alexa rating for " + url else: male_ratio = float(values[0] + values[1]) / sum(values) female_ratio = float(values[2] + values[3]) / sum(values) print url print values print male_ratio, female_ratio # Do we want to cache the result? if db_session is not False: try: db_session.query(WebsitesCache).filter(WebsitesCache.link==orig_url) \ .update({ 'male_ratio_alexa': male_ratio, 'female_ratio_alexa': female_ratio }) db_session.commit() except: print "Could not update " + url return male_ratio, female_ratio
async def test_request_home(test_client): agent = make_connected_agent("jid@server", "password") future = agent.start(auto_register=False) future.result() agent.web.setup_routes() client = await test_client(agent.web.app) response = await client.get("/spade") response = await response.text() sel = Selector(text=response) assert sel.css("title::text").get() == "jid agent" assert sel.css("img::attr(src)").get() == agent.avatar assert sel.css("ul.products-list > li").getall() == [] agent.stop()
async def test_get_messages(test_client): agent = Agent("jid@server", "password") agent.web.setup_routes() client = await test_client(agent.web.app) # add messages to trace for i in range(5): msg = Message(body=str(i), sender="{}@server".format(i), to="receiver@server") agent.traces.append(msg) response = await client.get("/spade/messages/") response = await response.text() sel = Selector(text=response) assert len(sel.css("ul.timeline > li").getall()) == 6 # num messages + end clock agent.stop()
def mon(inputs): week=[] errored_out=[] for month in inputs: try:data = urllib.request.urlopen(month).read() except urllib.error.URLError as e: print (month) errored_out.append(month) print(e.reason) if type(data) is bytes: data = data.decode("utf-8") hxs = Selector(text=data) weeks = hxs.xpath('//ul[@class="weeks"]/li/a').re('http://www.businessweek.com/archive/\\d+-\\d+/news/day\\d+\.html') week.append(weeks) else: hxs = Selector(text=data) weeks = hxs.xpath('//ul[@class="weeks"]/li/a').re('http://www.businessweek.com/archive/\\d+-\\d+/news/day\\d+\.html') week.append(weeks) return week
def post(inputs): posted=[] failed=[] for week in inputs: try:data = urllib.request.urlopen(week).read() except urllib.error.URLError as e: failed.append(week) print(week) print(e.reason) if type(data) is bytes: data = data.decode("utf-8") hxs = Selector(text=data) posts = hxs.xpath('//ul[@class="archive"]/li/span[@class="channel markets_and_finance"]/following-sibling::h1/a/@href').extract() posted.append(posts) else: hxs = Selector(text=data) posts = hxs.xpath('//ul[@class="archive"]/li/span[@class="channel markets_and_finance"]/following-sibling::h1/a/@href').extract() posted.append(posts) return posted
def download_chapter(chapter, folder_name): """ Grabs all images from a chapter and writes them down to filesystem. """ folder_name = werkzeug.utils.secure_filename(folder_name) # if the folder does not exist ... if not os.path.exists(folder_name): os.mkdir(folder_name) text = requests.get(chapter['href']).text sel = Selector(text) for value in sel.css("select[class='m'] > option::attr(value)").extract(): value = int(value) url = re.sub(r'\d+\.html', '%d.html' % value, chapter['href']) download_page(url, folder_name)
async def test_get_behaviour(test_client): class EmptyOneShotBehaviour(OneShotBehaviour): async def run(self): self.kill() agent = Agent("jid@server", "password") behaviour = EmptyOneShotBehaviour() agent.add_behaviour(behaviour) agent.web.setup_routes() client = await test_client(agent.web.app) response = await client.get("/spade/behaviour/OneShotBehaviour/EmptyOneShotBehaviour/") response = await response.text() sel = Selector(text=response) assert sel.css("section.content-header > h1::text").get().strip() == "OneShotBehaviour/EmptyOneShotBehaviour" agent.stop()
async def test_add_post(test_client): agent = Agent("jid@server", "password") async def handle_post(request): form = await request.post() number = form["number"] return {"number": number} agent.web.add_post("/test", handle_post, "examples/hello.html") agent.web.setup_routes() client = await test_client(agent.web.app) response = await client.post("/test", data={"number": 1024}) response = await response.text() sel = Selector(text=response) assert sel.css("h1::text").get() == "1024" agent.stop()
def download_page(url, folder_name): text = requests.get(url).text sel = Selector(text) for src in sel.css("img[id='image']::attr(src)").extract(): basename = os.path.basename(src) safe_basename = werkzeug.utils.secure_filename(basename) filename = os.path.join(folder_name, safe_basename) filename = os.path.abspath(filename) # file is not there or has a invalid size ... if not os.path.exists(filename) or os.path.getsize(filename) == 0: data = requests.get(src).content with open(filename, 'wb') as file: file.write(data) print('{0} written.'.format(filename)) else: print('{0} exists. Skipping.'.format(filename))
def test_has_class_simple(self): body = u""" <p class="foo bar-baz">First</p> <p class="foo">Second</p> <p class="bar">Third</p> <p>Fourth</p> """ sel = Selector(text=body) self.assertEqual( [x.extract() for x in sel.xpath('//p[has-class("foo")]/text()')], [u'First', u'Second']) self.assertEqual( [x.extract() for x in sel.xpath('//p[has-class("bar")]/text()')], [u'Third']) self.assertEqual( [x.extract() for x in sel.xpath('//p[has-class("foo","bar")]/text()')], []) self.assertEqual( [x.extract() for x in sel.xpath('//p[has-class("foo","bar-baz")]/text()')], [u'First'])
def download(self, **kwargs): """ Download and set image from wallhaven.cc :param position - position of image to choose from listed from 1 to 24, default is 0 = random. :param categories - categories to download from in 000 format, where every number represents binary for [general, anime, people] list. :param purity - purity of content in 000 format, where every number represents binary for [sfw, sketchy, _]. :param sorting - sorting type from available see WallhavenDownloader.sorting_types . """ # Make url from arguments order = 'desc' categories = kwargs.get('categories', '') purity = kwargs.get('purity', '') sorting = kwargs.get('sorting', '') page, position, rand = self._make_position(kwargs.get('position', 0)) url = self.base_url for arg in ['categories', 'purity', 'sorting', 'order', 'page']: value = locals()[arg] if value: url = add_or_replace_parameter(url, arg, locals()[arg]) # Download and parse items resp = requests.get(url) if resp.status_code != 200: self.logger.error('Failed to download image list {}'.format(resp.url)) return list_sel = Selector(text=resp.text) items = list_sel.xpath("//section[@class='thumb-listing-page']//figure/a/@href").extract() item = random.choice(items) if rand else items[position - 1] resp = requests.get(item) if resp.status_code != 200: self.logger.error('Failed to download image page {}'.format(resp.url)) return sel = Selector(text=resp.text) image_url = sel.xpath("//img[@id='wallpaper']/@src").extract_first() meta = { 'id': sel.xpath("//img[@id='wallpaper']/@data-wallpaper-id").extract_first(), 'tags': sel.xpath("//ul[@id='tags']//li/a/text()").extract(), 'views': sel.xpath("//dt[contains(text(),'Views')]/following-sibling::dd[1]/text()").extract_first(), 'favorites': sel.xpath("//dt[contains(text(),'Favorites')]" "/following-sibling::dd[1]//text()").extract_first(), 'res': sel.xpath("//h3/text()").extract_first(), } image = Image(image_url, meta) return self.process_url(image, kwargs)
def _find_match(self, sel: Selector) -> Match: xpath = lambda x: sel.xpath(x).extract_first(default='').strip() item = Match() item['url'] = urljoin(self.url_base, xpath(".//a/@href")) item['id'] = (re.findall('matches/(\d+)', item['url']) or [None])[0] item['game'] = next((g for g in self.games if g in item['url'].lower())) item['time'] = xpath("td[@class='status']/span/text()") item['time_secs'] = time_to_seconds(item['time']) item['timestamp'] = int((datetime.now() + timedelta(item['time_secs'])).timestamp()) item['t1'] = xpath(".//span[contains(@class,'opp1')]/span/text()") item['t1_country'] = xpath(".//span[contains(@class,'opp1')]/span[contains(@class,'flag')]/@title") item['t1_country_short'] = xpath(".//span[contains(@class,'opp1')]" "/span[contains(@class,'flag')]/@class").split()[-1] item['t2'] = xpath(".//span[contains(@class,'opp2')]/span/text()") item['t2_country'] = xpath(".//span[contains(@class,'opp2')]/span[contains(@class,'flag')]/@title") item['t2_country_short'] = xpath(".//span[contains(@class,'opp2')]" "/span[contains(@class,'flag')]/@class").split()[-1] scores = sel.css('.score::text').extract() item['t1_score'] = scores[0] if scores else None item['t2_score'] = scores[1] if len(scores) > 1 else None return item
async def test_get_agent(test_client): agent = make_presence_connected_agent("jid@server", "password") future = agent.start(auto_register=False) future.result() agent.web.setup_routes() client = await test_client(agent.web.app) jid = "friend@server" item = Item(jid=JID.fromstr(jid)) agent.presence.roster._update_entry(item) response = await client.get(f"/spade/agent/{jid}/") response = await response.text() sel = Selector(text=response) assert sel.css("section.content-header > h1::text").get().strip() == jid agent.stop()
def test_set_xpathfunc(self): def myfunc(ctx): myfunc.call_count += 1 myfunc.call_count = 0 body = u""" <p CLASS="foo">First</p> """ sel = Selector(text=body) self.assertRaisesRegexp( ValueError, 'Unregistered function in myfunc', sel.xpath, 'myfunc()') set_xpathfunc('myfunc', myfunc) sel.xpath('myfunc()') self.assertEqual(myfunc.call_count, 1) set_xpathfunc('myfunc', None) self.assertRaisesRegexp( ValueError, 'Unregistered function in myfunc', sel.xpath, 'myfunc()')
def ruleDetailEnShiShi(jsonData, detailUrl): for dat in jsonData.get('list'): # 发布时间 bulletinIssueTime = dat.get('bulletinIssueTime') bulletinName = dat.get('bulletinName') # 获取表格中的所有数据 转成html bulletinContent = dat.get('bulletincontent') selectors = Selector(text=bulletinContent) # 标准的格式 类似这种: https://www.hbggzyfwpt.cn/jyxx/jsgcZbjggsDetail?guid=804c6b5b-e69f-4118-8d38-9c9d9413eb65&isOther=false # 招标人 zhaoBiaoRen = selectors.xpath( "//div[text()='招标人或招标代理机构:']/../following-sibling::td[1]/div/text()" ).extract_first() # 地址 根据招标人定位地址 也可以根据地址文字定位 diZHi = selectors.xpath( "//div[text()='地址:']/../following-sibling::td[1]/div/text()").get( default='') # diZHi = selectors.xpath("//td[contains(text(),'地址:')]/text()").getall()[0] # 中标人 zhongBiaoRen = selectors.xpath( "//div[text()='中标人']/../../following-sibling::tr[1]/td[2]//text()" ).get(default='') # 中标价 zhongBiaoJia = selectors.xpath( "//div[text()='中标人']/../../following-sibling::tr[1]/td[3]//text()" ).get(default='') items = { 'bulletinIssueTime': bulletinIssueTime, 'zhaoBiaoRen': zhaoBiaoRen, 'bulletinName': bulletinName, 'diZHi': diZHi, 'zhongBiaoRen': zhongBiaoRen, 'zhongBiaoJia': zhongBiaoJia, 'detailUrl': detailUrl, 'city': '恩施市' } return items
def carregarG1(): noticias = [] try: html = urlopen("http://g1.globo.com/economia/ultimas-noticias.html", timeout=600000).read() except URLError as e: print(e) print("Falha na conexão com G1") return [] retorno = str(html.decode("utf-8")) #Obtém as informações do Html da página sel = Selector(text=retorno) titulos = sel.css('.feed-post-body').css('.feed-post-body-title').css( '.feed-post-link').xpath('.//text()').getall() links = sel.css('.feed-post-body').css('.feed-post-body-title').css( '.feed-post-link').xpath('.//@href').getall() resumos = sel.css('.feed-post-body').css('.feed-post-body-resumo').xpath( './/text()').getall() categorias = sel.css('.feed-post-body').css('.feed-post-metadata').css( '.feed-post-metadata-section').xpath('.//text()').getall() datas = sel.css('.feed-post-body').css('.feed-post-metadata').css( '.feed-post-datetime').xpath('.//text()').getall() #Monta a lista de notícias conteudo = [] for indice in range(len(titulos)): conteudo.append({ "fonte": "G1", "titulo": "" + titulos[indice] + "", "link": "" + links[indice] + "", "resumo": "" + resumos[indice] + "", "categoria": "" + categorias[indice] + "", "data": "" + datas[indice] + "" }) return conteudo
def extract_job(driver, url) -> Job: driver.get(url) sel = Selector(text=driver.page_source) job_id = int(url.split('/')[5]) job_url = url company_name = str( sel.xpath('//a[@data-control-name="company_link"]/text()') [2].get()).strip() company_url = str( sel.xpath('//a[@data-control-name="company_link"]/@href')[0].get()) element = sel.xpath('//span[@class="jobs-top-card__bullet"]/text()') if not element: element = sel.xpath( '//a[@data-control-name="commute_module_anchor"]/text()') address = str(element[0].get()).strip() description = ",".join( sel.xpath('//div[@id="job-details"]//descendant::*/text()').extract() ).strip() return Job(job_id, job_url, company_name, company_url, address, description)
def parse_detail(url): headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36' } response = requests.get(url, headers=headers).text sel = Selector(text=response) title = sel.xpath( "normalize-space(//div[@class='title-box']/h1[@class='title J_title']/text())" ).extract()[0] content_a = sel.xpath( "//article[@class='wrap-main']/div[@id='feed-main']/div[@class='item-name']/article[@class='txt-detail']/*[name(.)!='span']" ) dd = '' for content in content_a: content_temp = content.xpath( 'normalize-space(.)').extract_first().replace("'", '') if "极速发" in content_temp or "查看点评" in content_temp or "中立的消费门户" in content_temp or "小值机器人" in content_temp: continue dd += content_temp + '\n' links = sel.xpath( "//article[@class='txt-detail']/p/a[@itemprop='description']/@href" ).getall() text_links = sel.xpath( "//article[@class='txt-detail']/p/a[@itemprop='description']/text()" ).getall() temp_links = [] temp_text_links = [] shop_num = re.search(r'-?\d+', str(url), re.M | re.I).group() print(shop_num) if links and text_links: for link in links: if len(text_links) > 0: text_link = text_links.pop() if text_link in dd: if 'www.smzdm.com/p' in link and link: temp_links.append(parse_link(link)) if 'go.smzdm.com' in link: temp_links.append(link) temp_text_links.append(text_link) return (title, dd, temp_links, temp_text_links, shop_num)
def get_page_data(url): data = get_data(url) # 获取数据 se = Selector(text=data) poster = se.css(".grid_view .pic img").xpath("./@src").extract() # 获取海报 movie_name = se.css(".grid_view .pic img").xpath("./@alt").re( "\w+") # 获取电影名 tmp_actor_o_info = se.css(".grid_view .info .bd").xpath( "./p/text()") # 获取上映日期和种类 actor, other_info = tmp_actor_o_info[::4], tmp_actor_o_info[1::4] actor, other_info = [i.get() for i in actor], [i.get() for i in other_info] score = se.css(".star>.rating_num::text").extract() # 获取评分 movie_data = {} for p, m, a, o, s in zip(poster, movie_name, actor, other_info, score): a = a.strip() try: other_info = o.strip().replace("\xa0", " ") if "导演:" in a: director, a = a[4:].split(" ") else: director, a = a, a director = director.strip().split("/") # 列表 year, country, movie_type = other_info.strip().split("/") movie_type = movie_type.strip().split(" ") # 列表 movie_data.update({ m: { "poster": p, "movie_name": m, "director": "/".join(director), "movie_type": "/".join(movie_type), "score": s, "country": country.strip(), "year": int(year) } }) except ValueError: continue return movie_data
def get_url(self, list_fname): r""" 解析list列表页面,将解析出的数据存入mysql """ with open(list_fname, encoding='utf8') as f: text = f.read() html = Selector(text, type='html') result = html.xpath( "//h5[@class='issue-item__title']/a/@href").getall() # time.sleep(120) titles = html.xpath( "//h5[@class='issue-item__title']/a/text()").getall() eISBN = html.xpath( "//div[@class='teaser__row'][2]/div[@class='teaser__item']/text()" ).getall() if eISBN != []: if "eISBN" in eISBN[0]: eisbn = eISBN[0].replace("eISBN:", "").replace("-", "") else: eisbn = "" if result is None: return else: for i, item in enumerate(result): lists = [] self.i += 1 url = 'https://arc.aiaa.org' + item name = re.findall('10.2514/(.*)', item)[0] title = titles[i] try: session = html.xpath( "//div[@class='issue-item__session_details']/text()" ).getall()[i] except: session = "" message = (url, name, title, session, eisbn) sql = "insert ignore into detail (url,doi,title,session,eisbn) values(%s,%s,%s,%s,%s)" cur = self.conn.cursor() cur.execute(sql, message) self.conn.commit()
def down_engin(self, message): url = 'http://engineering.org.cn/EN/2095-8099/current.shtml' feature = 'txt_biaoti' fdir = self.list_path + '/' + 'engi' if not os.path.exists(fdir): os.makedirs(fdir) fname = '%s/engi_current.html' % fdir while True: resp = self.gethtml(url, feature) if resp: break selcover = Selector(text=resp.content.decode('utf8')) tdTag = selcover.xpath('//td[@class="img_display"]')[0] engicoverurl = tdTag.xpath('./img/@src').extract_first() engicoverurl = engicoverurl.replace('../..', '') with open(fname, mode='w', encoding='utf8') as f: f.write(resp.content.decode('utf8')) url = 'http://engineering.org.cn/EN/article/showOldVolumnList.do' feature = 'Current Issue' while True: resp = self.gethtml(url, feature) if resp: break sel = Selector(text=resp.content.decode('utf8')) conn = utils.init_db('mysql', 'hepjournal', 4) cur = conn.cursor() cur.execute( "update journal set cover_url='%s' where journal_id='engi'" % engicoverurl) for aTag in sel.xpath('//a[contains(@href, "../volumn/volumn")]'): url = 'http://engineering.org.cn/EN' + aTag.xpath( './@href').extract_first().replace('..', '') sql = "insert ignore into issue(url,journal_id) Values('%s','%s')" % ( url, 'engi') cur.execute(sql) conn.commit() cur.close() conn.close() self.senddistributefinish('startdown_list')
def test_has_class_simple(self): body = u""" <p class="foo bar-baz">First</p> <p class="foo">Second</p> <p class="bar">Third</p> <p>Fourth</p> """ sel = Selector(text=body) self.assertEqual( [x.extract() for x in sel.xpath('//p[has-class("foo")]/text()')], [u'First', u'Second']) self.assertEqual( [x.extract() for x in sel.xpath('//p[has-class("bar")]/text()')], [u'Third']) self.assertEqual([ x.extract() for x in sel.xpath('//p[has-class("foo","bar")]/text()') ], []) self.assertEqual([ x.extract() for x in sel.xpath('//p[has-class("foo","bar-baz")]/text()') ], [u'First'])
def get_hous_detail(start_url_content): try: price = re.findall('"comm_midprice":"(.*?)","area_midprice"',start_url_content,re.S)[0] except: price = re.findall('"comm_midprice":(.*?),"area_midprice"',start_url_content,re.S)[0] print(price) l2 = re.findall('lat : "(.*?)",.*?lng : "(.*?)"',start_url_content,re.S) lat_lng= [float(l2[0][0]), float(l2[0][1])] print(lat_lng) html = Selector(text=start_url_content) detali_dt = html.xpath('//*[@id="basic-infos-box"]/dl/dt') address = html.xpath('//span[@class="sub-hd"]/text()').extract_first() all_add = html.xpath('//div[@class="p_1180 p_crumbs"]/a/text()').extract() city = all_add[1].replace('小区','') county = all_add[2] community = all_add[3] community_name = all_add[4] pin = Pinyin() province = gen_address(city) sheet_name = pin.get_pinyin(province, "").replace('sheng', '').replace('shi', '') print(province,city,county,community,community_name) print(address) dt = [] for i in detali_dt: key1 = i.xpath('./text()').extract_first().replace('\xa0','').replace(':','') key = ho.get(key1) dt.append(key) # print('{}{}'.format(i.xpath('./dt/text()').extract_first(),i.xpath('./dd/text()').extract_first())) # print('{}'.format(i.xpath('./text()').extract_first())) detali_dd = html.xpath('//*[@id="basic-infos-box"]/dl/dd') dd = [] for i in detali_dd: dd.append(i.xpath('./text()').extract_first()) # print('{}{}'.format(i.xpath('./dt/text()').extract_first(),i.xpath('./dd/text()').extract_first())) # print('{}'.format(i.xpath('./text()').extract_first())) a = dict(zip(dt,dd)) print(a)
def parse_book(self, response): link = response._get_url() r = requests.get(link) sel = Selector(r.text) book_data = {} title = sel.css('div.col-sm-6.product_main>h1')[0].extract().replace( '<h1>', '').replace('</h1>', '') df = pd.read_html(link)[0] book_data['category'] = sel.css('ul li')[2].extract().split( '</a>')[0].split('>')[2] book_data['title'] = title book_data['price'] = df[1][3] book_data['units left'] = int(df[1][5].replace('In stock (', '').replace( ' available)', '')) book_data['UPC'] = df[1][0] book_data['url'] = link book_data['description'] = sel.css('p')[3].extract().replace( '<p>', '').replace('</p>', '') book_data['rating'] = sel.css('p')[2].extract().split('\n')[0].replace( '<p class="star-rating ', '').replace('">', '') yield book_data
def ruleDetailWuHanShi(jsonData, detailUrl): for dat in jsonData.get('list'): # 发布时间 bulletinIssueTime = dat.get('bulletinIssueTime') bulletinName = dat.get('bulletinName') # 获取表格中的所有数据 bulletinContent = dat.get('bulletincontent') selectors = Selector(text=bulletinContent) # 标准的格式 类似这种: https://www.hbggzyfwpt.cn/jyxx/jsgcZbjggsDetail?guid=804c6b5b-e69f-4118-8d38-9c9d9413eb65&isOther=false zhaoBiaoRen = selectors.xpath( "//td[text()='建设单位(招标人)']/following-sibling::td[1]/text()").get( default='') # xiangMuMingCheng = selectors.xpath("//td[text()='报建项目名称']/following-sibling::td[1]/text()").get(default='') diZHi = selectors.xpath( "//td[text()='建设地址']/following-sibling::td[1]/text()").get( default='') zhongBiaoRen = selectors.xpath( "//td[text()='中标人']/following-sibling::td[1]/text()").get( default='') zhongBiaoJia = selectors.xpath( "//td[text()='中标价(万元)']/following-sibling::td[1]/text()").get( default='') items = { 'bulletinIssueTime': bulletinIssueTime, 'zhaoBiaoRen': zhaoBiaoRen, 'bulletinName': bulletinName, 'diZHi': diZHi, 'zhongBiaoRen': zhongBiaoRen, 'zhongBiaoJia': zhongBiaoJia, 'detailUrl': detailUrl, 'city': '武汉市' } yield items
def parsel_for_parse_page(response): """ 使用parsel模块解析网页数据 :param response: 响应数据 :return: """ selector = Selector(response.text) # 定义映射关系 mappings = { 'vhk08k': 0, 'vhk6zl': 1, 'vhk9or': 2, 'vhkfln': 3, 'vhkbvu': 4, 'vhk84t': 5, 'vhkvxd': 6, 'vhkqsc': 7, 'vhkjj4': 8, 'vhk0f1': 9 } phone_element = selector.css('div.col.more > d').getall() tel_vhk_list = [] for d_em in phone_element: d = Selector(d_em) x = d.css('d::attr("class")').get() tel_vhk_list.append(x) # print(tel_vhk_list) # 获取VSG的映射值 tel_num_list = [mappings.get(i) for i in tel_vhk_list] tel_num_list = list( map(lambda x_: (str(x_) if x_ is not None else "-"), tel_num_list)) phone = "".join(tel_num_list) print(phone)
def post(inputs): posted = [] failed = [] for week in inputs: try: data = urllib.request.urlopen(week).read() except urllib.error.URLError as e: failed.append(week) print(week) print(e.reason) if type(data) is bytes: data = data.decode("utf-8") hxs = Selector(text=data) posts = hxs.xpath( '//ul[@class="archive"]/li/span[@class="channel markets_and_finance"]/following-sibling::h1/a/@href' ).extract() posted.append(posts) else: hxs = Selector(text=data) posts = hxs.xpath( '//ul[@class="archive"]/li/span[@class="channel markets_and_finance"]/following-sibling::h1/a/@href' ).extract() posted.append(posts) return posted
def mon(inputs): week = [] errored_out = [] for month in inputs: try: data = urllib.request.urlopen(month).read() except urllib.error.URLError as e: print(month) errored_out.append(month) print(e.reason) if type(data) is bytes: data = data.decode("utf-8") hxs = Selector(text=data) weeks = hxs.xpath('//ul[@class="weeks"]/li/a').re( 'http://www.businessweek.com/archive/\\d+-\\d+/news/day\\d+\.html' ) week.append(weeks) else: hxs = Selector(text=data) weeks = hxs.xpath('//ul[@class="weeks"]/li/a').re( 'http://www.businessweek.com/archive/\\d+-\\d+/news/day\\d+\.html' ) week.append(weeks) return week
def xpath_local_content(text, local_point="电话", contrast_point="地址") -> list: content = text.replace(" ", "").replace("\u3000", "") # content = response.replace(" ", "").replace("\u3000", "") selete = Selector(content) targets = [ '//*[contains(text(),"{}")]'.format(local_point), '//*[contains(text(),"手机")]', '//*[contains(text(),"座机")]' ] second_target = '//*[contains(text(),"{}")]'.format(contrast_point) lenght = len(targets) first_local = [] index = 0 for index, first_value in enumerate(targets): first_local = selete.xpath(first_value) if first_local: break if not first_local: return [] while index + 1 <= lenght: for count in range(1, 4): find_parent = '/..' * count first_local = selete.xpath(targets[index] + find_parent) second_local = selete.xpath(second_target + find_parent).extract() for second_value in second_local: for offset, value in enumerate(first_local): result = value.extract() # print(result) if result == second_value: result = selete.xpath(targets[index] + find_parent + '/*').extract() return result index += 1 return []
def down_list(): with open('1.html', mode='r', encoding='utf8') as f: text = f.read() html = Selector(text, 'html') dl = html.xpath("//li[@id='thA']//dl") for item in dl: item: Selector dt = item.xpath("./dt/a/text()").extract_first('') dd = item.xpath("./dd/a/text()").extract() dd_url = item.xpath("./dd/a/@href").extract() for i, small in enumerate(dd): name = dt + ';' + small filename = list_path + '/' + name if not os.path.exists(filename): os.makedirs(filename) list_url = "http://www.fzwjt.com" + dd_url[i] print(list_url) feature = 'PageBar41' res = utils.get_html(list_url, proxies=proxy, timeout=50) if res.content.decode('utf8').find(feature) == -1: file = '%s/%s.html' % (filename, 1) with open(file, mode='w', encoding='utf8') as f: f.write(res.content.decode()) utils.printf("下载", name, "成功...") else: html = Selector(res.text, 'html') page_num = html.xpath( "//span[@class='PageBar41']//em[3]/text()").extract_first( '') for page in range(1, int(page_num) + 1): url = list_url + "&page={page}".format(page=page) res = utils.get_html(url, proxies=proxy, timeout=50) file = '%s/%s.html' % (filename, page) with open(file, mode='w', encoding='utf8') as f: f.write(res.content.decode()) utils.printf("下载", name, page, "成功...")
# -*- coding: utf-8 -*- import re import requests from parsel import Selector # Pode usar RegEx para que o parsel retorne uma lista contendo apenas uma parte específica do texto, # Exemplo: # texto: 08:00 ~ 12:00 # RegEx: .* (.+) ~.* # retorno: 08:00 response = requests.get("http://semcomp.com.br/programacao") sel = Selector( response.text).xpath(u".//*[@title='Horário']/text()").re(".* (.+) ~.*") print("RegEx horários de palestras SemComp: %s" % sel) # RegEx para validar um username baseado nas regras: # pode possuir de 3 à 16 caracteres sendo eles letras, números, _ (underline) e - (hífen) # Exemplo: # username = "******" # Valido # username = "******" # Invalido username = "******" pattern = re.compile("^[a-zA-Z0-9_-]{3,16}$") if pattern.match(username): print("Valido") else: print("Invalido") # RegEx para encontrar emails em um site response = requests.get(
def select_sidebar(sel: Selector) -> Selector: """Select the info sidebar.""" return sel.xpath("//div[@class='js-scrollfix-bottom']")
import requests from parsel import Selector import pprint url = 'http://www.porters.vip/verify/uas/index.html' headers = {"User-Agent": "Postman"} resp = requests.get(url, headers=headers) print(resp.status_code) if resp.status_code == 200: sel = Selector(resp.text) res = sel.css('.list-group-item::text').extract() pprint.pprint(res) else: print("失败")
# nessa etapa inicial o webdriver é aberto no diretório abaixo # (Lembrando ser necessario baixar o webdriver antes) nome_driver = Firefox() # Recebe Link Url link_url = 'https://ultimateqa.com/fake-landing-page' # nessa etapa é aberto o linkedin via webdriver nome_driver.get(link_url) # Define Tempo de espera tempo_maximo_espera = 30 # Busca uma tag pelo Xpath caminho_xpath = '//h4[@class="et_pb_module_header"]' # exemplo_xpath = '//tag_html[@atributo="valor"]/tag/tag/tipo()' # Aguarda para continuar enquanto tag HTML definida não carregar pelo # tempo máximo definido: tempo_máximo_espera WebDriverWait(nome_driver, tempo_maximo_espera).until( EC.presence_of_element_located((By.XPATH, caminho_xpath)) ) # Continuação da Automação fonte_pagina = Selector(text=nome_driver.page_source) nome_objeto = fonte_pagina.xpath('//h4[@class="et_pb_module_header"]/a/@href').getall() print(nome_objeto)
def twitter (name_to_search,page_number,knownimage,verbose): placeToSearch='twitter.com' chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") chrome_path = './chromedriver' driver = webdriver.Chrome(chrome_path,chrome_options=chrome_options) people_list=[] for i in range(int(page_number)): driver.get("https://www.google.com/search?q=site:"+placeToSearch+"+AND+"+name_to_search + "&start=" + str(10 * i)) search=driver.find_elements_by_tag_name('a') time.sleep(10) for s in search: href=s.get_attribute('href') if href != None: if "https://twitter.com/" in href: if "/status/" not in href and "/media" not in href and "/hashtag/" not in href and "webcache.googleusercontent.com" not in href and "google.com" not in href: people_list.append(href) elif "/hashtag/" not in href and "webcache.googleusercontent.com" not in href and "google.com" not in href: if "/status/" in href: people_list.append(href.split("/status/")[0]) elif "/media" not in s.text: people_list.append(href.split("/media")[0]) people_list=set(people_list) now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") if not os.path.isdir("data/twitter"): os.mkdir("data/twitter"); path=os.path.join('data/twitter',str(now)+'_twitter_data.json') jsonData=[] userLink = set() for p in people_list: if verbose: print("*******************************************************************************************************") print(p) driver.get(p) driver.implicitly_wait(50) time.sleep(2) sel = Selector(text=driver.page_source) name = sel.xpath('//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div/div/div[2]/div/div/div[1]/div/div[2]/div/div/div[1]/div/span[1]/span/text()').extract_first() link = p description = sel.xpath('//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div/div/div[2]/div/div/div[1]/div/div[3]/div/div/span[1]/text()').extract_first() location = sel.xpath('//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div/div/div[2]/div/div/div[1]/div/div[4]/div/span[1]/span/span/text()').extract_first() member_since = sel.xpath('//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div/div/div[2]/div/div/div[1]/div/div[4]/div/span[2]/svg/text()').extract_first() born=sel.xpath('//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div/div/div[2]/div/div/div[1]/div/div[4]/div/span[2]/svg/text()').extract_first() webpage=sel.xpath('//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div/div/div[2]/div/div/div[1]/div/div[4]/div/a/text()').extract_first() image_url=sel.xpath('//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div/div/div[2]/div/div/div[1]/div/div[1]/a/div[1]/div[2]/div/img/@src').extract_first() if name==None: name="" if str(link) not in userLink: userLink.add(link) nameParts = name_to_search.split(' ') isMatcher = False for n in nameParts: if SequenceMatcher(None,n, name).ratio()>0.4 or SequenceMatcher(None,n,str(link)).ratio()>0.4 or n in str(description).lower(): isMatcher=True if SequenceMatcher(None,name_to_search, name).ratio()>0.4 or SequenceMatcher(None,name_to_search,str(link)).ratio()>0.4 or name_to_search in str(description).lower(): isMatcher=True if isMatcher: userData = {} if verbose: print("Name: "+str(name)) print("Link: "+str(link)) print("Description: "+str(description)) print("Location: "+ str(location)) print("Member since: "+str(member_since)) print("Born: "+str(born)) print("Web: "+str(webpage)) print ("Profile image url: "+str(image_url)) print('\n') print('\n') if knownimage: if not os.path.isdir("data/twitter/"+str(now)+"_images"): os.mkdir("data/twitter/"+str(now)+"_images"); image=os.path.join("data/twitter/"+str(now)+"_images/"+str(link.split('.com/')[1])+".jpg") try: urllib.request.urlretrieve(image_url, image) userData={'storedImage':image,'name':str(name),'link':str(link),'description':str(description),'location':str(location),'member_since':str(member_since),'born':str(born),'web':str(webpage),'image':str(image_url)} jsonData.append(userData) except: pass else: userData={'name':str(name),'link':str(link),'description':str(description),'location':str(location),'member_since':str(member_since),'born':str(born),'web':str(webpage),'image':str(image_url)} jsonData.append(userData) with open(path, 'w+') as outfile: json.dump(jsonData, outfile) print("Results Twitter in: " + str(path)) response={'results':str(path)} if len(people_list)>0: if knownimage: print("Compare similarity images.") face_identification(knownimage,'./data/twitter/'+str(now)+'_images/') response['images']='./data/twitter/'+str(now)+'_images/' response['recognized']='./data/twitter/'+str(now)+'_images/recognized/' driver.quit() return response
def load_fields_to_csv(driver): recommended_links = [] for tries in range(20): page = random.randint(0, 20) * 25 driver.get( f'https://www.linkedin.com/jobs/search/?f_TPR=r604800&keywords=junior%20data%20scientist&start={page}' ) # driver.get(f'https://www.linkedin.com/jobs/search/?f_TPR=r604800&keywords=junior%20data%20scientist&location=New%20York%20City%20Metropolitan%20Area&start={page}') sel = Selector(text=driver.page_source) job_listings = sel.xpath( '//a[contains(@href, "/jobs/view/")]/@href').getall() # alternate method: # sel.xpath('//a[has-class("disabled ember-view job-card-container__link job-card-list__title")]/@href').getall() job_listings = list(dict.fromkeys(job_listings)) # remove duplicates sleep(round(random.uniform(3, 20), 2)) for listing in reversed(job_listings): if len(recommended_links) != 0: listing = recommended_links[0] recommended_links.pop(0) driver.get('https://www.linkedin.com' + listing) url = driver.current_url if 'linkedin.comhttps' in url: driver.get(listing) sleep(2) sel = Selector(text=driver.page_source) ####################################################### job_title = sel.xpath( '//h1[has-class("jobs-top-card__job-title t-24")]/text()').get( ) if job_title: job_title = ''.join(job_title) if type(job_title) == str: job_title = job_title.strip() elif type(job_title) == list: job_title = ''.join(job_title) try: job_title = job_title.strip() except: continue job_title = validate_field(job_title) company_name = sel.xpath( '//a[has-class("jobs-top-card__company-url ember-view")]/text()' ).get() if company_name: company_name = ''.join(company_name) if type(company_name) == str: company_name = company_name.strip() elif type(company_name) == list: company_name = ''.join(company_name) try: company_name = company_name.strip() except: continue company_name = validate_field(company_name) location = sel.xpath( '//a[has-class("jobs-top-card__exact-location t-black--light link-without-visited-state")]/text()' ).getall() if location: location = ''.join(location) if type(location) == str: location = location.strip() elif type(location) == list: location = ''.join(location) try: location = location.strip() except: continue location = validate_field(location) posted_days_ago = sel.xpath( '//p[has-class("mt1 full-width flex-grow-1 t-14 t-black--light")]' ).get() if posted_days_ago: posted_days_ago = ''.join(posted_days_ago) if type(posted_days_ago) == str: posted_days_ago = posted_days_ago.strip() elif type(posted_days_ago) == list: posted_days_ago = ''.join(posted_days_ago) try: posted_days_ago = posted_days_ago.strip() except: continue posted_days_ago = validate_field(posted_days_ago) seniority_level = sel.xpath( '//p[has-class("jobs-box__body js-formatted-exp-body")]/text()' ).getall() if seniority_level: seniority_level = ''.join(seniority_level) if type(seniority_level) == str: seniority_level = seniority_level.strip() elif type(seniority_level) == list: seniority_level = ''.join(seniority_level) try: seniority_level = seniority_level.strip() except: continue seniority_level = validate_field(seniority_level) industry_job_functions = sel.xpath( '//li[has-class("jobs-box__list-item jobs-description-details__list-item")]/text()' ).getall() if industry_job_functions: industry_job_functions = ''.join(industry_job_functions) if type(industry_job_functions) == str: industry_job_functions = industry_job_functions.strip() elif type(industry_job_functions) == list: industry_job_functions = ''.join(industry_job_functions) try: industry_job_functions = industry_job_functions.strip() except: continue industry_job_functions = validate_field(industry_job_functions) employment_type = sel.xpath( '//p[has-class("jobs-box__body js-formatted-employment-status-body")]/text()' ).get() if employment_type: employment_type = ''.join(employment_type) if type(employment_type) == str: employment_type = employment_type.strip() elif type(employment_type) == list: employment_type = ''.join(employment_type) try: employment_type = employment_type.strip() except: continue employment_type = validate_field(employment_type) job_description = sel.xpath( '//div[has-class("jobs-box__html-content jobs-description-content__text t-14 t-normal")]' ).getall() if job_description: job_description = ''.join(job_description) if type(job_description) == str: job_description = job_description.strip() elif type(job_description) == list: job_description = ''.join(job_description) try: job_description = job_description.strip() except: continue job_description = validate_field(job_description) base_salary = sel.xpath( '//p[has-class("salary-main-rail__data-amount t-24 t-black t-normal")]/text()' ).get() if base_salary: base_salary = ''.join(base_salary) if type(base_salary) == str: base_salary = base_salary.strip() elif type(base_salary) == list: base_salary = ''.join(base_salary) try: base_salary = base_salary.strip() except: continue base_salary = validate_field(base_salary) applicants = sel.xpath('//span[has-class("ml1")]/text()').get() if applicants: applicants = ''.join(applicants) if type(applicants) == str: applicants = applicants.strip() elif type(applicants) == list: applicants = ''.join(applicants) applicants = validate_field(applicants) all_similar_job_links = sel.xpath( '//a[has-class("job-card__link-wrapper js-focusable-card ember-view")]' ).getall() all_similar_job_links = validate_field(all_similar_job_links) if all_similar_job_links != 'NaN': for similar_job in all_similar_job_links: start = similar_job.find('<') end = similar_job.find('href="') if start != -1 and end != -1: result = similar_job[start:end + 6] similar_job = similar_job.replace(result, '') end_quote = similar_job.find('"') rec_url = similar_job[0:end_quote] recommended_links.append(rec_url) url = driver.current_url url = validate_field(url) try: with open(parameters.file_name, 'a', newline='') as csvfile: spamwriter = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL) spamwriter.writerow([ job_title, company_name, location, posted_days_ago, seniority_level, industry_job_functions, employment_type, job_description, base_salary, applicants, all_similar_job_links, url ]) except: continue sleep(round(random.uniform(3, 10), 2)) return [ job_title, company_name, location, posted_days_ago, seniority_level, industry_job_functions, employment_type, job_description, base_salary, applicants, all_similar_job_links, url ] # driver.quit()
hxs = Selector(text=data) posts = hxs.xpath('//ul[@class="archive"]/li/span[@class="channel markets_and_finance"]/following-sibling::h1/a/@href').extract() posted.append(posts) else: hxs = Selector(text=data) posts = hxs.xpath('//ul[@class="archive"]/li/span[@class="channel markets_and_finance"]/following-sibling::h1/a/@href').extract() posted.append(posts) return posted if __name__ == '__main__': print("in main") totalWeeks = [] totalPosts = [] url = 'http://www.businessweek.com/archive/news.html#r=404' data = urllib.request.urlopen(url).read() data = data.decode("utf-8") sel = Selector(text=data) months = sel.xpath('//ul/li/a').re('http://www.businessweek.com/archive/\\d+-\\d+/news.html') #admittMonths = 12*(2015-1991) + 8 m=[] for i in months: m.append([i]) totalWeeks = [] pool = Pool(8) totalWeeks= pool.map(mon,m) totalWeeks = [ent for sublist in totalWeeks for ent in sublist] print (len(totalWeeks)) #club = [ent for sublist in totalWeeks for ent in sublist] #print (len(club)) club = [ent for sublist in totalWeeks for ent in sublist] print (len(club)) d=[]
from selenium import webdriver browser = webdriver.Chrome() # 驱动Chrome浏览器打开滑动验证码示例页面 browser.get('http://www.porters.vip/captcha/jigsaw.html') # 定位滑块 jigsawCircle = browser.find_element_by_css_selector('#jigsawCircle') action = webdriver.ActionChains(browser) # 点击并保持不松开 action.click_and_hold(jigsawCircle).perform() # 返回当前页面的html代码 html = browser.page_source import re from parsel import Selector sel = Selector(html) # 获取圆角矩形和缺口的CSS样式 mbk_style = sel.css('#missblock::attr("style")').get() tbk_style = sel.css('#targetblock::attr("style")').get() # 编写用于从CSS样式中提取left属性值的匿名函数 extract = lambda x: ''.join(re.findall('left: (\d+|\d+.\d+)px', x)) # 调用匿名函数获取CSS样式中的left属性值 mbk_left = extract(mbk_style) tbk_left = extract(tbk_style) # 计算当前拼图验证码滑块所需移动的距离 distance = float(tbk_left) - float(mbk_left) action.move_by_offset(distance, 0) # 设置滑动距离 action.release().perform() # 松开鼠标
except urllib.error.URLError: pass except urllib.error.HTTPError: pass except timeout: pass else: fail.append(s[i]) print ("failed to retive info from ",s[i],i) flag = True if flag ==True: pass else: clap = response.read() clap = clap.decode("utf-8") h = Selector(text=clap) date = h.xpath('//meta[@content][@name="pub_date"]/@content').extract() if date: pass else: date = h.xpath('//meta[@content][@name="parsely-pub-date"]/@content').extract() key = h.xpath('//meta[@content][@name="keywords"]/@content').extract() info = h.xpath('//div[@id = "article_body"]/p//text()').extract() if not info: info = h.xpath('//div[@class = "article-body__content"]/p//text()').extract() if len(info)>1: info = ' '.join(str(r) for r in info) info = info.replace(u"\xa0", u" ") if "T" in date[0]: date,t = date[0].split('T') else:
def test_make_links_absolute(self): text = u'<a href="file.html">link to file</a>' sel = Selector(text=text, base_url='http://example.com') sel.root.make_links_absolute() self.assertEqual(u'http://example.com/file.html', sel.xpath('//a/@href').extract_first())
def l_par_html(url): # 这个函数是用来获取链家网南京二手房的信息 wr = requests.get(url, headers=headers, stream=True) sel = Selector(wr.text) h_test = sel.xpath('//h2[@class="total fl"]').extract() title = sel.xpath('//li//div//div[@class="title"]/a/text()').extract() pos1 = sel.xpath( '//li//div//div[@class="flood"]//div/a[@data-el="region"]/text()' ).extract() pos2 = sel.xpath( '//li//div//div[@class="flood"]//div/a[2]/text()').extract() houseInfo = sel.xpath( '//li//div//div[@class="address"]//div/text()').extract() followInfo = sel.xpath( '//li//div//div[@class="followInfo"]/text()').extract() tags = sel.xpath('//li//div//div[@class="tag"]').xpath( 'string(.)').extract() # print(tags) #tag_taxfree = sel.xpath('//li//div//div[@class="tag"]/span[@class="taxfree"]/text()').extract() #tag_haskey = sel.xpath('//li//div//div[@class="tag"]/span[@class="haskey"]/text()').extract() total_price = sel.xpath( '//li//div//div[@class="priceInfo"]//div[@class="totalPrice"]//span/text()' ).extract() unit_price = sel.xpath( '//li//div//div[@class="priceInfo"]//div[@class="unitPrice"]//span/text()' ).extract() if len(total_price) != len(title) or \ len(tags) != len(title) or len(followInfo) != len(title) or len(houseInfo) != len(title) or \ len(pos1) != len(title) or len(pos2) != len(title): print("Warnings! Length of some item does not match.") #print(len(tags)) pages_info = pd.DataFrame(list( zip(title, pos1, pos2, houseInfo, followInfo, tags, total_price, unit_price)), columns=[ 'Title', 'Position1', 'Position2', 'HouseInfo', "FollowInfo", "Tags", "Total_Price", "Unit_Price" ]) return pages_info
json={'data': booth}, auth=('admin', 'SUPER-SECURE-PASSWORD')) r.raise_for_status() with open('lamberti.json') as jsonfile: jsondata = json.load(jsonfile) with open('lamberti.geojson') as geojsonfile: geojsondata = json.load(geojsonfile) for booth in jsondata: booth_no = int(booth['nr'][-2:]) r = requests.get(booth['url']) if r.status_code == 200: text = r.text selector = Selector(text=text) booth_name = selector.css('.booth-title::text').get() booth_descr = selector.css('.booth-body > p::text').getall() if isinstance(booth_descr, list): booth_descr = " ".join(booth_descr) booth_owner_company = selector.css( '.contactParticle--company::text').get() booth_owner_name = selector.css( '.contactParticle--name\:firstname\,lastname::text').get() booth_owner_street = selector.css( '.contactParticle--street::text').get() booth_owner_city = selector.css( '.contactParticle--city\:postal_code\,locality::text').get()
url = 'http://www.sinomed.ac.cn/lw/basicSearch.do?dbtype=lw&pageNo=1&pageSize=100&change=true&cmode=&flag=&time=&linkSearch=&more=&moreExp=&searchmore=&searchword=+%22{}%22%5B%E5%87%BA%E7%89%88%E5%B9%B4%5D&submitButton=&beginDate=&endDate='.format( str(year)) message = (str(year), url) url_que.put(message) while True: if not url_que.empty(): try: message = url_que.get() year = message[0] url = message[1] res = requests.get(url, proxies=proxy) res.encoding = res.apparent_encoding fname = r'E:\work\SinoMed_tasks\博硕论文\html' + '/' + "%s.html" % str( year) html = Selector(res.text, 'html') totalnum = html.xpath( "//input[@id='itemTotal']/@value").extract_first('') if totalnum == '0': print("%s年无文章暂不下载" % year) else: allnum = int(totalnum) count += allnum with open(fname, 'w', encoding='utf8') as f: f.write(res.text) print("%s年有%s文章" % (year, totalnum)) print("一共有%s文章" % (str(count))) except Exception as e: print("停止时年份为%s" % year) print('停止时count的个数%s' % str(count)) url_que.put(message)
import requests from parsel import Selector from nltk.tokenize import word_tokenize from nltk.tokenize import sent_tokenize from nltk.corpus import stopwords from string import punctuation from heapq import nlargest from collections import defaultdict from nltk.probability import FreqDist url = 'http://www.saopaulo.sp.gov.br/spnoticias/ultimas-noticias/centro-paula-souza-e-ibm-apresentam-modelo-educacional-p-tech/' text = requests.get(url).text selector = Selector(text=text) title = selector.xpath( '//header[contains(@class, "article-header")]//h1/text()').get() legend = selector.xpath( '//header[contains(@class, "article-header")]//p/text()').get() paragraphs = selector.xpath( '//article[contains(@class, "article-main")]//p/text()').getall() text_to_analyse = '' for paragraph in paragraphs: text_to_analyse += paragraph sentences = sent_tokenize(text_to_analyse) words = word_tokenize(text_to_analyse.lower()) stopwords = set(stopwords.words('portuguese') + list(punctuation)) words_without_stopwords = [word for word in words if word not in stopwords]
def _parse_me(cls, base_fname): json_fname = "{}.json".format(base_fname) html_fname = "{}.html".format(base_fname) resp = { "intro": {}, "declaration": {} } try: with open(json_fname, "r") as fp: data = json.load(fp) with open(html_fname, "r") as fp: raw_html = fp.read() html = Selector(raw_html) except ValueError: print( "File {} or it's HTML counterpart cannot be parsed".format(json_fname)) return None except FileNotFoundError: print( "File {} or it's HTML counterpart cannot be found".format(json_fname)) return None id_ = data.get("id") created_date = data.get("created_date") raw_html_lowered = raw_html.lower() for chunk in cls.dangerous_chunks: if chunk in raw_html_lowered: raise BadHTMLData("Dangerous fragment found: {}, {}".format( id_, base_fname)) try: data = data["data"] except KeyError: raise BadJSONData("API brainfart: {}, {}".format(id_, base_fname)) if "step_0" not in data: raise BadJSONData("Bad header format: {}, {}".format(id_, base_fname)) resp["_id"] = "nacp_{}".format(id_) resp["ft_src"] = "\n".join(cls.extract_textual_data(html)) resp["nacp_orig"] = data resp["declaration"]["url"] = "https://public.nazk.gov.ua/declaration/{}".format(id_) resp["declaration"]["source"] = "NACP" resp["declaration"]["basename"] = os.path.basename(base_fname) resp["intro"]["corrected"] = id_ in cls.corrected resp["intro"]["date"] = cls.parse_date(created_date) if "declarationType" not in data["step_0"] or "changesYear" in data["step_0"]: resp["intro"]["doc_type"] = "Форма змін" if "changesYear" in data["step_0"]: resp["intro"]["declaration_year"] = int(data["step_0"]["changesYear"]) else: resp["intro"]["doc_type"] = cls.declaration_types[data["step_0"]["declarationType"]] if "declarationYearTo" in data["step_0"]: resp["intro"]["declaration_year_to"] = cls.parse_date(data["step_0"]["declarationYearTo"]) if "declarationYearFrom" in data["step_0"]: resp["intro"]["declaration_year_from"] = cls.parse_date(data["step_0"]["declarationYearFrom"]) resp["intro"]["declaration_year"] = resp["intro"]["declaration_year_from"].year if "declarationYear1" in data["step_0"]: resp["intro"]["declaration_year"] = int(data["step_0"]["declarationYear1"]) if "declarationYear3" in data["step_0"] and data["step_0"]["declarationYear3"]: resp["intro"]["declaration_year"] = int(data["step_0"]["declarationYear3"]) if "declarationYear4" in data["step_0"] and data["step_0"]["declarationYear4"]: resp["intro"]["declaration_year"] = int(data["step_0"]["declarationYear4"]) resp["general"] = { "last_name": replace_apostrophes(title(data["step_1"]["lastname"])), "name": replace_apostrophes(title(data["step_1"]["firstname"])), "patronymic": replace_apostrophes(title(data["step_1"]["middlename"])), "full_name": replace_apostrophes("{} {} {}".format( title(data["step_1"]["lastname"]), title(data["step_1"]["firstname"]), title(data["step_1"]["middlename"]), )), "post": { "post": replace_apostrophes(data["step_1"].get("workPost", "")), "post_type": replace_apostrophes(data["step_1"].get("postType", "")), "office": replace_apostrophes(data["step_1"].get("workPlace", "")), "actual_region": replace_apostrophes(cls.region_types.get(data["step_1"].get("actual_region", ""), "")), "region": replace_apostrophes(cls.region_types.get(data["step_1"].get("region", ""), "")), } } if "step_2" in data: family = data["step_2"] if isinstance(family, dict): resp["general"]["family"] = [] for member in family.values(): if not isinstance(member, dict): continue resp["general"]["family"].append({ "family_name": replace_apostrophes("{} {} {}".format( title(member.get("lastname", "")), title(member.get("firstname", "")), title(member.get("middlename", "")), )), "relations": member.get("subjectRelation", "") }) # get regions from estate list if "step_3" in data and isinstance(data["step_3"], dict) and data["step_3"]: if "estate" not in resp: resp["estate"] = [] for estate in data["step_3"].values(): if isinstance(estate, dict) and "region" in estate: region = replace_apostrophes(cls.region_types.get(estate.get("region", ""), "")) if region: resp["estate"].append({"region": region}) if "step_4" in data and isinstance(data["step_4"], dict) and data["step_4"]: if "estate" not in resp: resp["estate"] = [] for estate in data["step_4"].values(): if isinstance(estate, dict) and "region" in estate: region = replace_apostrophes(cls.region_types.get(estate.get("region", ""), "")) if region: resp["estate"].append({"region": region}) if "estate" in resp: estate_list = html.css( "table:contains('Місцезнаходження') td:contains('Населений пункт') span::text" ).extract() for estate in estate_list: region = cls.decode_region(estate) if region: resp["estate"].append({"region": region}) resp['general']['full_name_suggest'] = [ { 'input': resp['general']['full_name'], 'weight': 5 }, { 'input': ' '.join( [ resp['general']['name'], resp['general']['patronymic'], resp['general']['last_name'] ] ), 'weight': 3 }, { 'input': ' '.join( [ resp['general']['name'], resp['general']['last_name'] ] ), 'weight': 3 } ] resp['general']['full_name_for_sorting'] = keyword_for_sorting(resp['general']['full_name']) if not resp["general"]["post"]["region"]: region_html = html.css( "fieldset:contains('Зареєстроване місце проживання') .person-info:contains('Місто')::text" ).extract() if len(region_html) > 1: resp["general"]["post"]["region"] = cls.decode_region(region_html[1]) if not resp["general"]["post"]["actual_region"]: region_html = html.css( "fieldset:contains('Місце фактичного проживання') .person-info:contains('Місто')::text" ).extract() if len(region_html) > 1: resp["general"]["post"]["actual_region"] = cls.decode_region(region_html[1]) # if set only one region use it value for second one if not resp["general"]["post"]["actual_region"] and resp["general"]["post"]["region"]: resp["general"]["post"]["actual_region"] = resp["general"]["post"]["region"] elif not resp["general"]["post"]["region"] and resp["general"]["post"]["actual_region"]: resp["general"]["post"]["region"] = resp["general"]["post"]["actual_region"] resp["index_card"] = concat_fields(resp, NACPDeclaration.INDEX_CARD_FIELDS) return NACPDeclaration(**resp).to_dict(True)
def parse(html, callback, *args, **kwargs): html = html.decode('utf8') html = html.encode('latin1') html = html.decode('gb2312', 'ignore') sel = Selector(text=html) return callback(sel, *args, **kwargs)
return country client = pymongo.MongoClient("mongodb+srv://ybh:[email protected]/resumely?retryWrites=true&w=majority") database = client["resumelydb"] profiles_collection = database['profiles'] done = set() extracted_data = {} extracted_data['candidates'] = [] driver = webdriver.Chrome(executable_path='C:\chromedriver_win32\chromedriver_80') pickle.dump( driver.get_cookies() , open("cookies.pkl","wb")) driver.maximize_window() for x in profiles_collection.find(no_cursor_timeout=True).skip(3600): driver.get(x['profile']) sleep(5) sel = Selector(text=driver.page_source) age = sel.xpath('//*[starts-with(@class,"widgetUserInfo__item widgetUserInfo__item_age")]/text()').extract_first() if age: age = age.strip() age = validate_field(age) try: if age != 'No Results': age = ' '.join(age.split()) for a in age.split(): if a.isdigit(): profiles_collection.update_one({"_id" : bson.ObjectId(x['_id'])},{ "$set": { "age": int(a) } }) except: pass # experiences experiencesTab = [] skills = []
def find_matches(self, sel: Selector) -> Generator[Match, None, None]: """ Generator to find live and upcoming matches in parsel.Selector object :returns: Generator for Match objects """ yield from self._find_matches(sel.xpath("//table[@id='gb-matches']//tr"))
def parse_playInfo(self, response): play = copy.deepcopy(response.meta["playInfoObj"]) # 赛事id---轮次 由上层传入 # url play["play_urls"] = response.url # 赛事时间 time_str = response.css("div.qbox_1 div.qbx_2 p::text").extract_first() if time_str == None: play["play_time"] = None else: if time_str.encode("utf-8") == "延期": play["play_result_detail"] = "延期" else: time_year = time_str.split("-")[0] if int(time_year) >= 0 and int(time_year) < 30: time_str = "20" + time_str else: time_str = "19" + time_str time_str = time_str[0:10] + " " + time_str[11:] play["play_time"] = time_str # 主队 play["team_home"] = response.css( "#matchTeam div.qpai_zi::text").extract_first() # 客队 play["team_vis"] = response.css( "#matchTeam div.qpai_zi_1::text").extract_first() # score_half = response.css("div.jifen_dashi p").extract_first() if score_half != None: score_half_p = Selector( text=score_half).xpath("//p/text()").extract_first() if score_half_p != None: score_half = score_half_p if score_half != None: # 半:1-1 score_half_arr = score_half.strip()[2:].split(" ")[0].split( "-") # 比分半场主 play["half_home"] = score_half_arr[0] # 比分半场客 play["half_vis"] = score_half_arr[1] else: # 比分半场主 play["half_home"] = None # 比分半场客 play["half_vis"] = None # $("#matchTeam div.vs span")[0] score_full = response.css("#matchTeam div.vs span").extract() if score_full == None or len(score_full) == 0: # 比分全场主 play["full_home"] = None # 比分全场客 play["full_vis"] = None # 赛事结果 play["play_result"] = None else: # 比分全场主 play["full_home"] = int( Selector( text=score_full[0]).css("::text").extract_first().strip()) # 比分全场客 play["full_vis"] = int( Selector( text=score_full[1]).css("::text").extract_first().strip()) # 赛事结果 if play["full_home"] == play["full_vis"]: play["play_result"] = 1 else: if play["full_home"] > play["full_vis"]: play["play_result"] = 3 else: play["play_result"] = 0 # # 指数详情-额外的请求 # /soccer/match/954629/odds/ajax/?page=0&trnum=5&companytype=BigBooks&type=1 # /soccer/match/954629/odds/ajax/?page=0&trnum=5&companytype=AuthoriteBooks&type=1 odds_url = "/soccer/match/{0}/odds/ajax/?page=0&trnum=5&companytype=AuthoriteBooks&type=1".format( play["id"]) return [ scrapy.Request(url=self.base_url + odds_url, headers=self.headers, meta={ 'cookiejar': response.meta['cookiejar'], "playInfoObj": play }, callback=self.parse_oddsInfo) ]
def find_history(self, sel: Selector) -> Generator[Match, None, None]: """ Generator to find recent matches in parsel.Selector object :returns: Generator for Match objects """ yield from self._find_matches(sel.xpath("//h2[contains(text(),'Recent')]/..//tr"))