def testRemoteInfoCore(self): header, body = getRequest(port=self.httpPort, path='/remote/info/core', arguments=dict(name='main'), parse=False) self.assertFalse('Traceback' in body, body) # only tested for MultiLucene situation for now! bodyLxml = HTML(body) lists = bodyLxml.xpath('//ul') fieldList = lists[0] fields = fieldList.xpath('li/a/text()') self.assertEquals(19, len(fields)) self.assertEqual([ '$facets', '__id__', '__key__.field', 'copy', 'field1', 'field2', 'field3', 'field4', 'field5', 'field_missing', 'intfield1', 'intfield2', 'intfield3', 'intfield_missing', 'sorted.field2', 'sorted.field4', 'sorted.intfield1', 'sorted.intfield_missing', 'untokenized.field3', ], fields) drilldownFieldList = lists[1] drilldownFields = drilldownFieldList.xpath('li/a/text()') self.assertEquals(set(['untokenized.field2', 'untokenized.fieldHier', 'untokenized.field2.copy']), set(drilldownFields))
def edit_message(base_url, username, password, message_id, new_body): url_opener = _utils.login_and_go_to_faq(base_url, username, password) # calculate some more URLs faq_url = urljoin(base_url, "faq.php") edit_url = urljoin(base_url, "misc.php") # go to the FAQ page (page with low backend complexity) to get the security token print("fetching security token") faq_response = url_opener.open(faq_url) faq = HTML(faq_response.read()) token_field = faq.find(".//input[@name='securitytoken']") security_token = token_field.attrib["value"] # encode the message request_string = \ "do=vsacb_editmessage&s=&securitytoken={0}&id={1}&vsacb_editmessage={2}".format( security_token, message_id, encode_outgoing_message(new_body) ) request_bytes = request_string.encode(server_encoding) print("updating message") edit_response = url_opener.open(edit_url, data=request_bytes) edit_response.read() print("done")
def parse_xpath_content(self, url): result = dict() content = self.get_content(url) if not content: return result result["url"] = url result["md5"] = self.md5(url) result["creat_at"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") tree = HTML(content) for key in self.config.Xpath.keys(): if not self.config.Xpath.get(key): continue elif isinstance(self.config.Xpath.get(key), dict): # 字符串截取 if self.config.Xpath[key]['op'] == 'cut': pos1 = content.find(self.config.Xpath[key]['start']) if pos1 != -1: pos2 = content[pos1:].find(self.config.Xpath[key]['end']) result[key] = content[pos1+len(self.config.Xpath[key]['start']):pos1+pos2] else: result[key] = "" else: list_content = tree.xpath(self.config.Xpath[key].replace('tbody/', '')) if list_content: result[key] = "".join(list_content) else: result[key] = "" result['publish_time'] = self.parse_time(result['publish_time']) return result
def decode_first(d): h = HTML(d) inner_js=''.join(h.xpath('//div/text()')).replace('_','') inner_js=inner_js.replace('&','').replace('%','') inner_js=inner_js.replace('=','').replace('undefined','') inner_js=inner_js.decode('hex') return inner_js
def hijack(content): html = HTML(content) body = html.xpath('//body')[0] script = Element('script') script.text = 'alert(/hijacked/);' body.append(script) content = tostring(html) return content
def save_download(self, url, data, index): page = HTML(data) body = page.xpath('//body')[0] bundles = elquery.get_elements_by_class(body, 'olpc-bundle') bundle = bundles[index] links = bundle.xpath('descendant-or-self::a[@href]') for link in links: href = urlparse.urljoin(url, link.attrib['href']) print 'got one page:', href self.store.save_page_set(href)
def decode_first_js(data): h = HTML(data) off = get_off(h) off.append(0) for el in h.xpath("//*[@id]"): if el.text: txt = decode_payload(off, el.text) if not txt: continue yield txt
def testRemoteInfoCore(self): header, body = getRequest(port=self.httpPort, path='/remote/info/core', arguments=dict(name='main'), parse=False) bodyLxml = HTML(body) lists = bodyLxml.xpath('//ul') fieldList = lists[0] fields = fieldList.xpath('li/a/text()') self.assertEquals(12, len(fields)) drilldownFieldList = lists[1] drilldownFields = drilldownFieldList.xpath('li/a/text()') self.assertEquals(['untokenized.field2', 'untokenized.fieldHier'], drilldownFields)
def get(self, url, depth=1): counter_processed.update((depth, )) logging.info('[{}] Processing {} ({}).'.format(threading.current_thread().name, url, depth)) rsp = self.session.get(url) rsp.encoding = 'GB2312' html = HTML(rsp.text) urls = html.xpath('//a/@href') urls = list(set(filter(lambda url: re.search(self.url_loc, url), urls))) for url in urls: self.data.put((url, depth + 1)) counter.update([depth + 1] * len(urls))
def main(wf): kw = wf.args[0] r = web.get(kw) r.raise_for_status() reg = re.compile('<ul id="dl-btn">.*</ul>', flags=re.DOTALL + re.MULTILINE) match = reg.search(r.text) if match: html = match.group(0) node = HTML(html).find('.//a') log.info(node.text) call(["open", node.get('href')])
def parse_urls(self): content = self.get_content(self.config.Root) if content: tree = HTML(content) url_list = tree.xpath(u"//a/@href") pattern = re.compile(self.config.Regex) url_joined_list = [urlparse.urljoin(self.config.Root, url) for url in url_list] url_joined_list = list(set(url_joined_list)) # 去重 return filter(pattern.match, url_joined_list) else: return []
def doit(d): if '<div' in d: d = decode_first(d) for p in decode_payloads(d): urls = [] if 'application/x-shockwave-flash' in p: t = 'flash' x=p.strip().splitlines()[-2].replace("'",'"').split('"') url_b=x[1].split('/')[1] sh =x[-2].decode('hex').strip("\x00") urls = re.findall('"(/'+url_b+'.*?)"',p) payload_url = re.findall('(http.*)',sh)[0] elif 'data:application/x-silverlight' in p: t = 'silverlight' x = HTML(re.findall('"(.*?)"',p)[0]) for i in x.xpath('//param'): if i.attrib['name'] == 'source': urls = [i.attrib['value']] elif i.attrib['name'] == 'initParams': vals = dict(map(lambda x: tuple(x.split('=')),i.attrib['value'].split('&'))) sh = vals['shell32'].decode('hex').strip("\x00") payload_url = re.findall('(http.*)',sh)[0] elif 'CollectGarbage' in p: t = 'ie' x= p.strip().splitlines()[-1].replace("'",'"').split('"') payload_url = x[1] + ' rc4 key: %s' % x[-2] sh = re.findall('"([0-9a-f]+)"\+',p,re.I)[0].decode('hex') else: t = 'unknown' sh_hash = hashlib.sha256(sh).hexdigest() print '[+] found %s exploit' % t if urls: print '[+] additional exploits:', ', '.join(urls) print '[+] payload url:', payload_url print '[+] shellcode hash:',sh_hash if args.save: n = args.dir + '/exp.%s.%s.txt' % (t,hashlib.sha256(p).hexdigest()) with open(n,'w') as f: f.write(p) print '[+] js saved to', n if sh: n = args.dir + '/exp.%s.%s.sh.bin' % (t,sh_hash) with open(n,'w') as f: f.write(sh) print '[+] shellcode saved to', n
def link_tag_url(html): ''' extracts a relative url from an HTML document's link tag, like <link rel="shortcut icon" href="images-template/favicon.ico" type="image/x-icon" /> ''' from lxml.etree import HTML doc = HTML(html) link_tag = doc.find('.//link[@rel="shortcut icon"]') if link_tag is not None: favicon_url = link_tag.get('href', '') if favicon_url: return favicon_url
def parase_response(start_url): flag = True try: # with open('2.txt','r') as f: # start_url=f.read().strip() while flag: product_code_list = [] print('start_url:', start_url) r = session.get(url=start_url, proxies=random.choice(proxies_list)) print(r.status_code) if r.status_code != 200: # 被屏蔽后可以更换地址 待扩展 raise Exception('地址被屏蔽') print('解析页面获取商品') html = HTML(r.text) products_html = html.xpath('//*[@id="Products"]/ul/li/div') conn = sqlite3.connect('tuhu_db.sqlite3') with conn: cur = conn.cursor() if products_html: for product in products_html: product_name = is_null(product.xpath('a/text()')) product_url = is_null(product.xpath('a/@href')) product_price = is_null( product.xpath('div/strong/text()')) product_code = is_null( product.xpath('form/input[1]/@value')) insert_product_sql = "INSERT INTO product_des (product_name,product_url,product_price,product_code) VALUES (?,?,?,?)" cur.execute(insert_product_sql, (product_name.strip(), product_url, product_price, product_code)) product_code_list.append(product_code) conn.commit() for code in product_code_list: parse_comment(code) # 如果没有下一页则循环 start_url = is_null(html.xpath('//*[@class="last-child"]/@href')) if not start_url: flag = False except Exception as e: print(e) with open('2.txt', 'a') as f: f.write(start_url + str(e) + '\n') conn = sqlite3.connect('tuhu_db.sqlite3') conn.commit() conn.close()
def doc(self): """获取后缀名""" url = 'https://linshiyouxiang.net/' headers = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;' 'q=0.8,application/signed-exchange;v=b3;q=0.9', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'cache-control': 'no-cache', 'pragma': 'no-cache', 'referer': 'https://linshiyouxiang.net/', 'sec-ch-ua': '"Google Chrome";v="87", " Not;A Brand";v="99", "Chromium";v="87"', 'sec-ch-ua-mobile': '?0', 'sec-fetch-dest': 'document', 'sec-fetch-mode': 'navigate', 'sec-fetch-site': 'same-origin', 'sec-fetch-user': '******', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/87.0.4280.88 Safari/537.36', } self.session.headers = headers response = self.session.get(url, timeout=timeout) if response.status_code == 200: context = HTML(response.text) self.address = context.xpath( '//*/ul[@class="dropdown-menu dropdown-menu-right"]/li/a/@data-mailhost' ) if self.address: return True else: return False else: return None
def city(self): """ 城市 :return: """ count = red_cli.scard('jr_category') while count: #没爬取一次分类换一个IP proxy = choice(get_proxy())["ip"] proxy = { 'http': 'http://{}'.format(proxy), 'https': 'https://{}'.format(proxy) } data_category = red_cli.srandmember('jr_category') cate = eval(data_category)["category_name"] cate_url = eval(data_category)["category_url"] tag = 0 item = EasyDict() item.category = cate resp = self.feach.get_req(url=cate_url, proxies=proxy) if resp != False: etre = HTML(resp) city_urls = etre.xpath( '//div[@class="filter-item"]/div[last()]/a/@href') city_names = etre.xpath( '//div[@class="filter-item"]/div[last()]/a/text()') for _ in range(len(city_names)): if city_names[_] == "全部": continue else: item.city_url = "https://www.jvrong.com" + str( city_urls[_]) item.city_name = city_names[_] red_cli.sadd('jr_city', str(item)) else: tag = 1 if tag == 1: print('请求失败') pass else: pprint('数据插入redis全部成功') red_cli.srem('jr_category', data_category) count -= 1
def cpxx(self, resp): """ 融资信息 :return: """ cpxx = self.sd_xpath.cpxx(HTML(resp)) return cpxx
async def get(self, url, proxy='', retry=5): response = None # 重试次数 for i in range(retry): try: response = await self.session.get( url, headers=self.headers, proxy='' if proxy == None else proxy, timeout=5) if 'content-type' in response.headers and 'html' in response.content_type: response.xpath = HTML(await response.text()).xpath if response.content_type == 'application/json': response.json_data = await response.json() if response.status != 200 or self.except_content_type != None and response.content_type != self.except_content_type: if proxy != None: await self.__update_proxy() proxy = self.proxy continue break except (Exception, BaseException, TimeoutError) as e: if proxy != None: await self.__update_proxy() proxy = self.proxy continue break if response != None and response.status == 200: self.succeed_proxies.add(proxy) else: self.succeed_proxies.discard(self.proxy) if proxy != None: await self.__update_proxy() return response
def _get_description(self, card: etree.HTML) -> str: description: str = "" for node in card.xpath(".//div[contains(@class, \"desc\")]"): description = node.text.strip() break return description
def _get_price(self, card: etree.HTML) -> str: price: str = "" for node in card.xpath(".//*[contains(@class, \"price\")]"): price = node.text.strip() break return price
def jpxx(self, resp): """ 融资信息 :return: """ jpxx = self.sd_xpath.jpxx(HTML(resp)) return jpxx
def get_detail(url): response = requests.get(url, headers=detail_headers) html = HTML(response.text) # print(response.text) contentList = html.xpath('//div[@class="article-content"]//text()') content = ''.join(contentList) savedate = html.xpath( 'string(//div[@class="article-source article-source-bjh"]/span[@class="date"])' ) savetime = html.xpath( 'string(//div[@class="article-source article-source-bjh"]/span[@class="time"])' ) publishDateStr = '2019-' + savedate + ' ' + savetime return content, publishDateStr
def qyyw(self, resp): """ 融资信息 :return: """ qyyw = self.sd_xpath.qyyw(HTML(resp)) return qyyw
async def __get_proxy_from_xila(self, session): ''' Crawl data from xiladaili. ''' try: for page in range(1, 5): url = f'http://www.xiladaili.com/gaoni/{page}/' res = await session.get(url, timeout=10) text = await res.text() html = HTML(text) for data in html.xpath('//table/tbody/tr'): ip = data.xpath('.//td[1]/text()')[0] await self.put_proxy(f'http://{ip}', '西拉代理') except Exception as e: logging.exception(e) pass
def raw(self, raw_html): """Parses the given string into an HTML Element.""" # print 20151008, raw_html # the lxml parser wraps `<html><body>...</body></html>` around # the snippet, but we don't want it. return HTML(raw_html)[0][0]
def _get_img_url(self, card: etree.HTML) -> str: img_url: str = "" for node in card.xpath(".//img[@data-original]"): img_url = node.attrib["data-original"] break return img_url
def main(): r = requests.get(BASE_URL) for filename in HTML( r.content).xpath('//a[starts-with(@href, "tl_2012_")]/@href'): print("Downloading", filename) download_file(posixpath.join(BASE_URL, filename))
def _get_location(self, card: etree.HTML) -> str: location: str = "" for node in card.xpath(".//*[contains(@class, \"location\")]"): location = node.text.strip() break return location
def zpzz(self, resp): """ 专利信息 :return: """ zpzz = self.sd_xpath.zpzz(HTML(resp)) return zpzz
def wxgz(self, resp): """ 融资信息 :return: """ wxgz = self.sd_xpath.wxgz(HTML(resp)) return wxgz
def rjzz(self, resp): """ 专利信息 :return: """ rjzz = self.sd_xpath.rjzz(HTML(resp)) return rjzz
def sbxx(self, resp): """ 商标信息 :return: """ sbxx = self.sd_xpath.sbxx(HTML(resp)) return sbxx
def zlxx(self, resp): """ 专利信息 :return: """ zlxx = self.sd_xpath.zlxx(HTML(resp)) return zlxx
def main(wf): if len(wf.args): kws = ' '.join(wf.args) url = 'http://www.mac-torrent-download.net/?s={}&x=0&y=0&open=1'.format( kws) text = get_recent_list(url) try: dd_arr = HTML(text).findall('.//dd') for dd in dd_arr: a = dd.find('.//a') href, title = a.get('href') + '?open=1', a.text.strip() info = dd.find('.//div[@class="blog_info"]') tags = ' / '.join([a.text for a in info.findall('.//a')]) time = info.find('.//i').tail.strip() wf.add_item(title=title, subtitle='{} {}'.format(time, tags), valid=True, arg=href, icon=ICON) except Exception as e: wf.add_item(title='槽糕!没找到 “{}”'.format(kws), subtitle='去“mac-torrent-download.net”手动搜索看看?', valid=True, arg=url, icon='icon.png') finally: wf.send_feedback()
def zbxx(self, resp): """ 融资信息 :return: """ zbxx = self.sd_xpath.zbxx(HTML(resp)) return zbxx
def zzzs(self, resp): """ 融资信息 :return: """ zzzs = self.sd_xpath.zzzs(HTML(resp)) return zzzs
def searchUrlMagnetCilifeng(self, content:str, page:int=1) -> None: """http://www.cilifeng.me/""" try: domain_name = "磁力风" search_url = "http://www.cilifeng.me/search?word={}&page={}".format(content, page) search_response = self._session.get(url=search_url) html = HTML(search_response.text) lis = html.xpath("//ul[@class='alt']/li") if lis and not self._stop: for li in lis: url = "http://www.cilifeng.me" + li.xpath(".//a/@href")[0].replace("../../..", "") result = (domain_name, url) self._detail_urls.put(result) self.searchUrlMagnetCilifeng(content=content, page=page + 1) except: pass finally: return
def gdxx(self, resp): """ 解析股东信息 :return: """ gsxx = self.sd_xpath.gdxx(HTML(resp)) return gsxx
def xzxk(self, resp): """ 融资信息 :return: """ xzxk = self.sd_xpath.xzxk(HTML(resp)) return xzxk
def gsbg(self, resp): """ 解析工商变更信息 :return: """ gsbg = self.sd_xpath.gsbg(HTML(resp)) return gsbg
def df_sites_info(self): self.urls = self.df_sites.url dfs = [] for url in tqdm(self.urls): r = self._request(url) dom_tree = HTML(r.text) site = dom_tree.xpath('normalize-space(//div[@class="Xc-ec-L b-L"]/text())') print('🕷:%s %s' % (site, url)) df = pd.read_html(self._request(url).text)[0] df.columns = ['rank', 'title', 'hot', 'site'] df['site'] = site df['url'] = url dfs.append(df) return pd.concat(dfs) # pd.merge(self.df_sites, pd.concat(dfs))
def rzxx(self, resp): """ 融资信息 :return: """ rzxx = self.sd_xpath.rzxx(HTML(resp)) return rzxx
def wzba(self, resp): """ 网站备案 :return: """ wzba = self.sd_xpath.wzba(HTML(resp)) return wzba
def html_to_table(input_filename, encoding='utf-8'): with open(input_filename) as fobj: html = fobj.read().decode(encoding).replace('\xa0', ' ') tree = HTML(html) data = tree.xpath('//body/b') for index, element in enumerate(data): text = element.text if text.startswith('Valores') and text.endswith('R$'): break new = [] for element in data[index + 1:]: text = element.text if text.startswith('FATURA DE '): continue elif REGEXP_PAGE.findall(text): continue else: new.append(element.text) data = new chunks = [[value.strip() for value in row] for row in partition(data, 4) if len(row) == 4] table = rows.Table(fields=FIELDS) current_year = datetime.datetime.now().year months = set(extract_month(row) for row in chunks) subtract_year = 'DEZ' in months and 'JAN' in months for row in chunks: try: category = convert_text(row[0]) description = convert_text(row[1]) value = convert_value(row[2]) except: print('WARNING: Ignoring row: {}'.format(row)) continue year = current_year month = extract_month(row) if subtract_year and month in ('NOV', 'DEZ'): year = current_year - 1 date = convert_date(row[3], year) table.append({'category': category, 'description': description, 'value': value, 'date': date, }) return table
def fake(base_url, username, password, game_id, time, score, game_name=None): url_opener = _utils.login_and_enter_arcade(base_url, username, password) # calculate some more URLs play_game_url = urljoin(base_url, "arcade.php?do=play&gameid={0}".format(game_id)) score_url = urljoin(base_url, "index.php?act=Arcade&do=newscore") # pretend to play the game print("playing the game") play_game_response = url_opener.open(play_game_url) play_game = HTML(play_game_response.read()) if game_name is None: # (meanwhile, find the game's name) game_flash = play_game.find(".//embed[@type='application/x-shockwave-flash']") if game_flash is None: print("didn't find the flash plugin on the game page :'-(") return flash_vars = game_flash.attrib['flashvars'].split("&") for var in flash_vars: if var.startswith("gamename="): game_name = var[len("gamename="):] if game_name is None: print("game name not found :'-(") return # wait the given time print("waiting") sleep(time) post_values = { "gscore": score, "gname": game_name } post_data = _utils.encode_post_data(post_values) print("submitting fake score") score_response = url_opener.open(score_url, data=post_data) score_response.read() print("done")
def parse_fulltext(fulltext_raw): """Extract article text from HTML page. Method extracts main text element from the supplied HTML assuming the HTML is from www.reuters.com. Parameters ---------- fulltext_raw : str HTML page to extract the article from. Returns ---------- str Article text. """ texts = HTML(fulltext_raw) texts = texts.xpath('//span[@id="articleText"]')[0].xpath('.//text()') text = " ".join(texts).strip() return text
def scrape_logos(): local("mkdir -p data/logos") base_uri = "http://www.sportslogos.net/league.php?id={0}" for url in [base_uri.format(page_id) for page_id in xrange(30, 36)]: resp = requests.get(url) if not resp.ok: print "Error retrieving {0}".format(url) continue tree = HTML(resp.content) for thumb in tree.findall(".//div[@class='thumbHolder']"): link = thumb.find("a") logo = link.find("img") title = link.attrib["title"].lower().replace("Logos", "") title = title.replace(" ", "_").strip() filename = "data/logos/{0}.gif".format(title) urllib.urlretrieve(logo.attrib["src"], filename)
def main(wf): parse = argparse.ArgumentParser() parse.add_argument('--app', dest='app') parse.add_argument('query', nargs='*', default=None) args = parse.parse_args() query = args.query[0] log.warn(query) if query: id = query.rsplit('/', 1)[-1].split('.')[0] url = 'http://soft.macx.cn/downloado.do?softid={}&cpus=2&urls=3'.format(id) r = web.get(url) r.raise_for_status() a = r.text node = HTML(a).find('.//a[@rel="facebox"][last()]') log.info(node.text) open = ['open'] if args.app: open.extend(['-a',args.app]) if node is not None and node.text == '浏览器直接下载': open.append(node.get('href')) else: open.append(url) call(open)
def parseData(urlList): urlW=open("/usr/product/zhenzhufen/url.txt" ,'a') for u in urlList: url=u.get("href").strip() print url urlW.write(url) urlW.write("\n") h = HTML(getHtml(url).decode('gbk')) dTxt=h.xpath('//h3') name=dTxt[0].text.strip().split()[0]+" "+dTxt[0].text.strip().split()[1]#名字 brand=dTxt[0].text.strip().split()[0]#品牌 # print brand # print name pCpgg=h.xpath('//p[@class="pCpgg"]') td=h.xpath('//td[@class="td2"]') if td: price=list(td[0].itertext())[1].strip() else : price=list(pCpgg[0].itertext())[1].strip()#价格 # print price norms=list(pCpgg[-1].itertext())[1].strip()#规格 # print norms spePs=h.xpath('//p[@class="speP"]/a') effect='' for speP in spePs: effect+=speP.text.strip()+" "#功效 # print effect awrap=h.xpath('//div[@class="Awrap"]/ul/li/a') imgUrl=awrap[0].find("img").attrib.get("src")#图片链接地址 # print imgUrl troCon=h.xpath('//div[@class="troCon"]') des=list(troCon[0].itertext()) description='' for d in des: if len(d.strip())>20: description+=d.strip()+""#产品描述 # print description dTxt=h.xpath('//div[@class="dTxt"]/p/a') series=dTxt[1].text.strip() #系列 # print series insertData(name,brand,price,norms,effect,imgUrl,description,series)
def fake(base_url, username, password, game_id, time, score, tourney_id, game_name=None, rung=None, face_off=None): url_opener = _utils.login_and_enter_arcade(base_url, username, password) # calculate some more URLs tourneys_url = urljoin(base_url, "arcade.php?&do=viewtournaments") view_tourney_url = urljoin(base_url, "arcade.php?&act=Arcade&do=viewtourney&tid={0}".format( tourney_id )) play_tourney_game_url = urljoin( base_url, "arcade.php?&do=playtourney&gameid={0}&tid={1}{2}{3}".format( game_id, tourney_id, "&rung={0}".format(rung) if rung is not None else "", "&faceoff={0}".format(face_off) if face_off is not None else "" ) ) score_url = urljoin(base_url, "index.php?act=Arcade&do=newscore") # go to tourneys print("entering tourneys page") tourneys_response = url_opener.open(tourneys_url) tourneys_response.read() # view the tourney print("looking at the tourney") view_tourney_response = url_opener.open(view_tourney_url) view_tourney_response.read() # pretend to play the game print("playing the game") play_tourney_game_response = url_opener.open(play_tourney_game_url) play_tourney_game = HTML(play_tourney_game_response.read()) if game_name is None: # (meanwhile, find the game's name) game_flash = play_tourney_game.find(".//embed[@type='application/x-shockwave-flash']") if game_flash is None: print("didn't find the flash plugin on the game page :'-(") return flash_vars = game_flash.attrib['flashvars'].split("&") for var in flash_vars: if var.startswith("gamename="): game_name = var[len("gamename="):] if game_name is None: print("game name not found :'-(") return # wait the given time print("waiting") sleep(time) post_values = { "gscore": score, "gname": game_name } post_data = _utils.encode_post_data(post_values) print("submitting fake score") score_response = url_opener.open(score_url, data=post_data) score_response.read() print("done")
def testRemoteInfoDrilldownValues(self): header, body = getRequest(port=self.httpPort, path='/remote/info/drilldownvalues', arguments=dict(path='untokenized.field2', name='main'), parse=False) self.assertFalse('Traceback' in body, body) bodyLxml = HTML(body) self.assertEquals(set(['value1', 'value0', 'value9', 'value8', 'value7', 'value6', 'value5', 'value4', 'value3', 'othervalue2', 'value2']), set(bodyLxml.xpath('//ul/li/a/text()')))
## some basic heuristic.... # if tlen - len(x) < epsi: # print 'fop' # return x except: pass return None def get_num(x): return int(re.search('[0-9]+$',x).group(0)) if __name__ == '__main__': args = apr.parse_args() h = HTML(open(args.file).read().replace('<br>','')) key_var = None for key in get_keys(h): print '[*] testing key:',key stream = ''; txt = None for el in h.xpath('//*[@id or @ui or @di]'): if el.text: txt = decode_page(el.text,key) if not txt: continue if 'cryptKey' in txt: key_var = re.findall('var cryptKey = ([_a-z0-9]+(\[\s*[0-9]+\s*\])?),',txt,re.I)[0][0] key_var = re.sub('\s+','',key_var)
def filter(self, environ, headers, data): url = construct_url(environ) static_url = environ['olpcproxy.static_url'] found = environ['olpcproxy.keys'] action = False if self.save_key in found: self.store.save_page_set(url, headers, data) action = True if self.remove_key in found: self.store.remove_page(url) action = True if environ.get('olpcproxy.downloads'): for index in environ['olpcproxy.downloads']: self.save_download(url, data, index) action = True if action: exc = httpexceptions.HTTPTemporaryRedirect( headers=[('Location', url)]) raise exc if '?' not in url: url_query = url + '?' else: url_query = url + '&' has_page = self.store.has_url(url) page = HTML(data) try: head = page.xpath('//head')[0] body = page.xpath('//body')[0] except IndexError: # Not a full HTML page return data self.sub_links(url, page, static_url) if has_page: time_diff = time.time() - self.store.url_cache_time(url) time_diff = format_time_diff(time_diff) message = ['This page was cached %s ago. You may ' % time_diff, tag.a('remove it from the cache', href=url_query+self.remove_key)] div_class = 'olpc-cached' else: message = ['This page is NOT cached. You may ', tag.a('add it to the cache', href=url_query+self.save_key)] div_class = None if head_style: insert_beginning( head, tag.style(head_style % {'static_url': static_url}, type="text/css")) image_location = static_url + '/x-small.gif' msg = tag.div( message, tag.a(tag.img(src=image_location, border=0, id="olpc-close-image"), href="#", onclick="document.getElementById('olpc-top-message').style.display='none'", valign="top"), id="olpc-top-message", class_=div_class) bundles = elquery.get_elements_by_class(body, 'olpc-bundle') if bundles: image_location = static_url + '/caution.gif' append( msg, tag.br(), tag.img(src=image_location), "Bundles were found in this page") for index, bundle in enumerate(bundles): b_msg = tag.div( tag.a(tag.img(src=static_url+'/arrow-down-red.gif', border=0), "You may download this bundle", href=url_query+self.download_key+'='+str(index))) insert_beginning(bundle, b_msg) insert_beginning(body, msg, tag.br(clear="all")) data = tostring(page, True) # Now fix up the content-type: content_type = header_value(headers, 'content-type') or '' content_type = self._charset_re.sub('', content_type).strip().lstrip(';') content_type += '; charset=utf' replace_header(headers, 'content-type', content_type) return data
print sql sqlW.write(sql) sqlW.write("\n") try: db.set_character_set('utf8') cursor.execute('SET NAMES utf8;') cursor.execute('SET CHARACTER SET utf8;') cursor.execute('SET character_set_connection=utf8;') cursor.execute(sql) db.commit() except MySQLdb.Error,e: print "Mysql Error %d: %s" % (e.args[0], e.args[1]) cursor.close() db.close() urlHtml=getHtml("http://cosme.pclady.com.cn/products_list/br0_bs0_bi1_sm119_ef0_pb0_pe0_or0.html") html= HTML(urlHtml.decode('gbk')) urlList=html.xpath('//div[@class="dList"]/ul/li/i[@class="iPic"]/a') parseData(urlList) for i in range(3 ,4): i=str(i) print i htmls="http://cosme.pclady.com.cn/products_list/br0_bs0_bi1_sm119_ef0_pb0_pe0_or0_p"+i+".html#productList" urlHtml=getHtml(htmls) try: html= HTML(urlHtml.decode('gbk')) urlList=html.xpath('//div[@class="dList"]/ul/li/i[@class="iPic"]/a') parseData(urlList) except Exception : errorTxt.write("\n") errorTxt.write(i) errorTxt.write("\n")
def get_racers(race_id): page = HTML(requests.get(TL_URL_TEMPL% race_id, headers={'User-Agent': USER_AGENT}).text) racers = set(el.text for el in page.xpath('.//a[@onmouseout]')) return list(racers)
def get_race_name(race_id): page = HTML(requests.get(TL_URL_TEMPL% race_id, headers={'User-Agent': USER_AGENT}).text) return page.findtext('.//title').split('live')[0].strip()
sel = selenium("localhost", 4444, "*chrome", "http://uk.yahoo.com/") time.sleep(10) for _ in range(10): # Wait for selenium to come up try: sel.start() except Exception: import traceback print traceback.format_exc() time.sleep(2) else: break else: raise Exception("Selenium failed to start") ## Do some searching try: sel.open("/?p=us") sel.type("id=p_13838465-p", search) sel.click("id=search-submit") sel.wait_for_page_to_load("30000") tree = HTML(sel.get_html_source()) results = tree.xpath('//*[@class="res"]/descendant::a/@href') print "\n".join(results) finally: sel.stop()
#!/usr/bin/env python3 import re import requests from lxml.etree import HTML response = requests.get('http://www.debian.org/releases/stable/') root = HTML(response.content) title_text = root.find('head').find('title').text release = re.search('\u201c(.*)\u201d', title_text).group(1) p_text = root.xpath('//div[@id="content"]/p[1]')[0].text version = p_text.split()[1] print('Codename: {}\nVersion: {}'.format(release, version))
def parseData(urlList): urlW=open("/usr/caizhuang/zhuangqian/url.txt" ,'a') for u in urlList: url=u.get("href").strip() print url urlW.write(url) urlW.write("\n") h = HTML(getHtml(url).decode('gbk')) try: dTxt=h.xpath('//h3') name=dTxt[0].text.strip().split()[0]+" "+dTxt[0].text.strip().split()[1]#锟斤拷锟斤拷 brand=dTxt[0].text.strip().split()[0]#品锟斤拷 except Exception: errorTxt.write(url) # print brand # print name try: pCpgg=h.xpath('//p[@class="pCpgg"]') td=h.xpath('//td[@class="td2"]') except Exception: errorTxt.write(url) try: if td: price=list(td[0].itertext())[1].strip() else : price=list(pCpgg[0].itertext())[1].strip()#锟桔革拷 # print price except Exception: errorTxt.write(url) try: norms=list(pCpgg[-1].itertext())[1].strip()#锟斤拷锟� # print norms except Exception: errorTxt.write(url) try: spePs=h.xpath('//p[@class="speP"]/a') effect='' for speP in spePs: effect+=speP.text.strip()+" "#锟斤拷效 # print effect except Exception: errorTxt.write(url) try: awrap=h.xpath('//div[@class="Awrap"]/ul/li/a') imgUrl=awrap[0].find("img").attrib.get("src")#图片锟斤拷锟接碉拷址 # print imgUrl except Exception: errorTxt.write(url) try: troCon=h.xpath('//div[@class="troCon"]') des=list(troCon[0].itertext()) description='' for d in des: if len(d.strip())>20: description+=d.strip()+""#锟斤拷品锟斤拷锟斤拷 # print description except Exception: errorTxt.write(url) try: dTxt=h.xpath('//div[@class="dTxt"]/p/a') series=dTxt[1].text.strip() #系锟斤拷 except Exception: errorTxt.write(url) # print series insertData(name,brand,price,norms,effect,imgUrl,description,series)
def parseData(urlList): urlW=open("/usr/product/mianmo/url.txt" ,'a') for u in urlList: url=u.get("href").strip() print url urlW.write(url) urlW.write("\n") h = HTML(getHtml(url).decode('gbk')) try: dTxt=h.xpath('//h3') name=dTxt[0].text.strip().split()[0]+" "+dTxt[0].text.strip().split()[1]#闁跨喐鏋婚幏鐑芥晸閺傘倖瀚� brand=dTxt[0].text.strip().split()[0]#閸濅線鏁撻弬銈嗗 except Exception: errorTxt.write(url) # print brand # print name try: pCpgg=h.xpath('//p[@class="pCpgg"]') td=h.xpath('//td[@class="td2"]') except Exception: errorTxt.write(url) try: if td: price=list(td[0].itertext())[1].strip() else : price=list(pCpgg[0].itertext())[1].strip()#闁跨喐顢欓棃鈺傚 # print price except Exception: errorTxt.write(url) try: norms=list(pCpgg[-1].itertext())[1].strip()#闁跨喐鏋婚幏鐑芥晸閿燂拷 # print norms except Exception: errorTxt.write(url) try: spePs=h.xpath('//p[@class="speP"]/a') effect='' for speP in spePs: effect+=speP.text.strip()+" "#闁跨喐鏋婚幏閿嬫櫏 # print effect except Exception: errorTxt.write(url) try: awrap=h.xpath('//div[@class="Awrap"]/ul/li/a') imgUrl=awrap[0].find("img").attrib.get("src")#閸ュ墽澧栭柨鐔告灮閹风兘鏁撻幒銉ь暜閹峰嘲娼� # print imgUrl except Exception: errorTxt.write(url) try: troCon=h.xpath('//div[@class="troCon"]') des=list(troCon[0].itertext()) description='' for d in des: if len(d.strip())>20: description+=d.strip()+""#闁跨喐鏋婚幏宄版惂闁跨喐鏋婚幏鐑芥晸閺傘倖瀚� # print description except Exception: errorTxt.write(url) try: dTxt=h.xpath('//div[@class="dTxt"]/p/a') series=dTxt[1].text.strip() #缁鏁撻弬銈嗗 except Exception: errorTxt.write(url) # print series insertData(name,brand,price,norms,effect,imgUrl,description,series)