def crawl_step2(self): for p in range(2, self.total_page + 1): data = { "__VIEWSTATE": self.__VIEWSTATE, "__EVENTVALIDATION": self.__EVENTVALIDATION, "__EVENTTARGET": self.__EVENTTARGET, } print("crawling page {}".format(p)) headers = { "Content-Type": "application/x-www-form-urlencoded", "User-Agent": random.choice(self.USER_AGENTS), "Cookie": self.cookie } browser = requests.post(self.URL, headers=headers, data=urlencode(data)) if browser.status_code == 200: html = lxml.html.fromstring(browser.text) view_state_div = html.xpath('//input[@id="__VIEWSTATE"]') self.__VIEWSTATE = view_state_div[0].attrib["value"] event_valid_div = html.xpath('//input[@id="__EVENTVALIDATION"]') self.__EVENTVALIDATION = event_valid_div[0].attrib["value"] self.__EVENTTARGET = "Navigate$btnNavNext" links = html.xpath('//table[@id="DgList"]/tr/td[2]/a') for link in links: self.urls.append(self.BASE_URL + str(link.attrib["href"])) self.final_crawl() else: print("Error while crawling page {}".format(p)) self.final_crawl()
def getImcexHTML(myObject): try: website = getHtml(myObject.baseUrl + myObject.currency) website_html = website.read() html = lxml.html.fromstring(website_html) #fill myObject.bids #get_bids = '//article[@class = "' + orderbookBids + '"]//table//tr//node()[text()]' vol_html = '/html/body/div/div[2]/div/section/div/div/table/tr/td/table/tr[*]/td[2]/strong//text()' price_html = '' vol_list = html.xpath(vol_html) #since vol_list returns vol of bids and asks bids_vol = vol_list[: len(vol_list) / 2] asks_vol = vol_list[len(vol_list) / 2:] startpos = 2 for index in xrange(startpos, 22): price_html = '/html/body/div/div[2]/div/section/div/div/table/tr/td/table/tr[' + str(index) + ']/td[4]//text()' price_list = html.xpath(price_html) #['\n4.1400 LREUR\n', '\n8.9900 ', 'LREUR', '\n'] price_bid = float(price_list[0][1:7]) price_ask = float(price_list[1][1:7]) myObject.bids.append([price_bid, float(bids_vol[index - startpos])]) myObject.asks.append([price_ask, float(asks_vol[index - startpos])]) #check if orientation right, else list.reverse checkOrientation(myObject) except: e = sys.exc_info()[1] errorFunction(e, "<EXITING><getImcexHTML>: ") sys.exit()
def major_crawler (self, major, url): print major.code, major.name, url content = self.tor_util.request(url) html = lxml.html.document_fromstring(content) degree = None h3 = html.xpath('//div[@class="majorBase"]/h3') if len(h3) == 3: degree = h3[1].text_content().strip() degree = degree[degree.find(u':'):].strip() how_long = h3[2].text_content().strip() how_long = how_long[how_long.find(u':')+1:].strip() elif len(h3) == 2: how_long = h3[1].text_content().strip() how_long = how_long[how_long.find(u':')+1:].strip() try: course = html.xpath('//div[@class="majorBase"]/div[@class="course"]/p')[0].text_content().strip() except: course = None description = html.xpath('//div[@class="majorCon"]')[0].text_content().strip() if degree and major.degree is None: major.degree = degree major.how_long = how_long major.course = course major.description = description major.save() related = html.xpath('//div[@class="majorBase"]/div[@class="course"]/a') for r in related: m,created = MyMajor.objects.get_or_create(name=r.text_content().strip()) major.related_majors.add(m)
def get_information_from_page(url_list,asynchronous=False): if asynchronous: for urls in url_list: rs = (grequests.get(u,stream=False) for u in urls) responses = grequests.map(rs) results = [] for r in responses: result = {} html = lxml.html.fromstring(r.text) posting_body = html.xpath('//div[@class="postingBody"]') result["textbody"] = [i.text_content() for i in posting_body] result['pictures'] = html.xpath('//ul[@id="viewAdPhotoLayout"]/li/a/@href') result['url'] = r.url results.append(result) r.close() return results else: r = requests.get(url_list) html = lxml.html.fromstring(r.text) posting_body = html.xpath('//div[@class="postingBody"]') textbody = [i.text_content() for i in posting_body] pictures = html.xpath('//ul[@id="viewAdPhotoLayout"]/li/a/@href') return textbody,pictures
def _inst_tops(last=5, pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()): ct._write_console() for _ in range(retry_count): time.sleep(pause) try: request = Request( rv.LHB_SINA_URL % (ct.P_TYPE["http"], ct.DOMAINS["vsf"], rv.LHB_KINDS[2], ct.PAGES["fd"], last, pageNo) ) text = urlopen(request, timeout=10).read() text = text.decode("GBK") html = lxml.html.parse(StringIO(text)) res = html.xpath('//table[@id="dataTable"]/tr') if ct.PY3: sarr = [etree.tostring(node).decode("utf-8") for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = "".join(sarr) sarr = "<table>%s</table>" % sarr df = pd.read_html(sarr)[0] df = df.drop([2, 3], axis=1) df.columns = rv.LHB_JGZZ_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class="pages"]/a[last()]/@onclick') if len(nextPage) > 0: pageNo = re.findall(r"\d+", nextPage[0])[0] return _inst_tops(last, pageNo, retry_count, pause, dataArr) else: return dataArr except Exception as e: print(e)
def _get_report_data(year, quarter, pageNo, dataArr): url = ct.REPORT_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1]) ct._write_console() try: html = lxml.html.parse(url) xtrs = html.xpath("//table[@class=\"list_table\"]/tr") for trs in xtrs: code = trs.xpath('td[1]//span/a/text()')[0] name = trs.xpath('td[2]/span/a/text()')[0] eps = trs.xpath('td[3]/text()')[0] #每股收益(元) eps_yoy = trs.xpath('td[4]/text()')[0] #每股收益同比(%) bvps = trs.xpath('td[5]/text()')[0] #每股净资产(元) bvps = '0' if bvps == '--' else bvps roe = trs.xpath('td[6]/text()')[0] #净资产收益率(%) roe = '0' if roe == '--' else roe epcf = trs.xpath('td[7]/text()')[0] #每股现金流量(元) epcf = '0' if epcf == '--' else epcf net_profits = trs.xpath('td[8]/text()')[0] #净利润(万元) profits_yoy = trs.xpath('td[9]/text()')[0] #净利润同比(%) distrib = trs.xpath('td[10]/text()')[0] #分配方案 report_date = trs.xpath('td[11]/text()')[0] #发布日期 dataArr.append([code, name, eps, eps_yoy, bvps, roe, epcf, net_profits, profits_yoy, distrib, report_date]) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') #获取下一页 if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_report_data(year, quarter, pageNo, dataArr) else: return dataArr except: pass
def scrape(self,links=[],ads=True,translator=False): responses = [] values = {} data = [] if ads: for link in links: r = requests.get(link) responses.append(r) else: for link in links: r = requests.get(link) text = unidecode(r.text) html = lxml.html.fromstring(text) links = html.xpath("//div[@class='cat']/a/@href") for link in links: if len(self.base_urls) > 1 or len(self.base_urls[0]) > 3: time.sleep(random.randint(5,27)) try: responses.append(requests.get(link)) print link except requests.exceptions.ConnectionError: print "hitting connection error" continue for r in responses: text = r.text html = lxml.html.fromstring(text) values["title"] = html.xpath("//div[@id='postingTitle']/a/h1")[0].text_content() values["link"] = unidecode(r.url) values["new_keywords"] = [] try: values["images"] = html.xpath("//img/@src") except IndexError: values["images"] = "weird index error" pre_decode_text = html.xpath("//div[@class='postingBody']")[0].text_content().replace("\n","").replace("\r","") values["text_body"] = pre_decode_text try: values["posted_at"] = html.xpath("//div[class='adInfo']")[0].text_content().replace("\n"," ").replace("\r","") except IndexError: values["posted_at"] = "not given" values["scraped_at"] = str(datetime.datetime.now()) body_blob = TextBlob(values["text_body"]) title_blob = TextBlob(values["title"]) values["language"] = body_blob.detect_language() #requires the internet - makes use of google translate api values["polarity"] = body_blob.polarity values["subjectivity"] = body_blob.sentiment[1] if values["language"] != "en" and not translator: values["translated_body"] = body_blob.translate(from_lang="es") values["translated_title"] = title_blob.translate(from_lang="es") else: values["translated_body"] = "none" values["translated_title"] = "none" text_body = values["text_body"] title = values["title"] values["phone_numbers"] = self.phone_number_parse(values) data.append(values) return data
def use_lxml_twitter_list(body): c = conn.cursor() html = body_clean(body) #print html paths = '//div[contains(text(), "#")]/..' results = html.xpath(paths) for r in results: #print tostring(r) rank = r[0].text.strip() tw_name = r[2][0].text.strip() #print rank, tw_name #tw_id = twitter_screenname_to_id(tw_name) klout_id, k_score = get_kloutid_short(tw_name) trst_score = get_trstrank_short(tw_name) f = open("./tweets/twibes.txt", 'a') f.write('%s\t%s\t%s\t%s\t%s\n'%(rank, tw_name, klout_id, k_score, trst_score)) f.close() print rank, tw_name, klout_id, k_score, trst_score c.execute('INSERT OR IGNORE INTO tw (rank, tw_id, klout_id, klout_score, trstrank) VALUES (?,?,?,?,?)', (rank, tw_name, klout_id, k_score, trst_score)) conn.commit() paths = '//a[contains(text(), "More...")]' results = html.xpath(paths) for r in results: more_link = r.get('href') print more_link twitter_rank_read(more_link) c.close()
def _get_report_data(year, quarter, pageNo, dataArr, orderby): ct._write_console() try: request = Request(ct.REPORT_URL % (ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1], orderby)) # 默认排序抓取的信息有重复和遗漏,增加排序功能参数orderby text = urlopen(request, timeout=10).read() text = text.decode('GBK') text = text.replace('--', '') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@class=\"list_table\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>' % sarr df = pd.read_html(sarr)[0] df = df.drop(11, axis=1) df.columns = ct.REPORT_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage) > 0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_report_data(year, quarter, pageNo, dataArr,orderby) else: return dataArr except Exception as e: print(e)
def _get_growth_data(year, quarter, pageNo, dataArr): url = ct.GROWTH_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1]) ct._write_console() try: html = lxml.html.parse(url) xtrs = html.xpath("//table[@class=\"list_table\"]/tr") for trs in xtrs: code = trs.xpath('td[1]/a/text()')[0] name = trs.xpath('td[2]/a/text()')[0] mbrg = trs.xpath('td[3]/text()')[0] mbrg = '0' if mbrg == '--' else mbrg nprg = trs.xpath('td[4]/text()')[0] nprg = '0' if nprg == '--' else nprg nav = trs.xpath('td[5]/text()')[0] nav = '0' if nav == '--' else nav targ = trs.xpath('td[6]/text()')[0] targ = '0' if targ == '--' else targ epsg = trs.xpath('td[7]/text()')[0] epsg = '0' if epsg == '--' else epsg seg = trs.xpath('td[8]/text()')[0] seg = '0' if seg == '--' else seg dataArr.append([code, name, mbrg, nprg, nav, targ, epsg, seg]) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') #获取下一页 if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_growth_data(year, quarter, pageNo, dataArr) else: return dataArr except: pass
def backupblog(self, url): req = urllib2.Request(url) try: self.operate = self.opener.open(req) except: print "Blog: %s downloading error! please check the url!" % url infocontent = self.operate.read() #print infocontent html = lxml.html.fromstring(infocontent.decode('utf8')) next = html.xpath('//span[@class="float-right"]/a') if len(next) != 0: nexturl = next[0].get('href') else: nexturl = None times = html.xpath('//span[@class="timestamp"]') if len(times) != 0: pubtime = times[0].text.strip().replace(':','-') else: pubtime = time.strftime('%Y-%m-%d %H-%M-%S', time.localtime(time.time())) titles = html.xpath('//h3[@class="title-article"]/strong') if len(titles) != 0: if platform.system() == "Windows": title = titles[0].text.strip().encode("gbk",'ignore') title = title.replace('\\','').replace(':','').replace('/','').replace('?','').replace('*','').replace('<','').replace('|','').replace('>','').replace('"','') else: title = titles[0].text.strip() else: title = "unkowntitle" fd = open(self.dir + pubtime + '----' + title + '.html', 'a+') fd.write(infocontent) fd.close() return nexturl
def _dist_cotent(year, pageNo, retry_count, pause): for _ in range(retry_count): time.sleep(pause) try: if pageNo > 0: ct._write_console() html = lxml.html.parse( rv.DP_163_URL % (ct.P_TYPE["http"], ct.DOMAINS["163"], ct.PAGES["163dp"], year, pageNo) ) res = html.xpath('//div[@class="fn_rp_list"]/table') if ct.PY3: sarr = [etree.tostring(node).decode("utf-8") for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = "".join(sarr) df = pd.read_html(sarr, skiprows=[0])[0] df = df.drop(df.columns[0], axis=1) df.columns = rv.DP_163_COLS df["divi"] = df["plan"].map(_fun_divi) df["shares"] = df["plan"].map(_fun_into) df = df.drop("plan", axis=1) df["code"] = df["code"].astype(object) df["code"] = df["code"].map(lambda x: str(x).zfill(6)) if pageNo == 0: page = html.xpath('//div[@class="mod_pages"]/a') asr = page[len(page) - 2] pages = asr.xpath("text()") except _network_error_classes: pass else: if pageNo == 0: return df, pages[0] else: return df raise IOError(ct.NETWORK_URL_ERROR_MSG)
def _get_forecast_data(year, quarter, pageNo, dataArr): url = ct.FORECAST_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1]) print 'getting data %s ...'%pageNo try: html = lxml.html.parse(url) xtrs = html.xpath("//table[@class=\"list_table\"]/tr") for trs in xtrs: code = trs.xpath('td[1]//span/a/text()')[0] name = trs.xpath('td[2]/span/a/text()')[0] type = trs.xpath('td[3]/a/text()') type = type[0] if len(type)>0 else trs.xpath('td[3]/text()')[0] report_date = trs.xpath('td[4]/text()')[0] pre_eps = trs.xpath('td[7]/text()')[0] pre_eps = '0' if pre_eps == '--' else pre_eps range = trs.xpath('td[8]/text()')[0] dataArr.append([code,name,type,report_date,pre_eps,range]) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') #获取下一页 if len(nextPage)>0: pageNo = re.findall(r'\d+',nextPage[0])[0] return _get_forecast_data(year,quarter,pageNo,dataArr) else: return dataArr except: pass
def _get_forecast_data(year, quarter, pageNo, dataArr): ct._write_console() try: gparser = etree.HTMLParser(encoding='GBK') html = lxml.html.parse(ct.FORECAST_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1]), parser=gparser) res = html.xpath("//table[@class=\"list_table\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = sarr.replace('--', '0') sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df = df.drop([4, 5, 8], axis=1) df.columns = ct.FORECAST_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+',nextPage[0])[0] return _get_forecast_data(year, quarter, pageNo, dataArr) else: return dataArr except Exception as e: print(e)
def _get_cashflow_data(year, quarter, pageNo, dataArr, retry_count=3, pause=0.001): ct._write_console() for _ in range(retry_count): time.sleep(pause) try: request = Request(ct.CASHFLOW_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1])) text = urlopen(request, timeout=10).read() text = text.decode('GBK') text = text.replace('--', '') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@class=\"list_table\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df.columns = ct.CASHFLOW_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_cashflow_data(year, quarter, pageNo, dataArr) else: return dataArr except Exception as e: pass raise IOError(ct.NETWORK_URL_ERROR_MSG)
def _newstocks(data, pageNo, retry_count, pause): for _ in range(retry_count): time.sleep(pause) ct._write_console() try: html = lxml.html.parse(rv.NEW_STOCKS_URL%(ct.P_TYPE['http'],ct.DOMAINS['vsf'], ct.PAGES['newstock'], pageNo)) res = html.xpath('//table[@id=\"NewStockTable\"]/tr') if len(res) == 0: return data if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = sarr.replace('<font color="red">*</font>', '') sarr = '<table>%s</table>'%sarr df = pd.read_html(StringIO(sarr), skiprows=[0, 1])[0] df = df.drop([df.columns[idx] for idx in [12, 13, 14]], axis=1) df.columns = rv.NEW_STOCKS_COLS df['code'] = df['code'].map(lambda x : str(x).zfill(6)) df['xcode'] = df['xcode'].map(lambda x : str(x).zfill(6)) res = html.xpath('//table[@class=\"table2\"]/tr[1]/td[1]/a/text()') tag = '下一页' if ct.PY3 else unicode('下一页', 'utf-8') hasNext = True if tag in res else False data = data.append(df, ignore_index=True) pageNo += 1 if hasNext: data = _newstocks(data, pageNo, retry_count, pause) except Exception as ex: print(ex) else: return data
def _get_forecast_data(year, quarter, pageNo, dataArr): ct._write_console() try: html = lxml.html.parse( ct.FORECAST_URL % (ct.P_TYPE["http"], ct.DOMAINS["vsf"], ct.PAGES["fd"], year, quarter, pageNo, ct.PAGE_NUM[1]) ) res = html.xpath('//table[@class="list_table"]/tr') if ct.PY3: sarr = [etree.tostring(node).decode("utf-8") for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = "".join(sarr) sarr = sarr.replace("--", "0") sarr = "<table>%s</table>" % sarr df = pd.read_html(sarr)[0] df = df.drop([4, 5, 8], axis=1) df.columns = ct.FORECAST_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class="pages"]/a[last()]/@onclick') if len(nextPage) > 0: pageNo = re.findall(r"\d+", nextPage[0])[0] return _get_forecast_data(year, quarter, pageNo, dataArr) else: return dataArr except: pass
def parsePage(html): # Dictionary to store info athInfo = {} #Now start populating our data object athInfo['AthleteName'] = html.cssselect("h2")[0].text athInfo['DivisionRank'] = html.cssselect("#rank *")[0].tail.strip() athInfo['OverallRank'] = html.cssselect("#div-rank *")[0].tail.strip() infoFields = ['Bib', 'Division', 'Age', 'State', 'Country', 'Profession'] detailsFields = ['TotalSwim', 'TotalBike', 'TotalRun', 'TotalTime'] rows = html.cssselect("table#general-info tr") for i, stat in enumerate(infoFields): athInfo[stat] = rows[i][1].text rows = html.cssselect("table#athelete-details tr") for i, stat in enumerate(detailsFields): athInfo[stat] = rows[i][1].text #have to use xpath to get T1 and T2 data athInfo['T1'] = html.xpath("//tr[contains(td/text(), 'T1:')]/td[2]")[0].text_content() athInfo['T2'] = html.xpath("//tr[contains(td/text(), 'T2:')]/td[2]")[0].text_content() athInfo['HasResults'] = 1 athInfo['Scraped'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") scraperwiki.sqlite.save(unique_keys=['Bib'], data=athInfo, table_name="RESULTS", verbose=0)
def _get_debtpaying_data(year, quarter, pageNo, dataArr): ct._write_console() try: request = Request(ct.DEBTPAYING_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1])) text = urlopen(request, timeout=ct.HTTP_TIMEOUT).read() text = text.decode('GBK') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@class=\"list_table\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df.columns = ct.DEBTPAYING_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_debtpaying_data(year, quarter, pageNo, dataArr) else: return dataArr except Exception as e: print(e)
def _dist_cotent(year, pageNo, retry_count, pause): url = rv.DP_163_URL%(ct.P_TYPE['http'], ct.DOMAINS['163'], ct.PAGES['163dp'], year, pageNo) for _ in range(retry_count): time.sleep(pause) try: if pageNo>0: print rv.DP_MSG%pageNo html = lxml.html.parse(url) res = html.xpath('//div[@class=\"fn_rp_list\"]/table') sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) df = pd.read_html(sarr, skiprows=[0])[0] df = df.drop(df.columns[0], axis=1) df.columns = rv.DP_163_COLS df['divi'] = df['plan'].map(_fun_divi) df['shares'] = df['plan'].map(_fun_into) df = df.drop('plan', axis=1) df['code'] = df['code'].astype(object) df['code'] = df['code'].map(lambda x : str(x).zfill(6)) if pageNo == 0: page = html.xpath('//div[@class=\"mod_pages\"]/a') asr = page[len(page)-2] pages = asr.xpath('text()') except _network_error_classes: pass else: if pageNo == 0: return df, pages[0] else: return df raise IOError("获取失败,请检查网络和URL:%s" % url)
def _get_debtpaying_data(year, quarter, pageNo, dataArr): url = ct.DEBTPAYING_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1]) ct._write_console() try: html = lxml.html.parse(url) xtrs = html.xpath("//table[@class=\"list_table\"]/tr") for trs in xtrs: code = trs.xpath('td[1]/a/text()')[0] name = trs.xpath('td[2]/a/text()')[0] currentratio = trs.xpath('td[3]/text()')[0] currentratio = '0' if currentratio == '--' else currentratio quickratio = trs.xpath('td[4]/text()')[0] quickratio = '0' if quickratio == '--' else quickratio cashratio = trs.xpath('td[5]/text()')[0] cashratio = '0' if cashratio == '--' else cashratio icratio = trs.xpath('td[6]/text()')[0] icratio = '0' if icratio == '--' else icratio sheqratio = trs.xpath('td[7]/text()')[0] sheqratio = '0' if sheqratio == '--' else sheqratio adratio = trs.xpath('td[8]/text()')[0] adratio = '0' if adratio == '--' else adratio dataArr.append([code, name, currentratio, quickratio, cashratio, icratio, sheqratio, adratio]) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') #获取下一页 if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_debtpaying_data(year, quarter, pageNo, dataArr) else: return dataArr except: pass
def _get_cashflow_data(year, quarter, pageNo, dataArr): url = ct.CASHFLOW_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1]) ct._write_console() try: html = lxml.html.parse(url) xtrs = html.xpath("//table[@class=\"list_table\"]/tr") for trs in xtrs: code = trs.xpath('td[1]/a/text()')[0] name = trs.xpath('td[2]/a/text()')[0] cf_sales = trs.xpath('td[3]/text()')[0] cf_sales = '0' if cf_sales == '--' else cf_sales rateofreturn = trs.xpath('td[4]/text()')[0] rateofreturn = '0' if rateofreturn == '--' else rateofreturn cf_nm = trs.xpath('td[5]/text()')[0] cf_nm = '0' if cf_nm == '--' else cf_nm cf_liabilities = trs.xpath('td[6]/text()')[0] cf_liabilities = '0' if cf_liabilities == '--' else cf_liabilities cashflowratio = trs.xpath('td[7]/text()')[0] cashflowratio = '0' if cashflowratio == '--' else cashflowratio dataArr.append([code, name, cf_sales, rateofreturn, cf_nm, cf_liabilities, cashflowratio]) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') #获取下一页 if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_cashflow_data(year, quarter, pageNo, dataArr) else: return dataArr except: pass
def _inst_detail(pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()): ct._write_console() for _ in range(retry_count): time.sleep(pause) try: request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[3], ct.PAGES['fd'], '', pageNo)) text = urlopen(request, timeout=10).read() text = text.decode('GBK') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@id=\"dataTable\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df.columns = rv.LHB_JGMX_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _inst_detail(pageNo, retry_count, pause, dataArr) else: return dataArr except Exception as e: print(e)
def is_date_range(html): postingbodies = html.xpath('id("postingbody")') if len(postingbodies) > 0: postingbody = postingbodies[0].text_content() else: warnings.warn('No #postingbody found on the page') return False titles = html.xpath('//title') if len(titles) > 0: title = titles[0].text_content() else: warnings.warn('No <title /> found on the page') return False for text in [title, postingbody]: if 'for the month' in text: return True body = iter(tokenize(text)) for window in _ngrams(body): # print(window) if len(list(dates_in_tokens(window))) == 2: return True else: for i in range(1, len(window)): if _is_end_date(window[0], window[1:i]): # print(window[0], window[1:i]) return True return False
def dates(html): 'Return my weird list date format' postingbodies = html.xpath('id("postingbody")') if len(postingbodies) > 0: postingbody = postingbodies[0].text_content() else: warnings.warn('No #postingbody found on the page') return None, None titles = html.xpath('//title') if len(titles) > 0: title = titles[0].text_content() else: warnings.warn('No <title /> found on the page') return None, None for text in [title, postingbody]: body = iter(tokenize(text)) for window in _ngrams(body): d = list(dates_in_tokens(window)) if len(d) == 2: return tuple(d) else: for i in range(1, len(window)): if _is_end_date(window[0], window[1:i]): return None, window[1:i] return None, None
def parsePage(html): # Dictionary to store info athInfo = {} # Now start populating our data object athInfo["ATHLETE_NAME"] = html.cssselect("h2")[0].text athInfo["DIVISION_RANK"] = html.cssselect("#rank *")[0].tail.strip() athInfo["OVERALL_RANK"] = html.cssselect("#div-rank *")[0].tail.strip() # infoFields = ['BIB', 'DIVISION', 'AGE', 'STATE', 'COUNTRY', 'PROFESSION'] infoFields = ["BIB", "DIVISION", "STATE", "COUNTRY", "PROFESSION"] detailsFields = ["TOTAL_SWIM", "TOTAL_BIKE", "TOTAL_RUN", "TOTAL_TIME"] rows = html.cssselect("table#general-info tr") for i, stat in enumerate(infoFields): athInfo[stat] = rows[i][1].text rows = html.cssselect("table#athelete-details tr") for i, stat in enumerate(detailsFields): athInfo[stat] = rows[i][1].text # have to use xpath to get T1 and T2 data athInfo["T1"] = html.xpath("//tr[contains(td/text(), 'T1:')]/td[2]")[0].text_content() athInfo["T2"] = html.xpath("//tr[contains(td/text(), 'T2:')]/td[2]")[0].text_content() athInfo["HAS_RESULTS"] = 1 athInfo["SCRAPED"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") scraperwiki.sqlite.save(unique_keys=["BIB"], data=athInfo, table_name="RESULTS", verbose=0)
def release(html): return { 'version': html.xpath('./span[@class="release-number"]/a/text()')[0], 'download': 'https://www.python.org' + html.xpath('./span[@class="release-number"]/a/@href')[0], 'date': html.xpath('./span[@class="release-date"]/text()')[0], 'notes': html.xpath('./span[@class="release-enhancements"]/a/@href')[0] }
def save_all_pdfs(links): final = [] pdfs = [] for link in links: r = requests.get(link) html = lxml.html.fromstring(r.text) possible_links = html.xpath("//a/@href") for i in possible_links: if "download" in i: final.append(i) for link in final: r = requests.get(link) html = lxml.html.fromstring(r.text) possible_pdfs = html.xpath("//a/@href") for i in possible_pdfs: if "pdf" in i: pdfs.append(i) for pdf in pdfs: name = pdf.split("/")[-1] with open(name,"wb") as f: r = requests.get(pdf) f.write(r.content)
def _get_operation_data(year, quarter, pageNo, dataArr): url = ct.OPERATION_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1]) ct._write_console() try: html = lxml.html.parse(url) xtrs = html.xpath("//table[@class=\"list_table\"]/tr") for trs in xtrs: code = trs.xpath('td[1]/a/text()')[0] name = trs.xpath('td[2]/a/text()')[0] arturnover = trs.xpath('td[3]/text()')[0] arturnover = '0' if arturnover == '--' else arturnover arturndays = trs.xpath('td[4]/text()')[0] arturndays = '0' if arturndays == '--' else arturndays inventory_turnover = trs.xpath('td[5]/text()')[0] inventory_turnover = '0' if inventory_turnover == '--' else inventory_turnover inventory_days = trs.xpath('td[6]/text()')[0] inventory_days = '0' if inventory_days == '--' else inventory_days currentasset_turnover = trs.xpath('td[7]/text()')[0] currentasset_turnover = '0' if currentasset_turnover == '--' else currentasset_turnover currentasset_days = trs.xpath('td[8]/text()')[0] currentasset_days = '0' if currentasset_days == '--' else currentasset_days dataArr.append([code, name, arturnover, arturndays, inventory_turnover, inventory_days, currentasset_turnover, currentasset_days]) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') #获取下一页 if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_growth_data(year, quarter, pageNo, dataArr) else: return dataArr except: pass
def _get_profit_data(year, quarter, pageNo, dataArr): url = ct.PROFIT_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1]) ct._write_console() try: html = lxml.html.parse(url) xtrs = html.xpath("//table[@class=\"list_table\"]/tr") for trs in xtrs: code = trs.xpath('td[1]/a/text()')[0] name = trs.xpath('td[2]/a/text()')[0] roe = trs.xpath('td[3]/text()')[0] roe = '0' if roe == '--' else roe net_profit_ratio = trs.xpath('td[4]/text()')[0] net_profit_ratio = '0' if net_profit_ratio == '--' else net_profit_ratio gross_profit_rate = trs.xpath('td[5]/text()')[0] gross_profit_rate = '0' if gross_profit_rate == '--' else gross_profit_rate net_profits = trs.xpath('td[6]/text()')[0] net_profits = '0' if net_profits == '--' else net_profits eps = trs.xpath('td[7]/text()')[0] eps = '0' if eps == '--' else eps business_income = trs.xpath('td[8]/text()')[0] business_income = '0' if business_income == '--' else business_income bips = trs.xpath('td[9]/text()')[0] bips = '0' if bips == '--' else bips dataArr.append([code, name, roe, net_profit_ratio, gross_profit_rate, net_profits, eps, business_income, bips]) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') #获取下一页 if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_profit_data(year, quarter, pageNo, dataArr) else: return dataArr except: pass
async def parse_gdq_schedule(): global schedule, oauth_token h_t = requests.get("https://gamesdonequick.com/schedule").text html = lxml.html.fromstring(h_t) table = html.xpath( "//table[@id='runTable']//tbody//tr[not(contains(@class, 'second-row'))]" ) table = table gs = [] for element in table: try: gtime = arrow.get(element.getchildren()[0].text) except arrow.parser.ParserError: continue game = element.getchildren()[1].text runners = element.getchildren()[2].text gs.append([gtime, game, runners]) schedule = gs
def getlist(url, GCase=True): rr = re.compile('(^http://)|(^https://)') r = requests.get(url, headers=hds()) txt = r.content.decode('utf8') html = lxml.html.parse(StringIO(txt)) lis = html.xpath('//div[@class="commonList_con"]/ul[@class="li_line"]/li') datasets = {} for li in lis: name = re.sub(r'\s*', '', li.xpath('a/text()')[0].strip()) name = name.replace('“', '').replace('”', '') tm = li.xpath('span/text()')[0].strip() href = li.xpath('a/@href')[0].strip() name = tm + name if not rr.match(href): href = 'http://www.spp.gov.cn' + li.xpath('a/@href')[0] #print(name,href) if (name.endswith('指导性案例')) and GCase: datasets[name] = href else: datasets[name] = href return datasets
def web_driver_wait_ruishu(self, time: int, rule: str, num: str): """ 笨方法 遍历页面匹配 :param time: 等待时间 :param rule: 规则 [id, class] :param num: 根据元素id :return: """ while time: response = self.execute_js("document.documentElement.outerHTML") try: html = etree.HTML(text=response["value"]) inp = html.xpath("//*[contains(@%s, '%s')]" % (rule, num)) if inp: break except Exception as e: continue time_.sleep(1) time -= 1 if not time: raise Exception("未找到 %s" % num)
def get_nips(i): data = [] url = 'https://papers.nips.cc/book/advances-in-neural-information-processing-systems-{}-{}'.format(i,1987+i) url_base = 'https://papers.nips.cc' res = requests.get(url) html = lxml.html.fromstring(res.text) for li in tqdm(html.xpath('//div[@class="main wrapper clearfix"]/ul/li')): a = li.xpath('a') pdf_url = url_base + a[0].attrib['href'] + '.pdf' #print('title: ' + a[0].text) #print('authors: ' + ', '.join([author.text for author in a[1:]])) #print('url: ' + pdf_url) ''' #pdf_urlが生きてるか雑に確認しようとした stc = requests.get(pdf_url).status_code if stc >= 300: sys.stderro.write('!!! responce: {}, title: {}'.format(stc, a[0].text)) continue ''' data.append({'title':a[0].text, 'authors':[author.text for author in a[1:]], 'pdf': pdf_url}) return data
def find_comic_by_pubdate(self, pubdate: date) -> DilbertComic: print("Fetching comic for {}".format(pubdate)) _debug("Fetching comic for {}".format(pubdate)) url = "http://dilbert.com/strip/{}".format(pubdate.isoformat()) _debug("Fetching url `{}`".format(url)) res = self.fetch_url(url) _debug("Status: {}".format(res.status)) body = res.read() html = lxml.html.document_fromstring(body) el = html.xpath( "//img[@class and contains(concat(' ', normalize-space(@class), ' '), ' img-comic ')]" )[0] comic = DilbertComic( pubdate=pubdate, url=el.get('src'), title=re.sub(' - .*$', '', el.get('alt')), filename=Path("{}.gif".format(pubdate.isoformat())), width=el.get('width'), height=el.get('height'), ) return comic
def main(): print('Start crawl...') URL = 'https://pr-cy.ru/browser-details/' r = requests.get(URL) data = r.text html = lxml.html.fromstring(data) ip_address = html.xpath('//table[@class="table"]//div[@class="ip"]/text()') text_mail = settings.text_of_mail.replace('%myip%', ip_address[0]) print('End crawl. Done!') with open('ip_log.json', 'w') as f: json.dump(ip_address[0], f, indent=4) print('Creating mail...') message = Mail(subject=settings.subject_mail, from_addr=settings.send_from, to_addr=settings.send_to, user=settings.user, password=settings.password) message.send(text_mail) print('Message sent!')
def scrape_joint_committees(self): url = 'http://legislature.idaho.gov/about/jointcommittees.htm' page = self.urlopen(url) html = lxml.html.fromstring(page) html.make_links_absolute(url) joint_li = html.xpath('//td[contains(h1, "Joint")]/ul/li') for li in joint_li: name, url = li[0].text, li[0].get('href') if 'Joint Finance-Appropriations Committee' in name: self.get_jfac(name, url) elif 'Joint Legislative Oversight Committee' in name: self.get_jlfc(name, url) elif name == 'Joint Millennium Fund Committee': self.get_jmfc(name, url) elif name == 'Economic Outlook and Revenue Assessment Committee': committee = Committee('joint', name) committee.add_source(url) # no membership available #self.save_committee(committee) else: self.log('Unknown committee: %s %s' % (name, url))
def test_scenario(self, scenario_id): """A very shallow test, just to see if the scenario loads all its blocks. We don't know enough about each scenario to know what each should do. So we load the scenario to see that the workbench could successfully serve it. """ url = reverse('workbench_show_scenario', kwargs={'scenario_id': scenario_id}) client = Client() response = client.get(url, follow=True) assert response.status_code == 200, scenario_id # Be sure we got the whole scenario. Again, we can't know what to expect # here, but at the very least, if there are verticals, they should not be # empty. That would be a sign that some data wasn't loaded properly while # rendering the scenario. html = lxml.html.fromstring(response.content) for vertical_tag in html.xpath('//div[@class="vertical"]'): # No vertical tag should be empty. assert list(vertical_tag), "Empty <vertical> shouldn't happen!"
def _download_file(url, name): r = requests.get(url) r = r.text html = lxml.html.parse(StringIO(r)) url = html.xpath( "//div[@class=\"simulation-main-image-panel\"]/a[2]/@href")[0] ftype = os.path.splitext(url)[1] url = 'https://phet.colorado.edu' + url path = '/home/chen/文档/Phet/' + name + ftype path1 = os.path.split(path)[0] if not os.path.exists(path1): os.mkdir(path1) if not os.path.exists(path): r = requests.get(url, stream=True) with open(path, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks f.write(chunk) f.flush() print('Download %s finished' % name) return
def parse_title(text): s = "" title_re = re.compile("\s") html = lxml.html.fromstring(text) title = html.xpath('//a[@class="strongbox"]//text()')[0] title = re.sub(title_re, '', title) for i in title: encode_str = str(i.encode("unicode-escape")).split(r'\\u')[-1].replace( '\'', '').replace(r'b(', '').strip() try: num, code = make_dict() if len(encode_str) != 4: i = i elif int(encode_str, 16) not in code: i = i else: i = str(num[code[int(encode_str, 16)]] - 1) s += i except: s = "None" return s
def find_by_xpath(self, xpath, original_find=None, original_selector=None): html = self.htmltree elements = [] for xpath_element in html.xpath(xpath): if self._element_is_link(xpath_element): return self._find_links_by_xpath(xpath) elif self._element_is_control(xpath_element): elements.append((DjangoClientControlElement, xpath_element)) else: elements.append((DjangoClientElement, xpath_element)) find_by = original_find or "xpath" query = original_selector or xpath return ElementList([ element_class(element, self) for element_class, element in elements ], find_by=find_by, query=query)
def latest_content(url): ''' 获取即时财经新闻内容 Parameter -------- url:新闻链接 Return -------- string:返回新闻的文字内容 ''' try: html = lxml.html.parse(url) res = html.xpath('//div[@id=\"artibody\"]/p') sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr).replace(' ', '') #.replace('\n\n', '\n'). html_content = lxml.html.fromstring(sarr) content = html_content.text_content() return content except Exception as er: print str(er)
def final_crawl(self): print("Length of url_list: {}".format(len(self.url_list))) for url in self.url_list: url = self.domain.format(url) req = urllib.request.Request(url, headers=self.header, method="GET") resp = urllib.request.urlopen(req) html = lxml.html.fromstring(resp.read().decode("gbk")) ps = html.xpath('//span[@id="yf_content"]/p') data = { "name": ps[0].text_content().split(':')[1], "addr": ps[2].text_content().split(':')[1], "area": self.current_area, "tel": ps[1].text_content().split(':')[1], "time": ps[3].text_content().split(':')[1], } # print(data) self.save2db(data)
def get_items_in_PNAS(page): html = lxml.html.fromstring(page.content) article_section = html.xpath( "//div[@class='highwire-cite highwire-cite-highwire-article highwire-citation-pnas-list-complete clearfix']" ) try: #for a_sec in reversed(article_section): for a_sec in article_section: a = article_module.Aritcle() # get item sections title_sec = a_sec.xpath(".//span[@class='highwire-cite-title']") # get items a.title_e = ''.join(title_sec[0].itertext()) pass except IndexError: raise IndexError
def creatorCheck(puzzlePage): # check to see if it was created by me - returns true or false log.debug("Starting creatorCheck on " + puzzlePage.url) html = lxml.html.fromstring(puzzlePage.text) creatorSet = html.xpath(r'//a[@itemprop="creator"]/child::text()') if len(creatorSet) == 1: if creatorSet[0] == username: # this puzzle is mine log.debug("created by me") return true else: log.debug("created by someone else " + creatorSet[0]) return false elif len(creatorSet) > 1: # multiple creator links, so I don't know what to do log.warning("multiple creators") log.warning(creatorSet) return false else: log.warning("No Creator found on page " + puzzlePage.url) return false
def scrapeDepartment(agent, url): print "scrapeDepartment" agent.open(url) rawResponse = agent.response().read() html = lxml.html.fromstring(rawResponse) rows = html.xpath( '//table[@id="curriculum_total_program_browser_table"]/tbody/tr') total_program_ids = [] for row in rows: departement = row.xpath('td[1]/text()') row = row.xpath('td[2]/div/ul/li/a/@href') if len(row) > 0: ## index = row[0].rfind('=') print departement, row[ 0] + "&curriculum_total_program_browser_table_per_page=all" total_program_ids.append( row[0] + "&curriculum_total_program_browser_table_per_page=all") return total_program_ids
def get_params(html): """ get params from html :param html: :return: """ data_init = json.loads( html.xpath("//div[@class='zh-general-list clearfix']/@data-init")[0]) params = data_init['params'] order_by = params['order_by'] hash_id = params['hash_id'] node_name = data_init['nodename'] ret_params = { "params": { "offset": 0, "order_by": order_by, "hash_id": hash_id }, "nodename": node_name } return ret_params
def find_by_xpath(self, xpath, original_find=None, original_query=None): html = self.htmltree elements = [] for xpath_element in html.xpath(xpath): if self._element_is_link(xpath_element): return self._find_links_by_xpath(xpath) elif self._element_is_control(xpath_element) and xpath_element.name: return self.find_by_name(xpath_element.name) else: elements.append(self.get_control(xpath_element)) find_by = original_find or "xpath" query = original_query or xpath return ElementList( [ZopeTestBrowserElement(element, self) for element in elements], find_by=find_by, query=query, )
def get_webpage(url, clean_text=False): # download page try: connection = urllib.request.urlopen(url, timeout=2) except timeout: return [], None, "timeout" except Exception as e: return [], None, "fail" page = connection.read() connection.close() # parse page html = lxml.html.fromstring(page) # extract links links = html.xpath("//a/@href") # complete relative links complete_links = [ urllib.parse.urljoin(url, link) for link in links ] # get text text = page if clean_text: # initialize cleaner cleaner = lxml.html.clean.Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True # This is True because we want to activate the styles & stylesheet filter # extract text text = cleaner.clean_html(html).text_content() # remove special characters text = re.sub('[^A-Za-z0-9]+', ' ', text) return complete_links, text, None
def get_text(self): """ 拿取房源信息 :return: 信息列表 """ name_list = [] house_type_list = [] position_list = [] direction_list = [] money_list = [] for i in range(1, 200): url = "https://hz.zu.anjuke.com/fangyuan/xiaoshan/p{}-px7/".format( i) print(url) rsp = requests.get(url=url, headers=self.headers, proxies=self.ip) html = etree.HTML(rsp.text) for i in range(3, 61): div = html.xpath( "//div[@id='list-content']/div[{}]".format(i))[0] name = div.xpath("./a/@title")[0] name_list.append(name) house_type = div.xpath("./div[@class='zu-info']/p[@class='details-item tag']/text()")[1] + \ div.xpath("./div[@class='zu-info']/p[@class='details-item tag']/text()")[2].strip() house_type_list.append(house_type) c = div.xpath( "./div[@class='zu-info']/address[@class='details-item']//text()" ) position = [c[1] + c[2].strip() if len(c) > 2 else c][0] position_list.append(position) a = div.xpath("./div[@class='zu-info']/p[2]//text()") direction = [a[1] + a[3] + a[5] if len(a) > 5 else a[1] + a[3]][0] direction_list.append(direction) money = div.xpath("./div[@class='zu-side']//text()")[ 1] + div.xpath("./div[@class='zu-side']//text()")[2] money_list.append(money) item_demo = list( zip(name_list, house_type_list, position_list, direction_list, money_list)) return item_demo
def parseIndexInfo(html): global indexInfo aa = [] for script in html.xpath("//script"): # print(script.text) if type(script.text) is not str: continue cs = re.match( ".*var cs='(.*?)'.*?var.*?var\s*(.*?)\s*=\s*lc.*var\s*(.*?)\s*=\s*lc.*var\s*(.*?)\s*=\s*lc.*var\s*(.*?)\s*=\s*lc.*src=(.*?).jpg", script.text, re.S | re.M) if cs: css = cs.group(1) urlbase = cs.group(6) aa.append([1, cs.group(2), urlbase.find(cs.group(2))]) aa.append([2, cs.group(3), urlbase.find(cs.group(3))]) aa.append([3, cs.group(4), urlbase.find(cs.group(4))]) aa.append([4, cs.group(5), urlbase.find(cs.group(5))]) maxIndex = max(aa, key=lambda b: b[2]) currSum = 0 for a in aa: a.append(currSum) if a[0] == maxIndex[0]: currSum += 40 else: currSum += 2 a.append(currSum) aa.sort(key=lambda bb: bb[2]) break # print(aa) for a in aa: value = (a[3], a[4]) # print(str(aa.index(a))) if aa.index(a) == 0: indexInfo["pageCount"] = value elif aa.index(a) == 1: indexInfo["key"] = value elif aa.index(a) == 2: indexInfo["chapter"] = value elif aa.index(a) == 3: indexInfo["subfix"] = value
def main(): """ The function that parses the FiveM page into JSON that LambentLight/ServerManager can read. """ # If the number of argument is not three if len(sys.argv) != 3: print(f"Wrong number of arguments. Expected 3, got {len(sys.argv)}") sys.exit(2) # First we need to check that the file with the HTML exists if not os.path.isfile(sys.argv[1]): # Print a message and exit with a code 2 print("The file with the builds does not exists!") sys.exit(3) # Load the contents into the lxml parser html = lxml.html.parse(sys.argv[1]) # Get the a nodes a_nodes = html.xpath("//a[@class='panel-block ']") # Create a list for storing our builds builds = [] # For each a node that we have for node in a_nodes: # Try to search the respective regex on the href regex = re.search(REGEX, node.attrib.get("href", "")) # If the regex was able to find the group if regex is not None and regex.group(1): # Add the item into our list builds.append(regex.group(1)) # Open a file for writing the builds with open(sys.argv[2], "w") as output: # Dump the list of builds json.dump(builds, output, indent=4) # And finally and a line at the end output.write("\n")
def parseItems(self, html, response): houselist = html.xpath( ".//ul[@class='house-list-wrap']//div[@class='list-info']") items = [] for houseinfo in houselist: detailurl = houseinfo.xpath(".//h2[1]/a/@href")[0] imageurl = houseinfo.xpath("./preceding-sibling::div[1]//a/@href") title = "".join(houseinfo.xpath(".//h2[1]/a/text()")) roomNum = "".join( houseinfo.xpath(".//p[1]/span[1]/text()")[0].split()) size = "".join(houseinfo.xpath(".//p[1]/span[2]/text()")) orient = "".join(houseinfo.xpath(".//p[1]/span[3]/text()")) floor = "".join(houseinfo.xpath(".//p[1]/span[4]/text()")) address = "".join(("".join( houseinfo.xpath(".//p[2]/span[1]//a/text()"))).split()) sumprice = "".join( houseinfo.xpath( "./following-sibling::div[1]//p[@class='sum']/b/text()")) unitprice = "".join( houseinfo.xpath( "./following-sibling::div[@class='price']//p[@class='unit']/text()" )) fromUrl = response.url key = fromUrl.split("//")[1] key = key.split(".")[0] city = self.urlMap[key] items.append( HouseItem(_id=detailurl, title=title, roomNum=roomNum, size=NumberUtil.fromString(size), orient=orient, floor=floor, address=address, sumPrice=NumberUtil.fromString(sumprice), unitPrice=NumberUtil.fromString(unitprice), imageurl=imageurl, city=city, fromUrl=fromUrl)) return items
def save(self, *args, **kwargs): # Update the Project's URIs docutils_settings = getattr(settings, "RESTRUCTUREDTEXT_FILTER_SETTINGS", {}) docutils_settings.update({"warning_stream": os.devnull}) try: html_string = publish_string(source=smart_str(self.description), writer_name="html4css1", settings_overrides=docutils_settings) if html_string.strip(): html = lxml.html.fromstring(html_string) for link in html.xpath("//a/@href"): if len(link) > 400: # @@@ ugly as sin, but fixes shit for now continue try: if any(urlparse.urlparse(link)[:5]): PackageURI.objects.get_or_create( package=self.package, uri=link) except ValueError: pass except Exception: # @@@ We Swallow Exceptions here, but it's the best way that I can think of atm. pass super(Release, self).save(*args, **kwargs) _current_show_install_command = self.show_install_command if self.classifiers.filter(trove="Framework :: Plone").exists(): self.show_install_command = False else: self.show_install_command = True if _current_show_install_command != self.show_install_command: super(Release, self).save(*args, **kwargs)
def _parse_fq_data(url, index, retry_count, pause, proxies=[]): import random for _ in range(retry_count): proxy = random.choice(proxies) if proxies else None time.sleep(pause) try: request = Request(url) text = None if not proxy: text = urlopen(request, timeout=10).read() else: text = build_opener(ProxyHandler(proxy)).open(request, timeout=10).read() text = text.decode('GBK') html = lxml.html.parse(StringIO(text)) res = html.xpath('//table[@id=\"FundHoldSharesTable\"]') if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) if sarr == '': return None df = pd.read_html(sarr, skiprows = [0, 1])[0] if len(df) == 0: return pd.DataFrame() if index: df.columns = ct.HIST_FQ_COLS[0:7] else: df.columns = ct.HIST_FQ_COLS if df['date'].dtypes == np.object: df['date'] = pd.to_datetime(df['date']) df = df.drop_duplicates('date') except ValueError as e: # 时间较早,已经读不到数据 return None except Exception as e: print(e) else: return df raise IOError(ct.NETWORK_URL_ERROR_MSG)
def set_knockout_template(formset, request, opts: dict=None): if opts is None: opts = {} _opts = { 'formset_form_class': 'form-empty', 'inline_title': getattr(formset, 'inline_title', formset.model._meta.verbose_name), 'layout_classes': get_layout_classes(), } _opts.update(opts) renderer = render_form(request, 'inline', formset.empty_form, { 'caller': {}, 'opts': _opts, }) empty_form_str = renderer.__str__() # return str(empty_form_str) html = lxml.html.parse(StringIO(empty_form_str)) for element in html.xpath("//*[@id or @name or @for]"): # sdv.dbg('element', element) data_bind_args = [] for attr in ['for', 'id', 'name']: if attr in element.attrib: attr_parts = element.attrib[attr].split('__prefix__') if len(attr_parts) == 2: attr_parts = to_json(attr_parts[0]) + ' + ($index() + $parent.serversideFormsCount) + ' + to_json(attr_parts[1]) data_bind_args.append(to_json(attr) + ': ' + attr_parts) del element.attrib[attr] # sdv.dbg('data_bind_args', data_bind_args) if len(data_bind_args) > 0: data_bind = 'attr: {' + ', '.join(data_bind_args) + '}' # sdv.dbg('data_bind', data_bind) element.attrib['data-bind'] = data_bind knockout_template = tostring(html, method='html', encoding='utf-8', standalone=True).decode('utf-8') # sdv.dbg('knockout_template before', knockout_template) body_begin = knockout_template.find('<body>') body_end = knockout_template.rfind('</body>') if body_begin == -1 or body_end == -1: sdv.dbg('failed ko template', knockout_template) raise ValueError('Knockout template is not wrapped in body tag') # sdv.dbg('knockout_template after', formset.knockout_template) formset.knockout_template = knockout_template[body_begin + len('<body>'):body_end]
def get_jfac(self, name, url): """gets membership info for the Joint Finance and Appropriations Committee.""" jfac_page = self.urlopen(url) html = lxml.html.fromstring(jfac_page) table = html.xpath('body/table/tr/td[2]/table')[0] committee = Committee('joint', name) for row in table.xpath('tr')[1:]: senate, house = row.xpath('td/strong') senate = senate.text.replace(u'\xa0', ' ') house = house.text.replace(u'\xa0', ' ') if ',' in senate: committee.add_member(*senate.split(','), chamber='upper') else: committee.add_member(senate, chamber='upper') if ',' in house: committee.add_member(*house.split(','), chamber='lower') else: committee.add_member(house, chamber='lower') committee.add_source(url) self.save_committee(committee)
def get_date_ohlc(exchange, symbol, date): print exchange, symbol, date for _ in range(3): try: time.sleep(0.005) page_url = 'https://www.google.com.hk/finance/historical?q=%s:%s' % ( exchange, symbol) r = s.get(page_url, proxies=proxies) html = lxml.html.parse(StringIO(r.text)) res = html.xpath('//table[@class=\"gf-table historical_price\"]') sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) if sarr == '': return None df = pd.read_html(sarr, skiprows=[0])[0] df.columns = ['date', 'open', 'high', 'low', 'close', 'amount'] df = df.drop('amount', axis=1) def date_to_int(s): y, m, d = s.split("-") return int(y) * 10000 + int(m) * 100 + int(d) df['date'] = df['date'].apply(date_to_int) # df['date'] = pd.to_datetime(df['date'], format=u"%Y-%m-%d") df = df.drop_duplicates('date') cmp_d = int(date.strftime("%Y%m%d")) df = df[df.date == cmp_d] if len(df) > 0: df['date'] = int(date.strftime("%Y%m%d")) code = get_code(symbol) assert code > 0, 'symbol code is %s' % code df.insert(0, 'code', code) df = df.set_index('code') return df return None except Exception as e: print traceback.format_exc() yyhtools.error(traceback.format_exc()) return None
def latest_content(net, url): """ 获取即时财经新闻内容 :param net: 指定网站名 :param url: 新闻链接 :return: string 返回新闻的文字内容 """ content = '' try: html = lxml.html.parse(url, parser=etree.HTMLParser(encoding='utf-8')) res = html.xpath(xpaths[net]) p_str_list = [ etree.tostring(node).strip().decode('utf-8') for node in res ] content = '\n'.join(p_str_list) html_content = lxml.html.fromstring(content) content = html_content.text_content() content = re.sub(r'(\r*\n)+', '\n', content) except Exception as e: print(e) return content
def parseDescription(self, data): x = [ '//*[@id="section-overview"]/mat-card/div[2]/fields-card[{}]/div/span' ] html = lxml.html.fromstring(data) res = [] for i in range(1, 5): elems = html.xpath(x[0].format(i)) if len(elems): itr = iter(elems) while True: try: res.append([ next(itr).text_content().replace(u'\xa0', u''), next(itr).text_content().replace(u'\xa0', u'') ]) except StopIteration: break return {'header': 'Description', 'lst': res}