示例#1
1
    def crawl_step2(self):
        for p in range(2, self.total_page + 1):

            data = {
                "__VIEWSTATE": self.__VIEWSTATE,
                "__EVENTVALIDATION": self.__EVENTVALIDATION,
                "__EVENTTARGET": self.__EVENTTARGET,
            }

            print("crawling page {}".format(p))
            headers = {
                "Content-Type": "application/x-www-form-urlencoded",
                "User-Agent": random.choice(self.USER_AGENTS),
                "Cookie": self.cookie
            }
            browser = requests.post(self.URL, headers=headers, data=urlencode(data))
            if browser.status_code == 200:
                html = lxml.html.fromstring(browser.text)

                view_state_div = html.xpath('//input[@id="__VIEWSTATE"]')
                self.__VIEWSTATE = view_state_div[0].attrib["value"]
                event_valid_div = html.xpath('//input[@id="__EVENTVALIDATION"]')
                self.__EVENTVALIDATION = event_valid_div[0].attrib["value"]
                self.__EVENTTARGET = "Navigate$btnNavNext"

                links = html.xpath('//table[@id="DgList"]/tr/td[2]/a')
                for link in links:
                    self.urls.append(self.BASE_URL + str(link.attrib["href"]))

                self.final_crawl()

            else:
                print("Error while crawling page {}".format(p))
                self.final_crawl()
示例#2
1
def getImcexHTML(myObject):
    try:
        website = getHtml(myObject.baseUrl + myObject.currency)
        website_html = website.read()
        html = lxml.html.fromstring(website_html)

        #fill myObject.bids
        #get_bids = '//article[@class = "' + orderbookBids + '"]//table//tr//node()[text()]'
        vol_html = '/html/body/div/div[2]/div/section/div/div/table/tr/td/table/tr[*]/td[2]/strong//text()'
        price_html = ''
        vol_list = html.xpath(vol_html)
        #since vol_list returns vol of bids and asks
        bids_vol = vol_list[: len(vol_list) / 2]
        asks_vol = vol_list[len(vol_list) / 2:]

        startpos = 2
        for index in xrange(startpos, 22):
            price_html = '/html/body/div/div[2]/div/section/div/div/table/tr/td/table/tr[' + str(index) + ']/td[4]//text()'
            price_list = html.xpath(price_html)         #['\n4.1400 LREUR\n', '\n8.9900 ', 'LREUR', '\n']
            price_bid = float(price_list[0][1:7])
            price_ask = float(price_list[1][1:7])

            myObject.bids.append([price_bid, float(bids_vol[index - startpos])])
            myObject.asks.append([price_ask, float(asks_vol[index - startpos])])

        #check if orientation right, else list.reverse
        checkOrientation(myObject)
    except:
        e = sys.exc_info()[1]
        errorFunction(e, "<EXITING><getImcexHTML>: ")
        sys.exit()
示例#3
0
文件: crawler.py 项目: ariel1234/gkp
	def major_crawler (self, major, url):
		print major.code, major.name, url
		content = self.tor_util.request(url)
		html = lxml.html.document_fromstring(content)

		degree = None
		h3 = html.xpath('//div[@class="majorBase"]/h3')
		if len(h3) == 3:
			degree = h3[1].text_content().strip()
			degree = degree[degree.find(u':'):].strip()
			how_long = h3[2].text_content().strip()
			how_long = how_long[how_long.find(u':')+1:].strip()
		elif len(h3) == 2:
			how_long = h3[1].text_content().strip()
			how_long = how_long[how_long.find(u':')+1:].strip()

		try: course = html.xpath('//div[@class="majorBase"]/div[@class="course"]/p')[0].text_content().strip()
		except: course = None
		description = html.xpath('//div[@class="majorCon"]')[0].text_content().strip()
		if degree and major.degree is None: major.degree = degree
		major.how_long = how_long
		major.course = course
		major.description = description
		major.save()

		related = html.xpath('//div[@class="majorBase"]/div[@class="course"]/a')
		for r in related:
			m,created = MyMajor.objects.get_or_create(name=r.text_content().strip())
			major.related_majors.add(m)
def get_information_from_page(url_list,asynchronous=False):
    
    if asynchronous:
        for urls in url_list:
            rs = (grequests.get(u,stream=False) for u in urls)
            responses = grequests.map(rs)
            results = []
            for r in responses:
                result = {}
                html = lxml.html.fromstring(r.text)
                posting_body = html.xpath('//div[@class="postingBody"]')
                result["textbody"] = [i.text_content() for i in posting_body]
                result['pictures'] = html.xpath('//ul[@id="viewAdPhotoLayout"]/li/a/@href')
                result['url'] = r.url
                results.append(result)
                r.close()
        return results
            
    else:
        r = requests.get(url_list)
        html = lxml.html.fromstring(r.text)
        posting_body = html.xpath('//div[@class="postingBody"]')
        textbody = [i.text_content() for i in posting_body]
        pictures = html.xpath('//ul[@id="viewAdPhotoLayout"]/li/a/@href')
        return textbody,pictures
示例#5
0
def _inst_tops(last=5, pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()):
    ct._write_console()
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(
                rv.LHB_SINA_URL % (ct.P_TYPE["http"], ct.DOMAINS["vsf"], rv.LHB_KINDS[2], ct.PAGES["fd"], last, pageNo)
            )
            text = urlopen(request, timeout=10).read()
            text = text.decode("GBK")
            html = lxml.html.parse(StringIO(text))
            res = html.xpath('//table[@id="dataTable"]/tr')
            if ct.PY3:
                sarr = [etree.tostring(node).decode("utf-8") for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = "".join(sarr)
            sarr = "<table>%s</table>" % sarr
            df = pd.read_html(sarr)[0]
            df = df.drop([2, 3], axis=1)
            df.columns = rv.LHB_JGZZ_COLS
            dataArr = dataArr.append(df, ignore_index=True)
            nextPage = html.xpath('//div[@class="pages"]/a[last()]/@onclick')
            if len(nextPage) > 0:
                pageNo = re.findall(r"\d+", nextPage[0])[0]
                return _inst_tops(last, pageNo, retry_count, pause, dataArr)
            else:
                return dataArr
        except Exception as e:
            print(e)
示例#6
0
def _get_report_data(year, quarter, pageNo, dataArr):
    url = ct.REPORT_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'],
                         year, quarter, pageNo, ct.PAGE_NUM[1])
    ct._write_console()
    try:
        html = lxml.html.parse(url)
        xtrs = html.xpath("//table[@class=\"list_table\"]/tr")
        for trs in xtrs:
            code = trs.xpath('td[1]//span/a/text()')[0]
            name = trs.xpath('td[2]/span/a/text()')[0]
            eps = trs.xpath('td[3]/text()')[0] #每股收益(元)
            eps_yoy = trs.xpath('td[4]/text()')[0] #每股收益同比(%)
            bvps = trs.xpath('td[5]/text()')[0] #每股净资产(元)
            bvps = '0' if bvps == '--' else bvps
            roe = trs.xpath('td[6]/text()')[0] #净资产收益率(%)
            roe = '0' if roe == '--' else roe
            epcf = trs.xpath('td[7]/text()')[0] #每股现金流量(元)
            epcf = '0' if epcf == '--' else epcf
            net_profits = trs.xpath('td[8]/text()')[0] #净利润(万元)
            profits_yoy = trs.xpath('td[9]/text()')[0] #净利润同比(%)
            distrib = trs.xpath('td[10]/text()')[0] #分配方案
            report_date = trs.xpath('td[11]/text()')[0] #发布日期
            dataArr.append([code, name, eps, eps_yoy, bvps, roe,
                            epcf, net_profits, profits_yoy, distrib,
                            report_date])
        nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') #获取下一页
        if len(nextPage)>0:
            pageNo = re.findall(r'\d+', nextPage[0])[0]
            return _get_report_data(year, quarter, pageNo, dataArr)
        else:
            return dataArr
    except:
        pass
示例#7
0
    def scrape(self,links=[],ads=True,translator=False):
        responses = []
        values = {}
        data = []
        
        if ads:
            for link in links:
                r = requests.get(link)
                responses.append(r)
        else:
            for link in links:
                r = requests.get(link)
                text = unidecode(r.text)
                html = lxml.html.fromstring(text)

                links = html.xpath("//div[@class='cat']/a/@href")
                for link in links:
                    if len(self.base_urls) > 1 or len(self.base_urls[0]) > 3:
                        time.sleep(random.randint(5,27))
                    try:
                        responses.append(requests.get(link))
                        print link
                    except requests.exceptions.ConnectionError:
                        print "hitting connection error"
                        continue

        for r in responses:
            text = r.text
            html = lxml.html.fromstring(text)
            values["title"] = html.xpath("//div[@id='postingTitle']/a/h1")[0].text_content()
            values["link"] = unidecode(r.url)
            values["new_keywords"] = []
            try:
                values["images"] = html.xpath("//img/@src")
            except IndexError:
                values["images"] = "weird index error"
            pre_decode_text = html.xpath("//div[@class='postingBody']")[0].text_content().replace("\n","").replace("\r","")  
            values["text_body"] = pre_decode_text 
            try:
                values["posted_at"] = html.xpath("//div[class='adInfo']")[0].text_content().replace("\n"," ").replace("\r","")
            except IndexError:
                values["posted_at"] = "not given"
            values["scraped_at"] = str(datetime.datetime.now())
            body_blob = TextBlob(values["text_body"])
            title_blob = TextBlob(values["title"])
            values["language"] = body_blob.detect_language() #requires the internet - makes use of google translate api
            values["polarity"] = body_blob.polarity
            values["subjectivity"] = body_blob.sentiment[1]
            if values["language"] != "en" and not translator:
                values["translated_body"] = body_blob.translate(from_lang="es")
                values["translated_title"] = title_blob.translate(from_lang="es")
            else:
                values["translated_body"] = "none"
                values["translated_title"] = "none"
            text_body = values["text_body"]
            title = values["title"]
            values["phone_numbers"] = self.phone_number_parse(values)
            data.append(values)
        
        return data
def use_lxml_twitter_list(body):
    c = conn.cursor()
    html = body_clean(body)
    #print html
    paths = '//div[contains(text(), "#")]/..'
    results = html.xpath(paths)
    for r in results:
        #print tostring(r)
        rank = r[0].text.strip()
        tw_name = r[2][0].text.strip()
        #print rank, tw_name
        #tw_id = twitter_screenname_to_id(tw_name)
        klout_id, k_score = get_kloutid_short(tw_name)
        trst_score = get_trstrank_short(tw_name)
        f = open("./tweets/twibes.txt", 'a')
        f.write('%s\t%s\t%s\t%s\t%s\n'%(rank, tw_name, klout_id, k_score, trst_score))
        f.close()
        print rank, tw_name, klout_id, k_score, trst_score
        c.execute('INSERT OR IGNORE INTO tw (rank, tw_id, klout_id, klout_score, trstrank) VALUES (?,?,?,?,?)', (rank, tw_name, klout_id, k_score, trst_score))
        conn.commit()
    paths = '//a[contains(text(), "More...")]'
    results = html.xpath(paths)
    for r in results:
        more_link = r.get('href')
        print more_link
        twitter_rank_read(more_link)
    c.close()
示例#9
0
def _get_report_data(year, quarter, pageNo, dataArr, orderby):
    ct._write_console()
    try:
        request = Request(ct.REPORT_URL % (ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'],
                                           year, quarter, pageNo, ct.PAGE_NUM[1], orderby))
        # 默认排序抓取的信息有重复和遗漏,增加排序功能参数orderby
        text = urlopen(request, timeout=10).read()
        text = text.decode('GBK')
        text = text.replace('--', '')
        html = lxml.html.parse(StringIO(text))
        res = html.xpath("//table[@class=\"list_table\"]/tr")
        if ct.PY3:
            sarr = [etree.tostring(node).decode('utf-8') for node in res]
        else:
            sarr = [etree.tostring(node) for node in res]
        sarr = ''.join(sarr)
        sarr = '<table>%s</table>' % sarr
        df = pd.read_html(sarr)[0]
        df = df.drop(11, axis=1)
        df.columns = ct.REPORT_COLS
        dataArr = dataArr.append(df, ignore_index=True)
        nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
        if len(nextPage) > 0:
            pageNo = re.findall(r'\d+', nextPage[0])[0]
            return _get_report_data(year, quarter, pageNo, dataArr,orderby)
        else:
            return dataArr
    except Exception as e:
        print(e)
示例#10
0
def _get_growth_data(year, quarter, pageNo, dataArr):
    url = ct.GROWTH_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year,
                         quarter, pageNo, ct.PAGE_NUM[1])
    ct._write_console()
    try:
        html = lxml.html.parse(url)
        xtrs = html.xpath("//table[@class=\"list_table\"]/tr")
        for trs in xtrs:
            code = trs.xpath('td[1]/a/text()')[0]
            name = trs.xpath('td[2]/a/text()')[0]
            mbrg = trs.xpath('td[3]/text()')[0]
            mbrg = '0' if mbrg == '--' else mbrg
            nprg = trs.xpath('td[4]/text()')[0] 
            nprg = '0' if nprg == '--' else nprg
            nav = trs.xpath('td[5]/text()')[0] 
            nav = '0' if nav == '--' else nav
            targ = trs.xpath('td[6]/text()')[0] 
            targ = '0' if targ == '--' else targ
            epsg = trs.xpath('td[7]/text()')[0] 
            epsg = '0' if epsg == '--' else epsg
            seg = trs.xpath('td[8]/text()')[0] 
            seg = '0' if seg == '--' else seg
            dataArr.append([code, name, mbrg, nprg, nav, targ, epsg, seg])
        nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') #获取下一页
        if len(nextPage)>0:
            pageNo = re.findall(r'\d+', nextPage[0])[0]
            return _get_growth_data(year, quarter, pageNo, dataArr)
        else:
            return dataArr
    except:
        pass
示例#11
0
    def backupblog(self, url):

        req = urllib2.Request(url)
        try:
            self.operate = self.opener.open(req)
        except:
            print "Blog: %s downloading error! please check the url!" % url
        infocontent = self.operate.read()
        #print infocontent
        html = lxml.html.fromstring(infocontent.decode('utf8'))
        next = html.xpath('//span[@class="float-right"]/a')
        if len(next) != 0:
            nexturl = next[0].get('href')
        else:
            nexturl = None
        times = html.xpath('//span[@class="timestamp"]')
        if len(times) != 0:
            pubtime = times[0].text.strip().replace(':','-')
        else:
            pubtime = time.strftime('%Y-%m-%d %H-%M-%S', time.localtime(time.time()))
        titles = html.xpath('//h3[@class="title-article"]/strong')
        if len(titles) != 0:
            if platform.system() == "Windows":
                title = titles[0].text.strip().encode("gbk",'ignore')
                title = title.replace('\\','').replace(':','').replace('/','').replace('?','').replace('*','').replace('<','').replace('|','').replace('>','').replace('"','')
            else:
                title = titles[0].text.strip()
        else:
            title = "unkowntitle"

        fd = open(self.dir + pubtime + '----' + title + '.html', 'a+')
        fd.write(infocontent)
        fd.close()

        return nexturl
示例#12
0
def _dist_cotent(year, pageNo, retry_count, pause):
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            if pageNo > 0:
                ct._write_console()
            html = lxml.html.parse(
                rv.DP_163_URL % (ct.P_TYPE["http"], ct.DOMAINS["163"], ct.PAGES["163dp"], year, pageNo)
            )
            res = html.xpath('//div[@class="fn_rp_list"]/table')
            if ct.PY3:
                sarr = [etree.tostring(node).decode("utf-8") for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = "".join(sarr)
            df = pd.read_html(sarr, skiprows=[0])[0]
            df = df.drop(df.columns[0], axis=1)
            df.columns = rv.DP_163_COLS
            df["divi"] = df["plan"].map(_fun_divi)
            df["shares"] = df["plan"].map(_fun_into)
            df = df.drop("plan", axis=1)
            df["code"] = df["code"].astype(object)
            df["code"] = df["code"].map(lambda x: str(x).zfill(6))
            if pageNo == 0:
                page = html.xpath('//div[@class="mod_pages"]/a')
                asr = page[len(page) - 2]
                pages = asr.xpath("text()")
        except _network_error_classes:
            pass
        else:
            if pageNo == 0:
                return df, pages[0]
            else:
                return df
    raise IOError(ct.NETWORK_URL_ERROR_MSG)
示例#13
0
def _get_forecast_data(year, quarter, pageNo, dataArr):
    url = ct.FORECAST_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year,
                           quarter, pageNo, ct.PAGE_NUM[1])
    print 'getting data %s ...'%pageNo
    try:
        html = lxml.html.parse(url)
        xtrs = html.xpath("//table[@class=\"list_table\"]/tr")
        for trs in xtrs:
            code = trs.xpath('td[1]//span/a/text()')[0]
            name = trs.xpath('td[2]/span/a/text()')[0]
            type = trs.xpath('td[3]/a/text()')
            type = type[0] if len(type)>0 else trs.xpath('td[3]/text()')[0]
            report_date = trs.xpath('td[4]/text()')[0] 
            pre_eps = trs.xpath('td[7]/text()')[0] 
            pre_eps = '0' if pre_eps == '--' else pre_eps
            range = trs.xpath('td[8]/text()')[0] 
            dataArr.append([code,name,type,report_date,pre_eps,range])
        nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') #获取下一页
        if len(nextPage)>0:
            pageNo = re.findall(r'\d+',nextPage[0])[0]
            return _get_forecast_data(year,quarter,pageNo,dataArr)
        else:
            return dataArr
    except:
        pass
示例#14
0
def _get_forecast_data(year, quarter, pageNo, dataArr):
    ct._write_console()
    try:
        gparser = etree.HTMLParser(encoding='GBK')
        html = lxml.html.parse(ct.FORECAST_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], 
                                                ct.PAGES['fd'], year, quarter, pageNo,
                                                ct.PAGE_NUM[1]),
                               parser=gparser)
        res = html.xpath("//table[@class=\"list_table\"]/tr")
        if ct.PY3:
            sarr = [etree.tostring(node).decode('utf-8') for node in res]
        else:
            sarr = [etree.tostring(node) for node in res]
        sarr = ''.join(sarr)
        sarr = sarr.replace('--', '0')
        sarr = '<table>%s</table>'%sarr
        df = pd.read_html(sarr)[0]
        df = df.drop([4, 5, 8], axis=1)
        df.columns = ct.FORECAST_COLS
        dataArr = dataArr.append(df, ignore_index=True)
        nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
        if len(nextPage)>0:
            pageNo = re.findall(r'\d+',nextPage[0])[0]
            return _get_forecast_data(year, quarter, pageNo, dataArr)
        else:
            return dataArr
    except Exception as e:
            print(e)
示例#15
0
def _get_cashflow_data(year, quarter, pageNo, dataArr,
                       retry_count=3, pause=0.001):
    ct._write_console()
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(ct.CASHFLOW_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'],
                                                    ct.PAGES['fd'], year,
                                                    quarter, pageNo, ct.PAGE_NUM[1]))
            text = urlopen(request, timeout=10).read()
            text = text.decode('GBK')
            text = text.replace('--', '')
            html = lxml.html.parse(StringIO(text))
            res = html.xpath("//table[@class=\"list_table\"]/tr")
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            sarr = '<table>%s</table>'%sarr
            df = pd.read_html(sarr)[0]
            df.columns = ct.CASHFLOW_COLS
            dataArr = dataArr.append(df, ignore_index=True)
            nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
            if len(nextPage)>0:
                pageNo = re.findall(r'\d+', nextPage[0])[0]
                return _get_cashflow_data(year, quarter, pageNo, dataArr)
            else:
                return dataArr
        except Exception as e:
            pass
    raise IOError(ct.NETWORK_URL_ERROR_MSG)
示例#16
0
def _newstocks(data, pageNo, retry_count, pause):
    for _ in range(retry_count):
        time.sleep(pause)
        ct._write_console()
        try:
            html = lxml.html.parse(rv.NEW_STOCKS_URL%(ct.P_TYPE['http'],ct.DOMAINS['vsf'],
                         ct.PAGES['newstock'], pageNo))
            res = html.xpath('//table[@id=\"NewStockTable\"]/tr')
            if len(res) == 0:
                return data
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            sarr = sarr.replace('<font color="red">*</font>', '')
            sarr = '<table>%s</table>'%sarr
            df = pd.read_html(StringIO(sarr), skiprows=[0, 1])[0]
            df = df.drop([df.columns[idx] for idx in [12, 13, 14]], axis=1)
            df.columns = rv.NEW_STOCKS_COLS
            df['code'] = df['code'].map(lambda x : str(x).zfill(6))
            df['xcode'] = df['xcode'].map(lambda x : str(x).zfill(6))
            res = html.xpath('//table[@class=\"table2\"]/tr[1]/td[1]/a/text()')
            tag = '下一页' if ct.PY3 else unicode('下一页', 'utf-8')
            hasNext = True if tag in res else False 
            data = data.append(df, ignore_index=True)
            pageNo += 1
            if hasNext:
                data = _newstocks(data, pageNo, retry_count, pause)
        except Exception as ex:
            print(ex)
        else:
            return data 
示例#17
0
def _get_forecast_data(year, quarter, pageNo, dataArr):
    ct._write_console()
    try:
        html = lxml.html.parse(
            ct.FORECAST_URL
            % (ct.P_TYPE["http"], ct.DOMAINS["vsf"], ct.PAGES["fd"], year, quarter, pageNo, ct.PAGE_NUM[1])
        )
        res = html.xpath('//table[@class="list_table"]/tr')
        if ct.PY3:
            sarr = [etree.tostring(node).decode("utf-8") for node in res]
        else:
            sarr = [etree.tostring(node) for node in res]
        sarr = "".join(sarr)
        sarr = sarr.replace("--", "0")
        sarr = "<table>%s</table>" % sarr
        df = pd.read_html(sarr)[0]
        df = df.drop([4, 5, 8], axis=1)
        df.columns = ct.FORECAST_COLS
        dataArr = dataArr.append(df, ignore_index=True)
        nextPage = html.xpath('//div[@class="pages"]/a[last()]/@onclick')
        if len(nextPage) > 0:
            pageNo = re.findall(r"\d+", nextPage[0])[0]
            return _get_forecast_data(year, quarter, pageNo, dataArr)
        else:
            return dataArr
    except:
        pass
def parsePage(html):
    
    # Dictionary to store info
    athInfo = {}
    
    #Now start populating our data object
    athInfo['AthleteName'] = html.cssselect("h2")[0].text
    athInfo['DivisionRank'] = html.cssselect("#rank *")[0].tail.strip()
    athInfo['OverallRank'] = html.cssselect("#div-rank *")[0].tail.strip()    

    infoFields = ['Bib', 'Division', 'Age', 'State', 'Country', 'Profession']
    detailsFields = ['TotalSwim', 'TotalBike', 'TotalRun', 'TotalTime']
    
    rows = html.cssselect("table#general-info tr")
    for i, stat in enumerate(infoFields):
        athInfo[stat] = rows[i][1].text
    
    rows = html.cssselect("table#athelete-details tr")
    for i, stat in enumerate(detailsFields):
        athInfo[stat] = rows[i][1].text

    #have to use xpath to get T1 and T2 data
    athInfo['T1'] = html.xpath("//tr[contains(td/text(), 'T1:')]/td[2]")[0].text_content()
    athInfo['T2'] = html.xpath("//tr[contains(td/text(), 'T2:')]/td[2]")[0].text_content()

    athInfo['HasResults'] = 1
    athInfo['Scraped'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    scraperwiki.sqlite.save(unique_keys=['Bib'], data=athInfo, table_name="RESULTS", verbose=0)
示例#19
0
def _get_debtpaying_data(year, quarter, pageNo, dataArr):
    ct._write_console()
    try:
        request = Request(ct.DEBTPAYING_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'],
                                                  ct.PAGES['fd'], year,
                                                  quarter, pageNo, ct.PAGE_NUM[1]))
        text = urlopen(request, timeout=ct.HTTP_TIMEOUT).read()
        text = text.decode('GBK')
        html = lxml.html.parse(StringIO(text))
        res = html.xpath("//table[@class=\"list_table\"]/tr")
        if ct.PY3:
            sarr = [etree.tostring(node).decode('utf-8') for node in res]
        else:
            sarr = [etree.tostring(node) for node in res]
        sarr = ''.join(sarr)
        sarr = '<table>%s</table>'%sarr
        df = pd.read_html(sarr)[0]
        df.columns = ct.DEBTPAYING_COLS
        dataArr = dataArr.append(df, ignore_index=True)
        nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
        if len(nextPage)>0:
            pageNo = re.findall(r'\d+', nextPage[0])[0]
            return _get_debtpaying_data(year, quarter, pageNo, dataArr)
        else:
            return dataArr
    except Exception as e:
        print(e)
示例#20
0
def _dist_cotent(year, pageNo, retry_count, pause):
    url = rv.DP_163_URL%(ct.P_TYPE['http'], ct.DOMAINS['163'],
                     ct.PAGES['163dp'], year, pageNo)
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            if pageNo>0:
                print rv.DP_MSG%pageNo
            html = lxml.html.parse(url)  
            res = html.xpath('//div[@class=\"fn_rp_list\"]/table')
            sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            df = pd.read_html(sarr, skiprows=[0])[0]
            df = df.drop(df.columns[0], axis=1)
            df.columns = rv.DP_163_COLS
            df['divi'] = df['plan'].map(_fun_divi)
            df['shares'] = df['plan'].map(_fun_into)
            df = df.drop('plan', axis=1)
            df['code'] = df['code'].astype(object)
            df['code'] = df['code'].map(lambda x : str(x).zfill(6))
            if pageNo == 0:
                page = html.xpath('//div[@class=\"mod_pages\"]/a')
                asr = page[len(page)-2]
                pages = asr.xpath('text()')
        except _network_error_classes:
            pass
        else:
            if pageNo == 0:
                return df, pages[0]
            else:
                return df
    raise IOError("获取失败,请检查网络和URL:%s" % url)    
示例#21
0
def _get_debtpaying_data(year, quarter, pageNo, dataArr):
    url = ct.DEBTPAYING_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year,
                             quarter, pageNo, ct.PAGE_NUM[1])
    ct._write_console()
    try:
        html = lxml.html.parse(url)
        xtrs = html.xpath("//table[@class=\"list_table\"]/tr")
        for trs in xtrs:
            code = trs.xpath('td[1]/a/text()')[0]
            name = trs.xpath('td[2]/a/text()')[0]
            currentratio = trs.xpath('td[3]/text()')[0]
            currentratio = '0' if currentratio == '--' else currentratio
            quickratio = trs.xpath('td[4]/text()')[0] 
            quickratio = '0' if quickratio == '--' else quickratio
            cashratio = trs.xpath('td[5]/text()')[0] 
            cashratio = '0' if cashratio == '--' else cashratio
            icratio = trs.xpath('td[6]/text()')[0] 
            icratio = '0' if icratio == '--' else icratio
            sheqratio = trs.xpath('td[7]/text()')[0] 
            sheqratio = '0' if sheqratio == '--' else sheqratio
            adratio = trs.xpath('td[8]/text()')[0] 
            adratio = '0' if adratio == '--' else adratio
            dataArr.append([code, name, currentratio, quickratio, cashratio,
                            icratio, sheqratio, adratio])
        nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') #获取下一页
        if len(nextPage)>0:
            pageNo = re.findall(r'\d+', nextPage[0])[0]
            return _get_debtpaying_data(year, quarter, pageNo, dataArr)
        else:
            return dataArr
    except:
        pass
示例#22
0
def _get_cashflow_data(year, quarter, pageNo, dataArr):
    url = ct.CASHFLOW_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year,
                           quarter, pageNo, ct.PAGE_NUM[1])
    ct._write_console()
    try:
        html = lxml.html.parse(url)
        xtrs = html.xpath("//table[@class=\"list_table\"]/tr")
        for trs in xtrs:
            code = trs.xpath('td[1]/a/text()')[0]
            name = trs.xpath('td[2]/a/text()')[0]
            cf_sales = trs.xpath('td[3]/text()')[0]
            cf_sales = '0' if cf_sales == '--' else cf_sales
            rateofreturn = trs.xpath('td[4]/text()')[0] 
            rateofreturn = '0' if rateofreturn == '--' else rateofreturn
            cf_nm = trs.xpath('td[5]/text()')[0] 
            cf_nm = '0' if cf_nm == '--' else cf_nm
            cf_liabilities = trs.xpath('td[6]/text()')[0] 
            cf_liabilities = '0' if cf_liabilities == '--' else cf_liabilities
            cashflowratio = trs.xpath('td[7]/text()')[0] 
            cashflowratio = '0' if cashflowratio == '--' else cashflowratio
            dataArr.append([code, name, cf_sales, rateofreturn, cf_nm,
                            cf_liabilities, cashflowratio])
        nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') #获取下一页
        if len(nextPage)>0:
            pageNo = re.findall(r'\d+', nextPage[0])[0]
            return _get_cashflow_data(year, quarter, pageNo, dataArr)
        else:
            return dataArr
    except:
        pass
示例#23
0
def _inst_detail(pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()):   
    ct._write_console()
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[3],
                                               ct.PAGES['fd'], '', pageNo))
            text = urlopen(request, timeout=10).read()
            text = text.decode('GBK')
            html = lxml.html.parse(StringIO(text))
            res = html.xpath("//table[@id=\"dataTable\"]/tr")
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            sarr = '<table>%s</table>'%sarr
            df = pd.read_html(sarr)[0]
            df.columns = rv.LHB_JGMX_COLS
            dataArr = dataArr.append(df, ignore_index=True)
            nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick')
            if len(nextPage)>0:
                pageNo = re.findall(r'\d+', nextPage[0])[0]
                return _inst_detail(pageNo, retry_count, pause, dataArr)
            else:
                return dataArr
        except Exception as e:
            print(e)
示例#24
0
def is_date_range(html):
    postingbodies = html.xpath('id("postingbody")')
    if len(postingbodies) > 0:
        postingbody = postingbodies[0].text_content()
    else:
        warnings.warn('No #postingbody found on the page')
        return False

    titles = html.xpath('//title')
    if len(titles) > 0:
        title = titles[0].text_content()
    else:
        warnings.warn('No <title /> found on the page')
        return False


    for text in [title, postingbody]:
        if 'for the month' in text:
            return True

        body = iter(tokenize(text))
        for window in _ngrams(body):
#           print(window)
            if len(list(dates_in_tokens(window))) == 2:
                return True
            else:
                for i in range(1, len(window)):
                    if _is_end_date(window[0], window[1:i]):
#                       print(window[0], window[1:i])
                        return True
    return False
示例#25
0
def dates(html):
    'Return my weird list date format'
    postingbodies = html.xpath('id("postingbody")')
    if len(postingbodies) > 0:
        postingbody = postingbodies[0].text_content()
    else:
        warnings.warn('No #postingbody found on the page')
        return None, None

    titles = html.xpath('//title')
    if len(titles) > 0:
        title = titles[0].text_content()
    else:
        warnings.warn('No <title /> found on the page')
        return None, None

    for text in [title, postingbody]:
        body = iter(tokenize(text))
        for window in _ngrams(body):
            d = list(dates_in_tokens(window))
            if len(d) == 2:
                return tuple(d)
            else:
                for i in range(1, len(window)):
                    if _is_end_date(window[0], window[1:i]):
                        return None, window[1:i]
    return None, None
def parsePage(html):

    # Dictionary to store info
    athInfo = {}

    # Now start populating our data object
    athInfo["ATHLETE_NAME"] = html.cssselect("h2")[0].text
    athInfo["DIVISION_RANK"] = html.cssselect("#rank *")[0].tail.strip()
    athInfo["OVERALL_RANK"] = html.cssselect("#div-rank *")[0].tail.strip()

    # infoFields = ['BIB', 'DIVISION', 'AGE', 'STATE', 'COUNTRY', 'PROFESSION']
    infoFields = ["BIB", "DIVISION", "STATE", "COUNTRY", "PROFESSION"]
    detailsFields = ["TOTAL_SWIM", "TOTAL_BIKE", "TOTAL_RUN", "TOTAL_TIME"]

    rows = html.cssselect("table#general-info tr")
    for i, stat in enumerate(infoFields):
        athInfo[stat] = rows[i][1].text

    rows = html.cssselect("table#athelete-details tr")
    for i, stat in enumerate(detailsFields):
        athInfo[stat] = rows[i][1].text

    # have to use xpath to get T1 and T2 data
    athInfo["T1"] = html.xpath("//tr[contains(td/text(), 'T1:')]/td[2]")[0].text_content()
    athInfo["T2"] = html.xpath("//tr[contains(td/text(), 'T2:')]/td[2]")[0].text_content()

    athInfo["HAS_RESULTS"] = 1
    athInfo["SCRAPED"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    scraperwiki.sqlite.save(unique_keys=["BIB"], data=athInfo, table_name="RESULTS", verbose=0)
示例#27
0
def release(html):
  return {
    'version': html.xpath('./span[@class="release-number"]/a/text()')[0],
    'download': 'https://www.python.org' + html.xpath('./span[@class="release-number"]/a/@href')[0],
    'date': html.xpath('./span[@class="release-date"]/text()')[0],
    'notes': html.xpath('./span[@class="release-enhancements"]/a/@href')[0]
  }
示例#28
0
def save_all_pdfs(links):
    final = []
    pdfs = []
    for link in links:
        r = requests.get(link)
        html = lxml.html.fromstring(r.text)
        possible_links = html.xpath("//a/@href")
        for i in possible_links:
            if "download" in i:
                final.append(i)

    for link in final:
        r = requests.get(link)
        html = lxml.html.fromstring(r.text)
        possible_pdfs = html.xpath("//a/@href")
        
        for i in possible_pdfs:
            if "pdf" in i:
                pdfs.append(i)
    
    for pdf in pdfs:
        name = pdf.split("/")[-1]
        with open(name,"wb") as f:
            r = requests.get(pdf)
            f.write(r.content)
示例#29
0
def _get_operation_data(year, quarter, pageNo, dataArr):
    url = ct.OPERATION_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year,
                            quarter, pageNo, ct.PAGE_NUM[1])
    ct._write_console()
    try:
        html = lxml.html.parse(url)
        xtrs = html.xpath("//table[@class=\"list_table\"]/tr")
        for trs in xtrs:
            code = trs.xpath('td[1]/a/text()')[0]
            name = trs.xpath('td[2]/a/text()')[0]
            arturnover = trs.xpath('td[3]/text()')[0]
            arturnover = '0' if arturnover == '--' else arturnover
            arturndays = trs.xpath('td[4]/text()')[0] 
            arturndays = '0' if arturndays == '--' else arturndays
            inventory_turnover = trs.xpath('td[5]/text()')[0] 
            inventory_turnover = '0' if inventory_turnover == '--' else inventory_turnover
            inventory_days = trs.xpath('td[6]/text()')[0] 
            inventory_days = '0' if inventory_days == '--' else inventory_days
            currentasset_turnover = trs.xpath('td[7]/text()')[0] 
            currentasset_turnover = '0' if currentasset_turnover == '--' else currentasset_turnover
            currentasset_days = trs.xpath('td[8]/text()')[0] 
            currentasset_days = '0' if currentasset_days == '--' else currentasset_days
            dataArr.append([code, name, arturnover, arturndays, inventory_turnover,
                            inventory_days, currentasset_turnover, currentasset_days])
        nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') #获取下一页
        if len(nextPage)>0:
            pageNo = re.findall(r'\d+', nextPage[0])[0]
            return _get_growth_data(year, quarter, pageNo, dataArr)
        else:
            return dataArr
    except:
        pass
示例#30
0
def _get_profit_data(year, quarter, pageNo, dataArr):
    url = ct.PROFIT_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year,
                         quarter, pageNo, ct.PAGE_NUM[1])
    ct._write_console()
    try:
        html = lxml.html.parse(url)
        xtrs = html.xpath("//table[@class=\"list_table\"]/tr")
        for trs in xtrs:
            code = trs.xpath('td[1]/a/text()')[0]
            name = trs.xpath('td[2]/a/text()')[0]
            roe = trs.xpath('td[3]/text()')[0]
            roe = '0' if roe == '--' else roe
            net_profit_ratio = trs.xpath('td[4]/text()')[0] 
            net_profit_ratio = '0' if net_profit_ratio == '--' else net_profit_ratio
            gross_profit_rate = trs.xpath('td[5]/text()')[0] 
            gross_profit_rate = '0' if gross_profit_rate == '--' else gross_profit_rate
            net_profits = trs.xpath('td[6]/text()')[0] 
            net_profits = '0' if net_profits == '--' else net_profits
            eps = trs.xpath('td[7]/text()')[0] 
            eps = '0' if eps == '--' else eps
            business_income = trs.xpath('td[8]/text()')[0] 
            business_income = '0' if business_income == '--' else business_income
            bips = trs.xpath('td[9]/text()')[0] 
            bips = '0' if bips == '--' else bips
            dataArr.append([code, name, roe, net_profit_ratio, gross_profit_rate,
                            net_profits, eps, business_income, bips])
        nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') #获取下一页
        if len(nextPage)>0:
            pageNo = re.findall(r'\d+', nextPage[0])[0]
            return _get_profit_data(year, quarter, pageNo, dataArr)
        else:
            return dataArr
    except:
        pass
示例#31
0
async def parse_gdq_schedule():
    global schedule, oauth_token

    h_t = requests.get("https://gamesdonequick.com/schedule").text
    html = lxml.html.fromstring(h_t)
    table = html.xpath(
        "//table[@id='runTable']//tbody//tr[not(contains(@class, 'second-row'))]"
    )
    table = table

    gs = []
    for element in table:
        try:
            gtime = arrow.get(element.getchildren()[0].text)
        except arrow.parser.ParserError:
            continue
        game = element.getchildren()[1].text
        runners = element.getchildren()[2].text
        gs.append([gtime, game, runners])

    schedule = gs
示例#32
0
def getlist(url, GCase=True):
    rr = re.compile('(^http://)|(^https://)')
    r = requests.get(url, headers=hds())
    txt = r.content.decode('utf8')
    html = lxml.html.parse(StringIO(txt))
    lis = html.xpath('//div[@class="commonList_con"]/ul[@class="li_line"]/li')
    datasets = {}
    for li in lis:
        name = re.sub(r'\s*', '', li.xpath('a/text()')[0].strip())
        name = name.replace('“', '').replace('”', '')
        tm = li.xpath('span/text()')[0].strip()
        href = li.xpath('a/@href')[0].strip()
        name = tm + name
        if not rr.match(href):
            href = 'http://www.spp.gov.cn' + li.xpath('a/@href')[0]
        #print(name,href)
        if (name.endswith('指导性案例')) and GCase:
            datasets[name] = href
        else:
            datasets[name] = href
    return datasets
示例#33
0
 def web_driver_wait_ruishu(self, time: int, rule: str, num: str):
     """
    笨方法 遍历页面匹配
    :param time: 等待时间
    :param rule: 规则 [id, class]
    :param num: 根据元素id
    :return:
    """
     while time:
         response = self.execute_js("document.documentElement.outerHTML")
         try:
             html = etree.HTML(text=response["value"])
             inp = html.xpath("//*[contains(@%s, '%s')]" % (rule, num))
             if inp:
                 break
         except Exception as e:
             continue
         time_.sleep(1)
         time -= 1
     if not time:
         raise Exception("未找到 %s" % num)
示例#34
0
def get_nips(i):
    data = []
    url = 'https://papers.nips.cc/book/advances-in-neural-information-processing-systems-{}-{}'.format(i,1987+i)
    url_base = 'https://papers.nips.cc'
    res = requests.get(url)
    html = lxml.html.fromstring(res.text)
    for li in tqdm(html.xpath('//div[@class="main wrapper clearfix"]/ul/li')):
        a = li.xpath('a')
        pdf_url = url_base + a[0].attrib['href'] + '.pdf'
        #print('title: ' + a[0].text)
        #print('authors: ' + ', '.join([author.text for author in a[1:]]))
        #print('url: ' + pdf_url)
        '''
	#pdf_urlが生きてるか雑に確認しようとした
        stc = requests.get(pdf_url).status_code
        if stc >= 300:
            sys.stderro.write('!!! responce: {}, title: {}'.format(stc, a[0].text))
            continue
        '''
        data.append({'title':a[0].text, 'authors':[author.text for author in a[1:]], 'pdf': pdf_url})
    return data
示例#35
0
 def find_comic_by_pubdate(self, pubdate: date) -> DilbertComic:
     print("Fetching comic for {}".format(pubdate))
     _debug("Fetching comic for {}".format(pubdate))
     url = "http://dilbert.com/strip/{}".format(pubdate.isoformat())
     _debug("Fetching url `{}`".format(url))
     res = self.fetch_url(url)
     _debug("Status: {}".format(res.status))
     body = res.read()
     html = lxml.html.document_fromstring(body)
     el = html.xpath(
         "//img[@class and contains(concat(' ', normalize-space(@class), ' '), ' img-comic ')]"
     )[0]
     comic = DilbertComic(
         pubdate=pubdate,
         url=el.get('src'),
         title=re.sub(' - .*$', '', el.get('alt')),
         filename=Path("{}.gif".format(pubdate.isoformat())),
         width=el.get('width'),
         height=el.get('height'),
     )
     return comic
示例#36
0
def main():
    print('Start crawl...')
    URL = 'https://pr-cy.ru/browser-details/'
    r = requests.get(URL)
    data = r.text
    html = lxml.html.fromstring(data)
    ip_address = html.xpath('//table[@class="table"]//div[@class="ip"]/text()')
    text_mail = settings.text_of_mail.replace('%myip%', ip_address[0])
    print('End crawl. Done!')

    with open('ip_log.json', 'w') as f:
        json.dump(ip_address[0], f, indent=4)

    print('Creating mail...')
    message = Mail(subject=settings.subject_mail,
                   from_addr=settings.send_from,
                   to_addr=settings.send_to,
                   user=settings.user,
                   password=settings.password)
    message.send(text_mail)
    print('Message sent!')
示例#37
0
 def scrape_joint_committees(self):
     url = 'http://legislature.idaho.gov/about/jointcommittees.htm'
     page = self.urlopen(url)
     html = lxml.html.fromstring(page)
     html.make_links_absolute(url)
     joint_li = html.xpath('//td[contains(h1, "Joint")]/ul/li')
     for li in joint_li:
         name, url = li[0].text, li[0].get('href')
         if 'Joint Finance-Appropriations Committee' in name:
             self.get_jfac(name, url)
         elif 'Joint Legislative Oversight Committee' in name:
             self.get_jlfc(name, url)
         elif name == 'Joint Millennium Fund Committee':
             self.get_jmfc(name, url)
         elif name == 'Economic Outlook and Revenue Assessment Committee':
             committee = Committee('joint', name)
             committee.add_source(url)
             # no membership available
             #self.save_committee(committee)
         else:
             self.log('Unknown committee: %s %s' % (name, url))
示例#38
0
    def test_scenario(self, scenario_id):
        """A very shallow test, just to see if the scenario loads all its blocks.

        We don't know enough about each scenario to know what each should do.
        So we load the scenario to see that the workbench could successfully
        serve it.

        """
        url = reverse('workbench_show_scenario', kwargs={'scenario_id': scenario_id})
        client = Client()
        response = client.get(url, follow=True)
        assert response.status_code == 200, scenario_id

        # Be sure we got the whole scenario.  Again, we can't know what to expect
        # here, but at the very least, if there are verticals, they should not be
        # empty.  That would be a sign that some data wasn't loaded properly while
        # rendering the scenario.
        html = lxml.html.fromstring(response.content)
        for vertical_tag in html.xpath('//div[@class="vertical"]'):
            # No vertical tag should be empty.
            assert list(vertical_tag), "Empty <vertical> shouldn't happen!"
示例#39
0
def _download_file(url, name):
    r = requests.get(url)
    r = r.text
    html = lxml.html.parse(StringIO(r))
    url = html.xpath(
        "//div[@class=\"simulation-main-image-panel\"]/a[2]/@href")[0]
    ftype = os.path.splitext(url)[1]
    url = 'https://phet.colorado.edu' + url
    path = '/home/chen/文档/Phet/' + name + ftype
    path1 = os.path.split(path)[0]
    if not os.path.exists(path1):
        os.mkdir(path1)
    if not os.path.exists(path):
        r = requests.get(url, stream=True)
        with open(path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:  # filter out keep-alive new chunks
                    f.write(chunk)
                    f.flush()
        print('Download %s finished' % name)
    return
示例#40
0
def parse_title(text):
    s = ""
    title_re = re.compile("\s")
    html = lxml.html.fromstring(text)
    title = html.xpath('//a[@class="strongbox"]//text()')[0]
    title = re.sub(title_re, '', title)
    for i in title:
        encode_str = str(i.encode("unicode-escape")).split(r'\\u')[-1].replace(
            '\'', '').replace(r'b(', '').strip()
        try:
            num, code = make_dict()
            if len(encode_str) != 4:
                i = i
            elif int(encode_str, 16) not in code:
                i = i
            else:
                i = str(num[code[int(encode_str, 16)]] - 1)
            s += i
        except:
            s = "None"
    return s
示例#41
0
    def find_by_xpath(self, xpath, original_find=None, original_selector=None):
        html = self.htmltree

        elements = []

        for xpath_element in html.xpath(xpath):
            if self._element_is_link(xpath_element):
                return self._find_links_by_xpath(xpath)
            elif self._element_is_control(xpath_element):
                elements.append((DjangoClientControlElement, xpath_element))
            else:
                elements.append((DjangoClientElement, xpath_element))

        find_by = original_find or "xpath"
        query = original_selector or xpath

        return ElementList([
            element_class(element, self) for element_class, element in elements
        ],
                           find_by=find_by,
                           query=query)
示例#42
0
def latest_content(url):
    '''
        获取即时财经新闻内容
    Parameter
    --------
        url:新闻链接
    
    Return
    --------
        string:返回新闻的文字内容
    '''
    try:
        html = lxml.html.parse(url)
        res = html.xpath('//div[@id=\"artibody\"]/p')
        sarr = [etree.tostring(node) for node in res]
        sarr = ''.join(sarr).replace('&#12288;', '')  #.replace('\n\n', '\n').
        html_content = lxml.html.fromstring(sarr)
        content = html_content.text_content()
        return content
    except Exception as er:
        print str(er)
示例#43
0
    def final_crawl(self):
        print("Length of url_list: {}".format(len(self.url_list)))
        for url in self.url_list:
            url = self.domain.format(url)

            req = urllib.request.Request(url,
                                         headers=self.header,
                                         method="GET")
            resp = urllib.request.urlopen(req)
            html = lxml.html.fromstring(resp.read().decode("gbk"))
            ps = html.xpath('//span[@id="yf_content"]/p')

            data = {
                "name": ps[0].text_content().split(':')[1],
                "addr": ps[2].text_content().split(':')[1],
                "area": self.current_area,
                "tel": ps[1].text_content().split(':')[1],
                "time": ps[3].text_content().split(':')[1],
            }
            # print(data)
            self.save2db(data)
示例#44
0
def get_items_in_PNAS(page):
    html = lxml.html.fromstring(page.content)

    article_section = html.xpath(
        "//div[@class='highwire-cite highwire-cite-highwire-article highwire-citation-pnas-list-complete clearfix']"
    )

    try:
        #for a_sec in reversed(article_section):
        for a_sec in article_section:
            a = article_module.Aritcle()

            # get item sections
            title_sec = a_sec.xpath(".//span[@class='highwire-cite-title']")

            # get items
            a.title_e = ''.join(title_sec[0].itertext())
            pass

    except IndexError:
        raise IndexError
示例#45
0
def creatorCheck(puzzlePage):
    # check to see if it was created by me - returns true or false
    log.debug("Starting creatorCheck on " + puzzlePage.url)
    html = lxml.html.fromstring(puzzlePage.text)
    creatorSet = html.xpath(r'//a[@itemprop="creator"]/child::text()')
    if len(creatorSet) == 1:
        if creatorSet[0] == username:
            # this puzzle is mine
            log.debug("created by me")
            return true
        else:
            log.debug("created by someone else " + creatorSet[0])
            return false
    elif len(creatorSet) > 1:
        # multiple creator links, so I don't know what to do
        log.warning("multiple creators")
        log.warning(creatorSet)
        return false
    else:
        log.warning("No Creator found on page " + puzzlePage.url)
        return false
示例#46
0
def scrapeDepartment(agent, url):
    print "scrapeDepartment"
    agent.open(url)
    rawResponse = agent.response().read()
    html = lxml.html.fromstring(rawResponse)
    rows = html.xpath(
        '//table[@id="curriculum_total_program_browser_table"]/tbody/tr')

    total_program_ids = []
    for row in rows:
        departement = row.xpath('td[1]/text()')
        row = row.xpath('td[2]/div/ul/li/a/@href')
        if len(row) > 0:
            ##                        index = row[0].rfind('=')
            print departement, row[
                0] + "&curriculum_total_program_browser_table_per_page=all"
            total_program_ids.append(
                row[0] +
                "&curriculum_total_program_browser_table_per_page=all")

    return total_program_ids
示例#47
0
def get_params(html):
    """
    get params from html
    :param html:
    :return:
    """
    data_init = json.loads(
        html.xpath("//div[@class='zh-general-list clearfix']/@data-init")[0])
    params = data_init['params']
    order_by = params['order_by']
    hash_id = params['hash_id']
    node_name = data_init['nodename']
    ret_params = {
        "params": {
            "offset": 0,
            "order_by": order_by,
            "hash_id": hash_id
        },
        "nodename": node_name
    }
    return ret_params
示例#48
0
    def find_by_xpath(self, xpath, original_find=None, original_query=None):
        html = self.htmltree

        elements = []

        for xpath_element in html.xpath(xpath):
            if self._element_is_link(xpath_element):
                return self._find_links_by_xpath(xpath)
            elif self._element_is_control(xpath_element) and xpath_element.name:
                return self.find_by_name(xpath_element.name)
            else:
                elements.append(self.get_control(xpath_element))

        find_by = original_find or "xpath"
        query = original_query or xpath

        return ElementList(
            [ZopeTestBrowserElement(element, self) for element in elements],
            find_by=find_by,
            query=query,
        )
示例#49
0
def get_webpage(url, clean_text=False):

  # download page

  try:
    connection = urllib.request.urlopen(url, timeout=2)
  except timeout:
    return [], None, "timeout"
  except Exception as e:
    return [], None, "fail" 

  page = connection.read()
  connection.close()

  # parse page
  html = lxml.html.fromstring(page)

  # extract links
  links = html.xpath("//a/@href")

  # complete relative links
  complete_links = [ urllib.parse.urljoin(url, link) for link in links ]

  # get text
  text = page

  if clean_text:

    # initialize cleaner
    cleaner = lxml.html.clean.Cleaner()
    cleaner.javascript = True # This is True because we want to activate the javascript filter
    cleaner.style = True      # This is True because we want to activate the styles & stylesheet filter

    # extract text
    text = cleaner.clean_html(html).text_content()

    # remove special characters
    text = re.sub('[^A-Za-z0-9]+', ' ', text)

  return complete_links, text, None
示例#50
0
    def get_text(self):
        """
        拿取房源信息
        :return: 信息列表
        """
        name_list = []
        house_type_list = []
        position_list = []
        direction_list = []
        money_list = []
        for i in range(1, 200):
            url = "https://hz.zu.anjuke.com/fangyuan/xiaoshan/p{}-px7/".format(
                i)
            print(url)

            rsp = requests.get(url=url, headers=self.headers, proxies=self.ip)
            html = etree.HTML(rsp.text)
            for i in range(3, 61):
                div = html.xpath(
                    "//div[@id='list-content']/div[{}]".format(i))[0]
                name = div.xpath("./a/@title")[0]
                name_list.append(name)
                house_type = div.xpath("./div[@class='zu-info']/p[@class='details-item tag']/text()")[1] + \
                             div.xpath("./div[@class='zu-info']/p[@class='details-item tag']/text()")[2].strip()
            house_type_list.append(house_type)
            c = div.xpath(
                "./div[@class='zu-info']/address[@class='details-item']//text()"
            )
            position = [c[1] + c[2].strip() if len(c) > 2 else c][0]
            position_list.append(position)
            a = div.xpath("./div[@class='zu-info']/p[2]//text()")
            direction = [a[1] + a[3] + a[5] if len(a) > 5 else a[1] + a[3]][0]
            direction_list.append(direction)
            money = div.xpath("./div[@class='zu-side']//text()")[
                1] + div.xpath("./div[@class='zu-side']//text()")[2]
            money_list.append(money)
        item_demo = list(
            zip(name_list, house_type_list, position_list, direction_list,
                money_list))
        return item_demo
示例#51
0
文件: _script.py 项目: xing2387/CCVE
def parseIndexInfo(html):
    global indexInfo
    aa = []
    for script in html.xpath("//script"):
        # print(script.text)
        if type(script.text) is not str:
            continue
        cs = re.match(
            ".*var cs='(.*?)'.*?var.*?var\s*(.*?)\s*=\s*lc.*var\s*(.*?)\s*=\s*lc.*var\s*(.*?)\s*=\s*lc.*var\s*(.*?)\s*=\s*lc.*src=(.*?).jpg",
            script.text, re.S | re.M)
        if cs:
            css = cs.group(1)
            urlbase = cs.group(6)
            aa.append([1, cs.group(2), urlbase.find(cs.group(2))])
            aa.append([2, cs.group(3), urlbase.find(cs.group(3))])
            aa.append([3, cs.group(4), urlbase.find(cs.group(4))])
            aa.append([4, cs.group(5), urlbase.find(cs.group(5))])
            maxIndex = max(aa, key=lambda b: b[2])
            currSum = 0
            for a in aa:
                a.append(currSum)
                if a[0] == maxIndex[0]:
                    currSum += 40
                else:
                    currSum += 2
                a.append(currSum)
            aa.sort(key=lambda bb: bb[2])
            break
    # print(aa)
    for a in aa:
        value = (a[3], a[4])
        # print(str(aa.index(a)))
        if aa.index(a) == 0:
            indexInfo["pageCount"] = value
        elif aa.index(a) == 1:
            indexInfo["key"] = value
        elif aa.index(a) == 2:
            indexInfo["chapter"] = value
        elif aa.index(a) == 3:
            indexInfo["subfix"] = value
示例#52
0
def main():
    """
    The function that parses the FiveM page into
    JSON that LambentLight/ServerManager can read.
    """
    # If the number of argument is not three
    if len(sys.argv) != 3:
        print(f"Wrong number of arguments. Expected 3, got {len(sys.argv)}")
        sys.exit(2)

    # First we need to check that the file with the HTML exists
    if not os.path.isfile(sys.argv[1]):
        # Print a message and exit with a code 2
        print("The file with the builds does not exists!")
        sys.exit(3)

    # Load the contents into the lxml parser
    html = lxml.html.parse(sys.argv[1])
    # Get the a nodes
    a_nodes = html.xpath("//a[@class='panel-block ']")

    # Create a list for storing our builds
    builds = []

    # For each a node that we have
    for node in a_nodes:
        # Try to search the respective regex on the href
        regex = re.search(REGEX, node.attrib.get("href", ""))

        # If the regex was able to find the group
        if regex is not None and regex.group(1):
            # Add the item into our list
            builds.append(regex.group(1))

    # Open a file for writing the builds
    with open(sys.argv[2], "w") as output:
        # Dump the list of builds
        json.dump(builds, output, indent=4)
        # And finally and a line at the end
        output.write("\n")
示例#53
0
 def parseItems(self, html, response):
     houselist = html.xpath(
         ".//ul[@class='house-list-wrap']//div[@class='list-info']")
     items = []
     for houseinfo in houselist:
         detailurl = houseinfo.xpath(".//h2[1]/a/@href")[0]
         imageurl = houseinfo.xpath("./preceding-sibling::div[1]//a/@href")
         title = "".join(houseinfo.xpath(".//h2[1]/a/text()"))
         roomNum = "".join(
             houseinfo.xpath(".//p[1]/span[1]/text()")[0].split())
         size = "".join(houseinfo.xpath(".//p[1]/span[2]/text()"))
         orient = "".join(houseinfo.xpath(".//p[1]/span[3]/text()"))
         floor = "".join(houseinfo.xpath(".//p[1]/span[4]/text()"))
         address = "".join(("".join(
             houseinfo.xpath(".//p[2]/span[1]//a/text()"))).split())
         sumprice = "".join(
             houseinfo.xpath(
                 "./following-sibling::div[1]//p[@class='sum']/b/text()"))
         unitprice = "".join(
             houseinfo.xpath(
                 "./following-sibling::div[@class='price']//p[@class='unit']/text()"
             ))
         fromUrl = response.url
         key = fromUrl.split("//")[1]
         key = key.split(".")[0]
         city = self.urlMap[key]
         items.append(
             HouseItem(_id=detailurl,
                       title=title,
                       roomNum=roomNum,
                       size=NumberUtil.fromString(size),
                       orient=orient,
                       floor=floor,
                       address=address,
                       sumPrice=NumberUtil.fromString(sumprice),
                       unitPrice=NumberUtil.fromString(unitprice),
                       imageurl=imageurl,
                       city=city,
                       fromUrl=fromUrl))
     return items
示例#54
0
    def save(self, *args, **kwargs):
        # Update the Project's URIs
        docutils_settings = getattr(settings,
                                    "RESTRUCTUREDTEXT_FILTER_SETTINGS", {})

        docutils_settings.update({"warning_stream": os.devnull})

        try:
            html_string = publish_string(source=smart_str(self.description),
                                         writer_name="html4css1",
                                         settings_overrides=docutils_settings)
            if html_string.strip():
                html = lxml.html.fromstring(html_string)

                for link in html.xpath("//a/@href"):
                    if len(link) > 400:
                        # @@@ ugly as sin, but fixes shit for now
                        continue

                    try:
                        if any(urlparse.urlparse(link)[:5]):
                            PackageURI.objects.get_or_create(
                                package=self.package, uri=link)
                    except ValueError:
                        pass
        except Exception:
            # @@@ We Swallow Exceptions here, but it's the best way that I can think of atm.
            pass

        super(Release, self).save(*args, **kwargs)

        _current_show_install_command = self.show_install_command

        if self.classifiers.filter(trove="Framework :: Plone").exists():
            self.show_install_command = False
        else:
            self.show_install_command = True

        if _current_show_install_command != self.show_install_command:
            super(Release, self).save(*args, **kwargs)
示例#55
0
def _parse_fq_data(url, index, retry_count, pause, proxies=[]):
    import random
    for _ in range(retry_count):
        proxy = random.choice(proxies) if proxies else None
        time.sleep(pause)
        try:
            request = Request(url)
            text = None
            if not proxy:
                text = urlopen(request, timeout=10).read()
            else:
                text = build_opener(ProxyHandler(proxy)).open(request, timeout=10).read()
            text = text.decode('GBK')
            html = lxml.html.parse(StringIO(text))
            res = html.xpath('//table[@id=\"FundHoldSharesTable\"]')
            if ct.PY3:
                sarr = [etree.tostring(node).decode('utf-8') for node in res]
            else:
                sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            if sarr == '':
                return None
            df = pd.read_html(sarr, skiprows = [0, 1])[0]
            if len(df) == 0:
                return pd.DataFrame()
            if index:
                df.columns = ct.HIST_FQ_COLS[0:7]
            else:
                df.columns = ct.HIST_FQ_COLS
            if df['date'].dtypes == np.object:
                df['date'] = pd.to_datetime(df['date'])
            df = df.drop_duplicates('date')
        except ValueError as e:
            # 时间较早,已经读不到数据
            return None
        except Exception as e:
            print(e)
        else:
            return df
    raise IOError(ct.NETWORK_URL_ERROR_MSG)
示例#56
0
def set_knockout_template(formset, request, opts: dict=None):
    if opts is None:
        opts = {}
    _opts = {
        'formset_form_class': 'form-empty',
        'inline_title': getattr(formset, 'inline_title', formset.model._meta.verbose_name),
        'layout_classes': get_layout_classes(),
    }
    _opts.update(opts)
    renderer = render_form(request, 'inline', formset.empty_form, {
        'caller': {},
        'opts': _opts,
    })
    empty_form_str = renderer.__str__()
    # return str(empty_form_str)
    html = lxml.html.parse(StringIO(empty_form_str))
    for element in html.xpath("//*[@id or @name or @for]"):
        # sdv.dbg('element', element)
        data_bind_args = []
        for attr in ['for', 'id', 'name']:
            if attr in element.attrib:
                attr_parts = element.attrib[attr].split('__prefix__')
                if len(attr_parts) == 2:
                    attr_parts = to_json(attr_parts[0]) + ' + ($index() + $parent.serversideFormsCount) + ' + to_json(attr_parts[1])
                    data_bind_args.append(to_json(attr) + ': ' + attr_parts)
                    del element.attrib[attr]
        # sdv.dbg('data_bind_args', data_bind_args)
        if len(data_bind_args) > 0:
            data_bind = 'attr: {' + ', '.join(data_bind_args) + '}'
            # sdv.dbg('data_bind', data_bind)
            element.attrib['data-bind'] = data_bind
    knockout_template = tostring(html, method='html', encoding='utf-8', standalone=True).decode('utf-8')
    # sdv.dbg('knockout_template before', knockout_template)
    body_begin = knockout_template.find('<body>')
    body_end = knockout_template.rfind('</body>')
    if body_begin == -1 or body_end == -1:
        sdv.dbg('failed ko template', knockout_template)
        raise ValueError('Knockout template is not wrapped in body tag')
    # sdv.dbg('knockout_template after', formset.knockout_template)
    formset.knockout_template = knockout_template[body_begin + len('<body>'):body_end]
示例#57
0
    def get_jfac(self, name, url):
        """gets membership info for the Joint Finance and Appropriations
        Committee."""
        jfac_page = self.urlopen(url)
        html = lxml.html.fromstring(jfac_page)
        table = html.xpath('body/table/tr/td[2]/table')[0]
        committee = Committee('joint', name)
        for row in table.xpath('tr')[1:]:
            senate, house = row.xpath('td/strong')
            senate = senate.text.replace(u'\xa0', ' ')
            house = house.text.replace(u'\xa0', ' ')
            if ',' in senate:
                committee.add_member(*senate.split(','), chamber='upper')
            else:
                committee.add_member(senate, chamber='upper')
            if ',' in house:
                committee.add_member(*house.split(','), chamber='lower')
            else:
                committee.add_member(house, chamber='lower')

        committee.add_source(url)
        self.save_committee(committee)
示例#58
0
def get_date_ohlc(exchange, symbol, date):
    print exchange, symbol, date
    for _ in range(3):
        try:
            time.sleep(0.005)
            page_url = 'https://www.google.com.hk/finance/historical?q=%s:%s' % (
                exchange, symbol)
            r = s.get(page_url, proxies=proxies)
            html = lxml.html.parse(StringIO(r.text))
            res = html.xpath('//table[@class=\"gf-table historical_price\"]')
            sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            if sarr == '':
                return None
            df = pd.read_html(sarr, skiprows=[0])[0]
            df.columns = ['date', 'open', 'high', 'low', 'close', 'amount']
            df = df.drop('amount', axis=1)

            def date_to_int(s):
                y, m, d = s.split("-")
                return int(y) * 10000 + int(m) * 100 + int(d)

            df['date'] = df['date'].apply(date_to_int)
            # df['date'] = pd.to_datetime(df['date'], format=u"%Y-%m-%d")
            df = df.drop_duplicates('date')
            cmp_d = int(date.strftime("%Y%m%d"))
            df = df[df.date == cmp_d]
            if len(df) > 0:
                df['date'] = int(date.strftime("%Y%m%d"))
                code = get_code(symbol)
                assert code > 0, 'symbol code is %s' % code
                df.insert(0, 'code', code)
                df = df.set_index('code')
                return df
            return None
        except Exception as e:
            print traceback.format_exc()
            yyhtools.error(traceback.format_exc())
            return None
示例#59
0
def latest_content(net, url):
    """
    获取即时财经新闻内容
    :param net: 指定网站名
    :param url: 新闻链接
    :return: string
        返回新闻的文字内容
    """
    content = ''
    try:
        html = lxml.html.parse(url, parser=etree.HTMLParser(encoding='utf-8'))
        res = html.xpath(xpaths[net])
        p_str_list = [
            etree.tostring(node).strip().decode('utf-8') for node in res
        ]
        content = '\n'.join(p_str_list)
        html_content = lxml.html.fromstring(content)
        content = html_content.text_content()
        content = re.sub(r'(\r*\n)+', '\n', content)
    except Exception as e:
        print(e)
    return content
示例#60
0
    def parseDescription(self, data):
        x = [
            '//*[@id="section-overview"]/mat-card/div[2]/fields-card[{}]/div/span'
        ]

        html = lxml.html.fromstring(data)

        res = []
        for i in range(1, 5):
            elems = html.xpath(x[0].format(i))
            if len(elems):
                itr = iter(elems)
                while True:
                    try:
                        res.append([
                            next(itr).text_content().replace(u'\xa0', u''),
                            next(itr).text_content().replace(u'\xa0', u'')
                        ])
                    except StopIteration:
                        break

        return {'header': 'Description', 'lst': res}