def start(url, depth, regex): b = Browser() b.open(url) print("parse html ...: %d" % len(b.content)) dom = html.fromstring(b.content) b.close() print("END")
def scrap(url): try: reload(sys) sys.setdefaultencoding('utf-8') display = Display(visible=0, size=(800, 600)) display.start() b = Browser() b.open(url) content = b.main_frame['content'].read() #print "=====Crawled=====" #print content #dom = html.fromstring(content) b.close() display.stop() del b return str(content) except Exception as e: print "===Scrapping Exception=====" print str(e) b.close() display.stop()
import sys reload(sys) sys.setdefaultencoding("utf-8") from webkit_browser import Browser from lxml import html import csv reader = csv.reader(file('stock_list.csv', 'rb')) stockids = [] for line in reader: stockids.append(line[1]) for integer in range(13): stockids.pop(0) b = Browser() for stockid in stockids: print 'http://stockhtm.finance.qq.com/sstock/ggcx/' + stockid +'.shtml' b.open('http://stockhtm.finance.qq.com/sstock/ggcx/' + stockid +'.shtml') content = b.main_frame['content'].read() dom = html.fromstring(content) results = dom.xpath('//*[@class="col-2 fr"]/ul/li') for result in results: print result.text_content() results = dom.xpath('//h1[@class="col-1-1"]') name = "" for result in results: name = result print result.text_content() results = dom.xpath('//table[@class="l20"]//tr/td[contains(@id,"main-")]') for result in results:
import sys reload(sys) sys.setdefaultencoding("utf-8") from webkit_browser import Browser from lxml import html import re b = Browser() b.open('http://quote.eastmoney.com/stocklist.html') content = b.main_frame['content'].read() dom = html.fromstring(content) results = dom.xpath('//*[@id="quotesearch"]/ul/li') import csv cf = open("stock_list.csv", "w") writer = csv.writer(cf) writer.writerow(['name', 'id']) for result in results: pattern = re.compile('[()]') stockItem = result.text_content() pricePair = pattern.split(stockItem) pricePair.pop() writer.writerow(pricePair) cf.close() print 'The result has been saved into stock_list.csv'