def getDetailListPages(pageNo, conn): ''' 最终目标页面的列表项 每个列表项打开后是若干url列表,每个url指向的页面是抓取目标 ''' # requestUrl = 'http://218.108.28.28:8000/ListPrompts.aspx?sectionID=01&tableID=CourtNotCarryOut&associateID=00000000000000000&hasPromptHistroy=False' requestUrl = 'http://218.108.28.28:8000/ListPrompts.aspx' head = ['Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset:GBK,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding:gzip,deflate,sdch', 'Accept-Language:zh-CN,zh;q=0.8', 'Cache-Control:max-age=0', 'Connection:keep-alive', 'Cookie:ASP.NET_SessionId=t3isah45gu5kb4454qyxkhzy; lzstat_uv=6061202253430616218|2529639; lzstat_ss=953382219_1_1373621147_2529639; _gscu_374314293=7359234405sddy11; _gscs_374314293=73592344d74zfy11|pv:3; _gscbrs_374314293=1; ECStaticSession=ECS81', 'Host:www.zjcredit.gov.cn:8000', 'Pragma:no-cache', 'Origin:http://www.zjcredit.gov.cn:8000', 'Referer:http://218.108.28.28:8000/ListPrompts.aspx?sectionID=01&tableID=CourtNotCarryOut&associateID=00000000000000000&hasPromptHistroy=False', 'User-Agent:Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.71 Safari/537.36' ] # associateID = '00000000000000000' # field_CorporationName = '' # field_OrganizationCode = '' # isIntermediary = 'False' # pageLength = 20 # recordTotal = 46358 # sectionID = '01' # tableID = 'CourtNotCarryOut' post_data_dic = {'recordTotal': 46358, 'tableID': 'CourtNotCarryOut', 'associateID': '00000000000000000', 'field_CorporationName': '', 'sectionID': '01', 'field_OrganizationCode': '', 'isIntermediary': 'False', 'pageNo': pageNo, 'pageLength': 20} proxy = '218.108.170.173:82' proxy = '218.108.170.170:80' content = jTool.fetchUrlProxy2(proxy, requestUrl, post_data_dic, 'post', head) # jTool.logit(content, '22.html') # content = open('22.html').read() if content: exp = '//table[1]/tr[2]/td[1]/table[1]/tr/td/a' aList = jTool.getNodes(content, exp) # import lxml.etree as ETree # print aList[1].xpath('//@title')[0] titles = aList[1].xpath('//@title') corpRecords = {} # print aList[1].xpath('//@onclick')[20] aStr = aList[1].xpath('//@onclick') cursor = conn.cursor() from urllib import unquote for i in range(20): # corpRecords['`corpName`'] = (unquote(titles[i])).decode('utf-8') corpRecords['`corpName`'] = titles[i].decode('utf-8').encode('utf-8') # print corpRecords['`corpName`'] # print unquote(corpRecords['`corpName`']) # print aStr[i+1] tmp = aStr[i+1].split(',') # print i+1 corpRecords['`table`'] = tmp[0].split("'")[1] corpRecords['`rowID`'] = tmp[5].split("'")[1] # print corpRecords['rowID'] # print corpRecords['corpName'] corpRecords['`pageNo`'] = str(pageNo) result = jTool.insertData(cursor, 'base_page_list', corpRecords) conn.commit() cursor.close() conn.close()
def getDetailPageContent(proxy, head, rowID, corpName): requestUrl = 'http://www.zjcredit.gov.cn:8000/BrowseDocumentPrompt.aspx' post_data_dic = {'sectionID': '01', 'associateID': '00000000000000000', 'tableID': 'CourtNotCarryOut', 'creditID': 0, 'rowID': rowID, 'timeSpan': '', 'seqNo': '', 'corpName': corpName, 'titleID': ''} content = jTool.fetchUrlProxy2(proxy, requestUrl, post_data_dic, 'post', head) return content