def parse(self, response): ''' :param response: :return: 取各版块的url并传入到parse_pages ''' self.log('Hi, this is an item page! %s' % response.url) # root = bs(response.body) # # #use navigation # curls = [] # divs = root.findAll("div", attrs={"class": "title"}) # for div in divs: # try: # url = div.find("span").find("a", attrs={"href": re.compile("^NewsList")}) # curls.append(url.get("href")) # except: # pass # # for url in curls: # yield scrapy.Request(root_domain + "/News/" + url, self.parse_pages) item = FenghuoItem() item['articles'] = [] c_item = FenghuoItem() c_item['url'] = '1233' b_item = FenghuoItem() b_item['url'] = '3333' item['articles'].append(dict(c_item)) item['articles'].append(dict(b_item)) line = json.dumps(dict(item)) + '\n' print line
def parse_item(self, response): ''' 访问各新闻页面,获取各键值 :param response: :return: ''' self.log('Hi, this is an item page! %s' % response.url) item = FenghuoItem() root = bs(response.body) item['topPost'] = 1 item["site_id"] = 13 item['website_id'] = '' item["site_name"] = '通山县机构编制网' item["area"] = 958 item["site_weight"] = 2 item['countryid'] = 1156 item['province'] = 1673 item['city'] = 136 item["ip"] = socket.gethostbyname("www.tsxbb.gov.cn") item["site_url"] = "www.tsxbb.gov.cn" item["forumurl"] = response.meta['forumurl'] item["site_type"] = '新闻' item["url"] = response.url item["subname"] = root.find("span", attrs={ "class": "text14h" }).find("a", attrs={ "href": "../" }).text item["title"] = root.find("td", attrs={"class": "textbiaoti"}).text str = root.find("td", attrs={"class": "text12hui"}).text str = str[str.index('20'):] item["pubdate"] = str[:str.index(' ') - 1] try: str = str[str.index('su = ') + 6:] item["website_id"] = str[:str.index(';') - 1] except: item["website_id"] = "" styles = root.find("div", attrs={ "class": "TRS_Editor" }).findAll("style") for style in styles: style.clear() #替换所有图片标签 imgs = root.find("div", attrs={"class": "TRS_Editor"}).findAll("img") for img in imgs: img.replaceWith(img.prettify()) item["txt"] = root.find("div", attrs={ "class": "TRS_Editor" }).text.replace("\r\n", "$*huanhang*$").replace("\n", "$*huanhang*$").replace( "\"", "'").replace("<br />", "$*huanhang*$") item["txt_len"] = len(item["txt"]) item["domain_1"] = "tsxbb.gov.cn" item["domain_2"] = "www" item["snatch_time"] = datetime.datetime.now()
def parse(self, response): ''' :param response: :return: 取各版块的url并传入到parse_pages ''' #test # url = 'http://m.sohu.com/cl/58/?page=1' # yield scrapy.Request(url, self.parse_item) #end test file_name = "log-" + str(datetime.date.today()) + ".txt" t_file = codecs.open(file_name, 'ab', encoding='utf-8') if self.isFirst: self.isFirst = False line = str(datetime.datetime.now()) + " " + self.name + " spider start!\n" t_file.write(line.decode("unicode_escape")) updatetool = UpdateTool() self.log('Hi, this is an page! %s' % response.url) self.new = 0 self.pages += 1 root = bs(response.body.decode('utf-8')) div = root.find("div", attrs={"class": "bd3 pb1"}) lis = div.findAll("p") for li in lis: item = FenghuoItem() iurl = 'm.sohu.com'+li.find("a").get("href") title = li.find("a").text pubdate = root.find('p',attrs={'class': 'w c2'}).text month = pubdate[16:18] day = pubdate[19:21] hour = pubdate[22:24] year = pubdate[11:15] item_date = datetime.date(int(year), int(month), int(day)) item['url'] = iurl item['title'] = title item['pubdate'] = str(item_date) item['snatch_time'] = datetime.datetime.now() item['topPost'] = 1 item['site_name'] = '手机搜狐网' item['site_url'] = "m.sohu.com/" print item if (not updatetool.hasUrl(iurl)) and self.now - item_date < self.delay: self.new += 1 self.total_new += 1 fp.process_item(item, "123") url = 'http://m.sohu.com/cl/58/?page='+str(self.pages) if self.new > 3 and self.hasNext: yield scrapy.Request(url, self.parse) else: line = str(datetime.datetime.now()) + " Totally crawled " + str(self.total_new) + " items " + self.name + " spider has finished start!\n\n" t_file.write(line.decode("unicode_escape"))
def parse_items(self,response): #title done #txt:main work,focus on parse HTML and remove impurity #pubdate done #snatch_time static #site_url static #site_name static #url done #topPost static #url done #init: html_parser = HTMLParser.HTMLParser() item = FenghuoItem() url = response.url self.log("Hi,this is in parse_items,url is %s" % url) root = bs(response.body) div=root.find("div","ny_con news_con_ny") pubdate0 = div.find("p","news_time").text year=pubdate0[3:7] month=pubdate0[8:10] day=pubdate0[11:13] hour='00' pubdate=str(year) + "-" + month + "-" + day + " " + hour + ":00" ps = div.findAll("p","MsoNormal") title = div.find("h3").text html = "" #see if there's rubbish in ps remove: print ps[-1] if ps[-1].find("div","page"): del ps[-1] print 'sssssssssssssss' # get txt by paragraph for p in ps: #remove comments in text: comments = p.findAll(text=lambda text:isinstance(text,Comment)) [comment.extract() for comment in comments] html = html + '\n' + p.text.encode('utf-8') text = html_parser.unescape(html) item['url'] = url item['title'] = html_parser.unescape(title) item['txt'] = text item['pubdate'] = str(pubdate) item['snatch_time'] = datetime.datetime.now() item['topPost'] = 1 item['site_name'] = '武汉大学研究生院' item['site_url'] = "www.gs.whu.edu.cn/" f=open('scrapy_log.txt','a') f.write(html_parser.unescape(title)+'\n'+str(pubdate)+'\n') f.close() fp.process_item(item, "123")
def parse_item(self, response): ''' 访问各新闻页面,获取各键值 :param response: :return: ''' self.log('Hi, this is an item page! %s' % response.url) item = FenghuoItem() root = bs(response.body) try: item['topPost'] = 1 item["site_id"] = 17 item['website_id'] = '' item["site_name"] = '襄城新闻网' item["area"] = 958 item["site_weight"] = 2 item['countryid'] = 1156 item['province'] = 1673 item['city'] = 136 item["ip"] = socket.gethostbyname("www.xcxww.com") item["site_url"] = "www.xcxww.com" types = root.find("div", attrs={"class": "pagenav"}).findAll("a") item["forumurl"] = types[len(types) - 1].get("href") item["site_type"] = '新闻' item["domain_1"] = "xcxww.com" item["domain_2"] = "www" item["url"] = response.url types = root.find("div", attrs={"class": "pagenav"}).findAll("a") item["subname"] = types[len(types) - 1].text item["pubdate"] = root.find("div", attrs={ "class": "info" }).find("span").text #替换所有图片标签 imgs = root.find("div", attrs={"class": "content"}).findAll("img") for img in imgs: img.replaceWith(img.prettify()) item["txt"] = root.find("div", attrs={ "class": "content" }).text.replace("\r\n", "$*huanhang*$").replace( "\n", "$*huanhang*$").replace("\"", "'").replace("<br />", "$*huanhang*$") item["txt_len"] = len(item["txt"]) item["title"] = root.find("h1").text item["snatch_time"] = datetime.datetime.now() return item except: #errors are log in error1.json with url line = response.url + "\n" self.file = codecs.open('error1.json', 'ab', encoding='utf-8') self.file.write(line.decode("unicode_escape")) pass
def parse_items(self, response): #title done #txt:main work,focus on parse HTML and remove impurity #pubdate done #snatch_time static #site_url static #site_name static #url done #topPost static #url done #init: html_parser = HTMLParser.HTMLParser() item = FenghuoItem() url = response.url self.log("Hi,this is in parse_items,url is %s" % url) root = bs(response.body) div = root.find("div", "content-box clear") pubdate0 = div.find("div", "time").text year = pubdate0[0:4] month = pubdate0[5:7] day = pubdate0[8:10] hour = pubdate0[11:13] minute = pubdate0[14:16] pubdate = str( year) + "-" + month + "-" + day + " " + hour + ":" + minute ps = div.findAll("p") title = div.find("h1").text html = "" #see if there's rubbish in ps remove: print ps[-1] if ps[-1].find("div", "page"): del ps[-1] print 'sssssssssssssss' # get txt by paragraph for p in ps: #remove comments in text: comments = p.findAll(text=lambda text: isinstance(text, Comment)) [comment.extract() for comment in comments] html = html + '\n' + p.text.encode('utf-8') text = html_parser.unescape(html) item['url'] = url item['title'] = html_parser.unescape(title) item['txt'] = text item['pubdate'] = str(pubdate) item['snatch_time'] = datetime.datetime.now() item['topPost'] = 1 item['site_name'] = '搜狐国内新闻' item['site_url'] = "news.sohus.com/" # see if the txt of this page has a next page fp.process_item(item, "123")
def parse(self, response): ''' :param response: :return: 取各版块的url并传入到parse_pages ''' #test # url = 'http://www.gs.whu.edu.cn/index.php/index-show-tid-40-p-1.html' # yield scrapy.Request(url, self.parse_item) #end test file_name = "log-" + str(datetime.date.today()) + ".txt" t_file = codecs.open(file_name, 'ab', encoding='utf-8') if self.isFirst: self.isFirst = False line = str(datetime.datetime.now()) + " " + self.name + " spider start!\n" t_file.write(line.decode("unicode_escape")) updatetool = UpdateTool() self.log('Hi, this is an page! %s' % response.url) self.new = 0 self.pages += 1 root = bs(response.body.decode('utf-8')) div = root.find("div", attrs={"class": "ulnotice"}) lis = div.findAll("li") for li in lis: item = FenghuoItem() iurl = li.find("a").get("href") if iurl[0:4]!='http': iurl='http://gs.whu.edu.cn'+iurl title = li.find("a").text pubdate = li.find("span").text month = pubdate[6:8] day = pubdate[9:11] hour = '00' year = pubdate[1:5] item_date = datetime.date(int(year), int(month), int(day)) if (not updatetool.hasUrl(iurl)) and self.now - item_date < self.delay: self.new += 1 self.total_new += 1 yield scrapy.Request(iurl, self.parse_items)#according to iurl,requuest the detail page url = 'http://www.gs.whu.edu.cn/index.php/index-show-tid-40-p-'+str(self.pages)+'.html' if self.new > 10 and self.hasNext: yield scrapy.Request(url, self.parse) else: line = str(datetime.datetime.now()) + " Totally crawled " + str(self.total_new) + " items " + self.name + " spider has finished start!\n\n" t_file.write(line.decode("unicode_escape"))
def parse_page(self, response): self.log('Hi, this is an list page! %s' % response.url) root = bs(response.body.decode('gbk')) table = root.find("div", id="threadlist") trs = table.findAll("tr") for tr in trs: item = FenghuoItem() item["domain_1"] = "437600.net" item["domain_2"] = "www" item["site_id"] = 12 item['website_id'] = '' item["site_name"] = '通山信息港' item["area"] = 958 item["site_weight"] = 2 item['countryid'] = 1156 item['province'] = 1673 item['city'] = 136 item["ip"] = socket.gethostbyname("www.437600.net") item["site_url"] = "www.437600.net" item["forumurl"] = response.meta['forumurl'] item["site_type"] = '论坛' item["snatch_time"] = datetime.datetime.now() try: item["url"] = root_domain + tr.find( "a", attrs={ "href": re.compile("^thread") }).get("href") item["title"] = tr.find("th").find("a", attrs={ "href": re.compile("^thread") }).text item["author"] = tr.find("td", attrs={ "class": "by" }).find("cite").find("a").text item["userpage"] = root_domain + tr.find( "td", attrs={ "class": "by" }).find("cite").find("a").get("href") url_id = tr.findAll( "td", attrs={"class": "by"})[0].find("cite").find("a").get("href") url_id = url_id[url_id.index("uid-") + 4:] item["userid"] = url_id[:url_id.index(".html")] try: item["reply"] = tr.find("td", attrs={ "class": "num" }).find("a").text except: item["reply"] = "" try: item["view"] = tr.find("td", attrs={ "class": "num" }).find("em").text except: item["view"] = "" try: item["postid"] = tr.findAll( "td", attrs={"class": "by"})[1].find("cite").find("a").text except: item["postid"] = "" try: item["subname"] = root_domain + tr.find("th").find( "em").find("a").text except: item["subname"] = "" try: time1 = tr.findAll("td", attrs={"class": "by"})[1].find("em").find("a") try: item["updatetime"] = time1.find("span").get("title") except: item["updatetime"] = time1.text except: item["updatetime"] = "" #explore the content of the page yield scrapy.Request(item["url"], self.parse_item, meta={'item': item}) except Exception as e: #print e #print tr line = str(tr) + "\n" self.file = codecs.open('error1.json', 'ab', encoding='utf-8') self.file.write(line.decode("unicode_escape"))
def parse_item(self, response): item = response.meta['item'] item['articles'] = [] new_root = bs(response.body.decode('gbk')) subnames = new_root.find("div", id="pt").findAll("a") item["subname"] = subnames[len(subnames) - 2].text try: pageText = new_root.find("div", attrs={ "class": "pg" }).find("span").text pageText = pageText[pageText.index("/") + 2:] pageText = pageText[:pageText.index(" ")] except: pageText = 1 url = response.url try: for page in range(1, int(pageText) + 1): r1 = requests.post(url) new_root = bs(r1.text) divs = new_root.findChildren( 'div', attrs={"id": re.compile("^post_[0-9]")}) for div in divs: c_item = FenghuoItem() c_item["topPost"] = 0 c_item['author'] = div.find("table").find( "td", attrs={ "class": "pls" }).find("div", attrs={ "class": "authi" }).find("a").text c_item['userpage'] = div.find("table").find( "td", attrs={ "class": "pls" }).find("div", attrs={ "class": "authi" }).find("a").get("href") #获取四种pubdate结构的方式 try: c_item["pubdate"] = div.find("table").find( "td", attrs={ "class": "plc" }).find("div", attrs={ "class": "authi" }).find("em").find("span").get("title") except: try: t_time = div.find("table").find( "td", attrs={ "class": "plc" }).find("div", attrs={ "class": "authi" }).find("em").text c_item["pubdate"] = t_time[t_time.index(" ") + 1:] except: try: c_item["pubdate"] = div.find("table").find( "td", attrs={ "class": "plc comiis_vtx" }).find("div", attrs={ "class": "authi" }).find("em").find("span").get("title") except: try: t_time = div.find("table").find( "td", attrs={ "class": "plc comiis_vtx" }).find("div", attrs={ "class": "authi" }).find("em").text c_item["pubdate"] = t_time[t_time. index(" ") + 1:] except: raise try: c_item["postfloor"] = div.find("table").find( "td", attrs={ "class": "plc" }).find("div", attrs={ "class": "pi" }).find("em").text if int(c_item["postfloor"]) == 1: c_item["topPost"] = 1 except: c_item["postfloor"] = div.find("table").find( "td", attrs={ "class": "pls" }).findNextSibling().find("div", attrs={ "class": "pi" }).find("em").text if int(c_item["postfloor"]) == 1: c_item["topPost"] = 1 # #clear css, js , advertisement and messy code styles = div.findAll("style") scripts = div.findAll("script") for style in styles: style.clear() for script in scripts: script.clear() advs = div.findAll( "div", attrs={"class": "attach_nopermission attach_tips"}) for adv in advs: adv.clear() m_codes = new_root.findAll("span", attrs={"style": "display:none"}) for m_code in m_codes: m_code.clear() m_codes = new_root.findAll("font", attrs={"class": "jammer"}) for m_code in m_codes: m_code.clear() #替换所有图片标签 imgs = div.find("table").find("td", attrs={ "class": "pls" }).findNextSibling().find("table").findAll("img") for img in imgs: img.replaceWith(img.prettify()) c_item["txt"] = div.find("table").find( "td", attrs={ "class": "pls" }).findNextSibling().find("table").text.replace( "<br />", " ").replace("\r\n", "$*huanhang*$").replace("\"", "‘") c_item["txt_len"] = len(c_item["txt"]) #租房子页面他有bug,所以要再获取一次内容 if c_item["topPost"] == 1 and item["subname"] == '房屋租售': try: #替换所有图片标签 imgs = new_root.find("div", attrs={ "class": "t_fsz" }).findAll("img") for img in imgs: img.replaceWith(img.prettify()) c_item['txt'] += new_root.find( "div", attrs={ "class": "t_fsz" }).text.replace("\r\n", "$*huanhang*$").replace( "\n", "$*huanhang*$").replace( "\"", "'").replace( "<br />", "$*huanhang*$") c_item["txt_len"] = len(c_item["txt"]) except: pass item['articles'].append(dict(c_item)) try: url = root_domain + new_root.find("div", attrs={ "class": "pgbtn" }).find("a").get("href") print url except: pass return item except: line = response.url + "\n" self.file = codecs.open('error2.json', 'ab', encoding='utf-8') self.file.write(line.decode("unicode_escape")) pass
def parse_pages(self, response): self.log('Hi, this is an item page! %s' % response.url) root_url = response.url last_t = "0" new = 21 url = root_url while new > 20: new = 0 r = requests.get(url) r.encoding = "gbk" root = bs(r.text) table = root.find("table", id="threadlisttableid") # print table try: trs = table.findAll("tr") except: line = response.url + "\n" self.file = codecs.open('error1.json', 'ab', encoding='utf-8') self.file.write(line.decode("unicode_escape")) continue for tr in trs: item = FenghuoItem() item["domain_1"] = "hbha.com.cn" item["domain_2"] = "bbs" item["site_id"] = 36 item['website_id'] = '' item["site_name"] = '红安论坛' item["area"] = 3507 item["site_weight"] = 2 item['countryid'] = 1156 item['province'] = 1673 item['city'] = 2508 item["ip"] = socket.gethostbyname("bbs.hbha.com.cn") item["site_url"] = "bbs.hbha.com.cn" item["forumurl"] = root_url item["site_type"] = '论坛' item["snatch_time"] = datetime.datetime.now() try: item["url"] = root_domain + tr.find( "a", attrs={ "href": re.compile("^forum") }).get("href") item["title"] = tr.find("th").find("a", attrs={ "href": re.compile("^forum") }).text item["author"] = tr.find("td", attrs={ "class": "by" }).find("cite").find("a").text item["userpage"] = root_domain + tr.find( "td", attrs={ "class": "by" }).find("cite").find("a").get("href") url_id = tr.findAll("td", attrs={ "class": "by" })[0].find("cite").find("a").get("href") url_id = url_id[url_id.index("uid=") + 4:] item["userid"] = url_id try: item["reply"] = tr.find("td", attrs={ "class": "num" }).find("a").text except: item["reply"] = "" try: item["view"] = tr.find("td", attrs={ "class": "num" }).find("em").text except: item["view"] = "" try: item["postid"] = tr.findAll( "td", attrs={"class": "by"})[1].find("cite").find("a").text except: item["postid"] = "" try: item["subname"] = root_domain + tr.find("th").find( "em").find("a").text except: item["subname"] = "" try: time1 = tr.findAll("td", attrs={"class": "by" })[1].find("em").find("a") try: item["updatetime"] = time1.find("span").get( "title") except: item["updatetime"] = time1.text except: item["updatetime"] = "" #explore the content of the page #增量的判断 if item["updatetime"] and item["updatetime"] > last_t: new += 1 print item["url"] yield scrapy.Request(item["url"], self.parse_item, meta={'item': item}) if item["updatetime"] and item["updatetime"] == last_t: new += 1 #update before except Exception as e: line = str(tr) + "\n" self.file = codecs.open('error1.json', 'ab', encoding='utf-8') self.file.write(line.decode("unicode_escape")) try: url = root_domain + root.find("div", attrs={ "class": "pg" }).find("a", attrs={ "class": "nxt" }).get("href") print url except: break
def parse_item(self, response): ''' 访问各新闻页面,获取各键值 :param response: :return: ''' self.log('Hi, this is an item page! %s' % response.url) item = FenghuoItem() root = bs(response.body) try: item['topPost'] = 1 item["site_id"] = 17 item['website_id'] = '' item["site_name"] = '中国通山网' item["area"] = 958 item["site_weight"] = 2 item['countryid'] = 1156 item['province'] = 1673 item['city'] = 136 item["ip"] = socket.gethostbyname("www.cntongshan.com") item["site_url"] = "www.cntongshan.com" item["forumurl"] = response.meta['forumurl'] item["site_type"] = '新闻' item["url"] = response.url url = response.url id = url[url.index("-") + 1:] id = id[:id.index(".")] item["parent_type"] = root.find("a", attrs={ "class": "Current" }).text item["subname"] = root.find("a", attrs={ "class": "A", "target": "_blank" }).text str = root.find("div", attrs={ "class": "Title_h1" }).find("div").text str1 = str[str.index('20'):] item["pubdate"] = str1[:str1.index('\n') - 2] try: str2 = str[str.index('作者') + 2:] item["author"] = str2[:str2.index("浏览")].replace( "\r\n", "$*huanhang*$") except: item["author"] = "" #because view is get by ajax through GET, so we should use requests to get view #id is the id of the news , getted by url r = requests.get( "http://www.cntongshan.com/public/ajax.aspx?action=addnum&id=" + id + "&t=4&_=1437061503826") item["view"] = int(r.text[:r.text.index(",")]) item["txt"] = root.find("div", attrs={ "class": "content_main" }).text.replace("\r\n", "$*huanhang*$").replace("\"", "‘") item["txt_len"] = len(item["txt"]) item["title"] = root.find("h1").text item["domain_1"] = "cntongshan.com" item["domain_2"] = "www" item["snatch_time"] = datetime.datetime.now() return item except: #errors are log in error1.json with url line = response.url + "\n" self.file = codecs.open('error1.json', 'ab', encoding='utf-8') self.file.write(line.decode("unicode_escape")) pass
def parse_pages(self, response): self.log('Hi, this is an item page! %s' % response.url) root = bs(response.body.decode('gbk')) try: pageText = root.find("div", attrs={ "class": "pg" }).find("span").text pageText = pageText[pageText.index("/") + 2:] pageText = pageText[:pageText.index(" ")] except: pageText = 1 root_url = response.url root_url = root_url[:root_url.index(".html") - 1] # for i in range(1, int(pageText)+1): # url = root_url + str(i) +".html" # yield scrapy.Request(url, self.parse_page, meta={'forumurl': root_url + "1.html"}) # new = 0 url = root_url + str(1) + ".html" while new > 20: r = requests.get(url) root = bs(r.text) url = root.find("div", attrs={ "class": "pg" }).find("a", attrs={ "class": "nxt" }).get("href") table = root.find("div", id="threadlist") trs = table.findAll("tr") for tr in trs: item = FenghuoItem() item["domain_1"] = "437600.net" item["domain_2"] = "www" item["site_id"] = 12 item['website_id'] = '' item["site_name"] = '通山信息港' item["area"] = 958 item["site_weight"] = 2 item['countryid'] = 1156 item['province'] = 1673 item['city'] = 136 item["ip"] = socket.gethostbyname("www.437600.net") item["site_url"] = "www.437600.net" item["forumurl"] = response.meta['forumurl'] item["site_type"] = '论坛' item["snatch_time"] = datetime.datetime.now() try: item["url"] = root_domain + tr.find( "a", attrs={ "href": re.compile("^thread") }).get("href") item["title"] = tr.find("th").find( "a", attrs={ "href": re.compile("^thread") }).text item["author"] = tr.find("td", attrs={ "class": "by" }).find("cite").find("a").text item["userpage"] = root_domain + tr.find( "td", attrs={ "class": "by" }).find("cite").find("a").get("href") url_id = tr.findAll("td", attrs={ "class": "by" })[0].find("cite").find("a").get("href") url_id = url_id[url_id.index("uid-") + 4:] item["userid"] = url_id[:url_id.index(".html")] try: item["reply"] = tr.find("td", attrs={ "class": "num" }).find("a").text except: item["reply"] = "" try: item["view"] = tr.find("td", attrs={ "class": "num" }).find("em").text except: item["view"] = "" try: item["postid"] = tr.findAll( "td", attrs={"class": "by"})[1].find("cite").find("a").text except: item["postid"] = "" try: item["subname"] = root_domain + tr.find("th").find( "em").find("a").text except: item["subname"] = "" try: time1 = tr.findAll("td", attrs={"class": "by" })[1].find("em").find("a") try: item["updatetime"] = time1.find("span").get( "title") except: item["updatetime"] = time1.text except: item["updatetime"] = "" #explore the content of the page #增量的判断 last_t = "" if item["updatetime"] and item["updatetime"] > last_t: new += 1 yield scrapy.Request(item["url"], self.parse_item, meta={'item': item}) if item["updatetime"] and item["updatetime"] == last_t: new += 1 #update before pass except Exception as e: #print e #print tr line = str(tr) + "\n" self.file = codecs.open('error1.json', 'ab', encoding='utf-8') self.file.write(line.decode("unicode_escape"))
def parse(self, response): self.log('Hi, this is an item page! %s' % response.url) root_url = response.url last_t = "0" new = 21 url = root_url has_next = True while new > 5 and has_next: new = 0 has_next = False r = requests.get(url) r.encoding = "gbk" root = bs(r.text) table = root.find("div", attrs={"id": "wrapper"}) try: trs = table.findAll("tbody") except: line = response.url + "\n" self.file = codecs.open('error1.json', 'ab', encoding='utf-8') self.file.write(line.decode("unicode_escape")) continue for tr in trs: item = FenghuoItem() item["domain_1"] = "461700.org" item["domain_2"] = "www" item["site_id"] = 46 item['website_id'] = '' item["site_name"] = '襄城论坛' item["area"] = 3507 item["site_weight"] = 2 item['countryid'] = 1156 item['province'] = 1673 item['city'] = 996 item["ip"] = socket.gethostbyname("www.461700.org") item["site_url"] = "www.461700.org" item["forumurl"] = root_url item["site_type"] = '论坛' item["snatch_time"] = datetime.datetime.now() # try: item["url"] = root_domain + tr.find( "a", attrs={ "href": re.compile("^show") }).get("href") item["title"] = tr.find("a", attrs={ "href": re.compile("^show") }).text item["author"] = tr.find("td", attrs={ "class": "author" }).find("cite").text try: item["reply"] = tr.find("td", attrs={ "class": "nums" }).find("strong").text except: item["reply"] = "" try: item["view"] = tr.find("td", attrs={ "class": "nums" }).find("em").text except: item["view"] = "" try: item["postid"] = tr.findAll("td", attrs={"class": "lastpost" })[1].find("cite").text except: item["postid"] = "" try: item["subname"] = tr.find("div", attrs={ "class": "tietitle" }).find("em").find("a").text except: item["subname"] = "" new += 1 print item["url"] yield scrapy.Request(item["url"], self.parse_item, meta={'item': item}) try: urls = root.find("div", attrs={ "class": "meneame" }).findAll("a") for u in urls: if u.text == "下一页": url = root_domain + u.get("href") has_next = True print url except: break
def parse_item(self, response): item = response.meta['item'] #item = FenghuoItem() item['articles'] = [] item["txt"] = "" new_root = bs(response.body.decode('gbk')) subnames = new_root.find("div", id="nav1").findAll("a") item["subname"] = subnames[len(subnames) - 1].text url = response.url hasNext = True try: while hasNext: hasNext = False r1 = requests.post(url) r1.encoding = 'gbk' new_root = bs(r1.text) divs = new_root.find("div", id="wrapper").findChildren( 'table', attrs={"class": "showTie"}) for div in divs: c_item = FenghuoItem() c_item["topPost"] = 0 try: c_item['author'] = div.find("td", attrs={ "class": "aa" }).find("a").text c_item['userpage'] = div.find("td", attrs={ "class": "aa" }).find("a").text times = div.find("td", attrs={ "class": "aa" }).findAll("li") c_item["pubdate"] = times[len(times) - 1].text item["updatetime"] = c_item["pubdate"] try: c = div.find("td", attrs={ "class": "bb" }).find("span").text c_item["postfloor"] = c[:c.index("阅") - 1] if c_item["postfloor"] == "楼主": c_item["postfloor"] = 1 c_item["topPost"] = 1 item["pubdate"] = c_item["pubdate"] item['userpage'] = c_item['userpage'] except: c = div.find("td", attrs={ "class": "bb" }).find("div", attrs={ "class": "tiefoot s_clear" }).find("span").text c = c[c.index("回复") + 2:] c_item["postfloor"] = c[:c.index("楼")] c_item["postfloor"] = int(c_item["postfloor"]) + 1 styles = div.findAll("style") scripts = div.findAll("script") for style in styles: style.clear() for script in scripts: script.clear() #替换所有图片标签 imgs = div.find("td", attrs={ "class": "bb" }).find("div").findAll("img") for img in imgs: img.replaceWith(img.prettify()) c_item["txt"] = div.find("td", attrs={ "class": "bb" }).find("div").text.replace("<br />", " ").replace( "\r\n", "$*huanhang*$").replace("\"", "‘") strs = str(c_item["postfloor"]) + "." + str( c_item["txt"]) + "\n" item["txt"] += strs item["txt_len"] = len(item["txt"]) c_item["txt_len"] = len(c_item["txt"]) item['articles'].append(dict(c_item)) except: line = str(div) + "\n" self.file = codecs.open('error2.json', 'ab', encoding='utf-8') self.file.write(line.decode("unicode_escape")) pass # # # #clear css, js , advertisement and messy code try: curls = new_root.find("div", attrs={ "class": "meneame" }).findAll("a") for curl in curls: if curl.text == "下一页" and url != root_domain + curl.get( "href"): url = curl hasNext = True print url except: pass return item except: line = response.url + "\n" self.file = codecs.open('error2.json', 'ab', encoding='utf-8') self.file.write(line.decode("unicode_escape")) pass
def parse_items(self, response): #title done #txt:main work,focus on parse HTML and remove impurity #pubdate done #snatch_time static #site_url static #site_name static #url done #topPost static #url done #init: html_parser = HTMLParser.HTMLParser() item = FenghuoItem() url = response.url self.log("Hi,this is in parse_items,url is %s" % url) root = bs(response.body) div = root.find("div", "conText") strong1 = div.find("strong", "fromSummary") pubdate = div.find("strong", "timeSummary").text ps = div.find("div", attrs={"id": "text"}).findAll("p") item_page = div.find("div", attrs={"id": "pages"}) title = div.find("h1").text html = "" #see if there's rubbish in ps remove: print ps[-1] if ps[-1].find("div", "page"): del ps[-1] # get txt by paragraph for p in ps: #remove comments in text: comments = p.findAll(text=lambda text: isinstance(text, Comment)) [comment.extract() for comment in comments] html = html + '\n' + p.text.encode('utf-8') text = html_parser.unescape(html) item['url'] = url item['title'] = html_parser.unescape(title) item['txt'] = text item['pubdate'] = str(pubdate) item['snatch_time'] = datetime.datetime.now() item['topPost'] = 1 item['site_name'] = '环球国内新闻' item['site_url'] = "china.huanqiu.com/" # see if the txt of this page has a next page if item_page: next_page = item_page.findAll("a") for a in next_page: if a.text == "下一页": next_url = a.get('href') # get the next page else: fp.process_item( item, "123") #all txt in a single page ,just process item to DB if next_url == url: #means we are parsing the last page of the whole txt print next_url else: #means we stiill has a next page to request to get the remaining part of the txt yield scrapy.Request(url=next_url, meta={'item': item}, callback=self.parse_items_page)