def process_item(self, item, spider): dbObject = mysqlHadler().dbHandle() cursor = dbObject.cursor() cursor.execute("USE test") # sql = "INSERT INTO articles(author,title,times,url,admire,likes) VALUES(%s,%s,%s,%s,%s,%s)" sql = "INSERT INTO ershoufang_detail_lianjia(title,link,house_code," \ "community_code,community_link,community,town_link," \ "town_disc, info,chan_quan,follow_info,tags,price,price_content," \ "uni_price_content, rent_status) " \ "VALUES(%s,%s,%s,%s,%s,%s, %s,%s,%s,%s,%s,%s,%s,%s,%s, 0)" \ " on duplicate key update rent_status = 2 " # sql1 = "insert rent_scrapy_tmp(house_code) values(%s) " try: cursor.execute( sql, (item['title'], item['link'], item['house_code'], item['community_code'], item['community_link'], item['community'], item['town_link'], item['town_disc'], item['house_info'], item['chan_quan'], item['follow_info'], item['tags'], item['price'], item['price_content'], item['unit_price'])) cursor.connection.commit() except BaseException as e: print("ERROR>>>>>>>>>>>>>", e, "<<<<<<<<<<<<<ERROR") dbObject.rollback() # cursor.execute(sql1, item['house_code']) # cursor.connection.commit() return item
def process_item(self, item, spider): dbObject = mysqlHadler().dbHandle() cursor = dbObject.cursor() cursor.execute("USE test") # sql = "INSERT INTO articles(author,title,times,url,admire,likes) VALUES(%s,%s,%s,%s,%s,%s)" sql = "update rent_detail_lianjia set price_content = %s, tags= %s,house_type= %s," \ " sub_way= %s,house_comment= %s,upload_date= %s,square= %s,direction= %s,base_info= %s ," \ " community_code = %s, community = %s, community_link = %s" \ " , rent_status = CASE WHEN rent_status = -1 THEN 0 ELSE rent_status END" \ " where house_code = %s" # sql = "update rent_detail_lianjia set community_code = %s, community = %s, community_link = %s where house_code = %s" try: cursor.execute( sql, (item['price_content'], item['tags'], item['house_type'], item['sub_way'], item['house_comment'], item['upload_date'], item['square'], item['direct'], item['base_info_str'], item['community_code'], item['community'], item['community_link'], item['house_code'])) # cursor.execute(sql, (item['community_code'], item['community'], item['community_link'], item['house_code'])) cursor.connection.commit() except BaseException as e: print("ERROR>>>>>>>>>>>>>", e, "<<<<<<<<<<<<<ERROR") dbObject.rollback() return item
class ToScrapeSpiderXPath(scrapy.Spider): name = 'lianjia' dbObject = mysqlHadler().dbHandle() cursor = dbObject.cursor() cursor.execute("USE test") # sql = "INSERT INTO articles(author,title,times,url,admire,likes) VALUES(%s,%s,%s,%s,%s,%s)" sql = "update rent_detail_lianjia set rent_status = -1 where rent_status <> -1 ;" # sql = "delete from rent_scrapy_tmp;" cursor.execute(sql) cursor.connection.commit() box = [] for num in range(102): pages = 'https://sh.lianjia.com/zufang/pudong/pg{0}/#contentList'.format(num) box.append(pages) start_urls = box # start_urls = [ # 'https://sh.lianjia.com/zufang/pudong', # ] # dbObject = mysqlHadler().dbHandle() # cursor = dbObject.cursor() # cursor.execute("USE test") # # sql = "INSERT INTO articles(author,title,times,url,admire,likes) VALUES(%s,%s,%s,%s,%s,%s)" # sql = "select distinct community_link link from rent_detail_lianjia where community_code is not null ;" # cursor.execute(sql) # results = cursor.fetchall() # box = [] # for row in results: # box.append(row[0]) # start_urls = box def parse(self, response): print(response) for quote in response.xpath('//div[@class="content__list--item"]'): price = int(quote.xpath('.//span[@class="content__list--item-price"]/em/text()')[0].extract()) # print(quote) yield { 'title': getList(quote.xpath('.//div/p[@class="content__list--item--title twoline"]/a/text()')), 'link': baseUrl + getList(quote.xpath('.//div/p[@class="content__list--item--title twoline"]/a/@href')), 'house_code': getList(quote.xpath('.//div/p[@class="content__list--item--title twoline"]/a/@href'))[ 8:-5], 'area_link': baseUrl + getList( quote.xpath('.//div/p[@class="content__list--item--des"]/a[position()=1]/@href')), 'area_disc': getList( quote.xpath('.//div/p[@class="content__list--item--des"]/a[position()=1]/text()')), 'town_link': baseUrl + getList( quote.xpath('.//div/p[@class="content__list--item--des"]/a[position()=2]/@href')), 'town_disc': getList( quote.xpath('.//div/p[@class="content__list--item--des"]/a[position()=2]/text()')), 'price': price }
def process_item(self, item, spider): dbObject = mysqlHadler().dbHandle() cursor = dbObject.cursor() cursor.execute("USE test") # sql = "INSERT INTO articles(author,title,times,url,admire,likes) VALUES(%s,%s,%s,%s,%s,%s)" sql = "INSERT INTO rent_detail_lianjia(title,link,house_code," \ "area_link,area_disc,town_link," \ "town_disc, price, rent_status) " \ "VALUES(%s,%s,%s,%s,%s,%s,%s, %s, 0) on duplicate key update rent_status = 2 " sql1 = "insert rent_scrapy_tmp(house_code) values(%s) " try: cursor.execute( sql, (item['title'], item['link'], item['house_code'], item['area_link'], item['area_disc'], item['town_link'], item['town_disc'], item['price'])) cursor.connection.commit() except BaseException as e: print("ERROR>>>>>>>>>>>>>", e, "<<<<<<<<<<<<<ERROR") dbObject.rollback() # cursor.execute(sql1, item['house_code']) # cursor.connection.commit() return item
class ToScrapeSpiderXPath(scrapy.Spider): name = 'lianjiaDetail' dbObject = mysqlHadler().dbHandle() cursor = dbObject.cursor() cursor.execute("USE test") # sql = "INSERT INTO articles(author,title,times,url,admire,likes) VALUES(%s,%s,%s,%s,%s,%s)" sql = "select link from rent_detail_lianjia where community_code is null ;" cursor.execute(sql) results = cursor.fetchall() box = [] for row in results: box.append(row[0]) # start_urls = [ # 'https://sh.lianjia.com/zufang/SH2135972204813492224.html' # ] start_urls = box def parse(self, response): # print response subtitile = response.xpath('//div[@class="content__subtitle"]')[ 0].xpath('string(.)').extract()[0].strip() upload_date = subtitile[12:22] code = response.xpath('//div[@class="content__subtitle"]')[0].xpath( ".//i[position()=2]/text()")[0].extract() house_code = code[5:] content__aside = response.xpath( './/div[@class="content__aside fr"]')[0] price_content = content__aside.xpath( './/div[@class="content__aside--title"]')[0].xpath( 'string(.)').extract() price = int( content__aside.xpath( './/div[@class="content__aside--title"]/span/text()') [0].extract()) tags = "" for quote in content__aside.xpath( './/p[@class="content__aside--tags"]/i'): tags = tags + "/" + quote.xpath('string(.)').extract()[0] content__article__table = \ content__aside.xpath('//ul[@class="content__aside__list"]')[0] house_type_content_span = content__article__table.xpath( './/li[position()=2]/span')[0].xpath('string(.)').extract_first() house_type_content = content__article__table.xpath( './/li[position()=2]')[0].xpath('string(.)').extract_first() house_type_content1 = house_type_content.replace( house_type_content_span, "") house_type = house_type_content1.split()[0] square = house_type_content1.split()[1] direct = content__article__table.xpath('.//li[position()=3]')[0].xpath( 'string(.)').extract() base_info = response.xpath( './/div[@class="content__article__info"]')[0] base_info_str = "" for quote in base_info.xpath('.//ul/li[@class="fl oneline"]'): base_info_str = base_info_str + quote.xpath( "string(.)").extract()[0] + "\n" houseComment = response.xpath( './/p[@data-el="houseComment"]/attribute::data-desc') house_comment = "" if houseComment: house_comment = houseComment[0].extract() sub_way = "" for quote in response.xpath( './/div[@class="content__article__info4"]/ul/li'): sub_way = sub_way + quote.xpath( './/span[position()=1]/text()')[0].extract() sub_way = sub_way + "- " + quote.xpath( './/span[position()=2]/text()')[0].extract() + "\n" community_link = baseUrl + response.xpath( '//div[@class="bread__nav w1150 bread__nav--bottom"]/h1/a/@href' )[0].extract() community_code = response.xpath( '//div[@class="bread__nav w1150 bread__nav--bottom"]/h1/a/@href' )[0].extract()[8:-1] community = response.xpath( '//div[@class="bread__nav w1150 bread__nav--bottom"]/h1/a/text()' )[0].extract()[:-2] yield { 'house_code': house_code, 'price_content': price_content, 'tags': tags, 'house_type': house_type, 'sub_way': sub_way, 'house_comment': house_comment, 'upload_date': upload_date, 'square': square, 'direct': direct, 'base_info_str': base_info_str, 'community_code': community_code, 'community': community, 'community_link': community_link }
class ToScrapeSpiderXPath(scrapy.Spider): name = 'lianjiaCommunity' # box = [] # for num in range(102): # pages = 'https://sh.lianjia.com/zufang/pudong/pg{0}/#contentList'.format(num) # box.append(pages) # start_urls = box # start_urls = [ # 'https://sh.lianjia.com/zufang/pudong', # ] dbObject = mysqlHadler().dbHandle() cursor = dbObject.cursor() cursor.execute("USE test") # sql = "INSERT INTO articles(author,title,times,url,admire,likes) VALUES(%s,%s,%s,%s,%s,%s)" sql = "select distinct community_code link from ershoufang_detail_lianjia where community_code is not null ;" cursor.execute(sql) results = cursor.fetchall() box = [] for row in results: box.append("https://sh.lianjia.com/ershoufang/c" + row[0]) start_urls = box def parse(self, response): for quote in response.xpath('//li [@class="clear LOGCLICKDATA"]'): title = getList( quote.xpath( './/div[@class="info clear"]/div[@class="title"]/a/text()') ) link = getList( quote.xpath('.//a[@class="noresultRecommend img "]/@href')) house_code = getList( quote.xpath( './/a[@class="noresultRecommend img "]/@data-housecode')) community_code = getList( quote.xpath( './/div[@class="info clear"]/div[@class="priceInfo"]/div[@class="unitPrice"]/@data-rid' )) community_link = getList( quote.xpath( './/div[@class="info clear"]/div[@class="address"]/div[@class="houseInfo"]/a/@href' )) community = getList( quote.xpath( './/div[@class="info clear"]/div[@class="address"]/div[@class="houseInfo"]/a/text()' )) town_link = getList( quote.xpath( './/div[@class="info clear"]/div[@class="flood"]/div[@class="positionInfo"]/a/@href' )) town_disc = getList( quote.xpath( './/div[@class="info clear"]/div[@class="flood"]/div[@class="positionInfo"]/a/text()' )) nn = quote.xpath( './/div[@class="info clear"]/div[@class="address"]/div[@class="houseInfo"]' ) house_info = nn[0].xpath('string(.)')[0].extract().replace( "\n", "").replace(" ", "") chan_quan = quote.xpath('.//div[@class="info clear"]/div[@class="flood"]/div[@class="positionInfo"]')[0] \ .xpath('string(.)')[0].extract().replace("\n", "").replace(" ", ""), follow_info = quote.xpath('.//div[@class="info clear"]/div[@class="followInfo"]')[0].xpath( 'string(.)')[0].extract() \ .replace("\n", "").replace(" ", "") tags = quote.xpath('.//div[@class="info clear"]/div[@class="tag"]')[0].xpath('string(.)')[0].extract() \ .replace("\n", "").replace(" ", "") price = int( quote.xpath( './/div[@class="info clear"]/div[@class="priceInfo"]/div[@class="totalPrice"]/span/text()' )[0].extract()) price_content = quote.xpath('.//div[@class="info clear"]/div[@class="priceInfo"]/div[@class="totalPrice"]')[ 0] \ .xpath('string(.)')[0].extract().replace("\n", "").replace(" ", "") unit_price = getList( quote.xpath( './/div[@class="info clear"]/div[@class="priceInfo"]/div[@class="unitPrice"]/span/text()' )) yield { 'title': title, 'link': link, 'house_code': house_code, 'community_code': community_code, 'community_link': community_link, 'community': community, 'town_link': town_link, 'town_disc': town_disc, 'house_info': house_info, 'chan_quan': chan_quan, 'follow_info': follow_info, 'tags': tags, 'price': price, 'price_content': price_content, 'unit_price': unit_price }
# coding=utf-8 from msysqlDb import mysqlHadler dbObject = mysqlHadler().dbHandle() cursor = dbObject.cursor() cursor.execute("USE test") # sql = "INSERT INTO articles(author,title,times,url,admire,likes) VALUES(%s,%s,%s,%s,%s,%s)" sql = "select link from rent_detail_lianjia ;" try: cursor.execute(sql) results = cursor.fetchall() # print results for row in results: link = row[0] # 打印结果 print "fname=%s" % \ (link) except BaseException as e: print("ERROR>>>>>>>>>>>>>", e, "<<<<<<<<<<<<<ERROR")