class UrlsPro: def __init__(self, typeName, configName): self.type = typeName self.config = ConfigProvider(configName) if self.type == 'sale': self.link = '.lianjia.com/ershoufang/' elif self.type == 'deal': self.link = '.lianjia.com/chengjiao/' else: self.link = '' logging.warning('输入类型错误,需要为sale或者deal') # 生成第一个请求url,目的在于获取页数 def getFirstUrls(self): city = self.config.get(self.type, 'city') areas = self.config.get(self.type, 'area').split('/') if self.link == '' or city == '' or areas == '': return [] urls = [] for area in areas: urls.append('https://' + city + self.link + area + '/') return urls # 若选择多城市爬取,生成各城市请求url def getMultiCityUrls(self): urls = [] if self.isMulti(): cities = self.config.get(self.type, 'multi_city').split('/') if cities == '': return urls for city in cities: urls.append('https://' + city + self.link) return urls def getMinPage(self): page = self.config.get(self.type, 'min_page') if page is None: return 0 return int(page) def isMulti(self): multi = self.config.get(self.type, 'enable_multi') if multi == 'True': return True else: return False def getAllUrls(self): urls = self.getFirstUrls() urls.extend(self.getMultiCityUrls()) return urls
class DataTo: def __init__(self, type): self.config = ConfigProvider('LianJiaConfig.cfg') self.is_csv = self.config.get('data', 'csv') self.is_db = self.config.get('data', 'db') self.type = type if self.is_db == 'ON': self.con, self.cursor = self.prepare() self.dbinit() # 建立数据库连接,进行初始化工作,返回数据库连接 def prepare(self): con = pymysql.connect(host=self.config.get('DB', 'host'), user=self.config.get('DB', 'user'), passwd=self.config.get('DB', 'password'), port=int(self.config.get('DB', 'port')), charset='utf8') cursor = con.cursor() return con, cursor def dbinit(self): try: # 建立数据库与表 self.cursor.execute( 'CREATE DATABASE IF NOT EXISTS HOUSECRAWL DEFAULT CHARSET utf8 COLLATE utf8_general_ci' ) self.cursor.execute('USE HOUSECRAWL') createSale = """CREATE TABLE IF NOT EXISTS `sale`( `house_id` VARCHAR(20), `city` VARCHAR(20), `district` VARCHAR(30), `title` VARCHAR(200) NOT NULL, `area` VARCHAR(100) NOT NULL, `description` VARCHAR(200) NOT NULL, `attention` VARCHAR(50) NOT NULL, `putdate` VARCHAR(50) NOT NULL, `totalprice` FLOAT, `unitprice` FLOAT, PRIMARY KEY (`house_id`))ENGINE=InnoDB DEFAULT CHARSET=utf8""" createDeal = """CREATE TABLE IF NOT EXISTS `deal`( `house_id` VARCHAR(20), `city` VARCHAR(20), `district` VARCHAR(30), `title` VARCHAR(100) NOT NULL, `description` VARCHAR(150) NOT NULL, `saletime` VARCHAR(20) NOT NULL, `dealdate` VARCHAR(20) NOT NULL, `saleprice` FLOAT, `dealprice` FLOAT, `unitprice` FLOAT, PRIMARY KEY (`house_id`))ENGINE=InnoDB DEFAULT CHARSET=utf8""" self.cursor.execute(createSale) self.cursor.execute(createDeal) self.con.commit() except: self.con.rollback() def insert(self, items): # 在售二手房插入sql,利用on duplicate key update 去重,注意类型必须均为%s insertSale = """insert into sale(house_id,city,district,title,area,description,attention,putdate,totalprice,unitprice) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) on duplicate key update city=VALUES(city),district=VALUES(district),title=VALUES(title),area=VALUES(area),description=VALUES(area), attention=VALUES(attention),putdate=VALUES(putdate),totalprice=VALUES(totalprice),unitprice=VALUES(unitprice)""" # 成交二手房插入sql insertDeal = """insert into deal(house_id,city,district,title,description,saletime,dealdate,saleprice,dealprice,unitprice) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) on duplicate key update city=VALUES(city),district=VALUES(district),title=VALUES(title),description=VALUES(description), saletime=VALUES(saletime),dealdate=VALUES(dealdate),saleprice=VALUES(saleprice),dealprice=VALUES(dealprice), unitprice=VALUES(unitprice)""" try: if self.type == 'sale': self.cursor.executemany(insertSale, items) if self.type == 'deal': self.cursor.executemany(insertDeal, items) self.con.commit() except: self.con.rollback() # 返回csv列名 def getName(self): if self.type == 'sale': return ["标题", "区域", "描述", "关注信息", "总价(万)", "单价(元/平方米)"] if self.type == 'deal': return [ "标题", "描述", "成交周期", "成交日期", "挂牌价(万)", "成交价(万)", "单价(元/平方米)" ]