Пример #1
0
class UrlsPro:
    def __init__(self, typeName, configName):
        self.type = typeName
        self.config = ConfigProvider(configName)
        if self.type == 'sale':
            self.link = '.lianjia.com/ershoufang/'
        elif self.type == 'deal':
            self.link = '.lianjia.com/chengjiao/'
        else:
            self.link = ''
            logging.warning('输入类型错误,需要为sale或者deal')

    # 生成第一个请求url,目的在于获取页数
    def getFirstUrls(self):
        city = self.config.get(self.type, 'city')
        areas = self.config.get(self.type, 'area').split('/')
        if self.link == '' or city == '' or areas == '':
            return []
        urls = []
        for area in areas:
            urls.append('https://' + city + self.link + area + '/')
        return urls

    # 若选择多城市爬取,生成各城市请求url
    def getMultiCityUrls(self):
        urls = []
        if self.isMulti():
            cities = self.config.get(self.type, 'multi_city').split('/')
            if cities == '':
                return urls
            for city in cities:
                urls.append('https://' + city + self.link)
        return urls

    def getMinPage(self):
        page = self.config.get(self.type, 'min_page')
        if page is None:
            return 0
        return int(page)

    def isMulti(self):
        multi = self.config.get(self.type, 'enable_multi')
        if multi == 'True':
            return True
        else:
            return False

    def getAllUrls(self):
        urls = self.getFirstUrls()
        urls.extend(self.getMultiCityUrls())
        return urls
Пример #2
0
class DataTo:
    def __init__(self, type):
        self.config = ConfigProvider('LianJiaConfig.cfg')
        self.is_csv = self.config.get('data', 'csv')
        self.is_db = self.config.get('data', 'db')
        self.type = type
        if self.is_db == 'ON':
            self.con, self.cursor = self.prepare()
            self.dbinit()

    # 建立数据库连接,进行初始化工作,返回数据库连接
    def prepare(self):
        con = pymysql.connect(host=self.config.get('DB', 'host'),
                              user=self.config.get('DB', 'user'),
                              passwd=self.config.get('DB', 'password'),
                              port=int(self.config.get('DB', 'port')),
                              charset='utf8')
        cursor = con.cursor()
        return con, cursor

    def dbinit(self):
        try:
            # 建立数据库与表
            self.cursor.execute(
                'CREATE DATABASE IF NOT EXISTS HOUSECRAWL DEFAULT CHARSET utf8 COLLATE utf8_general_ci'
            )
            self.cursor.execute('USE HOUSECRAWL')
            createSale = """CREATE TABLE IF NOT EXISTS `sale`(
            `house_id` VARCHAR(20),
            `city` VARCHAR(20),
            `district` VARCHAR(30),
            `title` VARCHAR(200) NOT NULL,
            `area` VARCHAR(100) NOT NULL,
            `description` VARCHAR(200) NOT NULL,
            `attention` VARCHAR(50) NOT NULL,
            `putdate` VARCHAR(50) NOT NULL,
            `totalprice` FLOAT,
            `unitprice` FLOAT,
            PRIMARY KEY (`house_id`))ENGINE=InnoDB DEFAULT CHARSET=utf8"""
            createDeal = """CREATE TABLE IF NOT EXISTS `deal`(
            `house_id` VARCHAR(20),
            `city` VARCHAR(20),
            `district` VARCHAR(30),
            `title` VARCHAR(100) NOT NULL,
            `description` VARCHAR(150) NOT NULL,
            `saletime` VARCHAR(20) NOT NULL,
            `dealdate` VARCHAR(20) NOT NULL,
            `saleprice` FLOAT,
            `dealprice` FLOAT,
            `unitprice` FLOAT,
            PRIMARY KEY (`house_id`))ENGINE=InnoDB DEFAULT CHARSET=utf8"""
            self.cursor.execute(createSale)
            self.cursor.execute(createDeal)
            self.con.commit()
        except:
            self.con.rollback()

    def insert(self, items):
        # 在售二手房插入sql,利用on duplicate key update 去重,注意类型必须均为%s
        insertSale = """insert into sale(house_id,city,district,title,area,description,attention,putdate,totalprice,unitprice) 
        VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) 
        on duplicate key update 
        city=VALUES(city),district=VALUES(district),title=VALUES(title),area=VALUES(area),description=VALUES(area),
        attention=VALUES(attention),putdate=VALUES(putdate),totalprice=VALUES(totalprice),unitprice=VALUES(unitprice)"""
        # 成交二手房插入sql
        insertDeal = """insert into deal(house_id,city,district,title,description,saletime,dealdate,saleprice,dealprice,unitprice) 
        VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) 
        on duplicate key update 
        city=VALUES(city),district=VALUES(district),title=VALUES(title),description=VALUES(description),
        saletime=VALUES(saletime),dealdate=VALUES(dealdate),saleprice=VALUES(saleprice),dealprice=VALUES(dealprice),
        unitprice=VALUES(unitprice)"""
        try:
            if self.type == 'sale':
                self.cursor.executemany(insertSale, items)
            if self.type == 'deal':
                self.cursor.executemany(insertDeal, items)
            self.con.commit()
        except:
            self.con.rollback()

    # 返回csv列名
    def getName(self):
        if self.type == 'sale':
            return ["标题", "区域", "描述", "关注信息", "总价(万)", "单价(元/平方米)"]
        if self.type == 'deal':
            return [
                "标题", "描述", "成交周期", "成交日期", "挂牌价(万)", "成交价(万)", "单价(元/平方米)"
            ]