def __init__(self, city=None): # 如果是爬最新房源,最大数没有限制 self.max_count = -1 self.lian_jia_session = LianJiaSession(city) self.__yaml_data = self.lian_jia_session.get_prop() self.base_url = self.lian_jia_session.get_city_url() self.__logger = self.lian_jia_session.get_logger() self.sql_session = self.lian_jia_session.get_sql_session()
class LianJia: def __init__(self, city=None): self.lian_jia_session = LianJiaSession(city) self.__yaml_data = self.lian_jia_session.get_prop() self.__house_list = [] self.__logger = self.lian_jia_session.get_logger() self.sql_session = self.lian_jia_session.get_sql_session() def parse(self, is_breaking=False): utils.reset_xiao_qu_status(self.sql_session, is_breaking) xiao_qu_queue = self.__get_xiao_qu_map() xiao_qu_id_soup_queue = queue.Queue() producer_arr = [] consumer_arr = [] for i in range(self.__yaml_data['producer_num']): producer = XiaoQuHouses(self.lian_jia_session, xiao_qu_queue, xiao_qu_id_soup_queue) producer.setName('producer - ' + str(i)) producer_arr.append(producer) producer.start() for i in range(self.__yaml_data['consumer_num']): consumer = ParseXiaoQuPage(self.lian_jia_session, xiao_qu_id_soup_queue) consumer.setName('consumer - ' + str(i)) consumer.start() consumer_arr.append(consumer) for item in producer_arr: print(sys._getframe().f_lineno) if item.is_alive(): print(sys._getframe().f_lineno) item.join() self.__logger.info('producer completed!.....................') print(sys._getframe().f_lineno) for i in consumer_arr: print(sys._getframe().f_lineno) xiao_qu_id_soup_queue.put((None, None)) xiao_qu_id_soup_queue.put((None, None)) for item in consumer_arr: print(sys._getframe().f_lineno) if item.is_alive(): print(sys._getframe().f_lineno) item.join() print(sys._getframe().f_lineno) self.__logger.info('consumer completed!.....................') def __get_xiao_qu_map(self): xiao_qus = self.sql_session.query(XiaoQu).filter( XiaoQu.status == False).filter( XiaoQu.zai_shou >= self.__yaml_data['min_house']).all() xiao_qu_quenue = queue.Queue() for item in xiao_qus: xiao_qu_quenue.put(item) self.__logger.info('发现小区数量 : [{0}](小区房源数量至少{1})'.format( xiao_qu_quenue.qsize(), self.__yaml_data['min_house'])) return xiao_qu_quenue
def __init__(self, city=None): self.__lian_jia_session = LianJiaSession(city) self.__yaml_data = self.__lian_jia_session.get_prop() self.__logger = self.__lian_jia_session.get_logger() self.__sql_session = self.__lian_jia_session.get_sql_session() self.__base_url = self.__lian_jia_session.get_city_url() self.__xiao_qu_urls = set() xiao_qus = self.__sql_session.query(XiaoQu).all() for item in xiao_qus: self.__xiao_qu_urls.add(item.url)
def __init__(self, city=None): self.__lian_jia_session = LianJiaSession(city) self.__yaml_data = self.__lian_jia_session.get_prop() self.__house_list = [] self.__logger = self.__lian_jia_session.get_logger() self.__sql_session = self.__lian_jia_session.get_sql_session() self.__base_url = self.__lian_jia_session.get_city_url() # 为了防止名字相同,重复录入 self.__district_names = set() # 添加已有district 信息 self.get_exixt_district_url()
class AllDistrict: def __init__(self, city=None): self.__lian_jia_session = LianJiaSession(city) self.__yaml_data = self.__lian_jia_session.get_prop() self.__house_list = [] self.__logger = self.__lian_jia_session.get_logger() self.__sql_session = self.__lian_jia_session.get_sql_session() self.__base_url = self.__lian_jia_session.get_city_url() # 为了防止名字相同,重复录入 self.__district_names = set() # 添加已有district 信息 self.get_exixt_district_url() def parse(self): a_arr = self.__parse_html('{0}/xiaoqu/'.format(self.__base_url)) districts = self.__parse_a(a_arr) self.__sql_session.add_all(districts) self.__sql_session.commit() for item in districts: a_arr = self.__parse_html(self.__base_url + item.url) districts = self.__parse_a(a_arr, item.id) self.__sql_session.add_all(districts) self.__sql_session.commit() def __parse_html(self, url): rep = self.__lian_jia_session.get(url) soup = BeautifulSoup(rep.text, 'lxml') a_arr = soup.select('.m-filter .position a') return a_arr def __parse_a(self, items, parent_id=None): districts = [] for item in items: if hasattr(item, 'href'): name = str(item.string) url = item['href'] if name not in self.__district_names and url not in exclude_url: district = District() district.name = name district.url = str(url) if parent_id: district.parent = parent_id self.__logger.info( 'add district name[{0}], url[{1}], parentId[{2}]'. format(name, url, parent_id)) districts.append(district) self.__district_names.add(name) return districts def get_exixt_district_url(self): districts = self.__sql_session.query(District).all() for d in districts: self.__district_names.add(d.name)
def __init__(self, report_date, city=None): lj_session = LianJiaSession(city) self.city = lj_session.city self.city_zh = lj_session.get_city_zh() engine = lj_session.get_sql_engine() self.__yaml_data = lj_session.get_prop() self.root_path = lj_session.get_log_path() self.logging = lj_session.get_logger() self.log_file_path = lj_session.get_log_file_name() conn = engine.raw_connection() self.cursor = conn.cursor() self.query_time = report_date self.today = datetime.today() self.date_str = self.get_date_str()
def __init__(self, city=None): self.lian_jia_session = LianJiaSession(city) self.__yaml_data = self.lian_jia_session.get_prop() self.base_url = self.lian_jia_session.get_city_url() self.__logger = self.lian_jia_session.get_logger() self.sql_session = self.lian_jia_session.get_sql_session()
class ChengJiaoHouse: def __init__(self, city=None): self.lian_jia_session = LianJiaSession(city) self.__yaml_data = self.lian_jia_session.get_prop() self.base_url = self.lian_jia_session.get_city_url() self.__logger = self.lian_jia_session.get_logger() self.sql_session = self.lian_jia_session.get_sql_session() def test(self, is_breaking=False): xiao_qus = self.sql_session.query(XiaoQu).filter( XiaoQu.status == False).filter( XiaoQu.zai_shou >= self.__yaml_data['min_house']).all() for xiao_qu in xiao_qus: cjs = self.sql_session.query(ChengJiao).filter( ChengJiao.xiao_qu == xiao_qu.id).all() if len(cjs) > 0: xiao_qu.status = True self.sql_session.commit() def parse(self, is_breaking=False): utils.reset_xiao_qu_status(self.sql_session, is_breaking) xiao_qu_queue = self.__get_xiao_qu_map() t = xiao_qu_queue.qsize() i = 0 while not xiao_qu_queue.empty(): i += 1 self.__logger.info('总进度 ===> {:0.2f}'.format(i / t)) xiao_qu = xiao_qu_queue.get() if xiao_qu is None: break url = 'https://wh.lianjia.com/chengjiao/c{0}/'.format(xiao_qu.url) self.parse_page(url, xiao_qu.id) xiao_qu.status = True self.sql_session.commit() def parse_latest(self): url = '{0}/chengjiao/'.format(self.base_url) self.parse_page(url) def parse_page(self, url, xiao_qu_id=None): rep = self.lian_jia_session.get(url) soup = BeautifulSoup(rep.text, 'lxml') total = soup.find('div', attrs={'class', 'total'}).find('span').get_text(strip=True) total = int(total) if total > 1000: self.__logger.info('error 小区[{0}] 发现房源数异常[{1}]'.format(url, total)) else: self.__logger.info('发现总房源{0}套'.format(total)) page_url_list = utils.get_all_page(soup) self.__parse_soup(soup, xiao_qu_id) # i = 0 # t = len(page_url_list) for url in page_url_list: # i += 1 # self.__logger.info('当前小区 progress {:0.2f}'.format(i/t)) rep = self.lian_jia_session.get(self.base_url + url) soup = BeautifulSoup(rep.text, 'lxml') self.__parse_soup(soup, xiao_qu_id) def __get_xiao_qu_id_by_name(self, name): xiao_qus = self.sql_session.query(XiaoQu).filter( XiaoQu.name == name).all() if len(xiao_qus) == 1: return xiao_qus[0].id elif len(xiao_qus) > 1: self.__logger.info('error 找多个同名小区 名称[{0}]'.format(name)) return None else: self.__logger.info('error 找不到小区 名称[{0}]'.format(name)) return None def __parse_soup(self, soup, xiao_qu_id): li_arr = soup.select('ul.listContent li') cheng_jiao_list = [] for li in li_arr: info_div = li.find('div', attrs={'class', 'info'}) url, title = utils.get_url_title(info_div) exist = self.sql_session.query(ChengJiao).filter( ChengJiao.url == url).one_or_none() # 如果 已存在跳过 if exist is None: cheng_jiao = ChengJiao() args = title.split(' ') if len(args) == 3: if xiao_qu_id is None: xiao_qu_id = self.__get_xiao_qu_id_by_name(args[0]) cheng_jiao.xiao_qu = xiao_qu_id price = info_div.find('div', attrs={ 'class', 'totalPrice' }).find('span', attrs={'class', 'number'}).get_text(strip=True) unit_price = info_div.find( 'div', attrs={'class', 'unitPrice' }).find('span', attrs={'class', 'number' }).get_text(strip=True) cheng_jiao.price = float(price) cheng_jiao.unit_price = float(unit_price) deal_date = info_div.find('div', attrs={'class', 'dealDate' }).get_text(strip=True) y_m_d = deal_date.split('.') deal_date = date(int(y_m_d[0]), int(y_m_d[1]), int(y_m_d[2])) deal_cycle_txt = info_div.find( 'span', attrs={'class', 'dealCycleTxt'}).get_text(strip=True) flood = info_div.find('div', attrs={'class', 'positionInfo' }).get_text(strip=True) gua_pai_jia = deal_cycle_txt[deal_cycle_txt.find('牌') + 1:deal_cycle_txt.find('万')] gua_pai_jia = float(gua_pai_jia) zhou_qi = deal_cycle_txt[deal_cycle_txt.find('期') + 1:deal_cycle_txt.find('天')] zhou_qi = int(zhou_qi) cheng_jiao.url = url cheng_jiao.title = title cheng_jiao.deal_date = deal_date cheng_jiao.gua_pai_jia = gua_pai_jia cheng_jiao.zhou_qi = zhou_qi cheng_jiao.flood = flood cheng_jiao.hu_xing = args[1] cheng_jiao.area1 = float( re.findall(r"\d+\.?\d*", args[2])[0]) cheng_jiao_list.append(cheng_jiao) self.__logger.info( 'url[{0}] 标题[{1}] 价格[{2}] 单价[{3}] 成交日期[{4}]'.format( url, title, gua_pai_jia, unit_price, deal_date)) self.sql_session.add_all(cheng_jiao_list) self.sql_session.commit() def __get_xiao_qu_map(self): xiao_qus = self.sql_session.query(XiaoQu).filter( XiaoQu.status == False).filter( XiaoQu.zai_shou >= self.__yaml_data['min_house']).all() xiao_qu_quenue = queue.Queue() for item in xiao_qus: xiao_qu_quenue.put(item) self.__logger.info('发现小区数量 : [{0}](小区房源数量至少{1})'.format( xiao_qu_quenue.qsize(), self.__yaml_data['min_house'])) return xiao_qu_quenue
price = Column(FLOAT(8)) change_time = Column(DATETIME) create_time = Column(DATETIME, server_default=func.now()) def create_view(engine): conn = engine.raw_connection() cursor = conn.cursor() sql = 'CREATE VIEW price_change_com AS SELECT p.house_id AS house_id, p.pre_price AS pre_price,p.price AS price,( p.price - p.pre_price ) AS priceChange, Round( ( p.price - p.pre_price ) / p.pre_price * 100, 2 ) AS fudu,p.change_time AS change_time FROM price_change p' cursor.execute(sql) sql = 'CREATE VIEW district_area AS SELECT d1.id id, d2.NAME district, d1.NAME area, d1.url url FROM district d1, district d2 WHERE d2.id = d1.parent' cursor.execute(sql) if __name__ == '__main__': view = False if len(sys.argv) == 3: filename, city, view = sys.argv lian_jia_session = LianJiaSession(city) if len(sys.argv) == 2: filename, city = sys.argv lian_jia_session = LianJiaSession(city) else: lian_jia_session = LianJiaSession() engine = lian_jia_session.get_sql_engine() base.metadata.create_all(engine) # 创建表结构 if view: create_view(engine)
class AllXiaoQu: def __init__(self, city=None): self.__lian_jia_session = LianJiaSession(city) self.__yaml_data = self.__lian_jia_session.get_prop() self.__logger = self.__lian_jia_session.get_logger() self.__sql_session = self.__lian_jia_session.get_sql_session() self.__base_url = self.__lian_jia_session.get_city_url() self.__xiao_qu_urls = set() xiao_qus = self.__sql_session.query(XiaoQu).all() for item in xiao_qus: self.__xiao_qu_urls.add(item.url) def get_xiao_qu_list(self): i = 0 district_list = self.__sql_session.query(District).filter(District.parent != None).all() t = len(district_list)/100 for district in district_list: i += 1 self.__logger.info('进度[{0:.2f}] 开始解析片区[{1}] id[{2}]===> url[{3}]'.format( i/t, district.name, district.id, district.url)) xiao_qu_list = self.__parse_a_district(district) self.__sql_session.add_all(xiao_qu_list) self.__sql_session.commit() def __parse_a_district(self, district): rep = self.__lian_jia_session.get(self.__base_url + district.url) soup = BeautifulSoup(rep.text, 'lxml') xiao_qu_list = self.__parse_page(soup, district.id) page_url_list = utils.get_all_page(soup) for url in page_url_list: rep = self.__lian_jia_session.get(self.__base_url + url) soup = BeautifulSoup(rep.text, 'lxml') xiao_qu_list2 = self.__parse_page(soup, district.id) xiao_qu_list.extend(xiao_qu_list2) self.__logger.info('添加小区数量 ==> {0}'.format(len(xiao_qu_list))) return xiao_qu_list def __parse_page(self, soup, district): xiao_qu_list = [] li_arr = soup.select('ul.listContent li') for li in li_arr: xiao_qu = XiaoQu() xiao_qu.district = district div = li.find('div', attrs={'class', 'info'}) a = div.find('div', attrs={'class', 'title'}).a href = a['href'] index = href.rfind('/', 0, len(href)-1) xiao_qu.url = href[index+1: len(href)-1] name = a.get_text(strip=True) if xiao_qu.url not in self.__xiao_qu_urls: self.__xiao_qu_urls.add(xiao_qu.url) xiao_qu.name = name house_info = div.find('div', attrs={'class', 'houseInfo'}).get_text(strip=True) xiao_qu.cheng_jiao_90 = int(house_info[house_info.find('交')+1:house_info.find('套')]) xiao_qu.chu_zu = int(house_info[house_info.rfind('|') + 1:house_info.rfind('套')]) nian_dai = div.select('div.positionInfo')[0].get_text(strip=True) nian_dai = nian_dai[nian_dai.find('/') + 1: nian_dai.find('年')] if nian_dai.isdigit(): xiao_qu.nian_dai = int(nian_dai) else: xiao_qu.nian_dai = 0 average_price = li.find('div', attrs={'class', 'totalPrice'}).get_text(strip=True) average_price = average_price[:average_price.find('元')] if average_price.isdigit(): xiao_qu.average_price = float(average_price) zai_shou = li.find('div', attrs={'class', 'xiaoquListItemSellCount'}).find('a').get_text(strip=True) xiao_qu.zai_shou = int(zai_shou[:zai_shou.find('套')]) xiao_qu_list.append(xiao_qu) return xiao_qu_list
def __init__(self, city=None): self.lian_jia_session = LianJiaSession(city) self.__yaml_data = self.lian_jia_session.get_prop() self.__house_list = [] self.__logger = self.lian_jia_session.get_logger() self.sql_session = self.lian_jia_session.get_sql_session()