def __extact_by_type(self, node, area_name): ''' 提取了所有按照类型划分的数据 :param node: :return: ''' utils.print('提取按照户型分类的数据...') table = node.find('table') if table is None: utils.print('没有找到按照户型分类的数据') return [] row_node_list = table.find_all('tr') i = 0 house_list = [] for row in row_node_list: if i == 0: i += 1 continue columns = row.find_all('td') if len(columns) < 6: continue house = {} house['region'] = area_name house['house_type'] = columns[0].text house['deal_count'] = columns[1].text house['area'] = area = utils.get_num(columns[2].text) house['price'] = utils.get_num(columns[3].text) house['availableforsalecount'] = utils.get_num(columns[4].text) house['availableforsalearea'] = utils.get_num(columns[5].text) house_list.append(house) i += 1 return house_list
def decode(cls, page_node): ''' id serial not null, building_name character varying(255), --几栋 branch character varying(10), --座号 house_type character varying(255), contact_code character varying(255), price double precision, floor integer, room_num character varying(50), usage character varying(50), build_area double precision, inside_area double precision, share_area double precision, :param page_node: :return: ''' tr_nodes = page_node.find_all('tr', class_='a1') house_info = {} try: for tr_node in tr_nodes: temp_house = cls.__decode_one_row(tr_node) house_info.update(temp_house) return house_info except Exception as e: utils.print('解析房屋页面信息时发生错误, error: {}'.format(str(e))) return None
def __extact_by_type(self, node, area_name): ''' 提取了所有按照类型划分的数据 :param node: :return: ''' utils.print('提取按照户型分类的数据...') table = node.find('table') if table is None: utils.print('没有找到按照户型分类的数据') return [] row_node_list = table.find_all('tr') i = 0 house_list = [] for row in row_node_list: if i == 0: i += 1 continue columns = row.find_all('td') if len(columns) < 6: continue house = orm.NewHouseByType() house.thedate = dt.now() house.region = area_name house.house_type = columns[0].text house.deal_count = columns[1].text house.area = area = utils.get_num(columns[2].text) house.price = utils.get_num(columns[3].text) house.availableforsalecount = utils.get_num(columns[4].text) house.availableforsalearea = utils.get_num(columns[5].text) house_list.append(house) i += 1 return house_list
def __extract_by_use(self, node, area_name): utils.print('提取按照用途分类的数据...') table = node.find('table') if table is None: utils.print('没有找到按照用户分类的数据') return [] row_node_list = table.find_all('tr') i = 0 house_list = [] for row in row_node_list: if i == 0: i += 1 continue columns = row.find_all('td') if len(columns) < 3: continue house = {} house['region'] = area_name house['use_type'] = columns[0].text house['deal_count'] = utils.get_num(columns[1].text) house['area'] = area = utils.get_num(columns[2].text) house_list.append(house) i += 1 return house_list
def crawl(self): project = NewHouseSourceDao.get_one_project() if project is None: utils.print('暂无项目可抓取...') return False self.__crawl_project_detail(project) return True
def __extract_by_use(self, node, area_name): utils.print('提取按照用途分类的数据...') table = node.find('table') if table is None: utils.print('没有找到按照用户分类的数据') return [] row_node_list = table.find_all('tr') i = 0 house_list = [] for row in row_node_list: if i == 0: i += 1 continue columns = row.find_all('td') if len(columns) < 3: continue house = orm.OldHouseByUse() house.thedate = dt.now() house.region = area_name house.use_type = columns[0].text house.deal_count = utils.get_num(columns[1].text) house.area = area = utils.get_num(columns[2].text) house_list.append(house) i += 1 return house_list
def __extract_by_area(self, node, area_name): ''' 按照面积的大小分类 :param node: :param area_name: :return: ''' utils.print('提取按照面积分类的数据...') table = node.find('table') if table is None: utils.print('没有找到按照面积分类的数据') return [] row_node_list = table.find_all('tr') i = 0 house_list = [] for row in row_node_list: if i == 0: i += 1 continue columns = row.find_all('td') if len(columns) < 5: continue house = {} house['region'] = area_name house['area_level'] = columns[0].text house['deal_count'] = columns[1].text house['area'] = area = utils.get_num(columns[2].text) house['price'] = utils.get_num(columns[3].text) house['total_price'] = utils.get_num(columns[4].text) house_list.append(house) i += 1 return house_list
def __extract_by_area(self, node, area_name): ''' 按照面积的大小分类 :param node: :param area_name: :return: ''' utils.print('提取按照面积分类的数据...') table = node.find('table') if table is None: utils.print('没有找到按照面积分类的数据') return [] row_node_list = table.find_all('tr') i = 0 house_list = [] for row in row_node_list: if i == 0: i += 1 continue columns = row.find_all('td') if len(columns) < 5: continue house = orm.NewHouseByArea() house.thedate = dt.now() house.region = area_name house.area_level = columns[0].text house.deal_count = columns[1].text house.area = area = utils.get_num(columns[2].text) house.price = utils.get_num(columns[3].text) house.total_price = utils.get_num(columns[4].text) house_list.append(house) i += 1 return house_list
def job1(): utils.print('job1') counter = 0 while True: time.sleep(1) counter+=1 if counter > 120: break
def crawl(self): ''' url= 要创建3个表来存储相关数据:按照户型、面积和类型分别存储 然后,对于每种类型,要分别爬取全市、南山、福田、罗湖、盐田、保安、龙岗 七个区域的数据 :return: ''' utils.print('---开始抓取深圳新房成交数据---') for area in self.areas: self.__query_one_area(area)
def decode_and_write(cls, page_node, project_info): project_info = cls.__decode(page_node, project_info) # 写入的结果不做判断,只看一会能不能获取到id,能获取到就算成功 NewHouseSourceDao.write_project(project_info) project_id = NewHouseSourceDao.get_project_id(project_info) if project_id == 0: utils.print('获取项目Id失败, {}'.format(project_info['project_name'])) return False project_info['id'] = project_id return True
def crawl(self): utils.print('开始抓取二手房源数据...') pageindex = self.__page_index while True: try: if not self.__crawl_one_page(pageindex): break except Exception as e: utils.print('抓取第{}页失败, {}'.format(pageindex, str(e))) continue if self.__total_count < self.__page_size * (pageindex - 1): break pageindex += 1
def __decode_house(cls, house_node, branch_name): ''' :param house_node: :param branch_name: :return: ''' div_nodes = house_node.find_all('div') if len(div_nodes) != 2: utils.print('获取房间信息失败: {}, {}'.format(branch_name, house_node.text)) return None house = {} house['branch'] = branch_name house['room_num'] = utils.remove_blank_char(div_nodes[0].text) href_node = div_nodes[1].find('a') if href_node is None: utils.print('获取房间的连接信息失败, {}, {}'.format(branch_name, house_node.text)) return None url = '{}{}'.format(cls.__url, href_node['href']) utils.print('读取房间 {} {} {} {}的信息...'.format(cls.__project_name, cls.__building_name, branch_name, house['room_num'])) r = utils.request_with_retry(url) if r is None: utils.print('读取房屋{}的页面信息失败'.format(house['room_num'])) return None html_node = BeautifulSoup(r.text, 'lxml') return NewHSrcHousePageDecoder.decode(html_node)
def __crawl_one_page_project(self, page_index): r = None utils.print('正在读取第%d页项目列表...' % page_index) if page_index == 1: r = utils.request_with_retry(self.__url) else: r = utils.request_with_retry('{}index.aspx'.format(self.__url), self.__create_form_data(page_index)) if r is None: utils.print('读取项目页面失败, page_index = {}'.format(page_index)) return html_node = BeautifulSoup(r.text, 'lxml') self.extract_formdata_from_newpage(html_node) if page_index == 1: self.__get_total_count(html_node) project_nodes = self.__get_project_nodes(html_node) project_list = [] for project_node in project_nodes: project = self.__convert_project_node_to_project(project_node) if project is None: continue project['is_crawled'] = False project_list.append(project) utils.print('解析出%d条项目信息' % len(project_list)) writedcount = NewHouseSourceDao.write_project_summary_list(project_list) utils.print('写入数据库 %d 条记录' % writedcount)
def __get_total_count(self, node): ''' 从第一页的数据中提取总数 :param node: :return: ''' spans = node.find_all('span', class_='a1') if len(spans) != 2: utils.print('查找记录总数失败') return False nums = re.findall(r'\d+', spans[1].text) if len(nums) != 1: utils.print('从字符串 {} 提取记录总数失败'.format(spans[1].text)) return self.__total_count = int(nums[0]) return True
def run(self): utils.print('正在读取项目列表...') r = utils.request_with_retry(self.__url) if r is None: utils.print('读取项目页面失败...') return html_node = BeautifulSoup(r.text, 'lxml') project_nodes = self.__get_project_nodes(html_node) for project_node in project_nodes: project = self.__convert_project_node_to_project(project_node) if project is None: continue project['is_crawled'] = False writedcount = NewHouseSourceDao.write_project_summary(project) if writedcount > 0: MailSender.send_alarm_message('深圳有新地产项目通过预售', str(project))
def __get_project_nodes(self, node): '''从html中找出所有预售项目的行''' table_node = node.find('table', id='DataList1') if table_node is None: utils.print('获取项目列表表格失败...') return [] sub_table_node = table_node.find('table') if sub_table_node is None: utils.print('获取项目列表子表格失败...') return [] project_nodes = sub_table_node.find_all('tr') if len(project_nodes) < 3: return [] #前两行是标题和空行 del project_nodes[0] del project_nodes[0] return project_nodes
def __crawl_one_page(self, pageindex): ''' 抓去一页的房屋信息 :param pageindex: :return: 是否要继续查找下一页,如果当前页出错,或者查找的结果一个都没写进去,那就没必要再找下一页了 ''' utils.print('抓取第{}页...'.format(pageindex)) url = self.__url.format(pageindex) r = utils.request_with_retry(url) s = BeautifulSoup(r.text, 'lxml') if pageindex == 1: if not self.__get_total_count(s): return False tablenode = s.find('table', id='DataGrid1') if tablenode is None: utils.print('查找表格失败') return False house_list = [] house_nodes = tablenode.find_all('tr') for house_node in house_nodes: house_properties = house_node.find_all('td') if len(house_properties) < 9: continue if house_properties[0].text == '项目名称': continue house = orm.OldHouseSource() #columns = ['thedate', 'region', 'serial_num', 'project_name','area', 'use_type', 'code', 'agency_info'] house.project_name = utils.remove_blank_char( house_properties[0].text) house.serial_num = house_properties[1].text house.region = utils.remove_blank_char(house_properties[2].text) house.area = house_properties[3].text house.use_type = house_properties[4].text house.code = house_properties[6].text house.agency_info = utils.remove_blank_char( house_properties[7].text) house.thedate = house_properties[8].text house_list.append(house) return orm_ope.insert_item_list(house_list)
def __query_one_area(self, area_name): ''' 这个调用接口时,在fromdata中传递的参数不同 返回的response也不同,第一行和最后一行不是html,不规范,要注意做兼容处理 :param area_name: :return: ''' utils.print('query {} info...'.format(area_name)) r = None if area_name == '全市': r = utils.request_with_retry(self.__url) else: fromdata = self.areas[area_name] self.form_data[ 'ctl00$ContentPlaceHolder1$scriptManager1'] = fromdata[ 'ctl00$ContentPlaceHolder1$scriptManager1'] self.form_data['__EVENTTARGET'] = fromdata['__EVENTTARGET'] r = utils.request_with_retry(self.__url, self.form_data) s = BeautifulSoup(r.text, 'lxml') self.extract_formdata_from_newpage(s) self.__extract_info_from_page_into_db(s, area_name)
def __read_mailer_sender_info(cls): mail_info = {} config = configparser.ConfigParser() try: read_ok = config.read('{}\config.ini'.format(sys.path[0]), encoding='utf-8-sig') print(str(read_ok)) section = 'mail' mail_info['host'] = config.get(section, 'host') mail_info['user'] = config.get(section, 'user') mail_info['pass'] = config.get(section, 'pass') mail_info['sender'] = config.get(section, 'sender') mail_info['receivers'] = config.get(section, 'receivers') if mail_info['host'] == '' or mail_info['user'] == '' or mail_info[ 'pass'] == '' or mail_info['sender'] == '' or mail_info[ 'receivers'] == '': utils.print('读取邮件配置信息失败: 配置内容为: {}'.format(str(mail_info))) return None return mail_info except Exception as e: print(' 读取配置文件出错.') print(e) return None
def query_every_day_data(self): try: utils.print('---------------开始轮询-------------------') new_deal = NewHouseDealInfoCrawler() new_deal.crawl() old_deal = OldHouseDealInfoCrawler() old_deal.crawl() old_source = OldHouseSourceCrawler() old_source.crawl() utils.print('---------------结束轮询-------------------') print('') except Exception as e: utils.print('在轮询期间发生未知错误, {}'.format(str(e))) traceback.print_exc()
def job2(): utils.print('job2')
def __crawl_project_detail(self, project_info): ''' 获取指定项目的详细信息,然后写入到数据库中 :param url: :param project_info: 这个是从列表中获取的项目的简要信息 :return: ''' utils.print('读取项目{}页面'.format(project_info['project_name'])) r = utils.request_with_retry(project_info['url']) if r is None: utils.print('读取项目: {} , 页面失败...'.format( project_info['project_name'])) return False s = BeautifulSoup(r.text, 'lxml') if not NewHSrcPrjPageDecoder.decode_and_write(s, project_info): return False for building in project_info['building_list']: try: utils.print('读取 {} 的 {} 页面...'.format( project_info['project_name'], building['building_name'])) building['project_id'] = project_info['id'] building['is_crawled'] = False if NewHouseSourceDao.is_building_crawled(building) > 0: continue r = utils.request_with_retry(building['url']) if r is None: utils.print('读取项目 {} 的楼栋 {} 页面失败.'.format( project_info['project_name'], building['building_name'])) continue html_node = BeautifulSoup(r.text, 'lxml') house_list = NewHSrcBldPageDecoder.decode( html_node, building['building_name'], project_info['project_name']) if NewHouseSourceDao.write_newhouse_building(building) == 0: continue building_id = NewHouseSourceDao.get_building_id(building) if building_id == 0: print('获取楼栋id失败,{}, {}'.format( project_info['project_name'], building['building_name'])) continue for house in house_list: house['building_id'] = building_id NewHouseSourceDao.write_houselist(house_list) NewHouseSourceDao.update_building_state_to_crawled(building_id) except Exception as e: utils.print('抓取建筑 {} 失败...'.format(building['building_name'])) utils.print(str(e)) NewHouseSourceDao.update_project_state_to_crawled( project_info['presale_license_num'])
def crawl_new_house_source_projects(self): if NewHouseSourceDao.get_one_project(None) is not None: utils.print('已经抓取过房源信息...') return new_house_source_crawler = NewHSrcProjectCrawler() new_house_source_crawler.crawl()
@classmethod def crawl_new_house_source_projects(self): if NewHouseSourceDao.get_one_project(None) is not None: utils.print('已经抓取过房源信息...') return new_house_source_crawler = NewHSrcProjectCrawler() new_house_source_crawler.crawl() @classmethod def query_and_mail_new_house_info(self): monitor = NewHouseMonitor() monitor.run() utils.print('-----------程序启动--------------') new_house_detail_querier = NewHouseDetailQuerier() def schedule_task(): # 每秒钟检查一次是否有新房源要抓取 schedule.every().second.do(new_house_detail_querier.query_one_project) # 每小时检查是否有新房通过预售 schedule.every().hour.do( ShenzhenHouseCrawler.query_and_mail_new_house_info) # 每天12点抓取当天的 新房和二手房成交信息 schedule.every().day.at('12:00').do( ShenzhenHouseCrawler.query_every_day_data)