def parse1(self, response): bs_obj = bs4.BeautifulSoup(response.text, 'html.parser') item = response.meta['item'] item['parcel_status'] = 'onsell' try: e_page = bs_obj.find('div', attrs={ 'id': 'infoContent', 'class': 'SconC' }) # 处理网页文字 e_ps = e_page.find_all('p') row_ps = [e_p.get_text(strip=True) for e_p in e_ps] d = { re_text[r]: filter( lambda x: re.search(r, x).group() if isinstance(x, unicode) and re.search(r, x) else None, row_ps) for r in re_text } df0 = pd.DataFrame(d) item['monitor_extra'] = df0 # 处理网页中的表格 e_table = e_page.table df = html_table_reader.table_tr_td(e_table) item['content_detail'] = df yield item except: log_obj.error( item['monitor_url'], "%s(%s)中无法解析\n%s" % (self.name, response.url, traceback.format_exc())) yield response.meta['item']
def parse1(self, response): bs_obj = bs4.BeautifulSoup(response.text, 'html.parser') item = response.meta['item'] try: e_table = bs_obj.find('table', class_='MsoNormalTable') if not e_table: e_table = bs_obj.find('div', align='center').table df = html_table_reader.table_tr_td(e_table) item['content_detail'] = df if item['parcel_status'] == 'onsell': try: item['monitor_extra'] = spider_func.extra_parse( bs_obj, { "tag": "div", "attrs": { "class": "TRS_PreAppend" }, "row_tag": "p" }) except: item['monitor_extra'] = spider_func.extra_parse( bs_obj, { "tag": "td", "attrs": { "class": "text" }, "row_tag": "p" }) yield item except: log_obj.error( item['monitor_url'], "%s(%s)中无法解析\n%s" % (self.name, response.url, traceback.format_exc())) yield response.meta['item']
def parse1(self, response): bs_obj = bs4.BeautifulSoup(response.text, 'html.parser') item = response.meta['item'] try: l = [] e_table = bs_obj.table if e_table.table: e_tables = e_table.find_all('table') for e_t in e_tables: l.append(html_table_reader.table_tr_td(e_t)) else: l.append(html_table_reader.table_tr_td(e_table)) item['content_detail'] = l if item['parcel_status'] == 'onsell': item['monitor_extra'] = spider_func.extra_parse(bs_obj,{"tag": "div","attrs": {"class":"show_centen"},"row_tag": "p"}) yield item except: log_obj.error(item['monitor_url'], "%s(%s)中无法解析\n%s" % (self.name, response.url, traceback.format_exc())) yield response.meta['item']
def parse22(self, response): bs_obj = bs4.BeautifulSoup(response.text, 'html.parser') item = response.meta['item'] try: e_table = bs_obj.find('table',class_='MsoNormalTable') df = html_table_reader.table_tr_td(e_table) item['content_detail'] = df yield item except: log_obj.error(item['monitor_url'], "%s(%s)中无法解析\n%s" %(self.name, response.url, traceback.format_exc())) yield response.meta['item']
def list_parser(self, bs_obj, title, targetfile): e_table = bs_obj.find('table', class_='styledTable') df = html_table_reader.table_tr_td(e_table) df.to_csv(targetfile, encoding='utf_8_sig')