Exemplo n.º 1
0
    def parse1(self, response):
        bs_obj = bs4.BeautifulSoup(response.text, 'html.parser')
        item = response.meta['item']
        item['parcel_status'] = 'onsell'
        try:
            e_page = bs_obj.find('div',
                                 attrs={
                                     'id': 'infoContent',
                                     'class': 'SconC'
                                 })
            # 处理网页文字
            e_ps = e_page.find_all('p')
            row_ps = [e_p.get_text(strip=True) for e_p in e_ps]
            d = {
                re_text[r]: filter(
                    lambda x: re.search(r, x).group()
                    if isinstance(x, unicode) and re.search(r, x) else None,
                    row_ps)
                for r in re_text
            }
            df0 = pd.DataFrame(d)
            item['monitor_extra'] = df0

            # 处理网页中的表格
            e_table = e_page.table
            df = html_table_reader.table_tr_td(e_table)
            item['content_detail'] = df
            yield item
        except:
            log_obj.error(
                item['monitor_url'], "%s(%s)中无法解析\n%s" %
                (self.name, response.url, traceback.format_exc()))
            yield response.meta['item']
 def parse1(self, response):
     bs_obj = bs4.BeautifulSoup(response.text, 'html.parser')
     item = response.meta['item']
     try:
         e_table = bs_obj.find('table', class_='MsoNormalTable')
         if not e_table:
             e_table = bs_obj.find('div', align='center').table
         df = html_table_reader.table_tr_td(e_table)
         item['content_detail'] = df
         if item['parcel_status'] == 'onsell':
             try:
                 item['monitor_extra'] = spider_func.extra_parse(
                     bs_obj, {
                         "tag": "div",
                         "attrs": {
                             "class": "TRS_PreAppend"
                         },
                         "row_tag": "p"
                     })
             except:
                 item['monitor_extra'] = spider_func.extra_parse(
                     bs_obj, {
                         "tag": "td",
                         "attrs": {
                             "class": "text"
                         },
                         "row_tag": "p"
                     })
         yield item
     except:
         log_obj.error(
             item['monitor_url'], "%s(%s)中无法解析\n%s" %
             (self.name, response.url, traceback.format_exc()))
         yield response.meta['item']
 def parse1(self, response):
     bs_obj = bs4.BeautifulSoup(response.text, 'html.parser')
     item = response.meta['item']
     try:
         l = []
         e_table = bs_obj.table
         if e_table.table:
             e_tables = e_table.find_all('table')
             for e_t in e_tables:
                 l.append(html_table_reader.table_tr_td(e_t))
         else:
             l.append(html_table_reader.table_tr_td(e_table))
         item['content_detail'] = l
         if item['parcel_status'] == 'onsell':
             item['monitor_extra'] = spider_func.extra_parse(bs_obj,{"tag": "div","attrs": {"class":"show_centen"},"row_tag": "p"})
         yield item
     except:
         log_obj.error(item['monitor_url'], "%s(%s)中无法解析\n%s" % (self.name, response.url, traceback.format_exc()))
         yield response.meta['item']
 def parse22(self, response):
     bs_obj = bs4.BeautifulSoup(response.text, 'html.parser')
     item = response.meta['item']
     try:
         e_table = bs_obj.find('table',class_='MsoNormalTable')
         df = html_table_reader.table_tr_td(e_table)
         item['content_detail'] = df
         yield item
     except:
         log_obj.error(item['monitor_url'], "%s(%s)中无法解析\n%s" %(self.name, response.url, traceback.format_exc()))
         yield response.meta['item']
 def list_parser(self, bs_obj, title, targetfile):
     e_table = bs_obj.find('table', class_='styledTable')
     df = html_table_reader.table_tr_td(e_table)
     df.to_csv(targetfile, encoding='utf_8_sig')