示例#1
0
 def save_items(self, items):
     for item in items:
         table_name = item.__class__.__name__
         item_dict = ItemAdapter(item).asdict()
         item_dict = self.__check_size(item_dict)
         item_dict = self.__clean_dict(item_dict)
         try:
             _columns = ', '.join(item_dict.keys())
             updated_values = ', '.join(i[0] + "='" + i[1] + "'"
                                        for i in item_dict.items()
                                        if i[0] != 'url')
             values = ", ".join("'{}'".format(k)
                                for k in item_dict.values())
             sql = "INSERT INTO sro.{} ({}) VALUES ({})".format(
                 table_name, _columns, values)
             self._cursor.execute(sql)
             print(sql)
         except:
             url = item_dict.pop('url')
             _columns = ', '.join(item_dict.keys())
             set_str = ", ".join("{}=%s".format(k)
                                 for k in item_dict.keys())
             sql = "UPDATE sro.{} SET {} WHERE url = '{}'".format(
                 table_name, set_str, url)
             self._cursor.execute(sql, list(item_dict.values()))
             print(sql)
     self._connection.commit()
    def process_item(self, item, spider):
        adapter = ItemAdapter(item)

        for key in list(adapter.keys()):
            if adapter[key] == "None":
                bad_key = adapter.pop(key)

        return item
 def process_item(self, item, spider):
     adapter = ItemAdapter(item)
     
     if adapter["company_name"] == None:
         raise DropItem(
             f"No name detected. Dropping {item['bbb_url'].split('/')[-1]}"
         )
     elif len(adapter["zip_code"]) != 5:
         bad_zip = adapter.pop("zip_code")
     else:
         # self.urls_seen.add(adapter['bbb_url'])
         return item
示例#4
0
 def process_item(self, item, spider):
     adapter = ItemAdapter(item)
     if adapter.get('描述1'):
         content1 = adapter['描述1']
         content1_list2 = content1[1]
         content1_list2_split = content1_list2.split(' ')
         adapter['区域'] = content1_list2_split[0]
         adapter['小区'] = content1_list2_split[1]
         content1_list1 = content1[0]
         content1_list1_split = content1_list1.split(',')
         adapter['户型'] = content1_list1_split[0].partition(':')[-1]
         adapter['面积'] = content1_list1_split[1].partition(' ')[0]
         adapter['类型'] = content1_list1_split[2].partition(':')[-1]
         adapter['楼层'] = content1_list1_split[3].partition('(')[0][3:]
         adapter['总层'] = content1_list1_split[3].partition('(')[-1][1:-2]
         adapter.pop('描述1')
     if adapter.get('单价'):
         adapter['单价'] = re.findall(r'^\d+', adapter['单价'])[0]
     if adapter.get('更新时间'):
         adapter['更新时间'] = adapter['更新时间'].partition(':')[-1].strip()[:10]
     return item
 def process_item(self, item, spider):
     scholar = ItemAdapter(item).asdict()
     self.scholar_rank[scholar.pop('field')][scholar.pop('rank')] = scholar
     return item