def save_items(self, items): for item in items: table_name = item.__class__.__name__ item_dict = ItemAdapter(item).asdict() item_dict = self.__check_size(item_dict) item_dict = self.__clean_dict(item_dict) try: _columns = ', '.join(item_dict.keys()) updated_values = ', '.join(i[0] + "='" + i[1] + "'" for i in item_dict.items() if i[0] != 'url') values = ", ".join("'{}'".format(k) for k in item_dict.values()) sql = "INSERT INTO sro.{} ({}) VALUES ({})".format( table_name, _columns, values) self._cursor.execute(sql) print(sql) except: url = item_dict.pop('url') _columns = ', '.join(item_dict.keys()) set_str = ", ".join("{}=%s".format(k) for k in item_dict.keys()) sql = "UPDATE sro.{} SET {} WHERE url = '{}'".format( table_name, set_str, url) self._cursor.execute(sql, list(item_dict.values())) print(sql) self._connection.commit()
def process_item(self, item, spider): adapter = ItemAdapter(item) for key in list(adapter.keys()): if adapter[key] == "None": bad_key = adapter.pop(key) return item
def process_item(self, item, spider): adapter = ItemAdapter(item) if adapter["company_name"] == None: raise DropItem( f"No name detected. Dropping {item['bbb_url'].split('/')[-1]}" ) elif len(adapter["zip_code"]) != 5: bad_zip = adapter.pop("zip_code") else: # self.urls_seen.add(adapter['bbb_url']) return item
def process_item(self, item, spider): adapter = ItemAdapter(item) if adapter.get('描述1'): content1 = adapter['描述1'] content1_list2 = content1[1] content1_list2_split = content1_list2.split(' ') adapter['区域'] = content1_list2_split[0] adapter['小区'] = content1_list2_split[1] content1_list1 = content1[0] content1_list1_split = content1_list1.split(',') adapter['户型'] = content1_list1_split[0].partition(':')[-1] adapter['面积'] = content1_list1_split[1].partition(' ')[0] adapter['类型'] = content1_list1_split[2].partition(':')[-1] adapter['楼层'] = content1_list1_split[3].partition('(')[0][3:] adapter['总层'] = content1_list1_split[3].partition('(')[-1][1:-2] adapter.pop('描述1') if adapter.get('单价'): adapter['单价'] = re.findall(r'^\d+', adapter['单价'])[0] if adapter.get('更新时间'): adapter['更新时间'] = adapter['更新时间'].partition(':')[-1].strip()[:10] return item
def process_item(self, item, spider): scholar = ItemAdapter(item).asdict() self.scholar_rank[scholar.pop('field')][scholar.pop('rank')] = scholar return item