def process_item_helper(self, item, collection_name): """ 1. for each post, get rid of 'last_reply_date' key to see if this post has been stored before 2. update/insert the post """ adapter = ItemAdapter(item) post = adapter.asdict() # get rid of ‘last_reply_date’ key last_reply_date = '' if 'last_reply_date' in post: last_reply_date = post['last_reply_date'] post.pop('last_reply_date') db_collection = self.db[collection_name] #logging.debug("search for post: \n %s", post) # if this post is new, insert the origin post if db_collection.count_documents(post) == 0: logging.debug("Insert a new item") db_collection.insert_one(adapter.asdict()) # this post has been stored before, update else: if last_reply_date != '': logging.debug("Updating an item with last_reply_date %s", last_reply_date) db_collection.update_one( post, {"$set": { "last_reply_date": last_reply_date }})
def process_item(self, item, spider): item = ItemAdapter(item) if self.is_valid_entry(item): bisect.insort(self.entries, item['word']) line = json.dumps(item.asdict(), indent=4, sort_keys=True) + ',' self.file.write(line) else: line = json.dumps(item.asdict(), indent=4, sort_keys=True) + ',' self.error_file.write(line) return item
def process_item(self, item, spider): item = ItemAdapter(item) if item['word'] is None: # if word could not be extracted wirte to errored file line = json.dumps(item.asdict(), indent=4, sort_keys=True) + ',' self.error_file.write(line) elif item['definitions'] is None: # if definition could not be extracted write to errored file line = json.dumps(item.asdict(), indent=4, sort_keys=True) self.error_file.write(line) #else: # write to database # write_to_db(self, item.asdict()) return item
def close_spider(self, spider): res = [ItemAdapter.asdict(item) for item in self.articles] with open(self.FILENAME, 'w', encoding='utf-8', newline='') as f: writer = csv.DictWriter(f, fieldnames=list(res[0].keys())) writer.writeheader() for i in res: writer.writerow(i)
def process_item(self, item, spider): adapter = ItemAdapter(item) print('In Process Item', adapter.asdict()) if adapter.get('product_price'): return item else: raise DropItem("Missing price in %s" % item)
def process_item(self, item, spider): adapter = ItemAdapter(item) # It provides a common interface to extract and set data without having to take the object’s type into account. if adapter.get('page_num'): line = "当前页面是第%d页" % item['page_num'] + "\n" else: line = json.dumps(adapter.asdict(), ensure_ascii=False) + "\n" #非ASCII编码,便于人阅读 self.file.write(line) return item
def process_item(self, item, spider): adapter = ItemAdapter(item) values = adapter.asdict() empty = lambda field: not field empty_fields = filter(empty, list(values.values())) if len(list(empty_fields)) > 12 or values.get("country") != "Kenya": print("\n\n***************") print("Dropped") print("******************\n\n") raise DropItem("Too many empty values found.") return item
def process_item(self, item, spider): adapter = ItemAdapter(item) try: adapter["brand"] = self.clean_text(adapter.get("brand")) adapter["name"] = self.clean_text(adapter.get("name")) adapter["price"] = int( float(adapter.get("price").replace(",", "").split()[1])) adapter["image"] = adapter.get("image").split("?")[0] except Exception: DropItem(f"Missing essential properties in {item}") self.csv.writerow(adapter.asdict()) return item
def close_spider(self, spider): if self.items is not []: for item in self.items: adapter = ItemAdapter(item) columns = adapter.field_names() writer = csv.DictWriter(self.file, fieldnames=columns, restval='', extrasaction='ignore', delimiter=',', quoting=csv.QUOTE_NONNUMERIC, quotechar="\"") if self.file.tell() == 0: writer.writeheader() writer.writerow(adapter.asdict()) self.file.close()
def process_item(self, item, spider): # Only handle TopicItems if not isinstance(item, BaseTopicItem): return item adapter = ItemAdapter(item) topic_id = adapter.get('topic_id') self.logger.debug(f'exporting TopicItem (id: {topic_id})') path = prepare_path(base_dir=self.base_dir_path, dirname_template=self.dirname_tmplt, filename_template=self.filename_tmplt, item=item) with path.open('w') as wh: json.dump(adapter.asdict(), wh) self.logger.debug(f'exported TopicItem (id: {topic_id})') return item
def process_item(self, item, spider): adapter = ItemAdapter(item) if 'name' in adapter.keys() and 'price' in adapter.keys(): adapter['name'] = [unidecode(adapter['name'][0])] adapter['price'] = [unidecode(adapter['price'][0])] str_price = adapter['price'][0] str_price = str_price.replace('.', '') str_price = str_price.replace(',', '.') price = float(str_price) if 'cents' in adapter.keys(): cents = adapter['cents'][0] cents = float(cents) price = price + cents / 100 adapter['price'] = price self.col.insert_one(adapter.asdict()) print("Exporter:", adapter["name"], "| ", adapter["price"], "|", adapter['store']) else: raise DropItem(f"{item} without name or price") return item
def process_item(self, item, spider): adapter = ItemAdapter(item) line = json.dumps(adapter.asdict(), ensure_ascii=False) + "\n" self.file.write(line) return item
def process_item(self, item, spider): adapter = ItemAdapter(item) line = json.dumps(adapter.asdict(), ensure_ascii=True) + '\n' self.file.write(line) print('Item Scraped!') return item
def process_item(self, item, spider): adapter = ItemAdapter(item) adapter['created_at'] = datetime.now().timestamp() self.items.append(adapter.asdict()) self.db[f'kw-{spider.keyword}'].insert_one(adapter.asdict()) return f"成功抓取关键词 [ {spider.keyword} ] 下的产品 {item['pid']} "
def process_item(self, item, spider): adapter = ItemAdapter(item) self.db[self.collection_name].update_one({'url': adapter.get('url')}, {'$set': adapter.asdict()}, upsert=True) return item