def process_item(self, item, spider): adapter = ItemAdapter(item) if 'inv_id' in adapter.keys(): # If inv_id is passed in item, look up the investment instance and # save the data directly. inv = get_object_or_404(Investment, id=adapter['inv_id'], template=True) setattr(inv, adapter['type'], adapter['data']) inv.save() else: lowercase_adapter = {k.lower(): v for k, v in adapter.items()} # If all investments are in the same item, get all investments for # the platform and iterate through them. platform = adapter['platformname'] investments = Investment.objects.filter(name__platform=platform, template=True) for inv in investments: # For each investment, assign the data from the item to the # relevant field - the adapter['type']) setattr(inv, adapter['type'], lowercase_adapter[str(inv.name.name).casefold()]) inv.save() return item
def save_items(self, items): for item in items: table_name = item.__class__.__name__ item_dict = ItemAdapter(item).asdict() item_dict = self.__check_size(item_dict) item_dict = self.__clean_dict(item_dict) try: _columns = ', '.join(item_dict.keys()) updated_values = ', '.join(i[0] + "='" + i[1] + "'" for i in item_dict.items() if i[0] != 'url') values = ", ".join("'{}'".format(k) for k in item_dict.values()) sql = "INSERT INTO sro.{} ({}) VALUES ({})".format( table_name, _columns, values) self._cursor.execute(sql) print(sql) except: url = item_dict.pop('url') _columns = ', '.join(item_dict.keys()) set_str = ", ".join("{}=%s".format(k) for k in item_dict.keys()) sql = "UPDATE sro.{} SET {} WHERE url = '{}'".format( table_name, set_str, url) self._cursor.execute(sql, list(item_dict.values())) print(sql) self._connection.commit()
def process_item(self, item, spider): if spider.name == 'novels': collection_name = 'novels' search_by = 'title' if spider.name == 'comments': collection_name = 'comments' search_by = 'comment_id' my_item = ItemAdapter(item).asdict() # find one document by title my_doc = self.db[collection_name].find_one({search_by: my_item[search_by]}) if not my_doc: # if there is no matched document, insert a new one self.db[collection_name].insert_one(ItemAdapter(item).asdict()) else: # update the existing document with not-none values self.db[collection_name].update( {'_id': ObjectId(my_doc['_id'])}, { '$set': {k: v for k, v in my_item.items() if v is not None} } ) return item
def process_item(self, item, spider): adapter = ItemAdapter(item) if "\\n" in adapter.get('invname', ""): # Replaces newlines with spaces invname = adapter['invname'].replace("\\n", " ") adapter['invname'] = invname # Removes extra spaces inside string when collecting new investments if adapter.get('invname', ''): invname = " ".join(adapter['invname'].split()) adapter['invname'] = invname # Removes keys with NoneType vals. Could be changed to a dict comprehension del_items = [] for key, val in adapter.items(): if key not in excluded_keys: if not val: del_items.append(key) for k in del_items: del adapter[k] for key, val in adapter.items(): if key not in excluded_keys: # Removes percentage sign + any text after and divides by 100 if "%" in val: adapter[key] = val.split("%")[0] #Nil items that are not numbers ("Zero", "Nil", etc) try: adapter[key] = Decimal(adapter[key]) except Exception: adapter[key] = 0 # Converts integer/float to correct percentage decimal. adapter[key] = Decimal(adapter[key]) / 100 return item
def process_item(self, item, spider): adapter = ItemAdapter(item) # Converts keys to lowercase so they can be matched to # case-insensitive models lowercase_adapter = {k.lower(): v for k, v in adapter.items()} # Gets AA Names again (new ones added) existing_aa_names = get_list_or_404(AssetAllocationName) try: existing_aa = get_list_or_404(AssetAllocation, investment=adapter['inv_id']) except Exception: existing_aa = [] #delete old allocationss (if they have changed) unused_names = [ aa for aa in existing_aa if str(aa.name).lower() not in lowercase_adapter.keys() ] for name in unused_names: name.delete() #Update existing ones for aa in existing_aa: # Matches the lowercase existing aa name to the lowercase # dict value and saves the updated allocation. try: aa.percentage = lowercase_adapter[str(aa.name).lower()] except KeyError as e: print(f"LOG: Asset class {e} no longer exists in scraped data") aa.save() lowercase_adapter.pop(str(aa.name).lower()) #Create new ones for k, v in lowercase_adapter.items(): aa_name = next( (i for i in existing_aa_names if i.name.lower() == k), None) #Get the AA NAME ID if k not in excluded_keys: allocation = AssetAllocation(name=aa_name, percentage=v, investment_id=adapter['inv_id']) allocation.save() return item
def strip_encoding(adapter: ItemAdapter) -> dict: for key, value in adapter.items(): if isinstance(value, str): adapter[key] = value.strip().replace("\n", "").replace("\r", "") return adapter
def process_item(self, item, spider): adapter = ItemAdapter(item) for key, value in adapter.items(): adapter[key] = strip(value) return item