Exemplo n.º 1
0
 def process_item(self, item, spider):
     adapter = ItemAdapter(item)
     if adapter['name'] is None or not isinstance(adapter['name'], str):
         return DropItem(f"name field invalid...")
     elif adapter['gclass'] is None or not isinstance(
             adapter['gclass'], str):
         return DropItem(f"gclass field invalid")
     else:
         return item
Exemplo n.º 2
0
    def test_exporter_custom_serializer(self):
        class CustomItemExporter(BaseItemExporter):
            def serialize_field(self, field, name, value):
                if name == 'age':
                    return str(int(value) + 1)
                else:
                    return super().serialize_field(field, name, value)

        i = self.item_class(name='John', age='22')
        a = ItemAdapter(i)
        ie = CustomItemExporter()

        self.assertEqual(ie.serialize_field(a.get_field_meta('name'), 'name', a['name']), 'John')
        self.assertEqual(ie.serialize_field(a.get_field_meta('age'), 'age', a['age']), '23')

        i2 = {'name': 'John', 'age': '22'}
        self.assertEqual(ie.serialize_field({}, 'name', i2['name']), 'John')
        self.assertEqual(ie.serialize_field({}, 'age', i2['age']), '23')
Exemplo n.º 3
0
 def test_get_output_value_list(self):
     """Getting output value must not remove value from item"""
     input_item = self.item_class(name=['foo', 'bar'])
     il = ItemLoader(item=input_item)
     self.assertEqual(il.get_output_value('name'), ['foo', 'bar'])
     loaded_item = il.load_item()
     self.assertIsInstance(loaded_item, self.item_class)
     self.assertEqual(
         ItemAdapter(loaded_item).asdict(), dict({'name': ['foo', 'bar']}))
Exemplo n.º 4
0
    def process_item(self, item, spider):
        '''
            每个实现保存的类里面必须都要有这个方法,且名字固定,用来具体实现怎么保存
        '''

        collection_name = item['domain_collection']
        table = self.db[collection_name]
        table.insert_one(ItemAdapter(item).asdict())
        return item
Exemplo n.º 5
0
 def test_add_value_singlevalue_singlevalue(self):
     """Values added after initialization should be appended"""
     input_item = self.item_class(name='foo')
     il = ItemLoader(item=input_item)
     il.add_value('name', 'bar')
     loaded_item = il.load_item()
     self.assertIsInstance(loaded_item, self.item_class)
     self.assertEqual(
         ItemAdapter(loaded_item).asdict(), {'name': ['foo', 'bar']})
Exemplo n.º 6
0
    def close_spider(self, spider):
        if self.items is not []:
            for item in self.items:
                adapter = ItemAdapter(item)
                columns = adapter.field_names()
                writer = csv.DictWriter(self.file,
                                        fieldnames=columns,
                                        restval='',
                                        extrasaction='ignore',
                                        delimiter=',',
                                        quoting=csv.QUOTE_NONNUMERIC,
                                        quotechar="\"")

                if self.file.tell() == 0:
                    writer.writeheader()
                writer.writerow(adapter.asdict())

        self.file.close()
Exemplo n.º 7
0
 def process_item(self, item, spider):
     adapter = ItemAdapter(item)
     name = adapter['name']
     i_type = adapter['m_type']
     i_time = adapter['m_time']
     output = f'{name}\t{i_type}\t{i_time}\n'
     with open('./scrapy_result.csv', 'a+', encoding="utf-8") as result:
         result.write(output)
     return item
Exemplo n.º 8
0
    def process_item(self, item, spider):
        adapter = ItemAdapter(item)
        html = adapter.get('table_html')
        if html is not None:

            i = 1
            for table in html.css("table"):
                path = os.path.join(spider.settings['FILES_STORE'],
                                    adapter['out_dir'], 'table-%i.tsv' % i)
                with open(path, "w") as file:
                    for tr in table.css('tr'):
                        line = "\t".join(
                            tr.css('td span::text').getall()) + "\n"
                        file.write(line)
                spider.logger.debug('Table Path: %s' % path)
                i += 1
            pass
        return item
    def process_item(self, item, spider):
        adapter = ItemAdapter(item)

        if adapter["bbb_url"] in self.seen_urls:
            DropItem(adapter)

        spider.logger.info(f"duplicate found {adapter['bbb_url']}")

        return item
Exemplo n.º 10
0
    def process_item(self, item, spider):
        item.setdefault('start_date', '')
        item.setdefault('end_date', '')
        self.file_1 = open('Requests.csv', 'a', encoding='utf-8', newline='')
        writer = csv.writer(self.file_1, delimiter=',')
        writer.writerow(ItemAdapter(item).values())
        self.file_1.close()

        return item
Exemplo n.º 11
0
    def process_item(self, item, spider):
        data = ItemAdapter(item).asdict()
        update = {"$set": data}
        # perform upsert
        self.db[spider.name].update_one(filter=data,
                                        update=update,
                                        upsert=True)

        return item
Exemplo n.º 12
0
    def print_items(self, lvl=None, colour=True):
        if lvl is None:
            items = [item for lst in self.items.values() for item in lst]
        else:
            items = self.items.get(lvl, [])

        print("# Scraped Items ", "-" * 60)
        display.pprint([ItemAdapter(x).asdict() for x in items],
                       colorize=colour)
Exemplo n.º 13
0
 def _exporter_for_item(self, item):
     adapter = ItemAdapter(item)
     company = adapter['company']
     if company not in self.company_to_exporter:
         f = open(os.path.join(self.path, f'{company}.json'), 'wb')
         exporter = JsonItemExporter(f, indent=4)
         exporter.start_exporting()
         self.company_to_exporter[company] = exporter
     return self.company_to_exporter[company]
Exemplo n.º 14
0
 def post_process(self, output):
     for x in output:
         if is_item(x):
             missing = [
                 arg for arg in self.args if arg not in ItemAdapter(x)
             ]
             if missing:
                 missing_fields = ", ".join(missing)
                 raise ContractFail(f"Missing fields: {missing_fields}")
Exemplo n.º 15
0
    def _get_serialized_fields(self,
                               item,
                               default_value=None,
                               include_empty=None,
                               pre=None,
                               field_filter=None):
        """Copy from BaseItemExporter
        """
        item = ItemAdapter(item)

        if include_empty is None:
            include_empty = self.export_empty_fields

        if self.fields_to_export is None:
            if include_empty:
                field_iter = item.field_names()
            else:
                field_iter = item.keys()
        else:
            if include_empty:
                field_iter = self.fields_to_export
            else:
                field_iter = (x for x in self.fields_to_export if x in item)

        for field_name in field_iter:
            k = None
            if field_filter:
                if pre is not None:
                    k = pre_join(pre, field_name)
                    if k in field_filter:
                        continue
            if field_name in item:
                field_meta = item.get_field_meta(field_name)
                value = self.serialize_field(
                    field_meta,
                    field_name,
                    item[field_name],
                    pre=k,
                    field_filter=field_filter,
                )
            else:
                value = default_value

            yield field_name, value
Exemplo n.º 16
0
 def test_nested_item(self):
     i1 = self.item_class(name='Joseph\xa3', age='22')
     i2 = self.item_class(name='Maria', age=i1)
     i3 = self.item_class(name='Jesus', age=i2)
     self.ie.start_exporting()
     self.ie.export_item(i3)
     self.ie.finish_exporting()
     exported = json.loads(to_unicode(self.output.getvalue()))
     expected = {'name': 'Jesus', 'age': {'name': 'Maria', 'age': ItemAdapter(i1).asdict()}}
     self.assertEqual(exported, [expected])
Exemplo n.º 17
0
    def process_item(self, item, spider):
        adapter = ItemAdapter(item)
        item_dict = dict()
        item_dict['id'] = adapter['id']
        item_dict['src'] = adapter['src']

        if self.db[self.collection_name].find_one({"id": adapter["id"]}):
            raise DropItem(f"Duplicate item found: {item!r}")
        else:
            return item
Exemplo n.º 18
0
 def process_item(self, item, spider):
     adapter = ItemAdapter(item)
     # удаляем / в конце если есть
     url = adapter['url'].strip('/')
     url_withaout_param = url[:url.rfind('/')]
     if url_withaout_param in self.store_urls:
         raise DropItem("Duplicate url found: %r" % item)
     else:
         self.store_urls.add(url_withaout_param)
         return item
Exemplo n.º 19
0
 def parse_item(self, response):
     adapter = ItemAdapter(self.item_cls())
     m = self.name_re.search(response.text)
     if m:
         adapter['name'] = m.group(1)
     adapter['url'] = response.url
     m = self.price_re.search(response.text)
     if m:
         adapter['price'] = m.group(1)
     return adapter.item
Exemplo n.º 20
0
 def test_header_export_two_items(self):
     for item in [self.i, ItemAdapter(self.i).asdict()]:
         output = BytesIO()
         ie = CsvItemExporter(output)
         ie.start_exporting()
         ie.export_item(item)
         ie.export_item(item)
         ie.finish_exporting()
         self.assertCsvEqual(output.getvalue(),
                             b'age,name\r\n22,John\xc2\xa3\r\n22,John\xc2\xa3\r\n')
 def process_item(self, item, spider):
     if item['product_title'] != []:
         cursor = self.db[self.collection_name].find({"_id": item['_id']})
         if cursor.count() == 0:
             self.db[self.collection_name].insert_one(ItemAdapter(item).asdict())
         else:
             self.db[self.collection_name].update_one({ "_id": item['_id'] }, {"$set": ItemAdapter(item).asdict()})
         return item
     
     return item
Exemplo n.º 22
0
    def process_item(self, item, spider):
        # Only handle TopicItems
        if not isinstance(item, BaseTopicItem):
            return item

        adapter = ItemAdapter(item)
        topic_id = adapter.get('topic_id')
        self.logger.debug(f'exporting TopicItem (id: {topic_id})')

        path = prepare_path(base_dir=self.base_dir_path,
                            dirname_template=self.dirname_tmplt,
                            filename_template=self.filename_tmplt,
                            item=item)
        with path.open('w') as wh:
            json.dump(adapter.asdict(), wh)

        self.logger.debug(f'exported TopicItem (id: {topic_id})')

        return item
def insert_into_comment(conn, item):
    adapter = ItemAdapter(item)
    sqlite_insert_with_param = """INSERT OR REPLACE INTO comment
							(comment_id, comment_text, comment_date, comment_author_id, comment_author_username,article_url,article_title) 
							VALUES (?, ?, ?, ?, ?, ?, ?);"""
    data_tuple = (adapter["comment_id"], adapter["comment_text"],
                  adapter["comment_date"], adapter["comment_author_id"],
                  adapter["comment_author_username"], adapter["article_url"],
                  adapter["article_title"])
    execute_sql_param(conn, sqlite_insert_with_param, data_tuple)
Exemplo n.º 24
0
 def update_body(self, item):
     item = ItemAdapter(item).asdict()
     domain, path = self.purify_url(item['url'])
     self.body += [{
         'image': item['image_url'],
         'domain': domain,
         'path': path,
         'des': item['description'],
         'title': item['title']
     }]
Exemplo n.º 25
0
 def test_false(self):
     self.assertFalse(is_item(int))
     self.assertFalse(is_item(sum))
     self.assertFalse(is_item(1234))
     self.assertFalse(is_item(object()))
     self.assertFalse(is_item("a string"))
     self.assertFalse(is_item(b"some bytes"))
     self.assertFalse(is_item(["a", "list"]))
     self.assertFalse(is_item(("a", "tuple")))
     self.assertFalse(is_item({"a", "set"}))
     self.assertFalse(is_item(dict))
     self.assertFalse(is_item(ScrapyItem))
     self.assertFalse(is_item(DataClassItem))
     self.assertFalse(is_item(ScrapySubclassedItem))
     self.assertFalse(is_item(AttrsItem))
     self.assertFalse(is_item(PydanticModel))
     self.assertFalse(ItemAdapter.is_item_class(list))
     self.assertFalse(ItemAdapter.is_item_class(int))
     self.assertFalse(ItemAdapter.is_item_class(tuple))
Exemplo n.º 26
0
 def test_add_value_list_singlevalue(self):
     """Values added after initialization should be appended"""
     input_item = self.item_class(name=["foo", "bar"])
     il = ItemLoader(item=input_item)
     il.add_value("name", "qwerty")
     loaded_item = il.load_item()
     self.assertIsInstance(loaded_item, self.item_class)
     self.assertEqual(
         ItemAdapter(loaded_item).asdict(),
         {"name": ["foo", "bar", "qwerty"]})
Exemplo n.º 27
0
    def process_item(self, item, spider):

        new_item = WeiXinCnpcNews(title=item.get('title'), author=item.get('author'), pre_title=item.get('pre_title'), \
                            preview_img_link=item.get('preview_img_link'), pub_time=item.get('pub_time'), \
                            content=item.get('content'), crawl_time=item.get('crawl_time'), url=item.get('url'), \
                            categories=item.get('categories'),images_url=str(item.get('image_urls')),images=str(item.get('images')))


        adapter = ItemAdapter(item)

        try:
            if adapter.get('content'):
                spider.session.add(new_item)
                spider.session.commit()
            else:
                raise (f"Missing content in {item}")
        except:
            spider.session.rollback()
        return item
Exemplo n.º 28
0
 def process_item(self, item, spider):
     itemAdapter = ItemAdapter(item=item)
     if itemAdapter.get('title') and itemAdapter.get('description'):
         type = self.get_refine_type(str(itemAdapter.get("type")))
         if type == "UNKNOW":
             text = [
                 str(itemAdapter.get('title')) +
                 str(itemAdapter.get('description'))
             ]
             prediction = self.classifier.predict(text)
             type = prediction[0]
         itemAdapter.update({'type': type})
         return itemAdapter.item
     else:
         return DropItem(
             f"Missing field in paper at {itemAdapter.get('source')}")
Exemplo n.º 29
0
    def _get_serialized_fields(self,
                               item,
                               default_value=None,
                               include_empty=None):
        """Return the fields to export as an iterable of tuples
        (name, serialized_value)
        """
        item = ItemAdapter(item)

        if include_empty is None:
            include_empty = self.export_empty_fields

        if self.fields_to_export is None:
            if include_empty:
                field_iter = item.field_names()
            else:
                field_iter = item.keys()
        elif isinstance(self.fields_to_export, Mapping):
            if include_empty:
                field_iter = self.fields_to_export.items()
            else:
                field_iter = ((x, y) for x, y in self.fields_to_export.items()
                              if x in item)
        else:
            if include_empty:
                field_iter = self.fields_to_export
            else:
                field_iter = (x for x in self.fields_to_export if x in item)

        for field_name in field_iter:
            if isinstance(field_name, str):
                item_field, output_field = field_name, field_name
            else:
                item_field, output_field = field_name
            if item_field in item:
                field_meta = item.get_field_meta(item_field)
                value = self.serialize_field(field_meta, output_field,
                                             item[item_field])
            else:
                value = default_value

            yield output_field, value
Exemplo n.º 30
0
    async def process_item(self, item, spider):
        print(item)
        if item["doctype"] == "course":
            self.db.Courses.find_one_and_update(
                {
                    "doctype": "course",
                    "id": item["id"]
                },
                {"$set": ItemAdapter(item).asdict()},
                upsert=True,
            )
            return item

        user = self.db.Users.find_one({
            "doctype": "user",
            "_id": item["user_id"]
        })

        if not user.get("notifications"):
            self.db.Assignments.find_one_and_update({"doctype": "assignment"})
        else:

            old = await self.process_grades(
                user)  # getting the current grades as an int

            # inserting the doc into db
            self.db[item["doctype"][0].upper() + item["doctype"][1:] +
                    "s"].find_one_and_update(
                        {
                            "doctype": item["doctype"],
                            "id": item["id"]
                        },
                        {"$set": ItemAdapter(item).asdict()},
                        upsert=True,
                    )

            current = await self.process_grades(user)  # updated grade count

            for system in user["notifications"]:
                pass

        return item