Пример #1
0
    def process_item(self, item, spider):

        category = item['category']
        language = item['language']
        topic = item['topic']

        if not category and language and topic:
            raise DropItem("No VocabularyListItem")

        base_path = conf['output_dir']

        file_path = '%s/%s/%s/%s.xml' % (base_path,
                                         self._get_valid_dirname(category),
                                         self._get_valid_dirname(language),
                                         self._get_valid_dirname(topic))

        if not os.path.exists(os.path.dirname(file_path)):
            os.makedirs(os.path.dirname(file_path))

        with open(file_path, 'w+b') as f:
            self.exporter = XmlVocabularyListItemExporter(f, item_element=conf['xml_item_element'],
                                                          root_element=conf['xml_root_element'])
            self.exporter.start_exporting()
            self.exporter.export_item(item)
            self.exporter.finish_exporting()
            f.close()

        return item
Пример #2
0
class XmlExportPipeline(object):
    """
    Exports an scrapy item to a specific xml file using the XmlVocabularyListItemExporter
    """

    def process_item(self, item, spider):

        category = item['category']
        language = item['language']
        topic = item['topic']

        if not category and language and topic:
            raise DropItem("No VocabularyListItem")

        base_path = conf['output_dir']

        file_path = '%s/%s/%s/%s.xml' % (base_path,
                                         self._get_valid_dirname(category),
                                         self._get_valid_dirname(language),
                                         self._get_valid_dirname(topic))

        if not os.path.exists(os.path.dirname(file_path)):
            os.makedirs(os.path.dirname(file_path))

        with open(file_path, 'w+b') as f:
            self.exporter = XmlVocabularyListItemExporter(f, item_element=conf['xml_item_element'],
                                                          root_element=conf['xml_root_element'])
            self.exporter.start_exporting()
            self.exporter.export_item(item)
            self.exporter.finish_exporting()
            f.close()

        return item

    def _get_valid_dirname(self, name):
        dirname = name;

        dirname = dirname.replace("/", "-")

        return dirname