示例#1
0
    def to_hep(self, source):
        """Get an output ready hep formatted record from the given
        :class:`hepcrawl.utils.ParsedItem`, whatever format it's record might be.

        Args:
            source(str): string identifying the source for this item (ex. 'arXiv').

        Returns:
            hepcrawl.utils.ParsedItem: the new item, with the internal record
                formated as hep record.

        Raises:
            UnknownItemFormat: if the source item format is unknown.
        """
        builder = LiteratureBuilder(source=source)

        builder.add_acquisition_source(
            source=source,
            method='hepcrawl',
            date=datetime.datetime.now().isoformat(),
            submission_number=os.environ.get('SCRAPY_JOB', ''),
        )

        self.record['acquisition_source'] = builder.record[
            'acquisition_source']

        if self.record_format == 'hep':
            record = hep_to_hep(
                hep_record=self.record,
                record_files=self.record_files,
            )
            for document in record.get('documents', []):
                if 'old_url' in document and 'original_url' not in document:
                    document['original_url'] = document['old_url']
                    del document['old_url']
            return record
        elif self.record_format == 'hepcrawl':
            record = _normalize_hepcrawl_record(
                item=self.record,
                source=source,
            )
            return hepcrawl_to_hep(dict(record))
        else:
            raise UnknownItemFormat('Unknown ParsedItem::{}'.format(
                self.record_format))
示例#2
0
def test_no_document_type(input_no_document_type_record,
                          expected_no_document_type_record):
    produced_record = hepcrawl_to_hep(input_no_document_type_record)
    assert produced_record == expected_no_document_type_record
示例#3
0
def test_no_document_type(
        input_no_document_type_record,
        expected_no_document_type_record
):
    produced_record = hepcrawl_to_hep(input_no_document_type_record)
    assert produced_record == expected_no_document_type_record
示例#4
0
def test_generic_crawler_record(input_generic_crawler_record,
                                expected_generic_crawler_record):
    produced_record = hepcrawl_to_hep(input_generic_crawler_record)
    assert produced_record == expected_generic_crawler_record
示例#5
0
def test_generic_crawler_record(
        input_generic_crawler_record,
        expected_generic_crawler_record
):
    produced_record = hepcrawl_to_hep(input_generic_crawler_record)
    assert produced_record == expected_generic_crawler_record