def parse_node(self, response, node): """Parse an arXiv XML exported file into a HEP record.""" node.remove_namespaces() record = HEPLoader(item=HEPRecord(), selector=node) record.add_xpath('title', './/title/text()') record.add_xpath('abstract', './/abstract/text()') record.add_xpath('preprint_date', './/created/text()') record.add_xpath('dois', './/doi//text()') record.add_xpath('pubinfo_freetext', './/journal-ref//text()') record.add_value('source', 'arXiv') authors, collabs = self._get_authors_or_collaboration(node) record.add_value('authors', authors) record.add_value('collaborations', collabs) collections = ['HEP', 'Citeable', 'arXiv'] comments = '; '.join(node.xpath('.//comments//text()').extract()) if comments: pages, notes, doctype = self._parse_comments_info(comments) record.add_value('public_notes', notes) if pages: record.add_value('page_nr', pages) if doctype: collections.append(doctype) record.add_value('collections', collections) record.add_value( 'report_numbers', self._get_arxiv_report_numbers(node) ) plain_categories = ' '.join( node.xpath('.//categories//text()').extract() ).split() categories = self._get_categories_object(plain_categories) record.add_value('field_categories', categories) record.add_value( 'arxiv_eprints', self._get_arxiv_eprint(node, plain_categories) ) record.add_value( 'external_system_numbers', self._get_ext_systems_number(node) ) license = get_license( license_url=node.xpath('.//license//text()').extract_first() ) record.add_value('license', license) parsed_record = dict(record.load_item()) validate_schema(data=parsed_record, schema_name='hep') return parsed_record
def parse_node(self, response, node): """Parse an arXiv XML exported file into a HEP record.""" node.remove_namespaces() record = HEPLoader(item=HEPRecord(), selector=node) record.add_xpath('title', './/title/text()') record.add_xpath('abstract', './/abstract/text()') record.add_xpath('preprint_date', './/created/text()') record.add_xpath('dois', './/doi//text()') record.add_xpath('pubinfo_freetext', './/journal-ref//text()') record.add_value('source', 'arXiv') authors, collabs = self._get_authors_or_collaboration(node) record.add_value('authors', authors) record.add_value('collaborations', collabs) collections = ['HEP', 'Citeable', 'arXiv'] comments = '; '.join(node.xpath('.//comments//text()').extract()) if comments: pages, notes, doctype = self._parse_comments_info(comments) record.add_value('public_notes', notes) if pages: record.add_value('page_nr', pages) if doctype: collections.append(doctype) record.add_value('collections', collections) record.add_value('report_numbers', self._get_arxiv_report_numbers(node)) plain_categories = ' '.join( node.xpath('.//categories//text()').extract()).split() categories = self._get_categories_object(plain_categories) record.add_value('field_categories', categories) record.add_value('arxiv_eprints', self._get_arxiv_eprint(node, plain_categories)) record.add_value('external_system_numbers', self._get_ext_systems_number(node)) license = get_license( license_url=node.xpath('.//license//text()').extract_first()) record.add_value('license', license) parsed_record = dict(record.load_item()) validate_schema(data=parsed_record, schema_name='hep') return parsed_record
def process_item(self, item, spider): """Convert internal format to INSPIRE data model.""" self.count += 1 if 'related_article_doi' in item: item['dois'] += item.pop('related_article_doi', []) source = item.pop('source', spider.name) item['acquisition_source'] = { 'source': source, # NOTE: Keeps method same as source to conform with INSPIRE # submissions which add `submissions` to this field. 'method': source, 'date': datetime.datetime.now().isoformat(), 'submission_number': os.environ.get('SCRAPY_JOB', ''), } item['titles'] = [{ 'title': item.pop('title', ''), 'subtitle': item.pop('subtitle', ''), 'source': source, }] item['abstracts'] = [{ 'value': item.pop('abstract', ''), 'source': source, }] item['imprints'] = [{ 'date': item.pop('date_published', ''), }] item['copyright'] = [{ 'holder': item.pop('copyright_holder', ''), 'year': item.pop('copyright_year', ''), 'statement': item.pop('copyright_statement', ''), 'material': item.pop('copyright_material', ''), }] if not item.get('publication_info'): if has_publication_info(item): item['publication_info'] = [{ 'journal_title': item.pop('journal_title', ''), 'journal_volume': item.pop('journal_volume', ''), 'year': int(item.pop('journal_year', 0)) or '', 'journal_issue': item.pop('journal_issue', ''), 'artid': item.pop('journal_artid', ''), 'page_start': item.pop('journal_fpage', ''), 'page_end': item.pop('journal_lpage', ''), 'note': item.pop('journal_doctype', ''), 'pubinfo_freetext': item.pop('pubinfo_freetext', ''), }] # Remove any fields filter_fields(item, [ 'journal_title', 'journal_volume', 'journal_year', 'journal_issue', 'journal_fpage', 'journal_lpage', 'journal_doctype', 'journal_artid', 'pubinfo_freetext', ]) validate_schema(dict(item), 'hep') return item
def parse_node(self, response, node): """Parse a WSP XML file into a HEP record.""" node.remove_namespaces() article_type = node.xpath('@article-type').extract() self.log("Got article_type {0}".format(article_type)) if article_type is None or article_type[ 0] not in self.allowed_article_types: # Filter out non-interesting article types return None record = HEPLoader(item=HEPRecord(), selector=node, response=response) if article_type in ['correction', 'addendum']: record.add_xpath('related_article_doi', "//related-article[@ext-link-type='doi']/@href") record.add_value('journal_doctype', article_type) record.add_xpath('dois', "//article-id[@pub-id-type='doi']/text()") record.add_xpath('page_nr', "//counts/page-count/@count") record.add_xpath('abstract', '//abstract[1]') record.add_xpath('title', '//article-title/text()') record.add_xpath('subtitle', '//subtitle/text()') record.add_value('authors', self._get_authors(node)) record.add_xpath('collaborations', "//contrib/collab/text()") free_keywords, classification_numbers = self._get_keywords(node) record.add_value('free_keywords', free_keywords) record.add_value('classification_numbers', classification_numbers) record.add_value('date_published', self._get_published_date(node)) # TODO: Special journal title handling # journal, volume = fix_journal_name(journal, self.journal_mappings) # volume += get_value_in_tag(self.document, 'volume') journal_title = '//abbrev-journal-title/text()|//journal-title/text()' record.add_xpath('journal_title', journal_title) record.add_xpath('journal_issue', '//issue/text()') record.add_xpath('journal_volume', '//volume/text()') record.add_xpath('journal_artid', '//elocation-id/text()') record.add_xpath('journal_fpage', '//fpage/text()') record.add_xpath('journal_lpage', '//lpage/text()') published_date = self._get_published_date(node) record.add_value('journal_year', int(published_date[:4])) record.add_value('date_published', published_date) record.add_xpath('copyright_holder', '//copyright-holder/text()') record.add_xpath('copyright_year', '//copyright-year/text()') record.add_xpath('copyright_statement', '//copyright-statement/text()') record.add_value('copyright_material', 'Article') license = get_license( license_url=node.xpath( '//license/license-p/ext-link/@href').extract_first(), license_text=node.xpath( '//license/license-p/ext-link/text()').extract_first(), ) record.add_value('license', license) record.add_value( 'collections', self._get_collections(node, article_type, journal_title)) parsed_record = dict(record.load_item()) validate_schema(data=parsed_record, schema_name='hep') return parsed_record
def parse_node(self, response, node): """Parse a WSP XML file into a HEP record.""" node.remove_namespaces() article_type = node.xpath('@article-type').extract() self.log("Got article_type {0}".format(article_type)) if article_type is None or article_type[0] not in self.allowed_article_types: # Filter out non-interesting article types return None record = HEPLoader(item=HEPRecord(), selector=node, response=response) if article_type in ['correction', 'addendum']: record.add_xpath('related_article_doi', "//related-article[@ext-link-type='doi']/@href") record.add_value('journal_doctype', article_type) record.add_xpath('dois', "//article-id[@pub-id-type='doi']/text()") record.add_xpath('page_nr', "//counts/page-count/@count") record.add_xpath('abstract', '//abstract[1]') record.add_xpath('title', '//article-title/text()') record.add_xpath('subtitle', '//subtitle/text()') record.add_value('authors', self._get_authors(node)) record.add_xpath('collaborations', "//contrib/collab/text()") free_keywords, classification_numbers = self._get_keywords(node) record.add_value('free_keywords', free_keywords) record.add_value('classification_numbers', classification_numbers) record.add_value('date_published', self._get_published_date(node)) # TODO: Special journal title handling # journal, volume = fix_journal_name(journal, self.journal_mappings) # volume += get_value_in_tag(self.document, 'volume') journal_title = '//abbrev-journal-title/text()|//journal-title/text()' record.add_xpath('journal_title', journal_title) record.add_xpath('journal_issue', '//issue/text()') record.add_xpath('journal_volume', '//volume/text()') record.add_xpath('journal_artid', '//elocation-id/text()') record.add_xpath('journal_fpage', '//fpage/text()') record.add_xpath('journal_lpage', '//lpage/text()') published_date = self._get_published_date(node) record.add_value('journal_year', int(published_date[:4])) record.add_value('date_published', published_date) record.add_xpath('copyright_holder', '//copyright-holder/text()') record.add_xpath('copyright_year', '//copyright-year/text()') record.add_xpath('copyright_statement', '//copyright-statement/text()') record.add_value('copyright_material', 'Article') license = get_license( license_url=node.xpath( '//license/license-p/ext-link/@href').extract_first(), license_text=node.xpath( '//license/license-p/ext-link/text()').extract_first(), ) record.add_value('license', license) record.add_value('collections', self._get_collections(node, article_type, journal_title)) parsed_record = dict(record.load_item()) validate_schema(data=parsed_record, schema_name='hep') return parsed_record