def build_conference_proceedings_item( self, proceedings_page_html, pos_id, ): selector = Selector( text=proceedings_page_html, type='html', ) selector.remove_namespaces() record = HEPLoader(item=HEPRecord(), selector=selector) record.add_value('collections', ['proceedings']) record.add_value( 'title', self._get_proceedings_title(selector=selector), ) record.add_value( 'subtitle', self._get_proceedings_date_place(selector=selector), ) record.add_value('journal_title', 'PoS') record.add_value( 'journal_volume', self._get_journal_volume(pos_ext_identifier=pos_id), ) parsed_proceeding = ParsedItem( record=record.load_item(), record_format='hepcrawl', ) return parsed_proceeding
def get_conference_proceedings_page_request(self, meta): """Gets the conference proceedings page, using the indernal conference id from the record html page retrieved before. """ if not meta.get('html_record'): raise PoSExtractionException( 'PoS conference paper page was empty, current meta:\n%s' % meta) proceedings_page_url = self._get_proceedings_page_url( page_html=meta.get('html_record'), ) page_selector = Selector( text=meta.get('xml_record'), type='xml', ) page_selector.remove_namespaces() pos_id = page_selector.xpath( ".//metadata/pex-dc/identifier/text()").extract_first() meta['pos_id'] = pos_id return Request( url=proceedings_page_url, meta=meta, callback=self.parse_conference_proceedings, )
def build_item(self, response): """Parse an PoS XML exported file into a HEP record.""" text = response.meta["record"] node = Selector(text=text, type="xml") node.remove_namespaces() record = HEPLoader(item=HEPRecord(), selector=node) record.add_xpath('title', '//metadata/pex-dc/title/text()') record.add_xpath('field_categories', '//metadata/pex-dc/subject/text()') record.add_xpath('source', '//metadata/pex-dc/publisher/text()') record.add_value('external_system_numbers', self._get_ext_systems_number(node)) pub_license, pub_license_url, openaccess = self._get_license(node) if pub_license: record.add_value('license', pub_license) record.add_value('license_url', pub_license_url) if openaccess: record.add_value('license_type', "open-access") date, year = self._get_date(node) if date: record.add_value('date_published', date) if year: record.add_value('journal_year', year) identifier = node.xpath( ".//metadata/pex-dc/identifier/text()").extract_first() record.add_value('urls', response.meta['pos_url']) if response.meta['pos_pdf_url']: record.add_value('additional_files', { 'type': "Fulltext", "url": response.meta['pos_pdf_url'] }) if identifier: pbn = re.split('[()]', identifier) if len(pbn) == 3: conf_acronym = pbn[1] article_id = pbn[2] record.add_value('journal_title', pbn[0]) record.add_value('journal_volume', conf_acronym) record.add_value('journal_artid', article_id) else: record.add_value('pubinfo_freetext', identifier) language = node.xpath( ".//metadata/pex-dc/language/text()").extract_first() if language: record.add_value('language', language) authors = self._get_authors(node) if authors: record.add_value('authors', authors) extra_data = self._get_extra_data(node) if extra_data: record.add_value('extra_data', extra_data) record.add_value('collections', ['HEP', 'ConferencePaper']) return record.load_item()
def parse(self, response): xml = Selector(response) xml.remove_namespaces() urls = xml.xpath('//loc/text()').extract() urls = [url.strip() for url in urls] for url in urls: if 'store-details' in url: yield scrapy.Request(url, callback=self.parse_store)
def build_item(self, response): """Parse an PoS XML exported file into a HEP record.""" text = response.meta["record"] node = Selector(text=text, type="xml") node.remove_namespaces() record = HEPLoader(item=HEPRecord(), selector=node) record.add_xpath('title', '//metadata/pex-dc/title/text()') record.add_xpath('field_categories', '//metadata/pex-dc/subject/text()') record.add_xpath('source', '//metadata/pex-dc/publisher/text()') record.add_value('external_system_numbers', self._get_ext_systems_number(node)) license = get_license( license_text=node.xpath( ".//metadata/pex-dc/rights/text()" ).extract_first(), ) record.add_value('license', license) date, year = self._get_date(node) if date: record.add_value('date_published', date) if year: record.add_value('journal_year', int(year)) identifier = node.xpath(".//metadata/pex-dc/identifier/text()").extract_first() record.add_value('urls', response.meta['pos_url']) if response.meta['pos_pdf_url']: record.add_value('additional_files', {'type': "Fulltext", "url": response.meta['pos_pdf_url']}) if identifier: pbn = re.split('[()]', identifier) if len(pbn) == 3: conf_acronym = pbn[1] article_id = pbn[2] record.add_value('journal_title', pbn[0]) record.add_value('journal_volume', conf_acronym) record.add_value('journal_artid', article_id) else: record.add_value('pubinfo_freetext', identifier) language = node.xpath(".//metadata/pex-dc/language/text()").extract_first() if language: record.add_value('language', language) authors = self._get_authors(node) if authors: record.add_value('authors', authors) extra_data = self._get_extra_data(node) if extra_data: record.add_value('extra_data', extra_data) record.add_value('collections', ['HEP', 'ConferencePaper']) return record.load_item()
def build_conference_paper_item( self, xml_record, conference_paper_url, conference_paper_pdf_url, ): selector = Selector(text=xml_record, type="xml") selector.remove_namespaces() record = HEPLoader(item=HEPRecord(), selector=selector) license_text = selector.xpath( './/metadata/pex-dc/rights/text()').extract_first() record.add_value('license', get_licenses(license_text=license_text)) date, year = self._get_date(selector=selector) record.add_value('date_published', date) record.add_value('journal_year', year) identifier = selector.xpath( ".//metadata/pex-dc/identifier/text()").extract_first() record.add_value( 'journal_title', self._get_journal_title(pos_ext_identifier=identifier), ) record.add_value( 'journal_volume', self._get_journal_volume(pos_ext_identifier=identifier), ) record.add_value( 'journal_artid', self._get_journal_artid(pos_ext_identifier=identifier), ) record.add_xpath('title', '//metadata/pex-dc/title/text()') record.add_xpath('source', '//metadata/pex-dc/publisher/text()') record.add_value( 'external_system_numbers', self._get_ext_systems_number(selector=selector), ) record.add_value('language', self._get_language(selector=selector)) record.add_value('authors', self._get_authors(selector=selector)) record.add_value('collections', ['conferencepaper']) record.add_value('urls', [conference_paper_url]) record.add_value( 'documents', self.get_documents(path=conference_paper_pdf_url, ), ) parsed_item = ParsedItem( record=record.load_item(), record_format='hepcrawl', ) return parsed_item
def build_item(self, response): """Parse an PoS XML exported file into a HEP record.""" text = response.meta["record"] node = Selector(text=text, type="xml") node.remove_namespaces() record = HEPLoader(item=HEPRecord(), selector=node) record.add_xpath("title", "//metadata/pex-dc/title/text()") record.add_xpath("source", "//metadata/pex-dc/publisher/text()") record.add_value("external_system_numbers", self._get_ext_systems_number(node)) license = get_license(license_text=node.xpath(".//metadata/pex-dc/rights/text()").extract_first()) record.add_value("license", license) date, year = self._get_date(node) if date: record.add_value("date_published", date) if year: record.add_value("journal_year", int(year)) identifier = node.xpath(".//metadata/pex-dc/identifier/text()").extract_first() record.add_value("urls", response.meta["pos_url"]) if response.meta["pos_pdf_url"]: record.add_value("additional_files", {"type": "Fulltext", "url": response.meta["pos_pdf_url"]}) if identifier: pbn = re.split("[()]", identifier) if len(pbn) == 3: conf_acronym = pbn[1] article_id = pbn[2] record.add_value("journal_title", pbn[0]) record.add_value("journal_volume", conf_acronym) record.add_value("journal_artid", article_id) else: record.add_value("pubinfo_freetext", identifier) language = node.xpath(".//metadata/pex-dc/language/text()").extract_first() if language: record.add_value("language", language) authors = self._get_authors(node) if authors: record.add_value("authors", authors) extra_data = self._get_extra_data(node) if extra_data: record.add_value("extra_data", extra_data) record.add_value("collections", ["HEP", "ConferencePaper"]) return record.load_item()
def _parse_schedule(self, response): docx_bytes = BytesIO(response.body) docx_str = "" with ZipFile(docx_bytes) as zf: for zip_info in zf.infolist(): if zip_info.filename == "word/document.xml": with zf.open(zip_info) as docx_file: docx_str = StringIO(docx_file.read().decode()) if not docx_str: return year_str = re.findall(r"\d{4}", response.url)[-1] # Remove MS Word namespaces on tags to use selectors sel = Selector(text=docx_str.getvalue()) sel.remove_namespaces() for row in sel.css("tr"): row_str = re.sub( r"\s+", " ", " ".join(row.css("*::text").extract()) ).strip() date_strs = re.findall(r"[a-zA-Z]{3,9} \d{1,2}", row_str) for idx, date_str in enumerate(date_strs): title = self._parse_title(idx) start = self._parse_start(date_str, year_str) meeting = Meeting( title=title, description="", classification=self._parse_classification(title), start=start, end=None, all_day=False, time_notes="See source to confirm details", location=self.location, links=self.link_date_map[(title, start.date())], source=self.start_urls[0], ) meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) yield meeting
def _parse_docx(self, attachment): items = [] docx_bytes = BytesIO(attachment) docx_str = "" with ZipFile(docx_bytes) as zf: for zip_info in zf.infolist(): if zip_info.filename == "word/document.xml": with zf.open(zip_info) as docx_file: docx_str = StringIO(docx_file.read().decode()) if not docx_str: return # Remove MS Word namespaces on tags to use selectors sel = Selector(text=docx_str.getvalue()) sel.remove_namespaces() year_str = "".join([ p.strip() for p in sel.css("tbl > tr")[:1].css("tc:first-of-type") [:1].css("*::text").extract() if p.strip() ]) for table in sel.css("tbl"): month_str = "".join([ p.strip() for p in table.css("tr")[1:2].css("tc:first-of-type") [:1].css("*::text").extract() if p.strip() ]).title() for cell in table.css("tc > p"): cell_str = re.sub( r"((?<=[\-–]) | (?=[\-–])|@)", "", re.sub(r"\s+", " ", " ".join(cell.css("*::text").extract())).strip(), ).strip() if (len(cell_str) <= 2 or (len(cell_str) > 2 and cell_str.startswith("201")) or not cell_str[0].isdigit()): continue items.append(self._parse_item(cell_str, month_str, year_str)) return items