예제 #1
0
    def build_conference_proceedings_item(
        self,
        proceedings_page_html,
        pos_id,
    ):
        selector = Selector(
            text=proceedings_page_html,
            type='html',
        )
        selector.remove_namespaces()
        record = HEPLoader(item=HEPRecord(), selector=selector)

        record.add_value('collections', ['proceedings'])
        record.add_value(
            'title',
            self._get_proceedings_title(selector=selector),
        )
        record.add_value(
            'subtitle',
            self._get_proceedings_date_place(selector=selector),
        )
        record.add_value('journal_title', 'PoS')
        record.add_value(
            'journal_volume',
            self._get_journal_volume(pos_ext_identifier=pos_id),
        )

        parsed_proceeding = ParsedItem(
            record=record.load_item(),
            record_format='hepcrawl',
        )

        return parsed_proceeding
예제 #2
0
    def get_conference_proceedings_page_request(self, meta):
        """Gets the conference proceedings page, using the indernal conference
        id from the record html page retrieved before.
        """
        if not meta.get('html_record'):
            raise PoSExtractionException(
                'PoS conference paper page was empty, current meta:\n%s' %
                meta)

        proceedings_page_url = self._get_proceedings_page_url(
            page_html=meta.get('html_record'), )

        page_selector = Selector(
            text=meta.get('xml_record'),
            type='xml',
        )
        page_selector.remove_namespaces()
        pos_id = page_selector.xpath(
            ".//metadata/pex-dc/identifier/text()").extract_first()
        meta['pos_id'] = pos_id

        return Request(
            url=proceedings_page_url,
            meta=meta,
            callback=self.parse_conference_proceedings,
        )
예제 #3
0
    def build_item(self, response):
        """Parse an PoS XML exported file into a HEP record."""
        text = response.meta["record"]
        node = Selector(text=text, type="xml")
        node.remove_namespaces()
        record = HEPLoader(item=HEPRecord(), selector=node)
        record.add_xpath('title', '//metadata/pex-dc/title/text()')
        record.add_xpath('field_categories',
                         '//metadata/pex-dc/subject/text()')
        record.add_xpath('source', '//metadata/pex-dc/publisher/text()')

        record.add_value('external_system_numbers',
                         self._get_ext_systems_number(node))
        pub_license, pub_license_url, openaccess = self._get_license(node)
        if pub_license:
            record.add_value('license', pub_license)
            record.add_value('license_url', pub_license_url)
            if openaccess:
                record.add_value('license_type', "open-access")

        date, year = self._get_date(node)
        if date:
            record.add_value('date_published', date)
        if year:
            record.add_value('journal_year', year)

        identifier = node.xpath(
            ".//metadata/pex-dc/identifier/text()").extract_first()
        record.add_value('urls', response.meta['pos_url'])
        if response.meta['pos_pdf_url']:
            record.add_value('additional_files', {
                'type': "Fulltext",
                "url": response.meta['pos_pdf_url']
            })
        if identifier:
            pbn = re.split('[()]', identifier)
            if len(pbn) == 3:
                conf_acronym = pbn[1]
                article_id = pbn[2]
                record.add_value('journal_title', pbn[0])
                record.add_value('journal_volume', conf_acronym)
                record.add_value('journal_artid', article_id)
            else:
                record.add_value('pubinfo_freetext', identifier)

        language = node.xpath(
            ".//metadata/pex-dc/language/text()").extract_first()
        if language:
            record.add_value('language', language)

        authors = self._get_authors(node)
        if authors:
            record.add_value('authors', authors)

        extra_data = self._get_extra_data(node)
        if extra_data:
            record.add_value('extra_data', extra_data)

        record.add_value('collections', ['HEP', 'ConferencePaper'])
        return record.load_item()
예제 #4
0
    def parse(self, response):
        xml = Selector(response)
        xml.remove_namespaces()

        urls = xml.xpath('//loc/text()').extract()
        urls = [url.strip() for url in urls]

        for url in urls:
            if 'store-details' in url:
                yield scrapy.Request(url, callback=self.parse_store)
예제 #5
0
    def build_item(self, response):
        """Parse an PoS XML exported file into a HEP record."""
        text = response.meta["record"]
        node = Selector(text=text, type="xml")
        node.remove_namespaces()
        record = HEPLoader(item=HEPRecord(), selector=node)
        record.add_xpath('title', '//metadata/pex-dc/title/text()')
        record.add_xpath('field_categories', '//metadata/pex-dc/subject/text()')
        record.add_xpath('source', '//metadata/pex-dc/publisher/text()')

        record.add_value('external_system_numbers', self._get_ext_systems_number(node))

        license = get_license(
            license_text=node.xpath(
                ".//metadata/pex-dc/rights/text()"
            ).extract_first(),
        )
        record.add_value('license', license)

        date, year = self._get_date(node)
        if date:
            record.add_value('date_published', date)
        if year:
            record.add_value('journal_year', int(year))

        identifier = node.xpath(".//metadata/pex-dc/identifier/text()").extract_first()
        record.add_value('urls', response.meta['pos_url'])
        if response.meta['pos_pdf_url']:
            record.add_value('additional_files', {'type': "Fulltext", "url": response.meta['pos_pdf_url']})
        if identifier:
            pbn = re.split('[()]', identifier)
            if len(pbn) == 3:
                conf_acronym = pbn[1]
                article_id = pbn[2]
                record.add_value('journal_title', pbn[0])
                record.add_value('journal_volume', conf_acronym)
                record.add_value('journal_artid', article_id)
            else:
                record.add_value('pubinfo_freetext', identifier)

        language = node.xpath(".//metadata/pex-dc/language/text()").extract_first()
        if language:
            record.add_value('language', language)

        authors = self._get_authors(node)
        if authors:
            record.add_value('authors', authors)

        extra_data = self._get_extra_data(node)
        if extra_data:
            record.add_value('extra_data', extra_data)

        record.add_value('collections', ['HEP', 'ConferencePaper'])
        return record.load_item()
예제 #6
0
    def build_conference_paper_item(
        self,
        xml_record,
        conference_paper_url,
        conference_paper_pdf_url,
    ):
        selector = Selector(text=xml_record, type="xml")
        selector.remove_namespaces()
        record = HEPLoader(item=HEPRecord(), selector=selector)

        license_text = selector.xpath(
            './/metadata/pex-dc/rights/text()').extract_first()
        record.add_value('license', get_licenses(license_text=license_text))

        date, year = self._get_date(selector=selector)
        record.add_value('date_published', date)
        record.add_value('journal_year', year)

        identifier = selector.xpath(
            ".//metadata/pex-dc/identifier/text()").extract_first()
        record.add_value(
            'journal_title',
            self._get_journal_title(pos_ext_identifier=identifier),
        )
        record.add_value(
            'journal_volume',
            self._get_journal_volume(pos_ext_identifier=identifier),
        )
        record.add_value(
            'journal_artid',
            self._get_journal_artid(pos_ext_identifier=identifier),
        )

        record.add_xpath('title', '//metadata/pex-dc/title/text()')
        record.add_xpath('source', '//metadata/pex-dc/publisher/text()')
        record.add_value(
            'external_system_numbers',
            self._get_ext_systems_number(selector=selector),
        )
        record.add_value('language', self._get_language(selector=selector))
        record.add_value('authors', self._get_authors(selector=selector))
        record.add_value('collections', ['conferencepaper'])
        record.add_value('urls', [conference_paper_url])
        record.add_value(
            'documents',
            self.get_documents(path=conference_paper_pdf_url, ),
        )

        parsed_item = ParsedItem(
            record=record.load_item(),
            record_format='hepcrawl',
        )

        return parsed_item
예제 #7
0
    def build_item(self, response):
        """Parse an PoS XML exported file into a HEP record."""
        text = response.meta["record"]
        node = Selector(text=text, type="xml")
        node.remove_namespaces()
        record = HEPLoader(item=HEPRecord(), selector=node)
        record.add_xpath("title", "//metadata/pex-dc/title/text()")
        record.add_xpath("source", "//metadata/pex-dc/publisher/text()")

        record.add_value("external_system_numbers", self._get_ext_systems_number(node))

        license = get_license(license_text=node.xpath(".//metadata/pex-dc/rights/text()").extract_first())
        record.add_value("license", license)

        date, year = self._get_date(node)
        if date:
            record.add_value("date_published", date)
        if year:
            record.add_value("journal_year", int(year))

        identifier = node.xpath(".//metadata/pex-dc/identifier/text()").extract_first()
        record.add_value("urls", response.meta["pos_url"])
        if response.meta["pos_pdf_url"]:
            record.add_value("additional_files", {"type": "Fulltext", "url": response.meta["pos_pdf_url"]})
        if identifier:
            pbn = re.split("[()]", identifier)
            if len(pbn) == 3:
                conf_acronym = pbn[1]
                article_id = pbn[2]
                record.add_value("journal_title", pbn[0])
                record.add_value("journal_volume", conf_acronym)
                record.add_value("journal_artid", article_id)
            else:
                record.add_value("pubinfo_freetext", identifier)

        language = node.xpath(".//metadata/pex-dc/language/text()").extract_first()
        if language:
            record.add_value("language", language)

        authors = self._get_authors(node)
        if authors:
            record.add_value("authors", authors)

        extra_data = self._get_extra_data(node)
        if extra_data:
            record.add_value("extra_data", extra_data)

        record.add_value("collections", ["HEP", "ConferencePaper"])
        return record.load_item()
예제 #8
0
    def _parse_schedule(self, response):
        docx_bytes = BytesIO(response.body)
        docx_str = ""
        with ZipFile(docx_bytes) as zf:
            for zip_info in zf.infolist():
                if zip_info.filename == "word/document.xml":
                    with zf.open(zip_info) as docx_file:
                        docx_str = StringIO(docx_file.read().decode())
        if not docx_str:
            return
        year_str = re.findall(r"\d{4}", response.url)[-1]
        # Remove MS Word namespaces on tags to use selectors
        sel = Selector(text=docx_str.getvalue())
        sel.remove_namespaces()
        for row in sel.css("tr"):
            row_str = re.sub(
                r"\s+", " ", " ".join(row.css("*::text").extract())
            ).strip()
            date_strs = re.findall(r"[a-zA-Z]{3,9} \d{1,2}", row_str)
            for idx, date_str in enumerate(date_strs):
                title = self._parse_title(idx)
                start = self._parse_start(date_str, year_str)
                meeting = Meeting(
                    title=title,
                    description="",
                    classification=self._parse_classification(title),
                    start=start,
                    end=None,
                    all_day=False,
                    time_notes="See source to confirm details",
                    location=self.location,
                    links=self.link_date_map[(title, start.date())],
                    source=self.start_urls[0],
                )

                meeting["status"] = self._get_status(meeting)
                meeting["id"] = self._get_id(meeting)

                yield meeting
 def _parse_docx(self, attachment):
     items = []
     docx_bytes = BytesIO(attachment)
     docx_str = ""
     with ZipFile(docx_bytes) as zf:
         for zip_info in zf.infolist():
             if zip_info.filename == "word/document.xml":
                 with zf.open(zip_info) as docx_file:
                     docx_str = StringIO(docx_file.read().decode())
     if not docx_str:
         return
     # Remove MS Word namespaces on tags to use selectors
     sel = Selector(text=docx_str.getvalue())
     sel.remove_namespaces()
     year_str = "".join([
         p.strip() for p in sel.css("tbl > tr")[:1].css("tc:first-of-type")
         [:1].css("*::text").extract() if p.strip()
     ])
     for table in sel.css("tbl"):
         month_str = "".join([
             p.strip() for p in table.css("tr")[1:2].css("tc:first-of-type")
             [:1].css("*::text").extract() if p.strip()
         ]).title()
         for cell in table.css("tc > p"):
             cell_str = re.sub(
                 r"((?<=[\-–]) | (?=[\-–])|@)",
                 "",
                 re.sub(r"\s+", " ",
                        " ".join(cell.css("*::text").extract())).strip(),
             ).strip()
             if (len(cell_str) <= 2
                     or (len(cell_str) > 2 and cell_str.startswith("201"))
                     or not cell_str[0].isdigit()):
                 continue
             items.append(self._parse_item(cell_str, month_str, year_str))
     return items