def _to_unicode(self, text): text = unicode_helpers.to_unicode_or_bust(text) if "{" in text: text = text.replace("\\", "") for i, j in self.bibtex_to_unicode.iteritems(): text = text.replace(i, j) return text
def _extract_biblio(self, page, id=None): biblio_dict = {} if not page: return biblio_dict unicode_page = unicode_helpers.to_unicode_or_bust(page) try: parsed_html = lxml.html.document_fromstring(unicode_page) try: response = parsed_html.find(".//title").text if response: biblio_dict["title"] = response.strip() except AttributeError: pass try: response = parsed_html.find(".//h1").text if response: biblio_dict["h1"] = response.strip() except AttributeError: pass # throws ParserError when document is empty except (ValueError, lxml.etree.ParserError): logger.warning( u"%20s couldn't parse %s so giving up on webpage biblio" % (self.provider_name, id)) try: response = re.search("<title>(.+?)</title>", unicode_page).group(1) response.replace("\n", "") response.replace("\r", "") if response: biblio_dict["title"] = response.strip() except AttributeError: pass return biblio_dict
def _extract_biblio(self, page, id=None): biblio_dict = {} if not page: return biblio_dict unicode_page = unicode_helpers.to_unicode_or_bust(page) try: parsed_html = lxml.html.document_fromstring(unicode_page) try: response = parsed_html.find(".//title").text if response and response.strip(): biblio_dict["title"] = response.strip() except AttributeError: pass try: response = parsed_html.find(".//h1").text if response and response.strip(): biblio_dict["h1"] = response.strip() except AttributeError: pass # throws ParserError when document is empty except (ValueError, lxml.etree.ParserError): logger.warning(u"%20s couldn't parse %s so giving up on webpage biblio" % (self.provider_name, id)) try: response = re.search("<title>(.+?)</title>", unicode_page).group(1) response.replace("\n", "") response.replace("\r", "") if response: biblio_dict["title"] = response.strip() except AttributeError: pass return biblio_dict
def _to_unicode(self, text): text = unicode_helpers.to_unicode_or_bust(text) return text