from __future__ import absolute_import, division, print_function, unicode_literals import cgi import webencodings import cchardet from cosrlib import re from .parsers import GUMBOCY_PARSER_HEAD _RE_XML_ENCODING = re.compile( r'^\s*\<\?xml\s+version\="1\.0"\s+encoding\="([^"]+)"\?\>') def get_encoding_from_content_type(content_type): _, params = cgi.parse_header(content_type.decode("ascii", "ignore")) if params.get("charset"): detected = webencodings.lookup(params["charset"]) if detected: return detected.codec_info class HTMLEncoding(object): """ This class deals with the many different encoding and quirks found in pages on the web, and tries to normalize everything in UTF-8 """ def __init__(self, document): self.doc = document self.parser = None self.detected = None def ensure_utf8(self):
from __future__ import absolute_import, division, print_function, unicode_literals import importlib import os from cosrlib import re, is_basestring from cosrlib.url import URL _RE_SPLIT_URLWORDS = re.compile(r"[^a-z0-9]+") _RE_SPLIT_WORDS = re.compile(r"[\s\W]+") _RE_WHITESPLACE = re.compile(r"[\s]+") _RE_STRIP_PROTOCOL = re.compile(r"^.*\/\/") def load_document_type(doctype, *args, **kwargs): """ Loads and instanciates a [HTML, ...]Document class from a doctype """ cls_name = "%sDocument" % doctype.upper() cls = getattr(importlib.import_module("cosrlib.document.%s" % doctype), cls_name) return cls(*args, **kwargs) class Document(object): """ An indexable document. Base class for all document types (HTML, PDF, ...) """ def __init__(self, source_data, url=None, headers=None, index_level=2): self.source_data = source_data self.source_headers = headers or {} self.index_level = index_level if not url: self.source_url = URL("")
from __future__ import absolute_import, division, print_function, unicode_literals import os import shutil from pyspark.sql import types as SparkTypes from cosrlib.url import URL from cosrlib.spark import createDataFrame, sql, SparkPlugin from cosrlib import re, py2_long from urlserver.id_generator import _fast_make_domain_id _RE_STRIP_FRAGMENT = re.compile(r"#.*") class WebGraphPlugin(SparkPlugin): """ Base class for WebGraph plugins """ include_external = True include_internal = True def hook_spark_pipeline_init(self, sc, sqlc, schema, indexer): if self.include_external: schema.append( SparkTypes.StructField("external_links", SparkTypes.ArrayType(SparkTypes.StructType([ SparkTypes.StructField("href", SparkTypes.StringType(), nullable=False), SparkTypes.StructField("text", SparkTypes.StringType(), nullable=True) ])), nullable=True) )
import importlib import os from cosrlib import re from cosrlib.url import URL _RE_SPLIT_URLWORDS = re.compile(r"[^a-z0-9]+") _RE_SPLIT_WORDS = re.compile(r"[\s\W]+") _RE_WHITESPLACE = re.compile(r"[\s]+") _RE_STRIP_PROTOCOL = re.compile(r"^.*\/\/") def load_document_type(doctype, *args, **kwargs): """ Loads and instanciates a [HTML, ...]Document class from a doctype """ cls_name = "%sDocument" % doctype.upper() cls = getattr(importlib.import_module("cosrlib.document.%s" % doctype), cls_name) return cls(*args, **kwargs) class Document(object): """ An indexable document. Base class for all document types (HTML, PDF, ...) """ def __init__(self, source_data, url=None, headers=None, index_level=2): self.source_data = source_data self.source_headers = headers or {} self.index_level = index_level if not url: self.source_url = URL("") elif isinstance(url, basestring):
from __future__ import absolute_import, division, print_function, unicode_literals import unicodedata from cosrlib import re _RE_WHITESPLACE = re.compile(r"\s+") _RE_REMOVE_LAST_WORD = re.compile(r"\s([^\s]*)$") _RE_SPLIT_WORDS = re.compile(r"[\s\W]+") # Some titles are useless and should be replaced by something more relevant BLACKLISTED_TITLES = frozenset(["home", "default"]) # Some summaries are useless and should be replaced by something more relevant BLACKLISTED_SUMMARIES = frozenset(["default"]) # Maximum length for titles TITLE_MAX_LENGTH = 70 # Maximum length for summaries SUMMARY_MAX_LENGTH = 160 def unicode_truncate(s, length, keep_words=False, ellipsis="..."): """ Truncates an UTF-8 string and return it as unicode """ encoded = s.decode("utf-8", "ignore") # If the unicode form is already under the length, return directly if len(encoded) <= length: return encoded
from __future__ import absolute_import, division, print_function, unicode_literals import urlparse from cosrlib import re from cosrlib.url import URL from cosrlib.document import Document from .htmlencoding import HTMLEncoding from .parsers import GUMBOCY_PARSER _RE_STRIP_TAGS = re.compile(r"<.*?>") _RE_VALID_NETLOC = re.compile(r"^[a-zA-Z0-9-]+\.[a-zA-Z0-9.:-]+$") class HTMLDocument(Document): """ Class representing an HTML document, with methods for parsing and extracting content """ # pylint: disable=attribute-defined-outside-init def __init__(self, *args, **kwargs): Document.__init__(self, *args, **kwargs) self.encoding = HTMLEncoding(self) self.analysis = None def discard_source_data(self): """ Remove source_data from memory """ del self.source_data def parse(self): """
import urlparse import gumbocy from cosrlib import re from cosrlib.url import URL from cosrlib.document import Document from . import defs from .htmlencoding import HTMLEncoding _RE_SEARCH_STYLE_HIDDEN = re.compile( r"(display\s*\:\s*none)|(visibility\s*\:\s*hidden)") _RE_STRIP_TAGS = re.compile(r"<.*?>") class HTMLDocument(Document): """ Class representing an HTML document, with methods for parsing and extracting content """ # pylint: disable=attribute-defined-outside-init def __init__(self, *args, **kwargs): Document.__init__(self, *args, **kwargs) self.encoding = HTMLEncoding(self) self.parser = None def reset(self): """ Reset the tree traversal properties """ # Store infos found in the <head> for further reuse
from __future__ import absolute_import, division, print_function, unicode_literals import os import shutil from pyspark.sql import types as SparkTypes from cosrlib.url import URL from cosrlib.spark import createDataFrame, sql, SparkPlugin from cosrlib import re, py2_long from urlserver.id_generator import _fast_make_domain_id _RE_STRIP_FRAGMENT = re.compile(r"#.*") class WebGraphPlugin(SparkPlugin): """ Base class for WebGraph plugins """ include_external = True include_internal = True def hook_spark_pipeline_init(self, sc, sqlc, schema, indexer): if self.include_external: schema.append( SparkTypes.StructField( "external_links", SparkTypes.ArrayType( SparkTypes.StructType([ SparkTypes.StructField("href", SparkTypes.StringType(),
def init(self): # Match based on domain suffixes self.regex_source = "|".join([re.escape(d) + "$" for d in self.args["domains"].split(" ")]) self.regex = re.compile(self.regex_source)
import urlparse import gumbocy from cosrlib import re from cosrlib.url import URL from cosrlib.document import Document from . import defs from .htmlencoding import HTMLEncoding _RE_SEARCH_STYLE_HIDDEN = re.compile(r"(display\s*\:\s*none)|(visibility\s*\:\s*hidden)") _RE_STRIP_TAGS = re.compile(r"<.*?>") class HTMLDocument(Document): """ Class representing an HTML document, with methods for parsing and extracting content """ # pylint: disable=attribute-defined-outside-init def __init__(self, *args, **kwargs): Document.__init__(self, *args, **kwargs) self.encoding = HTMLEncoding(self) self.parser = None def reset(self): """ Reset the tree traversal properties """ # Store infos found in the <head> for further reuse
from __future__ import absolute_import, division, print_function, unicode_literals import urlparse from cosrlib import re from cosrlib.url import URL from cosrlib.document import Document from .htmlencoding import HTMLEncoding from .parsers import GUMBOCY_PARSER _RE_STRIP_TAGS = re.compile(r"<.*?>") _RE_VALID_NETLOC = re.compile(r"^[a-zA-Z0-9-]+\.[a-zA-Z0-9.:-]+$") class HTMLDocument(Document): """ Class representing an HTML document, with methods for parsing and extracting content """ # pylint: disable=attribute-defined-outside-init def __init__(self, *args, **kwargs): Document.__init__(self, *args, **kwargs) self.encoding = HTMLEncoding(self) self.analysis = None def discard_source_data(self): """ Remove source_data from memory """ del self.source_data def parse(self):
import cgi import webencodings import cchardet from cosrlib import re from .parsers import GUMBOCY_PARSER_HEAD _RE_XML_ENCODING = re.compile(r'^\s*\<\?xml\s+version\="1\.0"\s+encoding\="([^"]+)"\?\>') def get_encoding_from_content_type(content_type): _, params = cgi.parse_header(content_type.decode("ascii", "ignore")) if params.get("charset"): detected = webencodings.lookup(params["charset"]) if detected: return detected.codec_info class HTMLEncoding(object): """ This class deals with the many different encoding and quirks found in pages on the web, and tries to normalize everything in UTF-8 """ def __init__(self, document): self.doc = document self.parser = None self.detected = None def ensure_utf8(self): """ Makes sure we have only UTF-8 in source_data """
from __future__ import absolute_import, division, print_function, unicode_literals import unicodedata from cosrlib import re _RE_WHITESPLACE = re.compile(r"\s+") _RE_REMOVE_LAST_WORD = re.compile(r"\s([^\s]*)$") _RE_SPLIT_WORDS = re.compile(r"[\s\W]+") # Some titles are useless and should be replaced by something more relevant BLACKLISTED_TITLES = frozenset(["home", "default"]) # Some summaries are useless and should be replaced by something more relevant BLACKLISTED_SUMMARIES = frozenset(["default"]) # Maximum length for titles TITLE_MAX_LENGTH = 70 # Maximum length for summaries SUMMARY_MAX_LENGTH = 160 def unicode_truncate(s, length, keep_words=False, ellipsis=u"..."): """ Truncates an UTF-8 string and return it as unicode """ encoded = s.decode("utf-8", "ignore") # If the unicode form is already under the length, return directly if len(encoded) <= length: return encoded