示例#1
0
from __future__ import absolute_import, division, print_function, unicode_literals

import cgi

import webencodings
import cchardet

from cosrlib import re
from .parsers import GUMBOCY_PARSER_HEAD

_RE_XML_ENCODING = re.compile(
    r'^\s*\<\?xml\s+version\="1\.0"\s+encoding\="([^"]+)"\?\>')


def get_encoding_from_content_type(content_type):
    _, params = cgi.parse_header(content_type.decode("ascii", "ignore"))
    if params.get("charset"):
        detected = webencodings.lookup(params["charset"])
        if detected:
            return detected.codec_info


class HTMLEncoding(object):
    """ This class deals with the many different encoding and quirks found in pages on the web,
        and tries to normalize everything in UTF-8 """
    def __init__(self, document):
        self.doc = document
        self.parser = None
        self.detected = None

    def ensure_utf8(self):
示例#2
0
from __future__ import absolute_import, division, print_function, unicode_literals

import importlib
import os

from cosrlib import re, is_basestring
from cosrlib.url import URL

_RE_SPLIT_URLWORDS = re.compile(r"[^a-z0-9]+")
_RE_SPLIT_WORDS = re.compile(r"[\s\W]+")
_RE_WHITESPLACE = re.compile(r"[\s]+")
_RE_STRIP_PROTOCOL = re.compile(r"^.*\/\/")


def load_document_type(doctype, *args, **kwargs):
    """ Loads and instanciates a [HTML, ...]Document class from a doctype """
    cls_name = "%sDocument" % doctype.upper()
    cls = getattr(importlib.import_module("cosrlib.document.%s" % doctype),
                  cls_name)
    return cls(*args, **kwargs)


class Document(object):
    """ An indexable document. Base class for all document types (HTML, PDF, ...) """
    def __init__(self, source_data, url=None, headers=None, index_level=2):
        self.source_data = source_data
        self.source_headers = headers or {}
        self.index_level = index_level

        if not url:
            self.source_url = URL("")
示例#3
0
from __future__ import absolute_import, division, print_function, unicode_literals

import os
import shutil

from pyspark.sql import types as SparkTypes

from cosrlib.url import URL
from cosrlib.spark import createDataFrame, sql, SparkPlugin
from cosrlib import re, py2_long
from urlserver.id_generator import _fast_make_domain_id


_RE_STRIP_FRAGMENT = re.compile(r"#.*")


class WebGraphPlugin(SparkPlugin):
    """ Base class for WebGraph plugins """

    include_external = True
    include_internal = True

    def hook_spark_pipeline_init(self, sc, sqlc, schema, indexer):

        if self.include_external:
            schema.append(
                SparkTypes.StructField("external_links", SparkTypes.ArrayType(SparkTypes.StructType([
                    SparkTypes.StructField("href", SparkTypes.StringType(), nullable=False),
                    SparkTypes.StructField("text", SparkTypes.StringType(), nullable=True)
                ])), nullable=True)
            )
示例#4
0
import importlib
import os

from cosrlib import re
from cosrlib.url import URL


_RE_SPLIT_URLWORDS = re.compile(r"[^a-z0-9]+")
_RE_SPLIT_WORDS = re.compile(r"[\s\W]+")
_RE_WHITESPLACE = re.compile(r"[\s]+")
_RE_STRIP_PROTOCOL = re.compile(r"^.*\/\/")


def load_document_type(doctype, *args, **kwargs):
    """ Loads and instanciates a [HTML, ...]Document class from a doctype """
    cls_name = "%sDocument" % doctype.upper()
    cls = getattr(importlib.import_module("cosrlib.document.%s" % doctype), cls_name)
    return cls(*args, **kwargs)


class Document(object):
    """ An indexable document. Base class for all document types (HTML, PDF, ...) """

    def __init__(self, source_data, url=None, headers=None, index_level=2):
        self.source_data = source_data
        self.source_headers = headers or {}
        self.index_level = index_level

        if not url:
            self.source_url = URL("")
        elif isinstance(url, basestring):
示例#5
0
from __future__ import absolute_import, division, print_function, unicode_literals

import unicodedata
from cosrlib import re


_RE_WHITESPLACE = re.compile(r"\s+")
_RE_REMOVE_LAST_WORD = re.compile(r"\s([^\s]*)$")
_RE_SPLIT_WORDS = re.compile(r"[\s\W]+")

# Some titles are useless and should be replaced by something more relevant
BLACKLISTED_TITLES = frozenset(["home", "default"])

# Some summaries are useless and should be replaced by something more relevant
BLACKLISTED_SUMMARIES = frozenset(["default"])

# Maximum length for titles
TITLE_MAX_LENGTH = 70

# Maximum length for summaries
SUMMARY_MAX_LENGTH = 160


def unicode_truncate(s, length, keep_words=False, ellipsis="..."):
    """ Truncates an UTF-8 string and return it as unicode """

    encoded = s.decode("utf-8", "ignore")

    # If the unicode form is already under the length, return directly
    if len(encoded) <= length:
        return encoded
示例#6
0
from __future__ import absolute_import, division, print_function, unicode_literals

import urlparse

from cosrlib import re
from cosrlib.url import URL
from cosrlib.document import Document

from .htmlencoding import HTMLEncoding
from .parsers import GUMBOCY_PARSER

_RE_STRIP_TAGS = re.compile(r"<.*?>")
_RE_VALID_NETLOC = re.compile(r"^[a-zA-Z0-9-]+\.[a-zA-Z0-9.:-]+$")


class HTMLDocument(Document):
    """ Class representing an HTML document, with methods for parsing and extracting content """

    # pylint: disable=attribute-defined-outside-init

    def __init__(self, *args, **kwargs):
        Document.__init__(self, *args, **kwargs)
        self.encoding = HTMLEncoding(self)
        self.analysis = None

    def discard_source_data(self):
        """ Remove source_data from memory """
        del self.source_data

    def parse(self):
        """
示例#7
0
import urlparse

import gumbocy

from cosrlib import re
from cosrlib.url import URL
from cosrlib.document import Document

from . import defs
from .htmlencoding import HTMLEncoding

_RE_SEARCH_STYLE_HIDDEN = re.compile(
    r"(display\s*\:\s*none)|(visibility\s*\:\s*hidden)")
_RE_STRIP_TAGS = re.compile(r"<.*?>")


class HTMLDocument(Document):
    """ Class representing an HTML document, with methods for parsing and extracting content """

    # pylint: disable=attribute-defined-outside-init

    def __init__(self, *args, **kwargs):
        Document.__init__(self, *args, **kwargs)

        self.encoding = HTMLEncoding(self)
        self.parser = None

    def reset(self):
        """ Reset the tree traversal properties """

        # Store infos found in the <head> for further reuse
示例#8
0
from __future__ import absolute_import, division, print_function, unicode_literals

import os
import shutil

from pyspark.sql import types as SparkTypes

from cosrlib.url import URL
from cosrlib.spark import createDataFrame, sql, SparkPlugin
from cosrlib import re, py2_long
from urlserver.id_generator import _fast_make_domain_id

_RE_STRIP_FRAGMENT = re.compile(r"#.*")


class WebGraphPlugin(SparkPlugin):
    """ Base class for WebGraph plugins """

    include_external = True
    include_internal = True

    def hook_spark_pipeline_init(self, sc, sqlc, schema, indexer):

        if self.include_external:
            schema.append(
                SparkTypes.StructField(
                    "external_links",
                    SparkTypes.ArrayType(
                        SparkTypes.StructType([
                            SparkTypes.StructField("href",
                                                   SparkTypes.StringType(),
示例#9
0
 def init(self):
     # Match based on domain suffixes
     self.regex_source = "|".join([re.escape(d) + "$" for d in self.args["domains"].split(" ")])
     self.regex = re.compile(self.regex_source)
示例#10
0
import urlparse

import gumbocy

from cosrlib import re
from cosrlib.url import URL
from cosrlib.document import Document

from . import defs
from .htmlencoding import HTMLEncoding


_RE_SEARCH_STYLE_HIDDEN = re.compile(r"(display\s*\:\s*none)|(visibility\s*\:\s*hidden)")
_RE_STRIP_TAGS = re.compile(r"<.*?>")


class HTMLDocument(Document):
    """ Class representing an HTML document, with methods for parsing and extracting content """

    # pylint: disable=attribute-defined-outside-init

    def __init__(self, *args, **kwargs):
        Document.__init__(self, *args, **kwargs)

        self.encoding = HTMLEncoding(self)
        self.parser = None

    def reset(self):
        """ Reset the tree traversal properties """

        # Store infos found in the <head> for further reuse
示例#11
0
from __future__ import absolute_import, division, print_function, unicode_literals

import urlparse

from cosrlib import re
from cosrlib.url import URL
from cosrlib.document import Document

from .htmlencoding import HTMLEncoding
from .parsers import GUMBOCY_PARSER


_RE_STRIP_TAGS = re.compile(r"<.*?>")
_RE_VALID_NETLOC = re.compile(r"^[a-zA-Z0-9-]+\.[a-zA-Z0-9.:-]+$")


class HTMLDocument(Document):
    """ Class representing an HTML document, with methods for parsing and extracting content """

    # pylint: disable=attribute-defined-outside-init

    def __init__(self, *args, **kwargs):
        Document.__init__(self, *args, **kwargs)
        self.encoding = HTMLEncoding(self)
        self.analysis = None

    def discard_source_data(self):
        """ Remove source_data from memory """
        del self.source_data

    def parse(self):
示例#12
0
import cgi

import webencodings
import cchardet

from cosrlib import re
from .parsers import GUMBOCY_PARSER_HEAD


_RE_XML_ENCODING = re.compile(r'^\s*\<\?xml\s+version\="1\.0"\s+encoding\="([^"]+)"\?\>')


def get_encoding_from_content_type(content_type):
    _, params = cgi.parse_header(content_type.decode("ascii", "ignore"))
    if params.get("charset"):
        detected = webencodings.lookup(params["charset"])
        if detected:
            return detected.codec_info


class HTMLEncoding(object):
    """ This class deals with the many different encoding and quirks found in pages on the web,
        and tries to normalize everything in UTF-8 """

    def __init__(self, document):
        self.doc = document
        self.parser = None
        self.detected = None

    def ensure_utf8(self):
        """ Makes sure we have only UTF-8 in source_data """
示例#13
0
from __future__ import absolute_import, division, print_function, unicode_literals

import unicodedata
from cosrlib import re

_RE_WHITESPLACE = re.compile(r"\s+")
_RE_REMOVE_LAST_WORD = re.compile(r"\s([^\s]*)$")
_RE_SPLIT_WORDS = re.compile(r"[\s\W]+")

# Some titles are useless and should be replaced by something more relevant
BLACKLISTED_TITLES = frozenset(["home", "default"])

# Some summaries are useless and should be replaced by something more relevant
BLACKLISTED_SUMMARIES = frozenset(["default"])

# Maximum length for titles
TITLE_MAX_LENGTH = 70

# Maximum length for summaries
SUMMARY_MAX_LENGTH = 160


def unicode_truncate(s, length, keep_words=False, ellipsis=u"..."):
    """ Truncates an UTF-8 string and return it as unicode """

    encoded = s.decode("utf-8", "ignore")

    # If the unicode form is already under the length, return directly
    if len(encoded) <= length:
        return encoded