def main_text_extraction(root):
    """
    Extracts the main text of a detailed page
    :param HTMLNode root:
    :return: list of HTMLNode: the main content area
    """
    main_content = []
    # Get the node with the most content
    ret_dict = find_most_text_node(root)
    main_node = root.find_id(ret_dict['id'])
    main_content.append(main_node)

    # Checking the neighbors of the main node
    parent = main_node.parent
    neighbors = parent.get_children()
    ind = neighbors.index(main_node)
    left_neighbors = neighbors[0:ind] or []
    left_neighbors = list(reversed(left_neighbors))
    right_neighbors = neighbors[ind + 1:-1] or []

    # The value of 10 is hardcoded and can be adjusted
    for l in left_neighbors:
        if tp.number_of_words(l.get_full_text()) >= 10:
            main_content.insert(0, l)
        elif l.type in HEADER:
            main_content.insert(0, l)
            break

    for r in right_neighbors:
        if tp.number_of_words(r.get_full_text()) >= 10:
            main_content.append(r)

    return main_content
Пример #2
0
    def score_check(self, node):
        # Preparations
        score = 0
        weights = [2, 2, 2, 2]
        max_value = functools.reduce((lambda x, y: x + y), weights)
        text = DataValidator.flatten_text(node.text)

        # Condition 1: has header tags
        found = False
        for tag in ['header', 'h', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            if node.has_tag(tag):
                found = True
                break
        if not found:
            score += weights[0]

        # Condition 2: Contains address and house number
        # Regexp for searching for house numbers, allowed are f.e.:
        # Somestreet 12, Somestreet 12a
        # but not:
        # Somestreet 12sometextwithoutanywhitespace
        if re.search('([A-Z]{1}[a-z]+\s+[0-9]{1,3}\w?\s+|[A-Z]{1}[a-z]+\s+[0-9]{1,3}\w?$)', text) is not None:
            score += weights[1]

        # Condition 3: Contains postal code(Germany)
        # Regexp for postal code
        if re.search('(\s+[0-9]{5}\s+|\s+[0-9]{5}$)', text) is not None:
            score += weights[2]

        # Condition 4: Contains less than 11 words
        if tp.number_of_words(text) <= 10:
            score += weights[3]

        return float(score) / float(max_value)
Пример #3
0
    def score_check(self, node):
        # Preparations
        score = 0
        weights = [2, 3, 2, 2, 2, 1]
        max_value = functools.reduce((lambda x, y: x + y), weights)

        # Condition 1: Contains more uppercase words
        if not DataValidator.contains_more_lower_than(node, 0.5):
            score += weights[0]

        # Condition 2: Contains header
        for tag in ['h', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            if node.has_tag(tag):
                score += weights[1]
                break

        # Condition 3: Contains less than 10 words
        if tp.number_of_words(DataValidator.flatten_text(node.text)) <= 10:
            score += weights[2]
            # Condition 4: Contains symbols - and :
            if re.search('[:\-]+', DataValidator.flatten_text(node.text)):
                score += weights[3]

        # Condition 5: Contains a link node
        if node.has_tag('a'):
            score += weights[4]

        # Condition 6: Contains no numbers or slashes
        if re.search('[0-9\/]+', DataValidator.flatten_text(
                node.text)) is None:
            score += weights[5]

        return float(score) / float(max_value)
Пример #4
0
 def base_check(self, node):
     text = DataValidator.flatten_text(node.text)
     if tp.number_of_words(text) > 15:
         return False
     if len(node.get_children()) > 1:
         return False
     return True
Пример #5
0
 def hit_check(self, node):
     if DataValidator.in_class_ids(node,
                                   ShortDescValidator.ids_and_classes):
         return True
     if tp.number_of_words(DataValidator.flatten_text(node.text)) >= 20:
         return True
     return False
Пример #6
0
    def base_check(self, node):
        text = DataValidator.flatten_text(node.text)
        if tp.number_of_words(text) < 3:
            return False
        for child in node.get_children():
            if child.has_tag('div') or child.has_tag('span') or child.has_tag(
                    'td'):
                return False

        found = False
        for tag in ['span', 'div', 'td']:
            if node.type == tag:
                found = True
        if tp.number_of_words(text) >= 15:
            found = True
        return found
Пример #7
0
    def score_check(self, node):
        # Preparations
        score = 0
        weights = [2, 3, 1, 3]
        # 1 is subtracted because of the else condition
        max_value = functools.reduce((lambda x, y: x + y), weights) - 1
        words = ['am', 'vom', 'bis', 'zum', 'datum']

        # Condition 1: The text has fewer than 15 words
        if tp.number_of_words(DataValidator.flatten_text(node.text)) <= 15:
            score += weights[0]
            # Condition 2. Additionally contains suitable words
            for word in words:
                if word in DataValidator.flatten_text(node.text.lower()):
                    score += weights[1]
                    break
        else:
            # Condition 3: Contains suitable words but has more than 15 words
            for word in words:
                if word in DataValidator.flatten_text(node.text.lower()):
                    score += weights[2]
                    break

        # Condition 4: Contains suitable classes
        if DataValidator.in_class_ids(node, DateValidator.hit_labels):
            score += weights[3]

        return float(score) / float(max_value)
Пример #8
0
 def hit_check(self, node):
     if DataValidator.in_class_ids(node, LocationValidator.ids_classes):
         return True
     if tp.number_of_words(DataValidator.flatten_text(node.text)) <= 10:
         for word in LocationValidator.key_words:
             reg = '(\s+|^)%s(\s+|:)\s*\S+' % word
             if re.match(reg, DataValidator.flatten_text(node.text).lower()):
                 return True
     return False
Пример #9
0
def run_node_detection(node, n, label_dict):
    """
    This function runs all the validation process of
    each validator for a given node and saves the information
    in the data container of the node and in the given
    label_dict. The function is recursive, regarding the
    n number, which stands for the level in the tree of valid nodes.
    Here a valid node is a node, that is not a <p> or <strong> tag,
    but can contain those as children.

    :param HTMLNode node: the node to validate
    :param int n: the number of the valid node
    :param dict label_dict: the dict, where information about hits and
           scores will be stored for each label.
    :return: None
    """
    # If the height is too big
    if n > MOD_MAX_HEIGHT:
        return

    hit = False
    class_list = [
        NoiseValidator, DateValidator, TimeValidator, TitleValidator,
        ShortDescValidator, LocationValidator, LinkValidator
    ]
    node.data_container['label'] = {'hits': [], 'scores': [], 'not': []}
    for cl in class_list:
        obj = cl()
        if obj.get_label() not in label_dict:
            label_dict[obj.get_label()] = {'hits': [], 'scores': []}

        ret = obj.run_checks(node)
        if isinstance(ret, tuple):
            node.data_container['label']['scores'].append(ret)
            if (ret[0], node) not in label_dict[ret[1]]['scores']:
                label_dict[ret[1]]['scores'].append((ret[0], node))
        else:
            if ret is not None:
                if ret is not DataLabel.UNKNOWN:
                    if node not in label_dict[ret]['hits']:
                        label_dict[ret]['hits'].append(node)
                    node.data_container['label']['hits'].append(ret)
                    if ret is DataLabel.NOISE:
                        break
                    hit = True
            else:
                node.data_container['label']['not'].append(obj.get_label())
    if node.parent is not None:
        # Because those types are formatting areas of webpages
        if node.type in ['div', 'td', 'tr', 'tbody', 'table']:
            run_node_detection(node.parent, n + 1, label_dict)
        elif node.type in FORMAT_TAGS or tp.number_of_words(
                node.get_pure_text()) > 0:
            run_node_detection(node.parent, n, label_dict)
        else:
            run_node_detection(node.parent, n + 1, label_dict)
Пример #10
0
 def base_check(self, node):
     text = DataValidator.flatten_text(node.text)
     if tp.number_of_words(text) > 15:
         return False
     if re.search('[\/?!]', text):
         return False
     # Checking the cases
     if DataValidator.contains_more_lower_than(node, (1.0/3.0)):
         return False
     return True
Пример #11
0
    def hit_check(self, node):
        keywords = ['©', 'copyright']

        if tp.number_of_words(DataValidator.flatten_text(node.text)) == 0:
            return True
        for kw in keywords:
            if kw in DataValidator.flatten_text(node.text):
                return True

        return False
Пример #12
0
def find_most_text_node(node):
    """
    Finds recursively the node with the most word count
    :param HTMLNode node: the root node, where to start
    :return: dict : 'id' is the id of a node, 'words' is the word count
    """
    text = get_unformatted_text(node)
    word_count = tp.number_of_words(text)
    ret_dict = {
        'id': node.identification,
        'words': word_count
    }
    for child in node.get_children():
        ret_child = find_most_text_node(child)
        if ret_child['words'] > ret_dict['words']:
            ret_dict = ret_child
    return ret_dict
Пример #13
0
    def score_check(self, node):
        # preparations
        score = 0
        weights = [2, 1, 3, 2, 2]
        max_value = functools.reduce((lambda x, y: x * y), weights)
        text = DataValidator.flatten_text(node.text)

        # Condition 1: more than 15 words
        if tp.number_of_words(text) >= 15:
            score += weights[0]

        # Condition 2: Contains quotation marks
        if re.match('".+"', text) is not None:
            score += weights[1]

        # Condition 3: Does not contain any headers
        found = False
        for tag in [
                'stronger', 'header', 'h', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
        ]:
            if node.has_tag(tag):
                found = True
                break
        if not found:
            score += weights[2]

        # Condition 4: Contains more lowercase words than uppercase
        if DataValidator.contains_more_lower_than(node):
            score += weights[3]

        # Condition 5: Contains full stops, exclamation signs,
        #              questions signs, semicolons, colons
        if re.match('[.:;?!]\s+|[.:;?!]$', text):
            score += weights[4]

        # Returning score
        return float(score) / float(max_value)
Пример #14
0
    def score_check(self, node):
        # Preparations
        score = 0
        weights = [2, 3, 1]
        # 1 gets subtracted because of the else statement
        max_value = functools.reduce((lambda x, y: x + y), weights) - 1
        words = ['von', 'bis', 'ab', 'uhrzeit', 'um']

        # Condition 1: contains less than 16 words
        if tp.number_of_words(DataValidator.flatten_text(node.text)) <= 15:
            score += weights[1]
            # Condition 2: combination of 1 and 2
            for word in words:
                if word in DataValidator.flatten_text(node.text.lower()):
                    score += weights[2]
                    break
        else:
            # Condition 3: contains suitable words
            for word in words:
                if word in DataValidator.flatten_text(node.text.lower()):
                    score += weights[0]
                    break

        return float(score) / float(max_value)