예제 #1
0
 def get_fingerprints(self, node):
     res = []
     text = normalize(get_text_and_tail(node)).strip()
     if node.tag == 'a' and 'href' in node.attrib:
         res = [(node.tag, node.attrib['href'], '', '')]
     if text:
         res += [(node.tag, a, node.attrib[a], text) for a in node.attrib]
         if node.tag == 'a':
             res += [(node.tag, '', '', text)]
         if not res:
             res = [(node.tag, '', '', text)]
     else:
         if 'style' in node.attrib:
             tmp = re.findall(r'background-image:[ ]*url\((http[^)]+)', node.attrib['style'])
             if tmp:
                 res += [('style', tmp[0])]
     return res
예제 #2
0
 def get_fingerprints(self, node):
     res = []
     text = normalize(get_text_and_tail(node)).strip()
     if node.tag == 'a' and 'href' in node.attrib:
         res = [(node.tag, node.attrib['href'], '', '')]
     if text:
         res += [(node.tag, a, node.attrib[a], text) for a in node.attrib]
         if node.tag == 'a':
             res += [(node.tag, '', '', text)]
         if not res:
             res = [(node.tag, '', '', text)]
     else:
         if 'style' in node.attrib:
             tmp = re.findall(r'background-image:[ ]*url\((http[^)]+)',
                              node.attrib['style'])
             if tmp:
                 res += [('style', tmp[0])]
     return res
예제 #3
0
    def process(self, url, tree, remove_visuals, exclude_data):
        self.remove_bad_xpaths_from_tree(tree)
        if self.detected_language is None:
            self.detected_language = get_language(
                tree, self.url_to_headers_mapping[url], self.domain)
        # print('language: {}'.format(self.detected_language))
        # pre_text_content = normalize('\n'.join([get_text_and_tail(x) for x in tree.iter()]))

        # author has to be attempted before duplicate removal, since an author is
        # likely to occur more often
        self.domain_nodes_dict.remove_template(tree)
        hardest_authors, not_hardest_authors, text_hard_authors, text_soft_authors, meta_authors = get_author(
            tree, self.detected_language)
        self.domain_nodes_dict.remove_author(tree)
        title = getRuleTitle(tree)
        # filter duplicate images by src
        ok_imgs = get_images(tree)
        titleind = ()
        imginds = []
        contentinds = []

        # such as title, date and later author

        link_eles = [link[0] for link in tree.iterlinks()
                     if link[0].tag == 'a' and link[2] and
                     link[2].startswith(self.domain) and
                     get_text_and_tail(link[0]).strip()]

        linkinds = []
        for num, node in enumerate(tree.iter()):
            if node in ok_imgs:
                imginds.append((node, num))
            elif normalize(get_text_and_tail(node)) == title:
                titleind = (node, num)
            elif get_text_and_tail(node).strip():
                if node in link_eles:
                    linkinds.append((node, num))
                contentinds.append((node, num))
            # Cleanup trash for visual'
            if remove_visuals:
                if node.tag == 'input':
                    node.set('type', 'hidden')
                elif node.tag == 'a' and not get_text_and_tail(node).strip():
                    for att in node.attrib:
                        node.set(att, '')
                if node.tag == 'img':
                    node.set('alt', '')
                if node.attrib and 'background-image' in node.attrib:
                    node.set('background-image', '')
        if not titleind:
            # fuzzy token text / title matching
            title_set = set(title.split())
            for num, node in enumerate(tree.iter()):
                text_content = get_text_and_tail(node)
                if text_content and len(text_content) < 500:
                    text_set = set(text_content.split())
                    if fscore(title_set, text_set) > 0.5:
                        titleind = (node, num)
                        break

        if titleind:
            sortedimgs = sorted(imginds, key=lambda x: abs(x[1] - titleind[1]))
        else:
            sortedimgs = []

        images = []
        for x in sortedimgs:
            val = None
            if 'src' in x[0].attrib:
                val = x[0].attrib['src']
            elif 'content' in x[0].attrib:
                val = x[0].attrib['content']
            elif 'style' in x[0].attrib:
                tmp = re.findall(r'background-image:[ ]*url\((http[^)]+)', x[0].attrib['style'])
                if tmp:
                    val = tmp[0]
            if val is not None and val not in images:
                images.append(val)

        author = ''
        author_node_index = None
        date = "1970-01-01"
        if titleind:
            date = get_dates(tree, titleind, self.detected_language)
            # excluding soft dates (meta, they wont work anyway)

            for at in [hardest_authors, not_hardest_authors, text_hard_authors, text_soft_authors]:
                if at:
                    author, author_node_index = sorted(
                        at, key=lambda x: abs(x[1] - titleind[1]))[0]
                    break

        if not author and meta_authors:
            for ma in meta_authors:
                author = ma
                break

        if author_node_index is not None:
            for num, node in enumerate(tree.iter()):
                if num == author_node_index:
                    break
            # It goes wrong when some year is mentioned in the title, then it removes title
            # print('removing author content', node.text)
            node.text = ''
            node.tail = ''

        cleaned_html = lxml.html.tostring(tree).decode('utf8')

        body_content = self.get_content(cleaned_html)

        if not body_content:
            body_content = []
            title_len = len(title)
            title_tokens = set(title.split())
            len_title_tokens = len(title_tokens)
            last_text_node_num = get_last_text_non_a_node(tree)
            for num, x in enumerate(tree.iter()):
                txt = normalize(get_text_and_tail(x))
                if txt:
                    if num < titleind[1]:
                        # print('removed pre-title', txt)
                        x.text = ''
                        x.tail = ''
                        continue
                    if last_text_node_num > 0 and num > last_text_node_num:
                        # print('removed post-content', txt)
                        x.text = ''
                        continue
                    n = len(txt)
                    # remove title
                    txt_tokens = set(txt.split())
                    n_matching = len(txt_tokens & title_tokens)
                    if (n < title_len * 3 and n_matching / len(txt_tokens) > 0.3 and
                            n_matching / len_title_tokens > 0.3):
                        # print('removed!', txt)
                        continue
                    body_content.append(txt)

        links = [x.attrib['href'] for x in tree.xpath('//a')
                 if 'href' in x.attrib and
                 x.attrib['href'].startswith(self.domain) and
                 self.should_save(x.attrib['href'])]

        money_amounts = money.find('\n'.join(body_content), 1000) + money.find(title, 1000)

        data = {'title': title,
                'body': body_content,
                'images': images,
                'publish_date': str(date),
                'author': author,
                'cleaned': cleaned_html,
                'language': self.detected_language,
                'url': url,
                'domain': self.domain,
                'money': money_amounts,
                'summary': '',
                'related': get_sorted_links(links, url)[:5]}

        if 'overwrite_values_by_xpath' in self.config:
            for k, v in self.config['overwrite_values_by_xpath'].items():
                new = tree.xpath(v)
                data[k] = new[0] if isinstance(new, list) else new

        filtered_data = {k: v for k, v in data.items() if k not in exclude_data}

        return filtered_data