Пример #1
0
 def possible_author(self, node):
     goods = ['author', 'by', 'publi', 'write', 'written', 'info']
     attr_values = node.attrib.values()
     if any([g in a for a in attr_values for g in goods]):
         return True
     txt = get_text_and_tail(node)
     if re.search(r'\b(author|by)[: ]', txt):
         return True
     return False
Пример #2
0
 def possible_author(self, node):
     goods = ['author', 'by', 'publi', 'write', 'written', 'info']
     attr_values = node.attrib.values()
     if any([g in a for a in attr_values for g in goods]):
         return True
     txt = get_text_and_tail(node)
     if re.search(r'\b(author|by)[: ]', txt):
         return True
     return False
Пример #3
0
 def get_fingerprints(self, node):
     res = []
     text = normalize(get_text_and_tail(node)).strip()
     if node.tag == 'a' and 'href' in node.attrib:
         res = [(node.tag, node.attrib['href'], '', '')]
     if text:
         res += [(node.tag, a, node.attrib[a], text) for a in node.attrib]
         if node.tag == 'a':
             res += [(node.tag, '', '', text)]
         if not res:
             res = [(node.tag, '', '', text)]
     else:
         if 'style' in node.attrib:
             tmp = re.findall(r'background-image:[ ]*url\((http[^)]+)', node.attrib['style'])
             if tmp:
                 res += [('style', tmp[0])]
     return res
Пример #4
0
 def get_fingerprints(self, node):
     res = []
     text = normalize(get_text_and_tail(node)).strip()
     if node.tag == 'a' and 'href' in node.attrib:
         res = [(node.tag, node.attrib['href'], '', '')]
     if text:
         res += [(node.tag, a, node.attrib[a], text) for a in node.attrib]
         if node.tag == 'a':
             res += [(node.tag, '', '', text)]
         if not res:
             res = [(node.tag, '', '', text)]
     else:
         if 'style' in node.attrib:
             tmp = re.findall(r'background-image:[ ]*url\((http[^)]+)',
                              node.attrib['style'])
             if tmp:
                 res += [('style', tmp[0])]
     return res
Пример #5
0
def get_author(tree, lang='en'):
    # Couple ways of matching:
    # - Both in meta and in text
    # - Node has one of the goods in it, text has it in it, and the case is authoric
    # - Node has one of the goods in it
    # - Text has one of the goods in it
    goods = ['author', 'by', 'publi', 'write', 'written', 'info']
    hard_authors = []
    meta_authors = []
    text_hard_authors = []
    text_soft_authors = []
    meta_nodes = tree.xpath('//head/meta')

    for option in goods:

        for meta in meta_nodes:
            if not any([option in a for a in meta.values()]):
                continue

            # dit gaat nog best helemaal fout! (ik moet atts nog chcken op goods)
            for attr in meta.attrib:
                author = get_text_author(
                    author_translation(meta.attrib[attr], lang))
                if author:
                    meta_authors.append(author)

    for num, node in enumerate(tree.iter()):
        # hard author
        if not any([g in a for a in node.attrib.values() for g in goods]):
            for parent in node.iterancestors():
                attr_values = parent.attrib.values()
                if any([g in a for a in attr_values for g in goods]):
                    break
            else:
                continue
        tailtext = get_text_and_tail(node).strip()
        if tailtext and len(tailtext) < 200:
            if lang != 'en':
                tailtext = author_translation(tailtext, lang)
            hard_author = get_text_author(tailtext)
            if hard_author:
                hard_authors.append((num, hard_author))

    for num, node in enumerate(tree.iter()):
        tailtext = get_text_and_tail(node).strip()
        if tailtext and len(tailtext) < 200:
            res = re.findall(r"(author|Author|AUTHOR)[:;]* ([A-Z][a-zA-Z' ]+)",
                             tailtext)
            if res:
                res = res[0]
                if res in meta_authors:
                    text_hard_authors.append((res, num))
                else:
                    text_soft_authors.append((res, num))
            else:
                res = re.findall(r"\b(by|By|BY)[:;]* ([A-Z][a-zA-Z' ]+)",
                                 tailtext)
                if res:
                    res = res[0]
                    if res in meta_authors:
                        text_hard_authors.append((res, num))
                    else:
                        text_soft_authors.append((res, num))

    hardest_authors = []
    not_hardest_authors = []
    for num, ha in hard_authors:
        if ha in meta_authors:
            hardest_authors.append((ha, num))
        else:
            not_hardest_authors.append((ha, num))

    meta_authors = set(meta_authors)

    return hardest_authors, not_hardest_authors, text_hard_authors, text_soft_authors, meta_authors
Пример #6
0
def get_dates(tree, titleind=(None, 1), lang='en'):
    # make this faster, its friggin slow (stupid fuzzy matching)
    hard_dates = []
    soft_dates = []
    fuzzy_hard_dates = []
    fuzzy_soft_dates = []
    meta_nodes = tree.xpath('//head/meta')

    goods = ['ublish', 'ublicat', 'date', 'time']

    for option in goods:

        for meta in meta_nodes:
            if not any([option in a for a in meta.values()]):
                continue

            for attr in meta.attrib:
                soft_dates.append(
                    get_text_date(date_translation(meta.attrib[attr], lang)))

    for num, node in enumerate(tree.iter()):
        candi_dates = [
            v for k, v in node.items() if v and any([x in k for x in goods])
        ]
        for v in candi_dates:
            if within_years(v):
                if lang != 'en':
                    v = date_translation(v, lang)
                d = get_text_date(v)
                if d:
                    soft_dates.append(d)
                else:
                    fuzzy_soft_dates.append(get_text_date(v, fuzzy=True))

        # hard date
        tailtext = get_text_and_tail(node).strip()
        if tailtext and within_years(tailtext):
            if lang != 'en':
                tailtext = date_translation(tailtext, lang)
            hard_date = get_text_date(tailtext)
            if hard_date:
                hard_dates.append((num, hard_date))
            else:
                fuzzy_hard_dates.append(
                    (num, get_text_date(tailtext, fuzzy=True)))

    soft_dates = set(soft_dates)
    fuzzy_soft_dates = set(x for x in fuzzy_soft_dates if x)
    fuzzy_hard_dates = [x for x in fuzzy_hard_dates if x]

    # Note that num and hd get switched here
    hardest_dates = []
    not_hardest_dates = []
    for num, hd in hard_dates:
        if hd in soft_dates:
            hardest_dates.append((hd, num))
        else:
            not_hardest_dates.append((hd, num))

    fuzzy_hardest_dates = []
    for num, hd in fuzzy_hard_dates:
        if hd in fuzzy_soft_dates:
            fuzzy_hardest_dates.append((hd, num))
        else:
            not_hardest_dates.append((hd, num))

    # if nothing, then try simply fuzzy on each node, and otherwise non fuzzy
    non_fuzzy_any = []
    fuzzy_any = []
    if not any(
        [hardest_dates, fuzzy_hardest_dates, not_hardest_dates, soft_dates]):
        # no leads, try to parse everything non fuzzy
        for num, node in enumerate(tree.iter()):
            non_fuzzy_text = get_text_date(node, fuzzy=False)
            if non_fuzzy_text:
                non_fuzzy_any.append((non_fuzzy_text, num))
            else:
                fuzzy_text = get_text_date(node, fuzzy=True)
                if fuzzy_text:
                    fuzzy_any.append((fuzzy_text, num))

    date = ''
    date_node_index = None

    for dt in [
            hardest_dates, fuzzy_hardest_dates, not_hardest_dates,
            non_fuzzy_any, fuzzy_any
    ]:
        if dt:
            date, date_node_index = sorted(
                dt, key=lambda x: abs(x[1] - titleind[1]))[0]
            break

    if not date and soft_dates:
        for sd in soft_dates:
            date = sd
            break

    all_dates = [
        hardest_dates, fuzzy_hardest_dates, not_hardest_dates, non_fuzzy_any,
        fuzzy_any
    ]

    if date_node_index is not None:
        # It goes wrong when some year is mentioned in the title, then it removes title
        # print('removing date content', node.text) ...... is dit nog gerecent?
        date_node_indices = [[y[1] for y in x if y[0] == date]
                             for x in all_dates]
        date_node_indices = [item for sub in date_node_indices for item in sub]
        for num, node in enumerate(tree.iter()):
            if num in date_node_indices:
                # maybe i now remove too littlet
                tt = get_text_and_tail(node)
                if tt and len(tt) < 25 or '|' in tt and len(tt) < 50:
                    node.text = ''
                    node.tail = ''
    # fd.nl
    if not date:
        now = datetime.datetime.now()
        if tree.xpath('//time[contains(text(), "Vandaag")]'):
            date = now.strftime('%Y-%m-%d')
        elif tree.xpath('//time[contains(text(), "Gisteren")]'):
            yesterday = now - datetime.timedelta(1)
            date = yesterday.strftime('%Y-%m-%d')

    return date
Пример #7
0
def get_author(tree, lang='en'):
    # Couple ways of matching:
    # - Both in meta and in text
    # - Node has one of the goods in it, text has it in it, and the case is authoric
    # - Node has one of the goods in it
    # - Text has one of the goods in it
    goods = ['author', 'by', 'publi', 'write', 'written', 'info']
    hard_authors = []
    meta_authors = []
    text_hard_authors = []
    text_soft_authors = []
    meta_nodes = tree.xpath('//head/meta')

    for option in goods:

        for meta in meta_nodes:
            if not any([option in a for a in meta.values()]):
                continue

            # dit gaat nog best helemaal fout! (ik moet atts nog chcken op goods)
            for attr in meta.attrib:
                author = get_text_author(author_translation(meta.attrib[attr], lang))
                if author:
                    meta_authors.append(author)

    for num, node in enumerate(tree.iter()):
        # hard author
        if not any([g in a for a in node.attrib.values() for g in goods]):
            for parent in node.iterancestors():
                attr_values = parent.attrib.values()
                if any([g in a for a in attr_values for g in goods]):
                    break
            else:
                continue
        tailtext = get_text_and_tail(node).strip()
        if tailtext and len(tailtext) < 200:
            if lang != 'en':
                tailtext = author_translation(tailtext, lang)
            hard_author = get_text_author(tailtext)
            if hard_author:
                hard_authors.append((num, hard_author))

    for num, node in enumerate(tree.iter()):
        tailtext = get_text_and_tail(node).strip()
        if tailtext and len(tailtext) < 200:
            res = re.findall(r"(author|Author|AUTHOR)[:;]* ([A-Z][a-zA-Z' ]+)", tailtext)
            if res:
                res = res[0]
                if res in meta_authors:
                    text_hard_authors.append((res, num))
                else:
                    text_soft_authors.append((res, num))
            else:
                res = re.findall(r"\b(by|By|BY)[:;]* ([A-Z][a-zA-Z' ]+)", tailtext)
                if res:
                    res = res[0]
                    if res in meta_authors:
                        text_hard_authors.append((res, num))
                    else:
                        text_soft_authors.append((res, num))

    hardest_authors = []
    not_hardest_authors = []
    for num, ha in hard_authors:
        if ha in meta_authors:
            hardest_authors.append((ha, num))
        else:
            not_hardest_authors.append((ha, num))

    meta_authors = set(meta_authors)

    return hardest_authors, not_hardest_authors, text_hard_authors, text_soft_authors, meta_authors
Пример #8
0
def get_dates(tree, titleind=(None, 1), lang="en"):
    # make this faster, its friggin slow (stupid fuzzy matching)
    hard_dates = []
    soft_dates = []
    fuzzy_hard_dates = []
    fuzzy_soft_dates = []
    meta_nodes = tree.xpath("//head/meta")

    goods = ["ublish", "ublicat", "date", "time"]

    for option in goods:

        for meta in meta_nodes:
            if not any([option in a for a in meta.values()]):
                continue

            for attr in meta.attrib:
                soft_dates.append(get_text_date(date_translation(meta.attrib[attr], lang)))

    for num, node in enumerate(tree.iter()):
        candi_dates = [v for k, v in node.items() if v and any([x in k for x in goods])]
        for v in candi_dates:
            if within_years(v):
                if lang != "en":
                    v = date_translation(v, lang)
                d = get_text_date(v)
                if d:
                    soft_dates.append(d)
                else:
                    fuzzy_soft_dates.append(get_text_date(v, fuzzy=True))

        # hard date
        tailtext = get_text_and_tail(node).strip()
        if tailtext and within_years(tailtext):
            if lang != "en":
                tailtext = date_translation(tailtext, lang)
            hard_date = get_text_date(tailtext)
            if hard_date:
                hard_dates.append((num, hard_date))
            else:
                fuzzy_hard_dates.append((num, get_text_date(tailtext, fuzzy=True)))

    soft_dates = set(soft_dates)
    fuzzy_soft_dates = set(x for x in fuzzy_soft_dates if x)
    fuzzy_hard_dates = [x for x in fuzzy_hard_dates if x]

    # Note that num and hd get switched here
    hardest_dates = []
    not_hardest_dates = []
    for num, hd in hard_dates:
        if hd in soft_dates:
            hardest_dates.append((hd, num))
        else:
            not_hardest_dates.append((hd, num))

    fuzzy_hardest_dates = []
    for num, hd in fuzzy_hard_dates:
        if hd in fuzzy_soft_dates:
            fuzzy_hardest_dates.append((hd, num))
        else:
            not_hardest_dates.append((hd, num))

    # if nothing, then try simply fuzzy on each node, and otherwise non fuzzy
    non_fuzzy_any = []
    fuzzy_any = []
    if not any([hardest_dates, fuzzy_hardest_dates, not_hardest_dates, soft_dates]):
        # no leads, try to parse everything non fuzzy
        for num, node in enumerate(tree.iter()):
            non_fuzzy_text = get_text_date(node, fuzzy=False)
            if non_fuzzy_text:
                non_fuzzy_any.append((non_fuzzy_text, num))
            else:
                fuzzy_text = get_text_date(node, fuzzy=True)
                if fuzzy_text:
                    fuzzy_any.append((fuzzy_text, num))

    date = ""
    date_node_index = None

    for dt in [hardest_dates, fuzzy_hardest_dates, not_hardest_dates, non_fuzzy_any, fuzzy_any]:
        if dt:
            date, date_node_index = sorted(dt, key=lambda x: abs(x[1] - titleind[1]))[0]
            break

    if not date and soft_dates:
        for sd in soft_dates:
            date = sd
            break

    all_dates = [hardest_dates, fuzzy_hardest_dates, not_hardest_dates, non_fuzzy_any, fuzzy_any]

    if date_node_index is not None:
        # It goes wrong when some year is mentioned in the title, then it removes title
        # print('removing date content', node.text) ...... is dit nog gerecent?
        date_node_indices = [[y[1] for y in x if y[0] == date] for x in all_dates]
        date_node_indices = [item for sub in date_node_indices for item in sub]
        for num, node in enumerate(tree.iter()):
            if num in date_node_indices:
                # maybe i now remove too little
                if node.text and len(node.text) < 25:
                    node.text = ""
                    node.tail = ""
    return date
Пример #9
0
    def process(self, url, tree, remove_visuals, exclude_data):
        self.remove_bad_xpaths_from_tree(tree)
        if self.detected_language is None:
            self.detected_language = get_language(
                tree, self.url_to_headers_mapping[url], self.domain)
        # print('language: {}'.format(self.detected_language))
        # pre_text_content = normalize('\n'.join([get_text_and_tail(x) for x in tree.iter()]))

        # author has to be attempted before duplicate removal, since an author is
        # likely to occur more often
        self.domain_nodes_dict.remove_template(tree)
        hardest_authors, not_hardest_authors, text_hard_authors, text_soft_authors, meta_authors = get_author(
            tree, self.detected_language)
        self.domain_nodes_dict.remove_author(tree)
        title = getRuleTitle(tree)
        # filter duplicate images by src
        ok_imgs = get_images(tree)
        titleind = ()
        imginds = []
        contentinds = []

        # such as title, date and later author

        link_eles = [link[0] for link in tree.iterlinks()
                     if link[0].tag == 'a' and link[2] and
                     link[2].startswith(self.domain) and
                     get_text_and_tail(link[0]).strip()]

        linkinds = []
        for num, node in enumerate(tree.iter()):
            if node in ok_imgs:
                imginds.append((node, num))
            elif normalize(get_text_and_tail(node)) == title:
                titleind = (node, num)
            elif get_text_and_tail(node).strip():
                if node in link_eles:
                    linkinds.append((node, num))
                contentinds.append((node, num))
            # Cleanup trash for visual'
            if remove_visuals:
                if node.tag == 'input':
                    node.set('type', 'hidden')
                elif node.tag == 'a' and not get_text_and_tail(node).strip():
                    for att in node.attrib:
                        node.set(att, '')
                if node.tag == 'img':
                    node.set('alt', '')
                if node.attrib and 'background-image' in node.attrib:
                    node.set('background-image', '')
        if not titleind:
            # fuzzy token text / title matching
            title_set = set(title.split())
            for num, node in enumerate(tree.iter()):
                text_content = get_text_and_tail(node)
                if text_content and len(text_content) < 500:
                    text_set = set(text_content.split())
                    if fscore(title_set, text_set) > 0.5:
                        titleind = (node, num)
                        break

        if titleind:
            sortedimgs = sorted(imginds, key=lambda x: abs(x[1] - titleind[1]))
        else:
            sortedimgs = []

        images = []
        for x in sortedimgs:
            val = None
            if 'src' in x[0].attrib:
                val = x[0].attrib['src']
            elif 'content' in x[0].attrib:
                val = x[0].attrib['content']
            elif 'style' in x[0].attrib:
                tmp = re.findall(r'background-image:[ ]*url\((http[^)]+)', x[0].attrib['style'])
                if tmp:
                    val = tmp[0]
            if val is not None and val not in images:
                images.append(val)

        author = ''
        author_node_index = None
        date = "1970-01-01"
        if titleind:
            date = get_dates(tree, titleind, self.detected_language)
            # excluding soft dates (meta, they wont work anyway)

            for at in [hardest_authors, not_hardest_authors, text_hard_authors, text_soft_authors]:
                if at:
                    author, author_node_index = sorted(
                        at, key=lambda x: abs(x[1] - titleind[1]))[0]
                    break

        if not author and meta_authors:
            for ma in meta_authors:
                author = ma
                break

        if author_node_index is not None:
            for num, node in enumerate(tree.iter()):
                if num == author_node_index:
                    break
            # It goes wrong when some year is mentioned in the title, then it removes title
            # print('removing author content', node.text)
            node.text = ''
            node.tail = ''

        cleaned_html = lxml.html.tostring(tree).decode('utf8')

        body_content = self.get_content(cleaned_html)

        if not body_content:
            body_content = []
            title_len = len(title)
            title_tokens = set(title.split())
            len_title_tokens = len(title_tokens)
            last_text_node_num = get_last_text_non_a_node(tree)
            for num, x in enumerate(tree.iter()):
                txt = normalize(get_text_and_tail(x))
                if txt:
                    if num < titleind[1]:
                        # print('removed pre-title', txt)
                        x.text = ''
                        x.tail = ''
                        continue
                    if last_text_node_num > 0 and num > last_text_node_num:
                        # print('removed post-content', txt)
                        x.text = ''
                        continue
                    n = len(txt)
                    # remove title
                    txt_tokens = set(txt.split())
                    n_matching = len(txt_tokens & title_tokens)
                    if (n < title_len * 3 and n_matching / len(txt_tokens) > 0.3 and
                            n_matching / len_title_tokens > 0.3):
                        # print('removed!', txt)
                        continue
                    body_content.append(txt)

        links = [x.attrib['href'] for x in tree.xpath('//a')
                 if 'href' in x.attrib and
                 x.attrib['href'].startswith(self.domain) and
                 self.should_save(x.attrib['href'])]

        money_amounts = money.find('\n'.join(body_content), 1000) + money.find(title, 1000)

        data = {'title': title,
                'body': body_content,
                'images': images,
                'publish_date': str(date),
                'author': author,
                'cleaned': cleaned_html,
                'language': self.detected_language,
                'url': url,
                'domain': self.domain,
                'money': money_amounts,
                'summary': '',
                'related': get_sorted_links(links, url)[:5]}

        if 'overwrite_values_by_xpath' in self.config:
            for k, v in self.config['overwrite_values_by_xpath'].items():
                new = tree.xpath(v)
                data[k] = new[0] if isinstance(new, list) else new

        filtered_data = {k: v for k, v in data.items() if k not in exclude_data}

        return filtered_data