Python make_tree_and_preprocess 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: ebdata.textmining.treeutils

메소드/함수: make_tree_and_preprocess

hotexamples.com에서의 예제들: 8

Python make_tree_and_preprocess - 8개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 ebdata.textmining.treeutils.make_tree_and_preprocess에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: clean.py 프로젝트: DotNetWebs/openblock

def clean_page(html, other_page):
    """
    Wrapper around the various cleaning functions. This accepts and returns
    strings instead of trees.
    """
    tree1 = make_tree_and_preprocess(html)
    tree2 = make_tree_and_preprocess(other_page)
    strip_template(tree1, tree2)
    # drop_useless_tags(tree1)
    # remove_empty_tags(tree1, ('div', 'span', 'td', 'tr', 'table'))
    return etree.tostring(tree1, method='html'), etree.tostring(tree2, method='html')

예제 #2

파일 보기

def clean_page(html, other_page):
    """
    Wrapper around the various cleaning functions. This accepts and returns
    strings instead of trees.
    """
    tree1 = make_tree_and_preprocess(html)
    tree2 = make_tree_and_preprocess(other_page)
    strip_template(tree1, tree2)
    # drop_useless_tags(tree1)
    # remove_empty_tags(tree1, ('div', 'span', 'td', 'tr', 'table'))
    return etree.tostring(tree1, method='html'), etree.tostring(tree2, method='html')

예제 #3

파일 보기

파일: webmining.py 프로젝트: DotNetWebs/openblock

def mine_page(html, other_pages):
    result = []
    for hole in extract(html, other_pages):
        # Differences in attribute values aren't relevant.
        if hole['type'] == 'attrib' or not hole['value'] or not hole['value'].strip():
            continue

        # # Differences in links are likely navigation, and can be ignored.
        # if hole['type'] == 'text' and hole['tag'] == 'a':
        #     continue

        # If it's a multitag value, clean its HTML a bit.
        if hole['type'] == 'multitag':
            tree = make_tree_and_preprocess(hole['value'])

            # Drop a bunch of tags that can muck up the display.
            tree = preprocess(tree,
                drop_tags=('a', 'area', 'b', 'center', 'font', 'form', 'img', 'input', 'map', 'small', 'sub', 'sup', 'topic'),
                drop_trees=('applet', 'button', 'embed', 'iframe', 'object', 'select', 'textarea'),
                drop_attrs=('background', 'border', 'cellpadding', 'cellspacing', 'class', 'clear', 'id', 'rel', 'style', 'target'))

            remove_empty_tags(tree, ('br',))
            tree = brs_to_paragraphs(tree)

            # The [6:-7] cuts off the '<body>' and '</body>'.
            try:
                body = tree.body
            except IndexError:
                continue # lxml raises an IndexError if there's no <body>.

            # Skip bits that don't have at least one letter or number.
            # Note: If this code is ever internationalized, this will have to be
            # removed.
            if not re.search('[A-Za-z0-9]', body.text_content()):
                continue

            string = etree.tostring(body, method='html')[6:-7]
        else:
            string = hole['value']

            # Skip bits that don't have at least one letter or number.
            # Note: If this code is ever internationalized, this will have to be
            # removed.
            if not re.search('[A-Za-z0-9]', string):
                continue

        # Clean up newlines, tabs and &nbsp;.
        string = re.sub('[\n\t]', ' ', string.strip())
        string = string.replace('&nbsp;', ' ')
        string = string.replace('&#160;', ' ')

        result.append(string)
    return result

예제 #4

파일 보기

파일: webmining.py 프로젝트: vijayaraju/everyblock-1

def mine_page(html, other_pages):
    result = []
    for hole in extract(html, other_pages):
        # Differences in attribute values aren't relevant.
        if hole['type'] == 'attrib' or not hole['value'] or not hole['value'].strip():
            continue

        # # Differences in links are likely navigation, and can be ignored.
        # if hole['type'] == 'text' and hole['tag'] == 'a':
        #     continue

        # If it's a multitag value, clean its HTML a bit.
        if hole['type'] == 'multitag':
            tree = make_tree_and_preprocess(hole['value'])

            # Drop a bunch of tags that can muck up the display.
            tree = preprocess(tree,
                drop_tags=('a', 'area', 'b', 'center', 'font', 'form', 'img', 'input', 'map', 'small', 'sub', 'sup', 'topic'),
                drop_trees=('applet', 'button', 'embed', 'iframe', 'object', 'select', 'textarea'),
                drop_attrs=('background', 'border', 'cellpadding', 'cellspacing', 'class', 'clear', 'id', 'rel', 'style', 'target'))

            remove_empty_tags(tree, ('br',))
            tree = brs_to_paragraphs(tree)

            # The [6:-7] cuts off the '<body>' and '</body>'.
            try:
                body = tree.body
            except IndexError:
                continue # lxml raises an IndexError if there's no <body>.

            # Skip bits that don't have at least one letter or number.
            # Note: If this code is ever internationalized, this will have to be
            # removed.
            if not re.search('[A-Za-z0-9]', body.text_content()):
                continue

            string = etree.tostring(body, method='html')[6:-7]
        else:
            string = hole['value']

            # Skip bits that don't have at least one letter or number.
            # Note: If this code is ever internationalized, this will have to be
            # removed.
            if not re.search('[A-Za-z0-9]', string):
                continue

        # Clean up newlines, tabs and &nbsp;.
        string = re.sub('[\n\t]', ' ', string.strip())
        string = string.replace('&nbsp;', ' ')
        string = string.replace('&#160;', ' ')

        result.append(string)
    return result

예제 #5

파일 보기

파일: sst.py 프로젝트: peudadayusuf/openblock

 def extract(self, html):
     tree = make_tree_and_preprocess(html)
     if self.htmltree is None:
         raise ValueError('This template has not learned anything yet.')
     return tree_extract(self.htmltree, tree, self.algorithm)

예제 #6

파일 보기

파일: sst.py 프로젝트: peudadayusuf/openblock

 def learn(self, html):
     tree = make_tree_and_preprocess(html)
     if self.htmltree is None:
         self.htmltree = tree
     else:
         self.htmltree = tree_diff(self.htmltree, tree, self.algorithm)

예제 #7

파일 보기

파일: sst.py 프로젝트: UniversityDailyKansan/openblock

 def extract(self, html):
     tree = make_tree_and_preprocess(html)
     if self.htmltree is None:
         raise ValueError('This template has not learned anything yet.')
     return tree_extract(self.htmltree, tree, self.algorithm)

예제 #8

파일 보기

파일: sst.py 프로젝트: UniversityDailyKansan/openblock

 def learn(self, html):
     tree = make_tree_and_preprocess(html)
     if self.htmltree is None:
         self.htmltree = tree
     else:
         self.htmltree = tree_diff(self.htmltree, tree, self.algorithm)