Python PageManager.getVisibleTokenStructure примеры использования

Язык программирования: Python

Пространство имен/Пакет: learning.PageManager

Класс/Тип: PageManager

Метод/Функция: getVisibleTokenStructure

Примеров на hotexamples.com: 3

Python PageManager.getVisibleTokenStructure - 3 примера найдено. Это лучшие примеры Python кода для learning.PageManager.PageManager.getVisibleTokenStructure, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

PageManager(13)

addPage(13)

learnStripes(9)

learnRulesFromMarkup(5)

learnAllRules(4)

getPage(3)

getVisibleTokenStructure(2)

learnListMarkups(2)

rulesToMarkup(2)

getPageChunks(1)

getPageIds(1)

getPossibleLocations(1)

getStripes(1)

listRulesToMarkup(1)

Пример #1

Показать файл

Файл: controllers.py Проект: thezedwards/extraction

def visible_token_viewer():
    if request.method == 'POST':
        data = request.get_json(force=True)
        test_string = data['test_string']
        test_string = ' '.join(test_string.split())
        pageManager = PageManager()
        page_file_dir = os.path.join(app.static_folder, 'visible_tokens_test')
        files = [f for f in os.listdir(page_file_dir) if os.path.isfile(os.path.join(page_file_dir, f))]
        for the_file in files:
            if the_file.startswith('.'):
                continue
            
            with codecs.open(os.path.join(page_file_dir, the_file), "r", "utf-8") as myfile:
                page_str = myfile.read().encode('utf-8')
            
            pageManager.addPage(the_file, page_str)
        triples = []
        for triple in pageManager.getVisibleTokenStructure():
            if triple['invisible_token_buffer_before'].endswith(test_string):
                triples.append(triple)
        return jsonify(triples=triples)

Пример #2

Показать файл

Файл: TreeListLearner.py Проект: usc-isi-i2/landmark-extraction

    def lists_on_single_page(self, content):
        pg = PageManager()
        pg.addPage("zzz", content)

        triples = pg.getVisibleTokenStructure()
        (ptree, paths_to_vis_text, path_to_invis_toks) = self.prefix_tree(triples, only_consider_tag='div')

        potential_lists = self.prefix_tree_to_paths(ptree)

        if self.__DEBUG:
            print '.... POTENTIAL LISTS ARE ....'
            print '\n'.join([''.join(p) for p in potential_lists])
            print '.... OK!....'

        all_tokens_list = pg.getPage("zzz").tokens

        # Now, let's get our lists
        lists = {}

        for i in range(len(potential_lists)):
            pot_list = potential_lists[i]

            as_path = ''.join(pot_list)
            if self.__DEBUG:
                print "PATH: %s" % as_path
            lists[as_path] = {
                'rows': []
            }

            # if as_path in paths_to_vis_text:
            for path_to_vis in paths_to_vis_text:
                if path_to_vis.find(as_path) > -1:
                    vis_texts = [a for a in paths_to_vis_text[path_to_vis]]
                    invis_toks = [t for t in path_to_invis_toks[path_to_vis]]

                    for idx in range(len(vis_texts)):
                        if self.__DEBUG:
                            print "%s ==> %s" % (vis_texts[idx], str(invis_toks[idx].token_location))
                        html_between_row = ''
                        if (idx+1) < len(vis_texts):
                            begin = invis_toks[idx].token_location
                            end = invis_toks[idx+1].token_location - 1
                            html_between_row = all_tokens_list.getTokensAsString(begin, end, whitespace=True)
                        lists[as_path]['rows'].append({
                            'visible_text': vis_texts[idx],
                            'starting_token_location': invis_toks[idx].token_location,
                            'html_between_row': html_between_row
                        })
            as_json_str = json.dumps(lists)

            if self.__DEBUG:
                print "--------"
                print as_json_str
                print "--------"

            # # do it as an extraction instead?
            # item_rule_begin = Landmark.escape_regex_string('<html')
            # item_rule_end = Landmark.escape_regex_string('/html>')
            #
            # begin_iter_rule = '.+?'.join([Landmark.escape_regex_string(a) for a in pot_list])
            #
            # # figure out: for each tag in the rule, add it's end tag (keep track of tag type)
            # #  NOTE: for now, this assumes that the HTML is well formed
            # end_it = '.+?'.join(['</div>' for i in range(len(pot_list))])
            #
            # end_iter_rule = end_it
            #
            # #  include end-regex: included in the stuff that's extracted.
            # #  Solve for the case where you only see part of the stuff
            # rule = IterationRule(str(i) + "_pathListRule", item_rule_begin, item_rule_end,
            #                      begin_iter_rule, end_iter_rule, removehtml=True)
            # extraction = rule.apply(content)
            #
            # print "**PATH: "+''.join(pot_list)
            # as_json_str = json.dumps(extraction)
            #
            # for seq in extraction['sequence']:
            #     print "\t"+seq['extract']

        # TODO: do this here????
        # TODO: big drop down the path should be considered... not just if hte path occurs twice
        # TODO: fix bugs
        markup = self.creat_row_markup(lists, all_tokens_list, pg)
        if self.__DEBUG:
            print "list markup"
            json.dumps(markup)
        return markup

Пример #3

Показать файл

    def lists_on_single_page(self, content):
        pg = PageManager()
        pg.addPage("zzz", content)

        triples = pg.getVisibleTokenStructure()
        (ptree, paths_to_vis_text,
         path_to_invis_toks) = self.prefix_tree(triples,
                                                only_consider_tag='div')

        potential_lists = self.prefix_tree_to_paths(ptree)

        if self.__DEBUG:
            print '.... POTENTIAL LISTS ARE ....'
            print '\n'.join([''.join(p) for p in potential_lists])
            print '.... OK!....'

        all_tokens_list = pg.getPage("zzz").tokens

        # Now, let's get our lists
        lists = {}

        for i in range(len(potential_lists)):
            pot_list = potential_lists[i]

            as_path = ''.join(pot_list)
            if self.__DEBUG:
                print "PATH: %s" % as_path
            lists[as_path] = {'rows': []}

            # if as_path in paths_to_vis_text:
            for path_to_vis in paths_to_vis_text:
                if path_to_vis.find(as_path) > -1:
                    vis_texts = [a for a in paths_to_vis_text[path_to_vis]]
                    invis_toks = [t for t in path_to_invis_toks[path_to_vis]]

                    for idx in range(len(vis_texts)):
                        if self.__DEBUG:
                            print "%s ==> %s" % (
                                vis_texts[idx],
                                str(invis_toks[idx].token_location))
                        html_between_row = ''
                        if (idx + 1) < len(vis_texts):
                            begin = invis_toks[idx].token_location
                            end = invis_toks[idx + 1].token_location - 1
                            html_between_row = all_tokens_list.getTokensAsString(
                                begin, end, whitespace=True)
                        lists[as_path]['rows'].append({
                            'visible_text':
                            vis_texts[idx],
                            'starting_token_location':
                            invis_toks[idx].token_location,
                            'html_between_row':
                            html_between_row
                        })
            as_json_str = json.dumps(lists)

            if self.__DEBUG:
                print "--------"
                print as_json_str
                print "--------"

            # # do it as an extraction instead?
            # item_rule_begin = Landmark.escape_regex_string('<html')
            # item_rule_end = Landmark.escape_regex_string('/html>')
            #
            # begin_iter_rule = '.+?'.join([Landmark.escape_regex_string(a) for a in pot_list])
            #
            # # figure out: for each tag in the rule, add it's end tag (keep track of tag type)
            # #  NOTE: for now, this assumes that the HTML is well formed
            # end_it = '.+?'.join(['</div>' for i in range(len(pot_list))])
            #
            # end_iter_rule = end_it
            #
            # #  include end-regex: included in the stuff that's extracted.
            # #  Solve for the case where you only see part of the stuff
            # rule = IterationRule(str(i) + "_pathListRule", item_rule_begin, item_rule_end,
            #                      begin_iter_rule, end_iter_rule, removehtml=True)
            # extraction = rule.apply(content)
            #
            # print "**PATH: "+''.join(pot_list)
            # as_json_str = json.dumps(extraction)
            #
            # for seq in extraction['sequence']:
            #     print "\t"+seq['extract']

        # TODO: do this here????
        # TODO: big drop down the path should be considered... not just if hte path occurs twice
        # TODO: fix bugs
        markup = self.creat_row_markup(lists, all_tokens_list, pg)
        if self.__DEBUG:
            print "list markup"
            json.dumps(markup)
        return markup