Python Fetcher.tokenize_html примеры использования

Язык программирования: Python

Пространство имен/Пакет: fetcher

Класс/Тип: Fetcher

Метод/Функция: tokenize_html

Примеров на hotexamples.com: 2

Python Fetcher.tokenize_html - 2 примера найдено. Это лучшие примеры Python кода для fetcher.Fetcher.tokenize_html, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Fetcher(30)

fetch(14)

stop(7)

_get_helper(5)

mark_paragraphs(3)

insertDataToMySQL(2)

tokenize_html(2)

children(2)

get(2)

raw_fetch_url(2)

make_soup(2)

host(1)

populate_products_has_tags(1)

header(1)

populate_products(1)

populate_categories(1)

len(1)

movie_inf(1)

left(1)

install_modpack(1)

insert_thumbs(1)

populate_tags(1)

resample(1)

query_video_information(1)

statistics(1)

url_valid(1)

url(1)

update_thumbs(1)

update_modpack(1)

to_decoder(1)

sync_topic_to_es(1)

start(1)

request(1)

setReferer(1)

setCredentials(1)

send(1)

search_movie(1)

runFetcher(1)

run(1)

get_subscriptions(1)

get_user_data(1)

get_latest_season(1)

get_ranked_pages(1)

coutries_data(1)

extract_dns(1)

entries(1)

download(1)

downLoadContent(1)

create_table(1)

Пример #1

Показать файл

    def _parse(self, enclosing_session=None, verbose=False):
        """ Parse the article content to yield parse trees and annotated token list """
        with SessionContext(enclosing_session) as session:

            # Convert the content soup to a token iterable (generator)
            toklist = Fetcher.tokenize_html(self._url, self._html, session)

            bp = self.get_parser()
            ip = IncrementalParser(bp, toklist, verbose=verbose)

            # List of paragraphs containing a list of sentences containing token lists
            # for sentences in string dump format (1-based paragraph and sentence indices)
            pgs = []

            # Dict of parse trees in string dump format,
            # stored by sentence index (1-based)
            trees = OrderedDict()

            # Word stem dictionary, indexed by (stem, cat)
            words = defaultdict(int)
            num_sent = 0

            for p in ip.paragraphs():

                pgs.append([])

                for sent in p.sentences():

                    num_sent += 1

                    if sent.parse():
                        # Obtain a text representation of the parse tree
                        trees[num_sent] = ParseForestDumper.dump_forest(
                            sent.tree)
                        pgs[-1].append(
                            Article._dump_tokens(sent.tokens, sent.tree,
                                                 words))
                    else:
                        # Error or no parse: add an error index entry for this sentence
                        eix = sent.err_index
                        trees[num_sent] = "E{0}".format(eix)
                        pgs[-1].append(
                            Article._dump_tokens(sent.tokens, None, None, eix))

            parse_time = ip.parse_time

            self._parsed = datetime.utcnow()
            self._parser_version = bp.version
            self._num_tokens = ip.num_tokens
            self._num_sentences = ip.num_sentences
            self._num_parsed = ip.num_parsed
            self._ambiguity = ip.ambiguity

            # Make one big JSON string for the paragraphs, sentences and tokens
            self._raw_tokens = pgs
            self._tokens = json.dumps(pgs,
                                      separators=(',', ':'),
                                      ensure_ascii=False)
            self._words = words
            # self._tokens = "[" + ",\n".join("[" + ",\n".join(sent for sent in p) + "]" for p in pgs) + "]"
            # Create a tree representation string out of all the accumulated parse trees
            self._tree = "".join("S{0}\n{1}\n".format(key, val)
                                 for key, val in trees.items())

Пример #2

Показать файл

Файл: article.py Проект: haukurb/Reynir

    def _parse(self, enclosing_session=None, verbose=False):
        """ Parse the article content to yield parse trees and annotated token list """
        with SessionContext(enclosing_session) as session:

            # Convert the content soup to a token iterable (generator)
            toklist = Fetcher.tokenize_html(self._url, self._html, session)

            bp = self.get_parser()
            ip = IncrementalParser(bp, toklist, verbose=verbose)

            # List of paragraphs containing a list of sentences containing token lists
            # for sentences in string dump format (1-based paragraph and sentence indices)
            pgs = []

            # Dict of parse trees in string dump format,
            # stored by sentence index (1-based)
            trees = OrderedDict()

            # Word stem dictionary, indexed by (stem, cat)
            words = defaultdict(int)
            num_sent = 0

            for p in ip.paragraphs():

                pgs.append([])

                for sent in p.sentences():

                    num_sent += 1
                    num_tokens = len(sent)

                    # We don't attempt to parse very long sentences (>100 tokens)
                    # since they are memory intensive (>16 GB) and may take
                    # minutest to process
                    if num_tokens <= MAX_SENTENCE_TOKENS and sent.parse():
                        # Obtain a text representation of the parse tree
                        token_dicts = TreeUtility.dump_tokens(
                            sent.tokens, sent.tree, words)
                        # Create a verbose text representation of
                        # the highest scoring parse tree
                        tree = ParseForestDumper.dump_forest(
                            sent.tree, token_dicts=token_dicts)
                        # Add information about the sentence tree's score
                        # and the number of tokens
                        trees[num_sent] = "\n".join([
                            "C{0}".format(sent.score),
                            "L{0}".format(num_tokens), tree
                        ])
                    else:
                        # Error, sentence too long or no parse:
                        # add an error index entry for this sentence
                        if num_tokens > MAX_SENTENCE_TOKENS:
                            # Set the error index at the first
                            # token outside the maximum limit
                            eix = MAX_SENTENCE_TOKENS
                        else:
                            eix = sent.err_index
                        token_dicts = TreeUtility.dump_tokens(
                            sent.tokens, None, None, eix)
                        trees[num_sent] = "E{0}".format(eix)

                    pgs[-1].append(token_dicts)

            # parse_time = ip.parse_time

            self._parsed = datetime.utcnow()
            self._parser_version = bp.version
            self._num_tokens = ip.num_tokens
            self._num_sentences = ip.num_sentences
            self._num_parsed = ip.num_parsed
            self._ambiguity = ip.ambiguity

            # Make one big JSON string for the paragraphs, sentences and tokens
            self._raw_tokens = pgs
            self._tokens = json.dumps(pgs,
                                      separators=(",", ":"),
                                      ensure_ascii=False)

            # Keep the bag of words (stem, category, count for each word)
            self._words = words

            # Create a tree representation string out of all the accumulated parse trees
            self._tree = "".join("S{0}\n{1}\n".format(key, val)
                                 for key, val in trees.items())