Python _clean_text_by_word示例

编程语言: Python

命名空间/包名称: preprocessing.textcleaner

方法/功能: _clean_text_by_word

hotexamples.com的示例: 6

Python _clean_text_by_word - 已找到6个示例。这些是从开源项目中提取的最受好评的preprocessing.textcleaner._clean_text_by_word现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： keywords.py 项目： a-parida12/typeCast

def keywords(text,
             ratio=0.2,
             words=None,
             language="english",
             split=False,
             scores=False):
    # Gets a dict of word -> lemma
    tokens = _clean_text_by_word(text, language)
    split_text = list(_tokenize_by_word(text))

    # Creates the graph and adds the edges
    graph = _build_graph(_get_words_for_graph(tokens))
    _set_graph_edges(graph, tokens, split_text)
    del split_text  # It's no longer used

    _remove_unreachable_nodes(graph)

    # PageRank cannot be run in an empty graph.
    if len(graph.nodes()) == 0:
        return [] if split else ""

    # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score
    pagerank_scores = _pagerank(graph)

    extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio,
                                       words)

    lemmas_to_word = _lemmas_to_words(tokens)
    keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word)

    # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined
    combined_keywords = _get_combined_keywords(keywords, text.split())

    return _format_results(keywords, combined_keywords, split, scores)

示例#2

显示文件

文件： keywords.py 项目： aiswaryasankar/similaritySum

def get_graph(text, language="english", deaccent=False):
    tokens = _clean_text_by_word(text, language, deacc=deaccent)
    split_text = list(_tokenize_by_word(text, deacc=deaccent))

    graph = _build_graph(_get_words_for_graph(tokens))
    _set_graph_edges(graph, tokens, split_text)

    return graph

示例#3

显示文件

文件： keywords.py 项目： anhtu/textrank

def get_graph(text, language="english"):
    tokens = _clean_text_by_word(text, language)
    split_text = list(_tokenize_by_word(text))

    graph = _build_graph(_get_words_for_graph(tokens))
    _set_graph_edges(graph, tokens, split_text)

    return graph

示例#4

显示文件

文件： keywords.py 项目： anhtu/textrank

def keywords(text, ratio=0.2, words=None, language="english", split=False, scores=False):
    # Gets a dict of word -> lemma
    tokens = _clean_text_by_word(text, language)
    split_text = list(_tokenize_by_word(text))

    # Creates the graph and adds the edges
    graph = _build_graph(_get_words_for_graph(tokens))
    _set_graph_edges(graph, tokens, split_text)
    del split_text # It's no longer used

    _remove_unreachable_nodes(graph)

    # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score
    pagerank_scores = _pagerank(graph)

    extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words)

    lemmas_to_word = _lemmas_to_words(tokens)
    keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word)

    # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined
    combined_keywords = _get_combined_keywords(keywords, text.split())

    return _format_results(keywords, combined_keywords, split, scores)

示例#5

显示文件

文件： export.py 项目： jnthnvctr/summary

def _get_labels(text, language, by_sentence):
    syntactic_units = _clean_text_by_sentences(text, language) if by_sentence \
        else _clean_text_by_word(text, language).values()
    return {unit.token: unit.text for unit in syntactic_units}

示例#6

显示文件

文件： export.py 项目： Biocoder2600/textrank

def _get_labels(text, language, by_sentence):
    syntactic_units = _clean_text_by_sentences(text, language) if by_sentence \
        else _clean_text_by_word(text, language).values()
    return {unit.token: unit.text for unit in syntactic_units}