Exemplos de get_stoplists em Python, exemplos de justext.get_stoplists em Python

Exemplo n.º 1

0

Exibir arquivo

def get_all_stop_words():
  """
  For the language independent version of Justext
  """
  stop_words = set()
  for language in justext.get_stoplists():
      stop_words.update(justext.get_stoplist(language))
  return stop_words

Exemplo n.º 2

0

Exibir arquivo

Arquivo: true_lg.py Projeto: AntoineOrgerit/Web-Scrapping

def jt_truelg_treatement(input_file, output_file, file_name):
    """
    Defines the specific JusText treatment to perform from the input file to the output file.
    It uses true language identification, linked to a specific file, to detect the language to
    use in JusText module.
    """
    if input_file.read() != " ":
        input_file.seek(0)
        languages = json.load(open("../../resources/doc_lg.json"))

        language = languages[os.path.basename(file_name)]
        if language not in justext.get_stoplists():
            language = "English"

        paragraphs = justext.justext(input_file.read(),
                                     justext.get_stoplist(language))

        for paragraph in paragraphs:
            output_file.write("<p>" + paragraph.text.replace("\n", " ") +
                              "</p>\n")
    else:
        output_file.write(" ")

Exemplo n.º 3

0

Exibir arquivo

def jt_langid_treatement(input_file, output_file):
    """
    Defines the specific JusText treatment to perform from the input file to the output file.
    It uses the langid module to detect the language to use in JusText module.
    """
    if input_file.read() != " ":
        input_file.seek(0)
        language = langid.classify(input_file.read())
        language = languages.get(alpha2=language[0]).name
        if "Greek" in language:
            language = "Greek"
        if language not in justext.get_stoplists():
            language = "English"

        input_file.seek(0)
        paragraphs = justext.justext(input_file.read(),
                                     justext.get_stoplist(language))

        for paragraph in paragraphs:
            output_file.write("<p>" + paragraph.text.replace("\n", " ") +
                              "</p>\n")
    else:
        output_file.write(" ")

Exemplo n.º 4

0

Exibir arquivo

try:
    from contextlib import redirect_stderr
    MUFFLE_FLAG = True
except ImportError:
    MUFFLE_FLAG = False

# third-party
from lxml import etree, html
from readability import Document
from readability.readability import Unparseable

# try this option
try:
    import justext
    JT_STOPLIST = set()
    for language in justext.get_stoplists():
        JT_STOPLIST.update(justext.get_stoplist(language))
except ImportError:
    justext = JT_STOPLIST = None

# own
from .htmlprocessing import convert_tags, prune_html
from .settings import JUSTEXT_LANGUAGES, MANUALLY_STRIPPED
from .utils import trim, HTML_PARSER
from .xml import TEI_VALID_TAGS

LOGGER = logging.getLogger(__name__)

SANITIZED_XPATH = '//aside|//audio|//button|//fieldset|//figure|//footer|//iframe|//img|//image|//input|//label|//link|//nav|//noindex|//noscript|//object|//option|//select|//source|//svg|//time'

Exemplo n.º 5

0

Exibir arquivo

Arquivo: bot.py Projeto: mill7r/AnchorBot

def guess_language(html):
    hits = dict()
    htmlset = set(str(html).split(" "))
    for lang in justext.get_stoplists():
        hits[lang] = len(set(justext.get_stoplist(lang)).intersection(htmlset))
    return max(hits, key=hits.get)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: bot.py Projeto: MuslimConditions/AnchorBot

def guess_language(html):
    hits = dict()
    htmlset = set(str(html).split(" "))
    for lang in justext.get_stoplists():
        hits[lang] = len(set(justext.get_stoplist(lang)).intersection(htmlset))
    return max(hits, key=hits.get)