Python cleanse_text示例，common.rpimod.wordproc.textparser.cleanse_text Python示例

示例#1

0

显示文件

文件： generic.py 项目： ditojohn/py3-raspi

def parse_word_definition(word, entryText):
    _FUNC_NAME_ = "parse_word_definition"

    searchWord = word
    wordDefinitions = []

    DEBUG_VAR = "entryText"
    coutput.print_debug(ERR_DEBUG, _FUNC_NAME_,
                        "{0} :: {1}".format(DEBUG_VAR, type(entryText)))

    sourceText = cparser.cleanse_text(entryText, DICT_CLEAN_TEXT_PATTERNS,
                                      DICT_CLEAN_INNER_TEXT_PATTERNS,
                                      DICT_CLEAN_OUTER_TEXT_PATTERNS)

    DEBUG_VAR = "sourceText"
    coutput.print_debug(ERR_DEBUG, _FUNC_NAME_,
                        "{0} :: {1}".format(DEBUG_VAR, type(sourceText)))

    for marker in DICT_MARKER_DEFINITION:
        wordDefinitions = wordDefinitions + cparser.find_enclosed_text(
            marker[0], marker[1], sourceText)

    DEBUG_VAR = "wordDefinitions"
    coutput.print_debug(ERR_DEBUG, _FUNC_NAME_,
                        "{0} :: {1}".format(DEBUG_VAR, type(wordDefinitions)))
    coutput.print_debug(ERR_DEBUG, _FUNC_NAME_, eval(DEBUG_VAR))

    return wordDefinitions

示例#2

0

显示文件

文件： generic.py 项目： ditojohn/raspi

def parse_word_clip(word, entryText):
    _FUNC_NAME_ = "parse_word_clip"

    searchWord = word
    
    pronunciationURLs = []
    pronunciationURL =  DICT_UNICODE_EMPTY_STR

    pronunciationWords = []
    pronunciationWord = DICT_UNICODE_EMPTY_STR

    DEBUG_VAR="entryText"
    coutput.print_debug(ERR_DEBUG, _FUNC_NAME_, "{0} :: {1}".format(DEBUG_VAR, type(entryText)))

    sourceText = cparser.cleanse_text(entryText, DICT_CLEAN_TEXT_PATTERNS, DICT_CLEAN_INNER_TEXT_PATTERNS, DICT_CLEAN_OUTER_TEXT_PATTERNS)

    DEBUG_VAR="sourceText"
    coutput.print_debug(ERR_DEBUG, _FUNC_NAME_, "{0} :: {1}".format(DEBUG_VAR, type(sourceText)))

    pronunciationURLs = pronunciationURLs + cparser.find_enclosed_text(DICT_MARKER_PRONUNCIATION_URL[0], DICT_MARKER_PRONUNCIATION_URL[1], sourceText)

    if len(pronunciationURLs) > 0:
        pronunciationURL = DICT_AUDIO_URL.format(PATH=pronunciationURLs[0])
        pronunciationWords = pronunciationWords + cparser.find_enclosed_text(DICT_MARKER_PRONUNCIATION_WORD[0], DICT_MARKER_PRONUNCIATION_WORD[1], sourceText)

        if len(pronunciationWords) > 0:
            pronunciationWord = pronunciationWords[0]

    return [pronunciationWord, pronunciationURL]

示例#3

0

显示文件

文件： generic.py 项目： ditojohn/raspi

def parse_word_definition(word, entryText):
    _FUNC_NAME_ = "parse_word_definition"

    searchWord = word
    wordDefinitions = []

    DEBUG_VAR="entryText"
    coutput.print_debug(ERR_DEBUG, _FUNC_NAME_, "{0} :: {1}".format(DEBUG_VAR, type(entryText)))

    sourceText = cparser.cleanse_text(entryText, DICT_CLEAN_TEXT_PATTERNS, DICT_CLEAN_INNER_TEXT_PATTERNS, DICT_CLEAN_OUTER_TEXT_PATTERNS)

    DEBUG_VAR="sourceText"
    coutput.print_debug(ERR_DEBUG, _FUNC_NAME_, "{0} :: {1}".format(DEBUG_VAR, type(sourceText)))
    
    for marker in DICT_MARKER_DEFINITION:
        wordDefinitions = wordDefinitions + cparser.find_enclosed_text(marker[0], marker[1], sourceText)

    DEBUG_VAR="wordDefinitions"
    coutput.print_debug(ERR_DEBUG, _FUNC_NAME_, "{0} :: {1}".format(DEBUG_VAR, type(wordDefinitions)))
    coutput.print_debug(ERR_DEBUG, _FUNC_NAME_, eval(DEBUG_VAR))

    return wordDefinitions

示例#4

0

显示文件

文件： generic.py 项目： ditojohn/py3-raspi

def parse_word_clip(word, entryText):
    _FUNC_NAME_ = "parse_word_clip"

    searchWord = word

    pronunciationURLs = []
    pronunciationURL = DICT_UNICODE_EMPTY_STR

    pronunciationWords = []
    pronunciationWord = DICT_UNICODE_EMPTY_STR

    DEBUG_VAR = "entryText"
    coutput.print_debug(ERR_DEBUG, _FUNC_NAME_,
                        "{0} :: {1}".format(DEBUG_VAR, type(entryText)))

    sourceText = cparser.cleanse_text(entryText, DICT_CLEAN_TEXT_PATTERNS,
                                      DICT_CLEAN_INNER_TEXT_PATTERNS,
                                      DICT_CLEAN_OUTER_TEXT_PATTERNS)

    DEBUG_VAR = "sourceText"
    coutput.print_debug(ERR_DEBUG, _FUNC_NAME_,
                        "{0} :: {1}".format(DEBUG_VAR, type(sourceText)))

    pronunciationURLs = pronunciationURLs + cparser.find_enclosed_text(
        DICT_MARKER_PRONUNCIATION_URL[0], DICT_MARKER_PRONUNCIATION_URL[1],
        sourceText)

    if len(pronunciationURLs) > 0:
        pronunciationURL = DICT_AUDIO_URL.format(PATH=pronunciationURLs[0])
        pronunciationWords = pronunciationWords + cparser.find_enclosed_text(
            DICT_MARKER_PRONUNCIATION_WORD[0],
            DICT_MARKER_PRONUNCIATION_WORD[1], sourceText)

        if len(pronunciationWords) > 0:
            pronunciationWord = pronunciationWords[0]

    return [pronunciationWord, pronunciationURL]

示例#5

0

显示文件

文件： spellit_download_lists.py 项目： ditojohn/raspi

    print "\nProcessing language: " + listID

    listSeq = str(index + 1).zfill(3)
    
    listLang = listID.lower().replace(' ', '_')
    listURL = SB_WORD_LIST_URL[args.contestYear].format(LANG=listLang)

    listURLResponse = connectionPool.request('GET', listURL)
    listURLData = listURLResponse.data.decode('cp1252')

    if isinstance(listURLData, str):
        listRawText = unicode(listURLData, 'utf-8')
    else:
        listRawText = listURLData
    
    cleansedText = cparser.cleanse_text(listRawText, SB_CLEAN_TEXT_PATTERNS, SB_CLEAN_INNER_TEXT_PATTERNS, SB_CLEAN_OUTER_TEXT_PATTERNS)

    listLang = listID.lower().replace(' ', '-')

    # Process basic word list
    listType = "basic"
    listFileName = SB_WORD_LIST_OUT[args.contestYear].format(YEAR=listYear, SEQ=listSeq, LANG=listLang, TYPE=listType)

    sectionOuterTextPatterns = [
    [r'.*<div class="section word study">', r'</div>.*']
    ]
    sectionText = cparser.cleanse_text(cleansedText, SB_CLEAN_TEXT_PATTERNS, SB_CLEAN_INNER_TEXT_PATTERNS, sectionOuterTextPatterns)
    words = cparser.find_enclosed_text(r'<li>\s*', r'\s*</li>', sectionText)
    print "Writing " + listFileName
    cfile.write(listFileName, coutput.multiline_text(words))