def parse_word_definition(word, entryText): _FUNC_NAME_ = "parse_word_definition" searchWord = word wordDefinitions = [] DEBUG_VAR = "entryText" coutput.print_debug(ERR_DEBUG, _FUNC_NAME_, "{0} :: {1}".format(DEBUG_VAR, type(entryText))) sourceText = cparser.cleanse_text(entryText, DICT_CLEAN_TEXT_PATTERNS, DICT_CLEAN_INNER_TEXT_PATTERNS, DICT_CLEAN_OUTER_TEXT_PATTERNS) DEBUG_VAR = "sourceText" coutput.print_debug(ERR_DEBUG, _FUNC_NAME_, "{0} :: {1}".format(DEBUG_VAR, type(sourceText))) for marker in DICT_MARKER_DEFINITION: wordDefinitions = wordDefinitions + cparser.find_enclosed_text( marker[0], marker[1], sourceText) DEBUG_VAR = "wordDefinitions" coutput.print_debug(ERR_DEBUG, _FUNC_NAME_, "{0} :: {1}".format(DEBUG_VAR, type(wordDefinitions))) coutput.print_debug(ERR_DEBUG, _FUNC_NAME_, eval(DEBUG_VAR)) return wordDefinitions
def parse_word_clip(word, entryText): _FUNC_NAME_ = "parse_word_clip" searchWord = word pronunciationURLs = [] pronunciationURL = DICT_UNICODE_EMPTY_STR pronunciationWords = [] pronunciationWord = DICT_UNICODE_EMPTY_STR DEBUG_VAR="entryText" coutput.print_debug(ERR_DEBUG, _FUNC_NAME_, "{0} :: {1}".format(DEBUG_VAR, type(entryText))) sourceText = cparser.cleanse_text(entryText, DICT_CLEAN_TEXT_PATTERNS, DICT_CLEAN_INNER_TEXT_PATTERNS, DICT_CLEAN_OUTER_TEXT_PATTERNS) DEBUG_VAR="sourceText" coutput.print_debug(ERR_DEBUG, _FUNC_NAME_, "{0} :: {1}".format(DEBUG_VAR, type(sourceText))) pronunciationURLs = pronunciationURLs + cparser.find_enclosed_text(DICT_MARKER_PRONUNCIATION_URL[0], DICT_MARKER_PRONUNCIATION_URL[1], sourceText) if len(pronunciationURLs) > 0: pronunciationURL = DICT_AUDIO_URL.format(PATH=pronunciationURLs[0]) pronunciationWords = pronunciationWords + cparser.find_enclosed_text(DICT_MARKER_PRONUNCIATION_WORD[0], DICT_MARKER_PRONUNCIATION_WORD[1], sourceText) if len(pronunciationWords) > 0: pronunciationWord = pronunciationWords[0] return [pronunciationWord, pronunciationURL]
def parse_word_definition(word, entryText): _FUNC_NAME_ = "parse_word_definition" searchWord = word wordDefinitions = [] DEBUG_VAR="entryText" coutput.print_debug(ERR_DEBUG, _FUNC_NAME_, "{0} :: {1}".format(DEBUG_VAR, type(entryText))) sourceText = cparser.cleanse_text(entryText, DICT_CLEAN_TEXT_PATTERNS, DICT_CLEAN_INNER_TEXT_PATTERNS, DICT_CLEAN_OUTER_TEXT_PATTERNS) DEBUG_VAR="sourceText" coutput.print_debug(ERR_DEBUG, _FUNC_NAME_, "{0} :: {1}".format(DEBUG_VAR, type(sourceText))) for marker in DICT_MARKER_DEFINITION: wordDefinitions = wordDefinitions + cparser.find_enclosed_text(marker[0], marker[1], sourceText) DEBUG_VAR="wordDefinitions" coutput.print_debug(ERR_DEBUG, _FUNC_NAME_, "{0} :: {1}".format(DEBUG_VAR, type(wordDefinitions))) coutput.print_debug(ERR_DEBUG, _FUNC_NAME_, eval(DEBUG_VAR)) return wordDefinitions
def parse_word_clip(word, entryText): _FUNC_NAME_ = "parse_word_clip" searchWord = word pronunciationURLs = [] pronunciationURL = DICT_UNICODE_EMPTY_STR pronunciationWords = [] pronunciationWord = DICT_UNICODE_EMPTY_STR DEBUG_VAR = "entryText" coutput.print_debug(ERR_DEBUG, _FUNC_NAME_, "{0} :: {1}".format(DEBUG_VAR, type(entryText))) sourceText = cparser.cleanse_text(entryText, DICT_CLEAN_TEXT_PATTERNS, DICT_CLEAN_INNER_TEXT_PATTERNS, DICT_CLEAN_OUTER_TEXT_PATTERNS) DEBUG_VAR = "sourceText" coutput.print_debug(ERR_DEBUG, _FUNC_NAME_, "{0} :: {1}".format(DEBUG_VAR, type(sourceText))) pronunciationURLs = pronunciationURLs + cparser.find_enclosed_text( DICT_MARKER_PRONUNCIATION_URL[0], DICT_MARKER_PRONUNCIATION_URL[1], sourceText) if len(pronunciationURLs) > 0: pronunciationURL = DICT_AUDIO_URL.format(PATH=pronunciationURLs[0]) pronunciationWords = pronunciationWords + cparser.find_enclosed_text( DICT_MARKER_PRONUNCIATION_WORD[0], DICT_MARKER_PRONUNCIATION_WORD[1], sourceText) if len(pronunciationWords) > 0: pronunciationWord = pronunciationWords[0] return [pronunciationWord, pronunciationURL]
print "\nProcessing language: " + listID listSeq = str(index + 1).zfill(3) listLang = listID.lower().replace(' ', '_') listURL = SB_WORD_LIST_URL[args.contestYear].format(LANG=listLang) listURLResponse = connectionPool.request('GET', listURL) listURLData = listURLResponse.data.decode('cp1252') if isinstance(listURLData, str): listRawText = unicode(listURLData, 'utf-8') else: listRawText = listURLData cleansedText = cparser.cleanse_text(listRawText, SB_CLEAN_TEXT_PATTERNS, SB_CLEAN_INNER_TEXT_PATTERNS, SB_CLEAN_OUTER_TEXT_PATTERNS) listLang = listID.lower().replace(' ', '-') # Process basic word list listType = "basic" listFileName = SB_WORD_LIST_OUT[args.contestYear].format(YEAR=listYear, SEQ=listSeq, LANG=listLang, TYPE=listType) sectionOuterTextPatterns = [ [r'.*<div class="section word study">', r'</div>.*'] ] sectionText = cparser.cleanse_text(cleansedText, SB_CLEAN_TEXT_PATTERNS, SB_CLEAN_INNER_TEXT_PATTERNS, sectionOuterTextPatterns) words = cparser.find_enclosed_text(r'<li>\s*', r'\s*</li>', sectionText) print "Writing " + listFileName cfile.write(listFileName, coutput.multiline_text(words))