def download(connectionPool, sourceURL, targetFileName): coutput.print_watcher("sourceURL") fileData = connectionPool.request('GET', sourceURL).data targetFile = open(targetFileName, "wb") targetFile.write(fileData) targetFile.close()
def parse_xml(self, root, word): _FUNC_NAME_ = "CollegiateDictionary.parse_xml" for entry in root.findall('entry'): args = {} args['headword'] = entry.find('hw').text args['spelling'] = re.sub("\*", "", entry.find('hw').text) args['functional_label'] = getattr(entry.find('fl'), 'text', DICT_UNICODE_EMPTY_STR) args['pronunciation'] = getattr(entry.find('pr'), 'text', DICT_UNICODE_EMPTY_STR) coutput.print_watcher(SB_ERR_DEBUG, _FUNC_NAME_, "args['pronunciation']") #args['pronunciations'] = self._get_pronunciations(entry) args['inflections'] = self._get_inflections(entry) args['senses'] = self._get_senses(entry) args['sound_fragments'] = [e.text for e in entry.findall("sound/wav") if e.text] coutput.print_watcher(SB_ERR_DEBUG, _FUNC_NAME_, "args['sound_fragments']") args['illustration_fragments'] = [e.text for e in entry.findall("art/bmp") if e.text] yield CollegiateDictionaryEntry(word, args)
def build_pronunciation(self, element, word_form): # Accepts <sound> element as input pronunciation = None wavElementText = cdict.DICT_UNICODE_EMPTY_STR wprElementText = cdict.DICT_UNICODE_EMPTY_STR subElements = element.find_all(['wav', 'wpr']) for subElement in subElements: subElementText = subElement.get_text().strip() if subElement.name == 'wav': subElementText = self.build_audio_url(subElementText) wavElementText = subElementText coutput.print_watcher('wavElementText') elif subElement.name == 'wpr': wprElementText = subElementText coutput.print_watcher('wprElementText') if wavElementText != cdict.DICT_UNICODE_EMPTY_STR: pronunciation = cdict.WordPronunciation(wavElementText) if wprElementText != cdict.DICT_UNICODE_EMPTY_STR: pronunciation.word_pronunciation = wprElementText pronunciation.form = word_form pronunciation.spelling = word_form.replace('*', '') return pronunciation
def build_entry_url(self, key_word): _FUNC_NAME_ = "DictionaryConfig.build_entry_url" coutput.print_watcher(MOD_ERR_DEBUG, _FUNC_NAME_, 'key_word') coutput.print_watcher(MOD_ERR_DEBUG, _FUNC_NAME_, 'coutput.normalize(key_word)') #return self.entry_url_format.format(WORD=key_word).replace(u" ", u"%20") return self.entry_url_format.format( WORD=coutput.normalize(key_word)).replace(u" ", u"%20")
def build_entry_url(self, key_word): coutput.print_watcher('key_word') coutput.print_watcher('coutput.normalize(key_word)') #return self.entry_url_format.format(WORD=key_word, KEY=self.api_key).replace(u" ", u"%20") return self.entry_url_format.format(WORD=coutput.normalize(key_word), KEY=self.api_key).replace( " ", "%20")
def _get_pronunciations(self, root): """ Returns list of IPA for regular and 'alternative' pronunciation. """ _FUNC_NAME_ = "CollegiateDictionary._get_pronunciations" prons = root.find("./pr") pron_list = [] if prons is not None: ps = self._flatten_tree(prons, exclude=['it']) pron_list.extend(ps) coutput.print_watcher(SB_ERR_DEBUG, _FUNC_NAME_, 'pron_list') return pron_list
def override_definitions(self, source, entry_word, overrides): _FUNC_NAME_ = "SimplifiedWordEntry.override_definitions" if len(overrides) > 0: self.source = source self.entry_word = entry_word # Remove duplicate definitions for override in overrides: # Handle overrides that are marked special by the application using a prefix e.g. * override_text = re.sub(ur'(^[^\(a-zA-Z0-9]|[\. ]+$)', DICT_UNICODE_EMPTY_STR, override, flags=re.IGNORECASE) coutput.print_watcher(MOD_ERR_DEBUG, _FUNC_NAME_, 'override') coutput.print_watcher(MOD_ERR_DEBUG, _FUNC_NAME_, 'override_text') for definition in self.definitions: definition_text = re.sub(ur'(^[^\(a-zA-Z0-9]|[\. ]+$)', DICT_UNICODE_EMPTY_STR, definition, flags=re.IGNORECASE) coutput.print_watcher(MOD_ERR_DEBUG, _FUNC_NAME_, 'definition') coutput.print_watcher(MOD_ERR_DEBUG, _FUNC_NAME_, 'definition_text') if definition_text == override_text: self.definitions.remove(definition) coutput.print_debug(MOD_ERR_DEBUG, _FUNC_NAME_, "Removed duplicate definition") # Override definitions self.definitions = overrides + self.definitions
def build_sound_url(self, fragment): _FUNC_NAME_ = "build_sound_url" coutput.print_watcher(SB_ERR_DEBUG, _FUNC_NAME_, 'fragment') base_url = "http://media.merriam-webster.com/soundc11" number_prefix_match = re.search(r'^([0-9]+)', fragment) special_prefix_match = re.search(r'^(gg|bix)', fragment) if number_prefix_match: prefix = "number" elif special_prefix_match: prefix = special_prefix_match.group(1) else: prefix = fragment[0] return "{0}/{1}/{2}".format(base_url, prefix, fragment)
def _get_pronunciations(self, root): """ Returns list of IPA for regular and 'alternative' pronunciation. """ _FUNC_NAME_ = "LearnersDictionary._get_pronunciations" prons = root.find("./pr") pron_list = [] if prons is not None: ps = self._flatten_tree(prons, exclude=['it']) pron_list.extend(ps) prons = root.find("./altpr") if prons is not None: ps = self._flatten_tree(prons, exclude=['it']) pron_list.extend(ps) coutput.print_watcher(SB_ERR_DEBUG, _FUNC_NAME_, '<ReplaceText>') return [p.strip(', ') for p in pron_list]
def download_entry(self, connection_pool, key_word): connectionResponse = connection_pool.request( 'GET', self.config.build_entry_url(key_word)) coutput.print_watcher("key_word") coutput.print_watcher("self.config.build_entry_url(key_word)") coutput.print_watcher("connectionResponse") # Perform unicode conversion coutput.print_watcher("connectionResponse.data") entryData = connectionResponse.data.decode('utf8') coutput.print_watcher("entryData") return entryData
def play_legacy(fileName, audioOutput, loopCount, loopDelaySec): # Reference: # https://www.pygame.org/docs/ref/mixer.html#pygame.mixer.init # http://techqa.info/programming/question/27745134/how-can-i-extract-the-metadata-and-bitrate-info-from-a-audio/video-file-in-python try: #Enable for RaspberryPi coutput.print_debug("Executing set_audio_output") set_audio_output(audioOutput) coutput.print_debug("Executing mediainfo") fileInfo = mediainfo(fileName) coutput.print_watcher("fileName") coutput.print_watcher("fileInfo['sample_rate']") coutput.print_watcher("fileInfo['bits_per_sample']") coutput.print_watcher("fileInfo['channels']") for loopIndex in range(0, loopCount): # Syntax: init(frequency=22050, size=-16, channels=2, buffer=4096) pygame.mixer.init() #pygame.mixer.init(frequency=long(float(fileInfo['sample_rate'])), channels=int(fileInfo['channels'])) coutput.print_debug("Executing pygame.mixer.music.load") pygame.mixer.music.load(fileName) coutput.print_debug("Executing pygame.mixer.music.play") pygame.mixer.music.play() while pygame.mixer.music.get_busy() == True: continue time.sleep( 0.06 ) # introduce delay to ensure that the end of the audio is not clipped during playback coutput.print_debug("Executing pygame.mixer.stop") pygame.mixer.stop() coutput.print_debug("Executing pygame.mixer.quit") pygame.mixer.quit() if loopIndex != (loopCount - 1): time.sleep(loopDelaySec) set_audio_output('auto') except: coutput.print_err("Unable to play audio from " + fileName) coutput.print_watcher("sys.exc_info()")
def get_audio_output(audioOutput): # Reference: # https://wiki.archlinux.org/index.php/Advanced_Linux_Sound_Architecture/Troubleshooting#HDMI_Output_does_not_work coutput.print_watcher("alsaAudioOutputConfig") coutput.print_watcher("platform.node()") coutput.print_watcher("audioOutput.lower()") coutput.print_watcher( "alsaAudioOutputConfig[platform.node()][audioOutput.lower()]") return alsaAudioOutputConfig[platform.node()][audioOutput.lower()]
def lookup_word(connectionPool, pronAudioOutput, pronLoopCount, pronLoopDelaySec, word, *lookupSource): _FUNC_NAME_ = "lookup_word" isError = False dictSources = [] if len(lookupSource) == 0: dictEntry = fetch_dictionary_entry(connectionPool, word) currentDefinitions = dictEntry[1] source = dictEntry[2] currentClipWord = dictEntry[3] currentClipURL = dictEntry[4] pronSource = dictEntry[5] display_dictionary_entry(connectionPool, pronAudioOutput, pronLoopCount, pronLoopDelaySec, word, currentDefinitions, source, currentClipWord, currentClipURL, pronSource) elif lookupSource[0].lower() == 'all' or lookupSource[0].lower() in DICT_SOURCES.keys(): if lookupSource[0].lower() == 'all': dictSources = dictSources + PRIORITIZED_DICT_SOURCES else: dictSources.append(DICT_SOURCES[lookupSource[0].lower()]) coutput.print_watcher(ERR_DEBUG, _FUNC_NAME_, 'dictSources') for dictSource in dictSources: source = dictSource.get_dictionary_source() pronSource = dictSource.get_dictionary_source() coutput.print_watcher(ERR_DEBUG, _FUNC_NAME_, 'source') dictEntryText = dictSource.get_dictionary_entry(connectionPool, word) currentDefinitions = dictSource.parse_word_definition(word, dictEntryText) [currentClipWord, currentClipURL] = dictSource.parse_word_clip(word, dictEntryText) display_dictionary_entry(connectionPool, pronAudioOutput, pronLoopCount, pronLoopDelaySec, word, currentDefinitions, source, currentClipWord, currentClipURL, pronSource) else: print "" displayMessage = "ERROR: Unable to lookup {WORD}. Dictionary source {SOURCE} not supported".format(WORD=word, SOURCE=lookupSource[0]) coutput.print_color(ERROR_TEXT_COLOR, displayMessage) print ""
def __init__(self, word, attrs): _FUNC_NAME_ = "CollegiateDictionaryEntry.__init__" self.word = word self.headword = attrs.get('headword') self.spelling = attrs.get('spelling') self.function = attrs.get('functional_label') self.pronunciation = attrs.get("pronunciation") coutput.print_watcher(SB_ERR_DEBUG, _FUNC_NAME_, "self.pronunciation") #self.pronunciations = attrs.get("pronunciations") self.inflections = attrs.get("inflections") self.senses = attrs.get("senses") self.audio = [self.build_sound_url(f) for f in attrs.get("sound_fragments")] self.illustrations = [self.build_illustration_url(f) for f in attrs.get("illustration_fragments")]
def get_dictionary_entry(connectionPool, word): _FUNC_NAME_ = "get_dictionary_entry" # Download dictionary entry dictEntryURL = DICT_ENTRY_URL.format(WORD=word, KEY=DICT_KEY).replace(" ", "%20") dictEntryURL = dictEntryURL.encode('utf-8') # Handle URL strings in ascii coutput.print_watcher(ERR_DEBUG, _FUNC_NAME_, 'dictEntryURL') dictEntryResponse = connectionPool.request('GET', dictEntryURL) coutput.print_watcher(ERR_DEBUG, _FUNC_NAME_, 'dictEntryResponse.data') # Convert XML to unicode if isinstance(dictEntryResponse.data, str): outputXML = unicode(dictEntryResponse.data, 'utf-8') else: outputXML = dictEntryResponse.data return outputXML
def build_cross_entries(self, element, entry_word): # Accepts <cx> element as input and returns an inflection wordInfl = cdict.WordInflection(entry_word) wordInfl.spelling = entry_word.replace('*', '') for subelement in element.find_all(['cl', 'ct']): if subelement.name == 'cl': subElementText = subelement.get_text().strip() subElementText = re.sub(r' of$', '', subElementText, flags=re.UNICODE) wordInfl.functional_label = subElementText elif subelement.name == 'ct': wordInfl.senses.extend(self.build_senses(subelement)) coutput.print_watcher('wordInfl') return wordInfl
def parse_word_definition(word, entryXML): _FUNC_NAME_ = "parse_word_definition" searchWord = word coutput.print_watcher(ERR_DEBUG, _FUNC_NAME_, 'entryXML') sourceXML = entryXML if isinstance(sourceXML, unicode): sourceXML = sourceXML.encode('utf-8') coutput.print_watcher(ERR_DEBUG, _FUNC_NAME_, 'sourceXML') wordDefinition = [] dictionary = api.CollegiateDictionary(DICT_KEY) try: entries = dictionary.lookup(searchWord, sourceXML) for entry in entries: for sense in entry.senses: coutput.print_watcher(ERR_DEBUG, _FUNC_NAME_, 'sense.definition') strDefinition = unicode("({0}) {1}", 'utf-8').format( entry.function, sense.definition) if isinstance(strDefinition, str): wordDefinition.append(unicode(strDefinition, 'utf-8')) else: wordDefinition.append(strDefinition) except api.WordNotFoundException: wordDefinition = [] return wordDefinition
def play(fileName, audioOutput, loopCount, loopDelaySec): # Reference: # https://realpython.com/playing-and-recording-sound-python/#playing-audio-files # https://askubuntu.com/questions/115369/how-to-play-mp3-files-from-the-command-line # https://www.ffmpeg.org/ffplay.html # https://www.ffmpeg.org/ffmpeg-devices.html#Examples-8 # Use aplay -L to find audio output device. e.g. HDMI is plughw playCommand = "ffmpeg -f alsa {outputdevice} -loglevel quiet -i {filename} 2>/dev/null".format( outputdevice=get_audio_output(audioOutput), filename=fileName) try: coutput.print_watcher("fileName") for loopIndex in range(0, loopCount): coutput.print_debug("Executing play") coutput.print_watcher("playCommand") os.system(playCommand) if loopIndex != (loopCount - 1): time.sleep(loopDelaySec) except: coutput.print_err("Unable to play audio from " + fileName) coutput.print_watcher("sys.exc_info()")
def compare_word_form(self, key_word, entry_word): keyWordToken = coutput.tokenize(key_word) entryWordToken = coutput.tokenize(entry_word) coutput.print_watcher('keyWordToken') coutput.print_watcher('entryWordToken') if keyWordToken != entryWordToken: coutput.print_warn( "A different form of the word is being pronounced.") for posPattern in self.posRules: coutput.print_watcher("posPattern['form']") if posPattern['regexPattern'].match(keyWordToken): coutput.print_tip( "The {0} form ({1}) of the word is to be spelled.". format(posPattern['form'], posPattern['pattern'])) break
def compare_word_form(self, key_word, entry_word): _FUNC_NAME_ = "DictionaryAssistant.compare_word_form" keyWordToken = coutput.tokenize(key_word) entryWordToken = coutput.tokenize(entry_word) coutput.print_watcher(MOD_ERR_DEBUG, _FUNC_NAME_, 'keyWordToken') coutput.print_watcher(MOD_ERR_DEBUG, _FUNC_NAME_, 'entryWordToken') if keyWordToken != entryWordToken: coutput.print_warn( "A different form of the word is being pronounced.") for posPattern in self.posRules: coutput.print_watcher(MOD_ERR_DEBUG, _FUNC_NAME_, "posPattern['form']") if posPattern['regexPattern'].match(keyWordToken): coutput.print_tip( "The {0} form ({1}) of the word is to be spelled.". format(posPattern['form'], posPattern['pattern'])) break
def fetch_dictionary_entry(connectionPool, word): _FUNC_NAME_ = "fetch_dictionary_entry" wordDefinitionSource = "" wordDefinitions = [] wordDefinitionFound = False pronunciationSource = "" pronunciationWord = "" pronunciationURL = "" wordPronunciationFound = False for dictSource in PRIORITIZED_DICT_SOURCES: dictEntryText = dictSource.get_dictionary_entry(connectionPool, word) coutput.print_watcher(ERR_DEBUG, _FUNC_NAME_, 'dictSource') if wordDefinitionFound == False: currentDefinitions = dictSource.parse_word_definition(word, dictEntryText) if len(currentDefinitions) > 0: wordDefinitionSource = dictSource.get_dictionary_source() wordDefinitions = currentDefinitions wordDefinitionFound = True coutput.print_watcher(ERR_DEBUG, _FUNC_NAME_, 'wordDefinitionSource') if wordPronunciationFound == False: [currentClipWord, currentClipURL] = dictSource.parse_word_clip(word, dictEntryText) if currentClipWord != "": pronunciationSource = dictSource.get_dictionary_source() [pronunciationWord, pronunciationURL] = [currentClipWord, currentClipURL] wordPronunciationFound = True coutput.print_watcher(ERR_DEBUG, _FUNC_NAME_, 'pronunciationSource') if wordDefinitionFound == True and wordPronunciationFound == True: break return [word, wordDefinitions, wordDefinitionSource, pronunciationWord, pronunciationURL, pronunciationSource]
def set_simplified_word_entry(self): simplifiedWordEntry = None matchEntries = [] matchInflection = None matchType = "none" matchEntryFound = False # Identify matching entry # Pass #1: Find matching entry word if not matchEntryFound: for we in self.word_entries: if self.key_word == we.entry_word: matchEntries.append(we) matchEntryFound = True matchType = "entryword" # Pass #2: Find matching inflection if not matchEntryFound: for we in self.word_entries: for infl in we.inflections: if self.key_word == infl.spelling: matchEntries.append(we) matchInflection = infl matchEntryFound = True matchType = "inflection" break if matchEntryFound: break # Pass #3: Default as first entry, if no match found if not matchEntryFound: for we in self.word_entries: matchEntries.append(we) matchEntryFound = True matchType = "default" break # Populate conformed entry attributes coutput.print_watcher('matchEntryFound') coutput.print_watcher('matchType') # If matching entry is found, populate pronunciation attributes if matchEntryFound: if matchType == "inflection": simplifiedWordEntry = SimplifiedWordEntry( matchEntries[0].source, self.key_word, matchInflection.spelling) simplifiedWordEntry.functional_label = coutput.coalesce( matchInflection.functional_label, matchEntries[0].functional_label) if matchInflection.pronunciation is not None: coutput.print_watcher( 'matchInflection.pronunciation.audio_url') simplifiedWordEntry.pronunciation = WordPronunciation( matchInflection.pronunciation.audio_url) simplifiedWordEntry.pronunciation.word_pronunciation = matchInflection.pronunciation.word_pronunciation simplifiedWordEntry.pronunciation.form = matchInflection.pronunciation.form simplifiedWordEntry.pronunciation.spelling = matchInflection.pronunciation.spelling coutput.print_watcher( 'simplifiedWordEntry.pronunciation.word_pronunciation') coutput.print_watcher( 'simplifiedWordEntry.pronunciation.form') coutput.print_watcher( 'simplifiedWordEntry.pronunciation.spelling') if matchInflection.respelling is not None: simplifiedWordEntry.respelling = WordRespelling( matchInflection.respelling.text, matchInflection.respelling.source) simplifiedWordEntry.respelling.form = matchInflection.respelling.form simplifiedWordEntry.respelling.spelling = matchInflection.respelling.spelling else: simplifiedWordEntry = SimplifiedWordEntry( matchEntries[0].source, self.key_word, matchEntries[0].entry_word) simplifiedWordEntry.functional_label = matchEntries[ 0].functional_label coutput.print_watcher('matchEntries[0].pronunciation') simplifiedWordEntry.pronunciation = copy.deepcopy( matchEntries[0].pronunciation) simplifiedWordEntry.respelling = copy.deepcopy( matchEntries[0].respelling) # Consolidate etymology and senses (definitions and examples) etymologies = [] definitions = [] for we in matchEntries: coutput.print_watcher('we') if we.etymology != DICT_UNICODE_EMPTY_STR and we.etymology not in etymologies: etymologies.append(we.etymology) flText = DICT_UNICODE_EMPTY_STR if we.functional_label != DICT_UNICODE_EMPTY_STR: flText = "({0}) ".format(we.functional_label) for sense in we.senses: defnText = flText + str(sense.definition) if defnText not in definitions: definitions.append(defnText) # Handle inflections within matching entries for infl in we.inflections: flText = DICT_UNICODE_EMPTY_STR if infl.functional_label != DICT_UNICODE_EMPTY_STR: flText = "({0}) ".format(infl.functional_label) for sense in infl.senses: defnText = flText + str(sense.definition) if defnText not in definitions: definitions.append(defnText) simplifiedWordEntry.etymology = "; ".join(et for et in etymologies) simplifiedWordEntry.definitions = definitions[:] # Else if no matching entry is found, create a skeleton entry else: simplifiedWordEntry = SimplifiedWordEntry(DICT_UNICODE_EMPTY_STR, self.key_word, DICT_UNICODE_EMPTY_STR) # Set conformed entry self.simplified_word_entry = simplifiedWordEntry coutput.print_watcher('simplifiedWordEntry')
def parse_word_clip(word, entryXML): _FUNC_NAME_ = "parse_word_clip" searchWord = word coutput.print_watcher(ERR_DEBUG, _FUNC_NAME_, 'entryXML') sourceXML = entryXML if isinstance(sourceXML, unicode): sourceXML = sourceXML.encode('utf-8') coutput.print_watcher(ERR_DEBUG, _FUNC_NAME_, 'sourceXML') dictionary = api.CollegiateDictionary(DICT_KEY) wordFound = False audioClipFound = False audioClip = DICT_UNICODE_EMPTY_STR audioClipWord = DICT_UNICODE_EMPTY_STR audioClipPron = DICT_UNICODE_EMPTY_STR try: # Pass #1: Find matching headword spelling coutput.print_debug(ERR_DEBUG, _FUNC_NAME_, "Start of Pass #1") entries = dictionary.lookup(searchWord, sourceXML) for entry in entries: coutput.print_watcher(ERR_DEBUG, _FUNC_NAME_, 'entry.spelling') if searchWord == entry.spelling: for audio in entry.audio: audioClipWord = entry.spelling audioClipPron = entry.pronunciation wordFound = True audioClip = audio audioClipFound = True if wordFound: break if wordFound: break coutput.print_debug(ERR_DEBUG, _FUNC_NAME_, "End of Pass #1") coutput.print_watcher(ERR_DEBUG, _FUNC_NAME_, 'audioClipFound') coutput.print_watcher(ERR_DEBUG, _FUNC_NAME_, 'wordFound') # Pass #2: Find matching inflection coutput.print_debug(ERR_DEBUG, _FUNC_NAME_, "Start of Pass #2") if audioClipFound == False: wordFound = False audioClip = DICT_UNICODE_EMPTY_STR audioClipWord = DICT_UNICODE_EMPTY_STR audioClipPron = DICT_UNICODE_EMPTY_STR entries = dictionary.lookup(searchWord, sourceXML) for entry in entries: coutput.print_watcher(ERR_DEBUG, _FUNC_NAME_, 'entry.spelling') for inflection in entry.inflections: coutput.print_watcher(ERR_DEBUG, _FUNC_NAME_, 'inflection.spellings') for spelling in inflection.spellings: coutput.print_watcher(ERR_DEBUG, _FUNC_NAME_, 'searchWord') coutput.print_watcher(ERR_DEBUG, _FUNC_NAME_, 'spelling') if searchWord == spelling: audioClipWord = spelling wordFound = True coutput.print_watcher(ERR_DEBUG, _FUNC_NAME_, 'inflection.sound_urls') for sound_url in inflection.sound_urls: coutput.print_watcher(ERR_DEBUG, _FUNC_NAME_, 'sound_url') audioClip = sound_url audioClipFound = True break if wordFound: break if wordFound: break coutput.print_debug(ERR_DEBUG, _FUNC_NAME_, "End of Pass #2") coutput.print_watcher(ERR_DEBUG, _FUNC_NAME_, 'audioClipFound') coutput.print_watcher(ERR_DEBUG, _FUNC_NAME_, 'wordFound') # Pass #3: Find pronunciation for first entry, if no match found coutput.print_debug(ERR_DEBUG, _FUNC_NAME_, "Start of Pass #3") if audioClipFound == False: wordFound = False audioClip = DICT_UNICODE_EMPTY_STR audioClipWord = DICT_UNICODE_EMPTY_STR audioClipPron = DICT_UNICODE_EMPTY_STR entries = dictionary.lookup(searchWord, sourceXML) for entry in entries: for audio in entry.audio: audioClipWord = entry.spelling wordFound = True audioClip = audio audioClipFound = True if wordFound: break if wordFound: break coutput.print_debug(ERR_DEBUG, _FUNC_NAME_, "End of Pass #3") coutput.print_watcher(ERR_DEBUG, _FUNC_NAME_, 'audioClipFound') coutput.print_watcher(ERR_DEBUG, _FUNC_NAME_, 'wordFound') except api.WordNotFoundException: audioClip = DICT_UNICODE_EMPTY_STR audioClipWord = DICT_UNICODE_EMPTY_STR audioClipPron = DICT_UNICODE_EMPTY_STR if not audioClipFound: audioClip = DICT_UNICODE_EMPTY_STR audioClipWord = DICT_UNICODE_EMPTY_STR audioClipPron = DICT_UNICODE_EMPTY_STR coutput.print_watcher(ERR_DEBUG, _FUNC_NAME_, 'searchWord') coutput.print_watcher(ERR_DEBUG, _FUNC_NAME_, 'audioClipWord') coutput.print_watcher(ERR_DEBUG, _FUNC_NAME_, 'audioClip') # Return audioClipWord and audioClip, if found if isinstance(audioClipWord, str): audioClipWord = unicode(audioClipWord, 'utf-8') if isinstance(audioClip, str): audioClip = unicode(audioClip, 'utf-8') if isinstance(audioClipPron, str): audioClipPron = unicode(audioClipPron, 'utf-8') return [audioClipWord, audioClip, audioClipPron]
def override_entry(self, source, entry_word, overrides): if len(overrides) > 0: self.entry_word = entry_word overrideInfo = {} overrideDefinitions = [] for override in overrides: if override.startswith('#!'): override_elements = override.split(':') override_name = override_elements[0].strip() override_value = re.sub('^#![a-zA-Z0-9]+: ', DICT_UNICODE_EMPTY_STR, override).strip() if override_value != DICT_UNICODE_EMPTY_STR: overrideInfo[override_name] = override_value else: if override != DICT_UNICODE_EMPTY_STR: overrideDefinitions.append(override) # Process info lines for key in overrideInfo: if key == '#!Etymology': self.etymology = overrideInfo[key] elif key == '#!AudioURL': if self.pronunciation is None: self.pronunciation = WordPronunciation( overrideInfo[key]) else: self.pronunciation.audio_url = overrideInfo[key] if '#!Word' in overrideInfo.keys(): self.pronunciation.form = overrideInfo['#!Word'] self.pronunciation.spelling = overrideInfo['#!Word'] elif key == '#!Respelling': if self.respelling is None: #self.respelling = WordRespelling(overrideInfo[key], overrideInfo['#!Source']) self.respelling = WordRespelling( overrideInfo[key], source) else: #self.respelling.source = self.respelling.source + ';' + overrideInfo['#!Source'] self.respelling.source = self.respelling.source + ';' + source self.respelling.text = overrideInfo[key] if '#!Word' in overrideInfo.keys(): self.respelling.form = overrideInfo['#!Word'] self.respelling.spelling = overrideInfo['#!Word'] elif key == '#!Sentence': self.usage = [overrideInfo[key]] + self.usage elif key == '#!Examples': self.examples = overrideInfo[key] else: self.definitions.append("{}: {}".format( key, overrideInfo[key])) # Process #!Source info lines if '#!Source' in overrideInfo.keys(): altSource = overrideInfo['#!Source'] else: altSource = source if self.source == DICT_UNICODE_EMPTY_STR: self.source = altSource else: self.source = self.source + ';' + altSource # Process definitions # Remove duplicate definitions for override in overrideDefinitions: # Handle overrides that are marked special by the application using a prefix e.g. * override_text = re.sub(r'(^[^\(a-zA-Z0-9]|[\. ]+$)', DICT_UNICODE_EMPTY_STR, override, flags=re.IGNORECASE) coutput.print_watcher('override') coutput.print_watcher('override_text') for definition in self.definitions: definition_text = re.sub(r'(^[^\(a-zA-Z0-9]|[\. ]+$)', DICT_UNICODE_EMPTY_STR, definition, flags=re.IGNORECASE) coutput.print_watcher('definition') coutput.print_watcher('definition_text') if definition_text == override_text: self.definitions.remove(definition) coutput.print_debug("Removed duplicate definition") # Override definitions self.definitions = overrideDefinitions + self.definitions
def set_word_entries(self): soup = BeautifulSoup(self.entry_raw_text, self.config.parser) nameFilter = re.compile(r'(hw|fl|pr|et|sound|def|cx|art)') for entry in soup.find_all('entry'): """ <!ELEMENT entry (((subj?, art?, formula?, table?), hw, (pr?, pr_alt?, pr_ipa?, pr_wod?, sound?)*, (ahw, (pr, pr_alt?, pr_ipa?, pr_wod?, sound?)?)*, vr?), (fl?, in*, lb*, ((cx, (ss | us)*) | et)*, sl*), (dx | def)*, (list? | (uro*, dro*, ((pl, pt, sa?) | (note) | quote+)*)))> """ # Capture and exclude miscellaneous entries from main entry: # * inflections <in> # * defined run-on entries <dro> # * undefined run-on entries <uro> # * variants <vr> miscElements = entry.find_all(['in', 'dro', 'uro', 'vr']) [x.extract() for x in entry.findAll(['in', 'dro', 'uro', 'vr'])] coutput.print_debug("Process all <ew> elements") for element in entry.find_all('ew'): elementText = element.get_text().strip() wordEntry = cdict.WordEntry(self.config.name, elementText) for element in entry.find_all(nameFilter): elementText = element.get_text().strip() coutput.print_watcher('element.name') coutput.print_watcher('elementText') if element.name == 'hw': coutput.print_debug("Process <hw> element") wordEntry.head_word = elementText elif element.name == 'fl': coutput.print_debug("Process <fl> element") wordEntry.functional_label = elementText elif element.name == 'et': coutput.print_debug("Process <et> element") wordEntry.etymology = elementText elif element.name == 'pr': wordEntry.respelling = self.build_respelling( element, wordEntry.entry_word) elif element.name == 'sound': wordEntry.pronunciation = self.build_pronunciation( element, wordEntry.entry_word) elif element.name == 'art': wordEntry.illustrations.extend( self.build_illustrations(element, wordEntry.entry_word)) elif element.name == 'def': coutput.print_debug("Process <def> element") wordEntry.senses.extend(self.build_senses(element)) elif element.name == 'cx': # Process cross-entry <cx> elements as inflections wordEntry.inflections.append( self.build_cross_entries(element, wordEntry.entry_word)) # Process previously captured misc. elements from main entry as inflections for miscElement in miscElements: for element in miscElement.find_all(['if', 'ure', 'drp', 'va']): elementText = element.get_text().strip() winf = cdict.WordInflection(elementText) winf.spelling = elementText.replace('*', '') if element.name == 'ure': winf.functional_label = "undefined run-on entry" elif element.name == 'drp': winf.functional_label = "defined run-on phrase" elif element.name == 'va': winf.functional_label = "variant form" for element in miscElement.find_all( ['il', 'sound', 'pr', 'def']): DEBUG_VAR = "element.name" coutput.print_debug("{0} :: {1}".format( DEBUG_VAR, eval(DEBUG_VAR))) elementText = element.get_text().strip() DEBUG_VAR = "elementText" coutput.print_debug("{0} :: {1}".format( DEBUG_VAR, eval(DEBUG_VAR))) if element.name == 'il': winf.functional_label = elementText elif element.name == 'pr': winf.respelling = self.build_respelling( element, winf.form) elif element.name == 'sound': winf.pronunciation = self.build_pronunciation( element, winf.form) elif element.name == 'def': winf.senses.extend(self.build_senses(element)) wordEntry.inflections.append(winf) coutput.print_watcher('wordEntry') self.word_entries.append(wordEntry)
# Main Program ################################################################ _FUNC_NAME_ = "main" dictConfig = cdictapi.DictionaryConfig() dictAssist = cdictassist.DictionaryAssistant(dictConfig) connectionPool = urllib3.PoolManager(10, headers=SDO_USER_AGENT) logEntries = cfile.read(SDO_LIST_FILE).splitlines() print "Downloading overrides ..." for entry in logEntries: coutput.print_watcher(SDO_ERR_DEBUG, _FUNC_NAME_, 'entry') logValues = entry.split(':') word = logValues[1] if not os.path.isfile(SDO_OVERRIDE_ENTRY_FILE.format(WORD=word)): cfile.write(SDO_OVERRIDE_ENTRY_FILE.format(WORD=word), dictAssist.download_entry(connectionPool, word)) wordEntry = cfile.read(SDO_OVERRIDE_ENTRY_FILE.format(WORD=word)) wordDictionary = cdictapi.DictionaryEntry(dictConfig, word, wordEntry) coutput.print_watcher(SDO_ERR_DEBUG, _FUNC_NAME_, 'wordEntry') SDO_ERR_DEFN_MISSING = False SDO_ERR_CLIP_MISSING = False
SB_ERR_DEBUG = False ################################################################ # Main Program ################################################################ _FUNC_NAME_ = "main" connectionPool = urllib3.PoolManager(10, headers=SDO_USER_AGENT) logEntries = cfile.read(SDO_LIST_FILE).splitlines() print "Downloading overrides ..." for entry in logEntries: coutput.print_watcher(SB_ERR_DEBUG, _FUNC_NAME_, 'entry') logValues = entry.split(':') word = logValues[1] wordEntry = cdict.fetch_dictionary_entry(connectionPool, word) coutput.print_watcher(SB_ERR_DEBUG, _FUNC_NAME_, 'wordEntry') SDO_ERR_DEFN_MISSING = False SDO_ERR_CLIP_MISSING = False print unicode("Word: {0}\t{1}", 'utf-8').format(word, logValues[2]) if SDO_ERR_DEFN_REGEX_PATTERN.match(logValues[2]): coutput.print_watcher(SB_ERR_DEBUG, _FUNC_NAME_, 'wordEntry[1]') if len(wordEntry[1]) > 0:
setRowElements = moduleTableElement.find_elements_by_xpath("//table/tbody/tr") setCounter = 0 setEntries = [] processFlag = False for setRowElement in setRowElements: setCounter = setCounter + 1 setColElement = setRowElement.find_element_by_xpath(".//td") setName = setColElement.text.strip().lower().replace(" ", "-") setID = "{:03d}-".format(setCounter) + setName setURL = setRowElement.find_element_by_xpath(".//a[contains(@href,'&bt=r') and not(contains(@href,'_test_'))]").get_property("href") print("Checking set {}.".format(setName)) coutput.print_watcher("setID") coutput.print_watcher("setURL") if len(APP_SELECT_SET_LIST) > 0: if setName in APP_SELECT_SET_LIST: processFlag = True else: processFlag = False else: if APP_START_SET_NM == APP_EMPTY_STRING: processFlag = True elif setName == APP_START_SET_NM: processFlag = True if setName == APP_STOP_SET_NM: processFlag = False
currOrigin = currOrigin + u'; ' + elementText elif u'ety-sl' in element['class']: elementText = element.get_text().strip() if currOrigin == cdict.DICT_UNICODE_EMPTY_STR: currOrigin = elementText else: currOrigin = currOrigin + elementText for entry in soup.find_all(self.config.is_required_element): if entry.name == u'div' and entry.has_attr(u'class') and any( re.compile(ur'entry-.*').match(x) for x in entry.attrs[u'class']): coutput.print_watcher(MOD_ERR_DEBUG, _FUNC_NAME_, 'entry') # Process head word: <h1 class="hword"> or <p class="hword"> for element in entry.find_all(class_="hword"): elementText = element.get_text().strip() currEntryWord = elementText #coutput.print_watcher(MOD_ERR_DEBUG, _FUNC_NAME_, 'currEntryWord') # Process functional label: <span class="fl"> for element in entry.find_all('span', class_="fl"): elementText = element.get_text().strip() elementText = re.sub(ur'[ ]*\(.*$', u'', elementText, flags=re.UNICODE) currFuncLabel = elementText