def parsedefinition(self, raw_definition, tonedchars_callback=None): log.info("Parsing the raw definition %s", raw_definition) # Default the toned characters callback to something sensible if tonedchars_callback is None: tonedchars_callback = lambda characters: [Word(Text(characters))] meanings, measurewords = [], [] for definition in raw_definition.strip().lstrip("/").rstrip("/").split( "/"): # Remove stray spaces definition = definition.strip() # Detect measure-word ness if definition.startswith("CL:"): ismeasureword = True # Measure words are comma-seperated for mw in definition[3:].strip().split(","): # Attempt to parse the measure words as structured data match = self.embeddedchineseregex.match(mw) if match is None: log.info( "Could not parse the apparent measure word %s", mw) continue # They SHOULD have pinyin information characterswords, pinyinwords = self.formatmatch( match, tonedchars_callback) if characterswords is None or pinyinwords is None: log.info( "The measure word %s was missing some information in the dictionary", mw) continue measurewords.append((characterswords, pinyinwords)) else: words = [] for ismatch, thing in utils.regexparse( self.embeddedchineseregex, definition): if ismatch: # A match - we can append a representation of the words it contains (characterwords, pinyinwords) = self.formatmatch( thing, tonedchars_callback) # Put the resulting words right into the output in a human-readable format words.extend(characterwords) if pinyinwords is not None: words.append(Word(Text(" - "))) words.extend(pinyinwords) else: # Just a string: append it as a list of tokens, trying to extract any otherwise-unmarked # pinyin in the sentence for colorisation etc words.append(Word(*tokenize(thing, forcenumeric=True))) meanings.append(words) return meanings, measurewords
def reformatmeaning(self, meaning): output = u"" for recognised, match in utils.regexparse(re.compile(ur"\(([0-9]+)\)"), meaning): if recognised: # Should reformat the number output += self.config.meaningnumber(int(match.group(1))) else: # Output is just unicode, append it directly output += match
def reformataudio(self, audio): output = u"" for recognised, match in utils.regexparse(re.compile(ur"\[sound:([^\]]*)\]"), audio): if recognised: # Must be a sound tag - leave it well alone output += match.group(0) else: # Process as if this non-sound tag were a reading, in order to turn it into some tags output += generateaudio(self.notifier, self.mediamanager, self.config, [model.Word(*model.tokenize(match))])
def parsedefinition(self, raw_definition, tonedchars_callback=None): log.info("Parsing the raw definition %s", raw_definition) # Default the toned characters callback to something sensible if tonedchars_callback is None: tonedchars_callback = lambda characters: [Word(Text(characters))] meanings, measurewords = [], [] for definition in raw_definition.strip().lstrip("/").rstrip("/").split("/"): # Remove stray spaces definition = definition.strip() # Detect measure-word ness if definition.startswith("CL:"): ismeasureword = True # Measure words are comma-seperated for mw in definition[3:].strip().split(","): # Attempt to parse the measure words as structured data match = self.embeddedchineseregex.match(mw) if match is None: log.info("Could not parse the apparent measure word %s", mw) continue # They SHOULD have pinyin information characterswords, pinyinwords = self.formatmatch(match, tonedchars_callback) if characterswords is None or pinyinwords is None: log.info("The measure word %s was missing some information in the dictionary", mw) continue measurewords.append((characterswords, pinyinwords)) else: words = [] for ismatch, thing in utils.regexparse(self.embeddedchineseregex, definition): if ismatch: # A match - we can append a representation of the words it contains (characterwords, pinyinwords) = self.formatmatch(thing, tonedchars_callback) # Put the resulting words right into the output in a human-readable format words.extend(characterwords) if pinyinwords is not None: words.append(Word(Text(" - "))) words.extend(pinyinwords) else: # Just a string: append it as a list of tokens, trying to extract any otherwise-unmarked # pinyin in the sentence for colorisation etc words.append(Word(*tokenize(thing, forcenumeric=True))) meanings.append(words) return meanings, measurewords
def tokenizetext(text, forcenumeric): # To recognise pinyin amongst the rest of the text, for now just look for maximal # sequences of alphanumeric characters as defined by Unicode. This should catch # the pinyin, its tone marks, tone numbers (if any) and allow umlauts. tokens = [] for recognised, match in utils.regexparse(re.compile(u"(\w|:)+", re.UNICODE), text): if recognised: tokens.extend(tokenizeonewitherhua(match.group(0), forcenumeric=forcenumeric)) else: tokens.append(Text(match)) # TODO: could be much smarter about segmentation here. For example, we could use the # pinyin regex to split up run on groups of pinyin-like characters. return tokens