def get_xml_list_dict(self, directory, corpora): """ Returns a dictionary with xml file list. keys are character codes. values are arrays of xml files. directory: root directory corpora: corpora list to restrict to """ dict = SortedDict() for file in glob.glob(os.path.join(directory, "*", "*", "*.*")): corpus_name = file.split("/")[-3] # exclude data which are not in the wanted corpora if corpus_name not in corpora: continue char_code = int(os.path.basename(file).split(".")[0]) if not dict.has_key(char_code): dict[char_code] = [] dict[char_code].append(file) return dict
class CharacterCollection(_XmlBase): """ A collection of characters is composed of sets. Each set can have zero, one, or more characters. """ DTD = \ """ <!ELEMENT character-collection (set*)> <!ELEMENT set (character*)> <!-- The name attribute identifies a set uniquely --> <!ATTLIST set name CDATA #REQUIRED> <!ELEMENT character (utf8?,width?,height?,strokes)> <!ELEMENT utf8 (#PCDATA)> <!ELEMENT width (#PCDATA)> <!ELEMENT height (#PCDATA)> <!ELEMENT strokes (stroke+)> <!ELEMENT stroke (point+)> <!ELEMENT point EMPTY> <!ATTLIST point x CDATA #REQUIRED> <!ATTLIST point y CDATA #REQUIRED> <!ATTLIST point timestamp CDATA #IMPLIED> <!ATTLIST point pressure CDATA #IMPLIED> <!ATTLIST point xtilt CDATA #IMPLIED> <!ATTLIST point ytilt CDATA #IMPLIED> """ def __init__(self): self._characters = SortedDict() @staticmethod def from_character_directory(directory, extensions=["xml", "bz2", "gz"], recursive=True): """ Creates a character collection from a directory containing individual character files. """ regexp = re.compile("\.(%s)$" % "|".join(extensions)) charcol = CharacterCollection() for name in os.listdir(directory): full_path = os.path.join(directory, name) if os.path.isdir(full_path) and recursive: charcol += CharacterCollection.from_character_directory( full_path, extensions) elif regexp.search(full_path): char = Character() gzip = False; bz2 = False if full_path.endswith(".gz"): gzip = True if full_path.endswith(".bz2"): bz2 = True try: char.read(full_path, gzip=gzip, bz2=bz2) except ValueError: continue # ignore malformed XML files utf8 = char.get_utf8() if utf8 is None: utf8 = "Unknown" charcol.add_set(utf8) if not char in charcol.get_characters(utf8): charcol.append_character(utf8, char) return charcol def __add__(self, other): new = CharacterCollection() for charcol in (self, other): for set_name in charcol.get_set_list(): new.add_set(set_name) characters = new.get_characters(set_name) for char in charcol.get_characters(set_name): if not char in characters: new.append_character(set_name, char) return new def add_set(self, set_name): if not self._characters.has_key(set_name): self._characters[set_name] = [] def remove_set(self, set_name): if self._characters.has_key(set_name): del self._characters[set_name] def get_set_list(self): return self._characters.keys() def get_characters(self, set_name): if self._characters.has_key(set_name): return self._characters[set_name] else: return [] def get_all_characters(self): characters = [] for k in self._characters.keys(): characters += self._characters[k] return characters def set_characters(self, set_name, characters): self._characters[set_name] = characters def append_character(self, set_name, character): if not self._characters.has_key(set_name): self._characters[set_name] = [] self._characters[set_name].append(character) def insert_character(self, set_name, i, character): if not self._characters.has_key(set_name): self._characters[set_name] = [] self._characters[set_name].append(character) else: self._characters[set_name].insert(i, character) def remove_character(self, set_name, i): if self._characters.has_key(set_name): if len(self._characters[set_name]) - 1 >= i: del self._characters[set_name][i] def remove_last_character(self, set_name): if self._characters.has_key(set_name): if len(self._characters[set_name]) > 0: del self._characters[set_name][-1] def replace_character(self, set_name, i, character): if self._characters.has_key(set_name): if len(self._characters[set_name]) - 1 >= i: self.remove_character(set_name, i) self.insert_character(set_name, i, character) def to_xml(self): s = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" s += "<character-collection>\n" for set_name in self._characters.keys(): s += "<set name=\"%s\">\n" % set_name for character in self._characters[set_name]: s += " <character>\n" utf8 = character.get_utf8() if utf8: s += " <utf8>%s</utf8>\n" % utf8 for line in character.get_writing().to_xml().split("\n"): s += " %s\n" % line s += " </character>\n" s += "</set>\n" s += "</character-collection>\n" return s # Private... def _start_element(self, name, attrs): self._tag = name if self._first_tag: self._first_tag = False if self._tag != "character-collection": raise ValueError, \ "The very first tag should be <character-collection>" if self._tag == "set": if not attrs.has_key("name"): raise ValueError, "<set> should have a name attribute" self._curr_set_name = attrs["name"].encode("UTF-8") self._curr_chars = [] if self._tag == "character": self._curr_char = Character() self._curr_writing = self._curr_char.get_writing() self._curr_width = None self._curr_height = None self._curr_utf8 = None if self._tag == "stroke": self._curr_stroke = Stroke() elif self._tag == "point": point = Point() for key in ("x", "y", "pressure", "xtilt", "ytilt", "timestamp"): if attrs.has_key(key): value = attrs[key].encode("UTF-8") if key in ("pressure", "xtilt", "ytilt"): value = float(value) else: value = int(float(value)) else: value = None setattr(point, key, value) self._curr_stroke.append_point(point) def _end_element(self, name): if name == "character-collection": for s in ["_tag", "_curr_char", "_curr_writing", "_curr_width", "_curr_height", "_curr_utf8", "_curr_stroke", "_curr_chars", "_curr_set_name"]: if s in self.__dict__: del self.__dict__[s] if name == "set": self.set_characters(self._curr_set_name, self._curr_chars) if name == "character": if self._curr_utf8: self._curr_char.set_utf8(self._curr_utf8) if self._curr_width: self._curr_writing.set_width(self._curr_width) if self._curr_height: self._curr_writing.set_height(self._curr_height) self._curr_chars.append(self._curr_char) if name == "stroke": if len(self._curr_stroke) > 0: self._curr_writing.append_stroke(self._curr_stroke) self._stroke = None self._tag = None def _char_data(self, data): if self._tag == "utf8": self._curr_utf8 = data.encode("UTF-8") if self._tag == "width": self._curr_width = int(data) elif self._tag == "height": self._curr_height = int(data)