Пример #1
0
    def get_xml_list_dict(self, directory, corpora):
        """
        Returns a dictionary with xml file list.
            keys are character codes.
            values are arrays of xml files.

        directory: root directory
        corpora: corpora list to restrict to
        """
        dict = SortedDict()
        for file in glob.glob(os.path.join(directory, "*", "*", "*.*")):
            corpus_name = file.split("/")[-3]
            # exclude data which are not in the wanted corpora
            if corpus_name not in corpora:
                continue
            
            char_code = int(os.path.basename(file).split(".")[0])
            if not dict.has_key(char_code):
                dict[char_code] = []
            dict[char_code].append(file)
        return dict
Пример #2
0
class CharacterCollection(_XmlBase):
    """
    A collection of characters is composed of sets.
    Each set can have zero, one, or more characters.
    """

    DTD = \
"""
<!ELEMENT character-collection (set*)>
<!ELEMENT set (character*)>

<!-- The name attribute identifies a set uniquely -->
<!ATTLIST set name CDATA #REQUIRED>

<!ELEMENT character (utf8?,width?,height?,strokes)>
<!ELEMENT utf8 (#PCDATA)>
<!ELEMENT width (#PCDATA)>
<!ELEMENT height (#PCDATA)>
<!ELEMENT strokes (stroke+)>
<!ELEMENT stroke (point+)>
<!ELEMENT point EMPTY>

<!ATTLIST point x CDATA #REQUIRED>
<!ATTLIST point y CDATA #REQUIRED>
<!ATTLIST point timestamp CDATA #IMPLIED>
<!ATTLIST point pressure CDATA #IMPLIED>
<!ATTLIST point xtilt CDATA #IMPLIED>
<!ATTLIST point ytilt CDATA #IMPLIED>
"""

    def __init__(self):
        self._characters = SortedDict()

    @staticmethod
    def from_character_directory(directory,
                                 extensions=["xml", "bz2", "gz"], 
                                 recursive=True):
        """
        Creates a character collection from a directory containing
        individual character files.
        """
        regexp = re.compile("\.(%s)$" % "|".join(extensions))
        charcol = CharacterCollection()
        
        for name in os.listdir(directory):
            full_path = os.path.join(directory, name)
            if os.path.isdir(full_path) and recursive:
                charcol += CharacterCollection.from_character_directory(
                               full_path, extensions)
            elif regexp.search(full_path):
                char = Character()
                gzip = False; bz2 = False
                if full_path.endswith(".gz"): gzip = True
                if full_path.endswith(".bz2"): bz2 = True
                
                try:
                    char.read(full_path, gzip=gzip, bz2=bz2)
                except ValueError:
                    continue # ignore malformed XML files

                utf8 = char.get_utf8()
                if utf8 is None: utf8 = "Unknown"

                charcol.add_set(utf8)
                if not char in charcol.get_characters(utf8):
                    charcol.append_character(utf8, char)
                
        return charcol

    def __add__(self, other):
        new = CharacterCollection()
        for charcol in (self, other):
            for set_name in charcol.get_set_list():
                new.add_set(set_name)
                characters = new.get_characters(set_name)
                for char in charcol.get_characters(set_name):
                    if not char in characters:
                        new.append_character(set_name, char)
        return new
                   
    def add_set(self, set_name):
        if not self._characters.has_key(set_name):
            self._characters[set_name] = []

    def remove_set(self, set_name):
        if self._characters.has_key(set_name):
            del self._characters[set_name]

    def get_set_list(self):
        return self._characters.keys()

    def get_characters(self, set_name):
        if self._characters.has_key(set_name):
            return self._characters[set_name]
        else:
            return []

    def get_all_characters(self):
        characters = []
        for k in self._characters.keys():
            characters += self._characters[k]
        return characters

    def set_characters(self, set_name, characters):
        self._characters[set_name] = characters

    def append_character(self, set_name, character):
        if not self._characters.has_key(set_name):
            self._characters[set_name] = []

        self._characters[set_name].append(character)

    def insert_character(self, set_name, i, character):
        if not self._characters.has_key(set_name):
            self._characters[set_name] = []
            self._characters[set_name].append(character)
        else:
            self._characters[set_name].insert(i, character)

    def remove_character(self, set_name, i):
        if self._characters.has_key(set_name):
            if len(self._characters[set_name]) - 1 >= i:
                del self._characters[set_name][i]

    def remove_last_character(self, set_name):
        if self._characters.has_key(set_name):
            if len(self._characters[set_name]) > 0:
                del self._characters[set_name][-1]

    def replace_character(self, set_name, i, character):
        if self._characters.has_key(set_name):
            if len(self._characters[set_name]) - 1 >= i:
                self.remove_character(set_name, i)
                self.insert_character(set_name, i, character)

    def to_xml(self):
        s = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
        s += "<character-collection>\n"

        for set_name in self._characters.keys():
            s += "<set name=\"%s\">\n" % set_name

            for character in self._characters[set_name]:
                s += "  <character>\n"

                utf8 = character.get_utf8()
                if utf8:
                    s += "    <utf8>%s</utf8>\n" % utf8

                for line in character.get_writing().to_xml().split("\n"):
                    s += "    %s\n" % line
                
                s += "  </character>\n"

            s += "</set>\n"

        s += "</character-collection>\n"

        return s

    # Private...    

    def _start_element(self, name, attrs):
        self._tag = name

        if self._first_tag:
            self._first_tag = False
            if self._tag != "character-collection":
                raise ValueError, \
                      "The very first tag should be <character-collection>"

        if self._tag == "set":
            if not attrs.has_key("name"):
                raise ValueError, "<set> should have a name attribute"

            self._curr_set_name = attrs["name"].encode("UTF-8")
            self._curr_chars = []

        if self._tag == "character":
            self._curr_char = Character()
            self._curr_writing = self._curr_char.get_writing()
            self._curr_width = None
            self._curr_height = None
            self._curr_utf8 = None

        if self._tag == "stroke":
            self._curr_stroke = Stroke()
            
        elif self._tag == "point":
            point = Point()

            for key in ("x", "y", "pressure", "xtilt", "ytilt", "timestamp"):
                if attrs.has_key(key):
                    value = attrs[key].encode("UTF-8")
                    if key in ("pressure", "xtilt", "ytilt"):
                        value = float(value)
                    else:
                        value = int(float(value))
                else:
                    value = None

                setattr(point, key, value)

            self._curr_stroke.append_point(point)

    def _end_element(self, name):
        if name == "character-collection":
            for s in ["_tag", "_curr_char", "_curr_writing", "_curr_width",
                      "_curr_height", "_curr_utf8", "_curr_stroke",
                      "_curr_chars", "_curr_set_name"]:
                if s in self.__dict__:
                    del self.__dict__[s]
               
        if name == "set":
            self.set_characters(self._curr_set_name, self._curr_chars)

        if name == "character":
            if self._curr_utf8:
                self._curr_char.set_utf8(self._curr_utf8)
            if self._curr_width:
                self._curr_writing.set_width(self._curr_width)
            if self._curr_height:
                self._curr_writing.set_height(self._curr_height)
            self._curr_chars.append(self._curr_char)

        if name == "stroke":
            if len(self._curr_stroke) > 0:
                self._curr_writing.append_stroke(self._curr_stroke)
            self._stroke = None

        self._tag = None

    def _char_data(self, data):
        if self._tag == "utf8":
            self._curr_utf8 = data.encode("UTF-8")
        if self._tag == "width":
            self._curr_width = int(data)
        elif self._tag == "height":
            self._curr_height = int(data)