Exemplo n.º 1
0
    def endElement(self, name):
        """
        Signals the end of an element.
        Data stored in global vars of the class, time to create our objects and fire their processing
        @param name: the name of the element
        @type name: str 
        @param attrs: of the element type as a string and the attrs parameter holds an object of the Attributes interface containing the attributes of the element.
        @type attrs: Attributes
        """
        self.ss_text = "".join(self.ss_text)
        #get rid of annoying leading spaces
        self.ss_text = self.ss_text.strip()
        
        #all of the elements have to be declared here
        #for each element, create the objects and clear "buffers"
        if name == self.IN_TAG["src"]:
            self.src = SimpleSentence(self.ss_text, self.ss_attributes)
            self.ss_text = []
        elif name == self.IN_TAG["tgt"]:
            self.tgt.append(SimpleSentence(self.ss_text, self.ss_attributes))
            self.ss_text = []
        elif name == self.IN_TAG["sent"]:
            #when the judged sentence gets closed, all previously inserted data have to be converted to objects 
            parallelsentence = ParallelSentence(self.src, self.tgt, self.ref, self.ps_attributes)

            #apply feature generators
            for fg in self.feature_generators:
                parallelsentence = fg.add_features_parallelsentence(parallelsentence)
                #parallelsentence.add_attributes( fg.get_features_parallelsentence(parallelsentence) )
            
            #print parallelsentence
            src = self.src
#            #print src.get_string()
#            for fg in self.feature_generators:
#                src = fg.add_features_src(src, parallelsentence)
#                #src.add_attributes( fg.get_features_src(src, parallelsentence) )
#            parallelsentence.set_source(src)

            #display modifications on output file
            XMLGenerator._write(self, "\n\t")
             
            XMLGenerator.startElement(self, name, parallelsentence.get_attributes())
                        
            XMLGenerator._write(self, "\n\t\t")
            XMLGenerator.startElement(self, self.OUT_TAG["src"], src.get_attributes())
            XMLGenerator.characters(self, src.get_string())
            XMLGenerator.endElement(self, self.OUT_TAG["src"])
            
            for tgt in parallelsentence.get_translations():
#                for fg in self.feature_generators:
#                    tgt = fg.add_features_tgt(tgt, parallelsentence)
#                    #tgt.add_attributes( fg.get_features_tgt(tgt, parallelsentence) )

                XMLGenerator._write(self, "\n\t\t")
                XMLGenerator.startElement(self, self.OUT_TAG["tgt"], tgt.get_attributes())
                XMLGenerator.characters(self, tgt.get_string())
                XMLGenerator.endElement(self, self.OUT_TAG["tgt"])
            
            XMLGenerator._write(self, "\n\t")
            XMLGenerator.endElement(self, name)
    def endElement(self, name):
        """
        Signals the end of an element.
        Data stored in global vars of the class, time to create our objects and fire their processing
        @param name: the name of the element
        @type name: str 
        @param attrs: of the element type as a string and the attrs parameter holds an object of the Attributes interface containing the attributes of the element.
        @type attrs: Attributes
        """
        
        #get rid of annoying leading spaces
        self.ss_text = self.ss_text.strip()
        
        #all of the elements have to be declared here
        #for each element, create the objects and clear "buffers"
        if name == self.TAG_SRC:
            self.src = SimpleSentence(self.ss_text, self.ss_attributes)
            self.ss_text = u""
        elif name == self.TAG_TGT:
            self.tgt.append(SimpleSentence(self.ss_text, self.ss_attributes))
            self.ss_text = u""
        elif name == self.TAG_SENT:
            #when the judged sentence gets closed, all previously inserted data have to be converted to objects 
            parallelsentence = ParallelSentence(self.src, self.tgt, self.ref, self.ps_attributes)

            #apply feature generators
            for fg in self.feature_generators:
                parallelsentence = fg.add_features_parallelsentence(parallelsentence)
                #parallelsentence.add_attributes( fg.get_features_parallelsentence(parallelsentence) )
            
            #print parallelsentence
            src = parallelsentence.get_source()
#            #print src.get_string()
#            for fg in self.feature_generators:
#                src = fg.add_features_src(src, parallelsentence)
#                #src.add_attributes( fg.get_features_src(src, parallelsentence) )
#            parallelsentence.set_source(src)

            #display modifications on output file
            XMLGenerator._write(self, "\n\t")
             
            XMLGenerator.startElement(self, name, parallelsentence.get_attributes())
                        
            XMLGenerator._write(self, "\n\t\t")
            XMLGenerator.startElement(self, self.TAG_SRC, src.get_attributes())
            XMLGenerator.characters(self, src.get_string())
            XMLGenerator.endElement(self, self.TAG_SRC)
            
            found_best = False
            tab_entry = "\n"

            for tgt in parallelsentence.get_translations():
#                for fg in self.feature_generators:
#                    tgt = fg.add_features_tgt(tgt, parallelsentence)
#                    #tgt.add_attributes( fg.get_features_tgt(tgt, parallelsentence) )

                XMLGenerator._write(self, "\n\t\t")
                XMLGenerator.startElement(self, self.TAG_TGT, tgt.get_attributes())
                XMLGenerator.characters(self, tgt.get_string())
                XMLGenerator.endElement(self, self.TAG_TGT)
                

                if int(tgt.get_attribute("rank")) == 1 and not found_best:
                    string = tgt.get_string()
                    tab_entry = "%s\n" % string
                    found_best = True 



                
            if not found_best:
                print "ERROR: didn't find best ranked sentence"
            XMLGenerator._write(self, "\n\t")
            XMLGenerator.endElement(self, name)
            self.tab_file.write(tab_entry)
Exemplo n.º 3
0
    def write_to_file(self, filename):
        '''
        XML output is written to the desired file
        '''
        tempfilename = "%s.tmp" % filename 
        f = open(tempfilename, 'w')
        generator = XMLGenerator(f, "utf-8")
        generator.startDocument()
        generator.startElement(self.TAG["doc"], {})

        for parallelsentence in self.parallelsentences:
            generator.characters("\n\t")
            attributes = dict([(k,str(v)) for k,v in parallelsentence.get_attributes().iteritems()])
            generator.startElement(self.TAG["sent"], attributes)
            
            src = parallelsentence.get_source()
            attributes = dict([(k,str(v)) for k,v in src.get_attributes().iteritems()])
            
            if isinstance(src, SimpleSentence):            
                                    
                generator._write("\n\t\t")
                generator.startElement(self.TAG["src"], attributes)
                generator.characters(c(src.get_string()))
                generator.endElement(self.TAG["src"])
            elif isinstance(src, tuple):
                for src in parallelsentence.get_source():
                    generator._write("\n\t\t")
                    generator.startElement(self.TAG["src"], attributes)
                    generator.characters(c(src.get_string()))
                    generator.endElement(self.TAG["src"])
            
            translations = parallelsentence.get_translations()
            
            if self.shuffle_translations:
                shuffle(translations)
            

            
            if self.sort_attribute:
                translations = sorted(translations, key=lambda tgt: tgt.get_attribute(self.sort_attribute))
            
            
            for tgt in translations:
                generator._write("\n\t\t")
                attributes = dict([(k,str(v)) for k,v in tgt.get_attributes().iteritems()])
                generator.startElement(self.TAG["tgt"], attributes)
                generator.characters(c(tgt.get_string()))
                generator.endElement(self.TAG["tgt"])
            
            
            ref = parallelsentence.get_reference()
            if ref and ref.get_string() != "":
                generator._write("\n\t\t")
                attributes = dict([(k,str(v)) for k,v in ref.get_attributes().iteritems()])
                generator.startElement(self.TAG["ref"], attributes)
                generator.characters(c(ref.get_string()))
                generator.endElement(self.TAG["ref"])
            
            generator._write("\n\t")

            
            
            generator.endElement(self.TAG["sent"])
        generator.characters("\n")
        generator.endElement(self.TAG["doc"])
        generator.characters("\n")
        generator.endDocument()
        f.close()
        shutil.move(tempfilename, filename)
            
            
Exemplo n.º 4
0
class IncrementalJcml(object):
    """
    Write line by line incrementally on an XML file, without loading anything in the memory.
    Don't forget the close function. Object sentences cannot be edited after written
    """
    def __init__(self, filename, xmlformat=JcmlFormat):
        self.TAG = xmlformat.TAG
        self.filename = filename
        self.file = tempfile.NamedTemporaryFile(mode='w',delete=False,suffix='.jcml', prefix='tmp_', dir='.') #"/tmp/%s.tmp" % os.path.basename(filename)
        self.tempfilename = self.file.name
#        self.file = open(self.tempfilename, 'w')
        self.generator = XMLGenerator(self.file, "utf-8")
        self.generator.startDocument()
        self.generator.startElement(self.TAG["doc"], {})
        
    def add_parallelsentence(self, parallelsentence):
        self.generator.characters("\n\t")
        #convert all attribute values to string, otherwise it breaks
        attributes = dict([(key,str(val)) for key,val in parallelsentence.get_attributes().iteritems()])
        self.generator.startElement(self.TAG["sent"], attributes)
        
        src = parallelsentence.get_source()
        
        if isinstance(src, SimpleSentence):            
                                
            self.generator._write("\n\t\t")
            src_attributes = dict([(key,str(val)) for key,val in src.get_attributes().iteritems()])
            self.generator.startElement(self.TAG["src"], src_attributes)
            self.generator.characters(c(src.get_string()))
            self.generator.endElement(self.TAG["src"])
        elif isinstance(src, tuple):
            for src in parallelsentence.get_source():
                self.generator._write("\n\t\t")
                src_attributes = dict([(key,str(val)) for key,val in src.get_attributes().iteritems()])
                self.generator.startElement(self.TAG["src"], src_attributes)
                self.generator.characters(c(src.get_string()))
                self.generator.endElement(self.TAG["src"])
        
        for tgt in parallelsentence.get_translations():
            self.generator._write("\n\t\t")
            tgt_attributes = dict([(key,str(val)) for key,val in tgt.get_attributes().iteritems()])
            self.generator.startElement(self.TAG["tgt"], tgt_attributes)
            self.generator.characters(c(tgt.get_string()))
            self.generator.endElement(self.TAG["tgt"])
        
        
        ref = parallelsentence.get_reference()
        if ref and ref.get_string() != "":
            self.generator._write("\n\t\t")
            ref_attributes = dict([(key,str(val)) for key,val in ref.get_attributes().iteritems()])
            self.generator.startElement(self.TAG["ref"], ref_attributes)
            self.generator.characters(c(ref.get_string()))
            self.generator.endElement(self.TAG["ref"])
        
        self.generator._write("\n\t")
        self.generator.endElement(self.TAG["sent"])
        
    
    def close(self):
        self.generator.characters("\n")
        self.generator.endElement(self.TAG["doc"])
        self.generator.characters("\n")
        self.generator.endDocument()
        self.file.close()
        shutil.move(self.tempfilename, self.filename)
Exemplo n.º 5
0
class SaxJCMLProcessor(XMLGenerator):
    """
    Handles the generation of features over an XML object formatted as JCML. 
    It does processing every time a parallel sentence including its contents has been declared.
    Processing of any other XML type should follow this example.
    """
    
    def __init__(self, out, feature_generators = [], size=100):
        """
        @param out: file object to receive processed changes
        @type out: file
        @param feature_generators: list of feature generators to be applied
        @type feature_generators: list
        """
        
        #flags that show the current focus of the parsing
        self.is_parallelsentence = False 
        self.is_simplesentence = False
        self.passed_head = False  #annotations declaration can only be done before any sentence has been declared
        #the following variables function as a buffer, that gets filled as the elements are being parsed
        #when elements are ended, then objects are created
        self.ps_attributes = {} #attributes of the parallel sentence
        self.ss_attributes = {} #attributes of a simple sentence
        
        self.src = None
        self.tgt = []
        self.ref = None
        self.annotations = []
        
        self.ss_text = []
        
        self.set_tags()
        
        self.feature_generators = feature_generators
        self.out = out
        self._encoding = "utf-8"
        self.generator = XMLGenerator(out, "utf-8")
                
        self.counter = 0
        
        log.debug("File size given: {}. Loading progress bar.".format(size))
        
    def set_tags(self):
        """
        Handles the basic tags used for reading the simple XML format. 
        As tags are prone to changes, this can be done by changing values here, or overriding accordingly
        """
        self.TAG_DOC = "jcml"
        self.TAG_SENT = "judgedsentence"
        self.TAG_SRC = "src"
        self.TAG_TGT = "tgt"
        self.TAG_REF = "ref"
        self.TAG_ANNOTATIONS = "annotations"
        self.TAG_ANNOTATION = "annotation"
        
    def startDocument(self):
        self.generator.startDocument()
        self.generator.startElement(self.TAG_DOC, {})

    def endDocument(self):
        self.generator.endElement(self.TAG_DOC)
        self.generator.endDocument()
    
    def startElement(self, name, attrs=[]):
        """
        Signals the start of an element (simplesentence or parallelsentence)
        @param name: the name of the element
        @type name: str 
        @param attrs: of the element type as a string and the attrs parameter holds an object of the Attributes interface containing the attributes of the element.
        @type attrs: Attributes
        """
        if name == self.TAG_SENT:
            
            #empty up string and attribute buffer
            self.ss_text = []
            self.ps_attributes = {}
            self.tgt = []
            for att_name in attrs.getNames():
                self.ps_attributes[att_name] = attrs.getValue(att_name)
            self.is_parallelsentence = True
            
            #add the newly produced feature generators to the heading of the generated file
#            self.generator.startElement(self.TAG_ANNOTATIONS, {})
#            if not self.passed_head:
#                for featuregenerator in self.feature_generators:
#                    atts = {"name" : featuregenerator.get_annotation_name()}
#
#
#
#                self.passed_head = True    
#        
#        if name == self.TAG_ANNOTATION:
#            if not self.passed_head:
#                self.annotations.append(attrs.getValue("name"))
#                #self.generator.startElement(name, attrs)
#            else:
#                print "Format error. Annotation must be declared in the beginning of the document"
        
        elif name in [self.TAG_SRC, self.TAG_TGT, self.TAG_REF]:
            
            #empty up string and attribute buffer
            self.ss_text = []
            self.ss_attributes = {}
            for att_name in attrs.getNames():
                self.ss_attributes[att_name] = attrs.getValue(att_name)
            self.is_simplesentence = True
        
                
    def characters(self, ch):
        """
        The Parser will call this method to report each chunk of character data. 
        We use it to store the string of the simplesentence
        @param ch: character being parsed
        @type ch: str 
        """
        if self.is_simplesentence :
            self.ss_text.append(c(ch))
#            self.ss_text = u"%s%s" % (self.ss_text, ch)
            
    
    def endElement(self, name):
        """
        Signals the end of an element.
        Data stored in global vars of the class, time to create our objects and fire their processing
        @param name: the name of the element
        @type name: str 
        @param attrs: of the element type as a string and the attrs parameter holds an object of the Attributes interface containing the attributes of the element.
        @type attrs: Attributes
        """
        parsed_text = "".join(self.ss_text).strip()
        #get rid of annoying leading spaces
        
        #all of the elements have to be declared here
        #for each element, create the objects and clear "buffers"
        if name == self.TAG_SRC:
            self.src = SimpleSentence(parsed_text, self.ss_attributes)
            self.ss_text = []
        elif name == self.TAG_REF:
            self.ref = SimpleSentence(parsed_text, self.ss_attributes)
            self.ss_text = []
        elif name == self.TAG_TGT:
            self.tgt.append(SimpleSentence(parsed_text, self.ss_attributes))
            self.ss_text = []
        elif name == self.TAG_SENT:
            #when the judged sentence gets closed, all previously inserted data have to be converted to objects 
            parallelsentence = ParallelSentence(self.src, self.tgt, self.ref, self.ps_attributes)
            log.debug("Parallelsentence {} complete".format(self.counter))

            #apply feature generators
            for fg in self.feature_generators:
                #sys.stderr.write("Processing sentence with {}".format(fg.__class__.__name__))
                parallelsentence = fg.add_features_parallelsentence(parallelsentence)
                #parallelsentence.add_attributes( fg.get_features_parallelsentence(parallelsentence) )
            #print parallelsentence
            src = parallelsentence.get_source()
#            #print src.get_string()
#            for fg in self.feature_generators:
#                src = fg.add_features_src(src, parallelsentence)
#                #src.add_attributes( fg.get_features_src(src, parallelsentence) )
#            parallelsentence.set_source(src)

            #display modifications on output file
            self.generator._write(u"\n\t")
             
            self.generator.startElement(name, parallelsentence.get_attributes())
                        
            self.generator._write(u"\n\t\t")

            src_attributes = dict_string(src.get_attributes())
            self.generator.startElement(self.TAG_SRC, src_attributes)
            self.generator.characters(src.get_string())
            self.generator.endElement(self.TAG_SRC)
            
            for tgt in parallelsentence.get_translations():
#                for fg in self.feature_generators:
#                    tgt = fg.add_features_tgt(tgt, parallelsentence)
#                    #tgt.add_attributes( fg.get_features_tgt(tgt, parallelsentence) )

                self.generator._write(u"\n\t\t")
                tgt_attributes = dict_string(tgt.get_attributes())
                self.generator.startElement(self.TAG_TGT, tgt_attributes)
                self.generator.characters(tgt.get_string())
                self.generator.endElement(self.TAG_TGT)
            
            
            ref = parallelsentence.get_reference()
            
            self.generator._write(u"\n\t\t")
            if ref:
                ref_attributes = dict_string(ref.get_attributes())
                self.generator.startElement(self.TAG_REF, ref_attributes)
                self.generator.characters(ref.get_string())
                self.generator.endElement(self.TAG_REF)
                self.generator._write(u"\n\t")

            self.generator.endElement(name)
            
            self.counter+=1
            if self.counter%100 == 0:
                log.info("{}: Processed {} sentences".format(self.out, self.counter))