Пример #1
0
 def __init__(self, datafile):
     self.holdings = PilotHoldings()
     self.data = common.XmlStreamReader("RECORD",datafile)
     for i, node in enumerate(self.data.elements()):
         #print i,node
         try:
             self.holdings.add_record(node)
         except Exception as e:
             print e
             print "Failed to add record to holdings"
     
     '''
Пример #2
0
class PilotDelegator:
    """
        To work with pilot data, its best for first create
        the data in memore for later manipulation rather than reading it 
        again and again from the xml.
        
        When dealing with a collection of records, encapsulation is very useful:
        I only want to know that a record has been created and validated, and that
        if it fails to do so, that I'm alerted so I can log it. 

    
    """
    cnt = Counter()
    def __init__(self, datafile):
        self.holdings = PilotHoldings()
        self.data = common.XmlStreamReader("RECORD",datafile)
        for i,node in enumerate(self.data.elements()):
            
            try:
                self.holdings.add_record(node)
            except Exception  as e:
                # this is where logging belongs
                print "----------Error------------"
                print e.message
                print e.node
                print "----------Error End---------"
                self.cnt[e.message]+=1
        print self.cnt.items()
        #self.holdings.pickle_it()
    def report(self):
        self.holdings.report('full')
    def test(self):
        # This will fail, so we must alther the holdings to make it pass
        self.holdings.test() 
Пример #3
0
 def __init__(self, datafile):
     self.holdings = PilotHoldings()
     self.data = common.XmlStreamReader("RECORD",datafile)
     for i,node in enumerate(self.data.elements()):
         
         try:
             self.holdings.add_record(node)
         except Exception  as e:
             # this is where logging belongs
             print "----------Error------------"
             print e.message
             print e.node
             print "----------Error End---------"
             self.cnt[e.message]+=1
     print self.cnt.items()
Пример #4
0
class PilotDelegator:
    """
        To work with pilot data, its best for first create
        the data in memore for later manipulation rather than reading it 
        from the xml file.
        
        When dealing with a collection of records, encapsulation is useful:
        I only want to know that a record has been created and validated, and that
        if it fails to do so, that I'm alerted so I can log it. 
 
    """
    brokenfile = open("/Users/peder/dev/goc/broken-pilot-records.xml", "w")
    brokenfile.write("<XML>\n")
    cnt = Counter()
    
    def __init__(self, datafile):
        self.holdings = PilotHoldings()
        self.data = common.XmlStreamReader("RECORD",datafile)
        for i, node in enumerate(self.data.elements()):
            #print i,node
            try:
                self.holdings.add_record(node)
            except Exception as e:
                print e
                print "Failed to add record to holdings"
        
        '''
        for i,node in enumerate(self.data.elements()):          
            try:
                self.holdings.add_record(node)
            except Exception  as e:
                print e
                self.cnt[e.type]+=1
                #self.__write_errors(e)
                pass
                
        print self.cnt.items()
        '''
   
        #self.holdings.pickle_it()
    #This is cheezy, I should be using a yield statement inside a generator for this
    #brokenfile.write("</XML>\n")
    
    def _write_errors(self,error):
        # this is where logging belongs
        print "--- {} Error ----".format(error.type)
        print error.message
        self.brokenfile.write(lxml.etree.tostring(error.node)+"\n")      
        
    def report(self):
        self.holdings.report('full')
        
        
    def test(self):
        # This will fail, so we must alther the holdings to make it pass
        self.holdings.test() 
    def match_languages(self):
        file = open("/Users/peder/dev/goc/matched-pilot-records.xml", "w")
        bifile = open("/Users/peder/dev/goc/bilingual-pilot-records.xml", "w")
        file.write("<XML>\n")
        bifile.write("<XML>\n")
        lang_counts = Counter()
        match_count = 0
        #print title[0]
        language_markers=[
                    (' - English Version',' - French Version'),
                    (' (in English)', ' (in French)'),
                    (' (In English)', ' (In French)'),
                    ('(- English)', '(- French)'),  
                     (' (English version)',' (French version)'),
                    (' (English Version)',' (French Version)')
                    ]
  
        ''' If one of these markers is in the data,
            then there is probably a french equivalent
        '''  
        for i,record in enumerate(self.holdings.records):   
            lang_counts[record.language]+=1               
            for marker in language_markers:
                if record.language == u'Bilingual (English and French) | Bilingue (Anglais et Fran\xe7ais)':
                    print i, record.title
                    bifile.write(lxml.etree.tostring(record.node)+"\n")
                    continue
                elif marker[0] in record.title:
                   
                    # Split the marker out of the record
                    split_title_en = record.title.split(marker[0])[0]
                    equivalent_title_french_record = split_title_en + marker[1]
                    #print equivalent_title_french_record
                    # Having this title should enable us to find the french record
                    # in the many cases where it's not alternating with the english
                    
                    french_record =  self.holdings.find_french_record(equivalent_title_french_record)

                    try:
                        #print marker
                        e = str(record.title.split(marker[0])[0])
                        f = str(french_record[0].title.split(marker[1])[0])
#                        print e
#                        print f
#                        print i, record.node 
#                        print i, french_record[0].node
                        if e == f:
                            #print "Match for  {}".format(marker[1])
#                           print i, "WE HAVE A MATCH " + match_count

                            #file.write(lxml.etree.tostring(record.node)+"\n")
                            #file.write(lxml.etree.tostring(french_record[0].node)+"\n")
                           
                            match_count +=1
                            break
                        
                    except:
                        pass
                        #print i, "NONE"
                        #print "No match for " + split_title_en
                        #raise
                        
                    #search holdings for the french record
                    #sys.exit()
                    break
        file.write("</XML>\n")
        file.close()
        bifile.write("</XML>\n")
        bifile.close()
        print "LANGUAGE COUNTS", lang_counts.items()
        print "MATCH COUNT ", match_count