Exemplo n.º 1
0
 def run(self, doPrint=False):
     self.converter.writer.writeheader()
     termsTags = self.getParsedHTML().findAll("dt")
     for dt in termsTags:
         dd = dt.findNextSibling()
         self.stripTags(dd)
         gla = stripAllHTML(self.fixUp(dt.getText(" ")))
         glaStems = self.getStems(gla, withUnicode=True)
         eng = stripAllHTML(unicode(self.fixUp(str(dd)).decode("utf-8")))
         engStems = self.getStems(eng)
         try:
             glaPhones = utils.getMetaphones(glaStems)
             engPhones = utils.getMetaphones(engStems)
         except Exception, err:
             import pdb
             pdb.set_trace()
         row = {
             self.converter.fields[0]: gla,
             self.converter.fields[1]: eng,
             self.converter.fields[2]: "",
             self.converter.fields[3]: json.dumps(glaStems),
             self.converter.fields[4]: json.dumps(engStems),
             self.converter.fields[5]: json.dumps(glaPhones),
             self.converter.fields[6]: json.dumps(engPhones),
         }
         try:
             self.converter.writer.writerow(row)
         except Exception, err:
             import pdb
             pdb.set_trace()
Exemplo n.º 2
0
 def run(self, doPrint=False):
     self.converter.writer.writeheader()
     termsTags = self.getParsedHTML().findAll("dt")
     for dt in termsTags:
         dd = dt.findNextSibling()
         self.stripTags(dd)
         gla = stripAllHTML(self.fixUp(dt.getText(" ")))
         glaStems = self.getStems(gla, withUnicode=True)
         eng = stripAllHTML(unicode(self.fixUp(str(dd)).decode("utf-8")))
         engStems = self.getStems(eng)
         try:
             glaPhones = utils.getMetaphones(glaStems)
             engPhones = utils.getMetaphones(engStems)
         except Exception, err:
             import pdb;pdb.set_trace()
         row = {
             self.converter.fields[0]: gla,
             self.converter.fields[1]: eng,
             self.converter.fields[2]: "",
             self.converter.fields[3]: json.dumps(glaStems),
             self.converter.fields[4]: json.dumps(engStems),
             self.converter.fields[5]: json.dumps(glaPhones),
             self.converter.fields[6]: json.dumps(engPhones),
             }
         try:
             self.converter.writer.writerow(row)
         except Exception, err:
             import pdb;pdb.set_trace()
Exemplo n.º 3
0
 def run(self, doPrint=False):
     self.converter.writer.writeheader()
     rowTags = self.getParsedHTML(convertEntities=False).findAll("tr")
     for tr in rowTags:
         cells = tr.findAll("td")
         if len(cells) == 0:
             continue
         # skip the first cell, which is page numbers from Pokorny's PIE
         # dictionary
         terms, seeAlsos, definition = cells[1:]
         pieTerms = []
         for term in terms.findAll("span"):
             pieTerms.append(term.text)
             pieTerms.extend(utils.getWordPermutations(term.text))
         # we're going to add these to the keywords too
         pieSeeAlsos = [x.text for x in seeAlsos.findAll("span")]
         # clean up definitions text
         self.stripTags(definition)
         # put it all together
         for pie in pieTerms:
             pieStems = self.getStems(pie, withUnicode=True)
             [
                 pieStems.extend(self.getStems(x, withUnicode=True))
                 for x in pieSeeAlsos
             ]
             eng = unicode(self.fixUp(str(definition)).decode("utf-8"))
             engStems = self.getStems(eng)
             try:
                 piePhones = utils.getMetaphones(
                     set(pieStems + pieTerms + pieSeeAlsos))
                 engPhones = utils.getMetaphones(engStems)
             except Exception, err:
                 import pdb
                 pdb.set_trace()
             row = {
                 self.converter.fields[0]: pie,
                 self.converter.fields[1]: eng,
                 self.converter.fields[2]: json.dumps(pieSeeAlsos),
                 self.converter.fields[3]: json.dumps(pieStems),
                 self.converter.fields[4]: json.dumps(engStems),
                 self.converter.fields[5]: json.dumps(piePhones),
                 self.converter.fields[6]: json.dumps(engPhones),
             }
             try:
                 self.converter.writer.writerow(row)
             except Exception, err:
                 import pdb
                 pdb.set_trace()
Exemplo n.º 4
0
 def run(self, doPrint=False):
     self.converter.writer.writeheader()
     rowTags = self.getParsedHTML(convertEntities=False).findAll("tr")
     for tr in rowTags:
         cells = tr.findAll("td")
         if len(cells) == 0:
             continue
         # skip the first cell, which is page numbers from Pokorny's PIE
         # dictionary
         terms, seeAlsos, definition = cells[1:]
         pieTerms = []
         for term in terms.findAll("span"):
             pieTerms.append(term.text)
             pieTerms.extend(utils.getWordPermutations(term.text))
         # we're going to add these to the keywords too
         pieSeeAlsos = [x.text for x in seeAlsos.findAll("span")]
         # clean up definitions text
         self.stripTags(definition)
         # put it all together
         for pie in pieTerms:
             pieStems = self.getStems(pie, withUnicode=True)
             [pieStems.extend(self.getStems(x, withUnicode=True))
              for x in pieSeeAlsos]
             eng = unicode(self.fixUp(str(definition)).decode("utf-8"))
             engStems = self.getStems(eng)
             try:
                 piePhones = utils.getMetaphones(
                     set(pieStems + pieTerms + pieSeeAlsos))
                 engPhones = utils.getMetaphones(engStems)
             except Exception, err:
                 import pdb;pdb.set_trace()
             row = {
                 self.converter.fields[0]: pie,
                 self.converter.fields[1]: eng,
                 self.converter.fields[2]: json.dumps(pieSeeAlsos),
                 self.converter.fields[3]: json.dumps(pieStems),
                 self.converter.fields[4]: json.dumps(engStems),
                 self.converter.fields[5]: json.dumps(piePhones),
                 self.converter.fields[6]: json.dumps(engPhones),
                 }
             try:
                 self.converter.writer.writerow(row)
             except Exception, err:
                 import pdb
                 pdb.set_trace()
Exemplo n.º 5
0
 def run(self):
     super(AddProtoCelticKeywords, self).run()
     reader = unicsv.UnicodeReader(self.inFilename)
     fieldnames = collection.ProtoCelticDictionaryV1().fields
     writer = unicsv.UnicodeWriter(self.outFilename, fieldnames)
     writer.writeheader()
     for row in reader:
         pclOrig = row["pcl"].split()
         engOrig = row["eng"].split()
         pcl = utils.getUnicodeStems(pclOrig)
         eng = utils.getStems(engOrig)
         row["see-also"] = ""
         row["pcl-keywords"] = json.dumps(utils.getUnicodeStems(pcl))
         row["eng-keywords"] = json.dumps(utils.getStems(eng))
         pcl = pcl + pclOrig
         eng = eng + engOrig
         try:
             row["pcl-metaphone"] = json.dumps(utils.getMetaphones(pcl))
             row["eng-metaphone"] = json.dumps(utils.getMetaphones(eng))
         except Exception, err:
             import pdb;pdb.set_trace()
         writer.writerow(row)
Exemplo n.º 6
0
 def run(self):
     super(AddProtoCelticKeywords, self).run()
     reader = unicsv.UnicodeReader(self.inFilename)
     fieldnames = collection.ProtoCelticDictionaryV1().fields
     writer = unicsv.UnicodeWriter(self.outFilename, fieldnames)
     writer.writeheader()
     for row in reader:
         pclOrig = row["pcl"].split()
         engOrig = row["eng"].split()
         pcl = utils.getUnicodeStems(pclOrig)
         eng = utils.getStems(engOrig)
         row["see-also"] = ""
         row["pcl-keywords"] = json.dumps(utils.getUnicodeStems(pcl))
         row["eng-keywords"] = json.dumps(utils.getStems(eng))
         pcl = pcl + pclOrig
         eng = eng + engOrig
         try:
             row["pcl-metaphone"] = json.dumps(utils.getMetaphones(pcl))
             row["eng-metaphone"] = json.dumps(utils.getMetaphones(eng))
         except Exception, err:
             import pdb
             pdb.set_trace()
         writer.writerow(row)
Exemplo n.º 7
0
 def test_getMetaphonesMultipleWords(self):
     words = ["This", "is", "Baxter's", "favorite", "seat"]
     results = utils.getMetaphones(words)
     self.assertEquals(results, ['0S', 'AS', 'FFRT', 'PKSTRRS', 'ST', 'TS'])
Exemplo n.º 8
0
 def test_getMetaphonesSentence(self):
     text = "This is Baxter's favorite seat"
     results = utils.getMetaphones(text)
     self.assertEquals(results, ['0S', 'AS', 'FFRT', 'PKSTRRS', 'ST', 'TS'])
Exemplo n.º 9
0
 def test_getMetaphonesUnicodeWord(self):
     text = "φrāko"
     results = utils.getMetaphones(text)
     self.assertEquals(results, ['0RK', 'TRK'])
Exemplo n.º 10
0
 def test_getMetaphonesWord(self):
     text = "Baxter"
     results = utils.getMetaphones(text)
     self.assertEquals(results, ['PKSTR'])