def test01(self): g = EncodingGuesser() filename = os.path.join(TESTDATA, "gnosis-readme") self.assertEqual(g.guess(filename), None) filename = os.path.join(TESTDATA, "cp850a.txt") self.assertEqual(g.guess(filename), "cp850") filename = os.path.join(TESTDATA, "cp850b.txt") self.assertEqual(g.guess(filename), "cp850") filename = os.path.join(TESTDATA, "README.TXT") self.assertEqual(g.guess(filename), "cp850") filename = os.path.join(TESTDATA, "cp1252a.txt") self.assertEqual(g.guess(filename), "cp1252") filename = os.path.join(TESTDATA, "cp1252b.txt") self.assertEqual(g.guess(filename), "cp1252")
def __init__(self, vol): #Task.__init__(self) self.volume = vol self.encodingGuesser = EncodingGuesser()
#self.status("%s : %d words",fileRow.name,len(tokens)) #print fileRow.path(), ".occurences.deleteAll()" fileRow.occurences.deleteAll() #self.occurences.query(file=deleteRows(file=fileRow) pos = 0 for token in tokens: pos += 1 self.status(fileRow.path() + ": " + token) word = self.words.peek(token) if word is None: word = self.words.appendRow(id=token) #elif word.ignore: # continue fileRow.occurences.appendRow(word=word, pos=pos) encodingGuesser = EncodingGuesser() def get_reader(fullname): base, ext = os.path.splitext(fullname) try: return readers[ext.lower()] except KeyError, e: return non_reader def read_content(sess, fileInstance, fullname): r = get_reader(fullname) return r(sess, fileInstance, fullname)