示例#1
0
    def test01(self):
        g = EncodingGuesser()
        filename = os.path.join(TESTDATA, "gnosis-readme")
        self.assertEqual(g.guess(filename), None)

        filename = os.path.join(TESTDATA, "cp850a.txt")
        self.assertEqual(g.guess(filename), "cp850")

        filename = os.path.join(TESTDATA, "cp850b.txt")
        self.assertEqual(g.guess(filename), "cp850")

        filename = os.path.join(TESTDATA, "README.TXT")
        self.assertEqual(g.guess(filename), "cp850")

        filename = os.path.join(TESTDATA, "cp1252a.txt")
        self.assertEqual(g.guess(filename), "cp1252")

        filename = os.path.join(TESTDATA, "cp1252b.txt")
        self.assertEqual(g.guess(filename), "cp1252")
示例#2
0
 def test01(self):
     g = EncodingGuesser()
     filename = os.path.join(TESTDATA,"gnosis-readme")
     self.assertEqual(g.guess(filename),None)
     
     filename = os.path.join(TESTDATA,"cp850a.txt")
     self.assertEqual(g.guess(filename),"cp850")
     
     filename = os.path.join(TESTDATA,"cp850b.txt")
     self.assertEqual(g.guess(filename),"cp850")
     
     filename = os.path.join(TESTDATA,"README.TXT")
     self.assertEqual(g.guess(filename),"cp850")
     
     filename = os.path.join(TESTDATA,"cp1252a.txt")
     self.assertEqual(g.guess(filename),"cp1252")
     
     filename = os.path.join(TESTDATA,"cp1252b.txt")
     self.assertEqual(g.guess(filename),"cp1252")
示例#3
0
 def __init__(self, vol):
     #Task.__init__(self)
     self.volume = vol
     self.encodingGuesser = EncodingGuesser()
示例#4
0
class FileVisitor:  # (Task):
    # used?
    def __init__(self, vol):
        #Task.__init__(self)
        self.volume = vol
        self.encodingGuesser = EncodingGuesser()

    def looper(self, task):
        self.task = task
        sess = self.volume.getContext()
        #from lino.apps.keeper import tables
        self.ftypes = sess.query(tables.FileType)
        self.files = sess.query(tables.File)
        self.dirs = sess.query(tables.Directory)
        self.words = sess.query(tables.Word)
        self.occurences = sess.query(tables.Occurence)
        self.volume.directories().deleteAll()
        #for row in self.dirs.query(volume=self.volume):
        #    row.delete()
        self.visit(self.volume.path, "")

##     def getLabel(self):
##         return "Loading "+self.volume.getLabel()

    def visit_file(self, fileRow, name):
        base, ext = os.path.splitext(name)
        #
        if ext.lower() == ".txt":
            self.status(name)
            s = open(name).read()
            coding = self.encodingGuesser.guess(name, s)
            self.status("%s: %s", name, coding)
            #print name,":",coding
            if coding:
                tokens = standardTokenizer(s.decode(coding))
            else:
                tokens = standardTokenizer(s)

            #coding = guesscoding(name)
            #f = codecs.open(name,encoding=coding)
            #tokens = standardTokenizer(f.read())
            #tokens = open(name).read().split()
            self.loadWords(fileRow, tokens)


##             count = 0
##             for ln in open(name).readlines():
##                 for w in ln.split():
##                     count += 1
##             self.verbose("%s contains %d words.", name, count)
        elif ext == ".doc":
            self.status("Ignoring MS-Word %s.", name)
            #msdoc = MsWordDocument(name)
            #fileRow.title = msdoc.title
            #self.loadWords(fileRow,msdoc.content.split())
        else:
            self.status("Ignoring unknown file %s.", name)

    def loadWords(self, fileRow, tokens):
        #self.status("%s : %d words",fileRow.name,len(tokens))
        #print fileRow.path(), ".occurences.deleteAll()"
        fileRow.occurences.deleteAll()
        #self.occurences.query(file=deleteRows(file=fileRow)
        pos = 0
        for token in tokens:
            pos += 1
            self.status(fileRow.path() + ": " + token)
            word = self.words.peek(token)
            if word is None:
                word = self.words.appendRow(id=token)
            #elif word.ignore:
            #    continue
            fileRow.occurences.appendRow(word=word, pos=pos)
示例#5
0
        #self.status("%s : %d words",fileRow.name,len(tokens))
        #print fileRow.path(), ".occurences.deleteAll()"
        fileRow.occurences.deleteAll()
        #self.occurences.query(file=deleteRows(file=fileRow)
        pos = 0
        for token in tokens:
            pos += 1
            self.status(fileRow.path() + ": " + token)
            word = self.words.peek(token)
            if word is None:
                word = self.words.appendRow(id=token)
            #elif word.ignore:
            #    continue
            fileRow.occurences.appendRow(word=word, pos=pos)

encodingGuesser = EncodingGuesser()


def get_reader(fullname):
    base, ext = os.path.splitext(fullname)
    try:
        return readers[ext.lower()]
    except KeyError, e:
        return non_reader


def read_content(sess, fileInstance, fullname):
    r = get_reader(fullname)
    return r(sess, fileInstance, fullname)

示例#6
0
 def __init__(self,vol):
     #Task.__init__(self)
     self.volume = vol
     self.encodingGuesser = EncodingGuesser()
示例#7
0
class FileVisitor: # (Task):
    # used?
    def __init__(self,vol):
        #Task.__init__(self)
        self.volume = vol
        self.encodingGuesser = EncodingGuesser()

    def looper(self,task):
        self.task=task
        sess = self.volume.getContext()
        #from lino.apps.keeper import tables 
        self.ftypes = sess.query(tables.FileType)
        self.files = sess.query(tables.File)
        self.dirs = sess.query(tables.Directory)
        self.words = sess.query(tables.Word)
        self.occurences = sess.query(tables.Occurence)
        self.volume.directories().deleteAll()
        #for row in self.dirs.query(volume=self.volume):
        #    row.delete()
        self.visit(self.volume.path,"")

##     def getLabel(self):
##         return "Loading "+self.volume.getLabel()

    def visit_file(self,fileRow,name):
        base,ext = os.path.splitext(name)
        #
        if ext.lower() == ".txt":
            self.status(name)
            s = open(name).read()
            coding = self.encodingGuesser.guess(name,s)
            self.status("%s: %s", name,coding)
            #print name,":",coding
            if coding:
                tokens = standardTokenizer(s.decode(coding))
            else:
                tokens = standardTokenizer(s)
            
            #coding = guesscoding(name)
            #f = codecs.open(name,encoding=coding)
            #tokens = standardTokenizer(f.read())
            #tokens = open(name).read().split()
            self.loadWords(fileRow,tokens)
##             count = 0
##             for ln in open(name).readlines():
##                 for w in ln.split():
##                     count += 1
##             self.verbose("%s contains %d words.", name, count)
        elif ext == ".doc":
            self.status("Ignoring MS-Word %s.", name)
            #msdoc = MsWordDocument(name)
            #fileRow.title = msdoc.title
            #self.loadWords(fileRow,msdoc.content.split())
        else:
            self.status("Ignoring unknown file %s.", name)
                    
    def loadWords(self,fileRow,tokens):
        #self.status("%s : %d words",fileRow.name,len(tokens))
        #print fileRow.path(), ".occurences.deleteAll()"
        fileRow.occurences.deleteAll()
        #self.occurences.query(file=deleteRows(file=fileRow)
        pos = 0
        for token in tokens:
            pos += 1
            self.status(fileRow.path()+": "+token)
            word = self.words.peek(token)
            if word is None:
                word = self.words.appendRow(id=token)
            #elif word.ignore:
            #    continue
            fileRow.occurences.appendRow(word=word, pos=pos)