Python DocReader примеры использования

Язык программирования: Python

Пространство имен/Пакет: doc.reader

Класс/Тип: DocReader

Примеров на hotexamples.com: 4

Python DocReader - 4 примера найдено. Это лучшие примеры Python кода для doc.reader.DocReader, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

DocReader(1)

read(1)

Пример #1

Показать файл

Файл: fileReader.py Проект: BloodyD/DocAnalyzer

    def save_file(self, filePath):
        docF = DocReader(filePath)
        
        rawCont = docF.read()
        rawCont = rawCont.decode(CODING)
        
        self.readChars = 0
        size = len(rawCont)
        contentParted = rawCont.partition(NEW_LINE)
        caption = contentParted[0]
        content = contentParted[2]
        paraSplitter = '%s\x0c' %(NEW_LINE)
        paragraphs = content.split(paraSplitter)
#         print filePath, "got %d paragraphs" %(len(paragraphs))
        
        FILE_ID = self.dbi.addFile(path.basename(filePath), caption)
        for para in paragraphs:
            paraCaption, __, paraContent = para.partition(NEW_LINE)
            self.readChars += len(para) + len(paraSplitter)
            PARAGRAPH_ID = self.dbi.addParagraph(paraCaption, FILE_ID)
            curRdNr = None
            rdNrContent = []
            for line in paraContent.split(NEW_LINE):
                # eigentlich werden randnummern in textboxes als eigene Zeile erkannt...
                if line.isdigit():
                    if curRdNr != None:
                        RDNR_ID = self.dbi.addRdNr(curRdNr, PARAGRAPH_ID)
                        self.save_content(rdNrContent, RDNR_ID, PARAGRAPH_ID, FILE_ID)
                    rdNrContent = []
                    curRdNr = int(line)
#                     self.readChars += len(line) + len(NEW_LINE)
                # ... wenn man aber die Dokumente aus odt importiert, werden die in die zeile vorne drangehangen(zusammen mit 2 tabs)
                elif re.match("\d+\t\t", line[:10]):
                    partition = line.partition("\t\t")
                    if curRdNr != None:
                        RDNR_ID = self.dbi.addRdNr(curRdNr, PARAGRAPH_ID)
                        self.save_content(rdNrContent, RDNR_ID, PARAGRAPH_ID, FILE_ID)
                    rdNrContent = [partition[2].strip()]
                    curRdNr = int(partition[0])
                else:
                    rdNrContent.append(line)
            self.fileStatusUpdated.emit(self.readChars, size)

Пример #2

Показать файл

Файл: fileAnalyzer.py Проект: BloodyD/DocAnalyzer

def readDocFile(fName):
    docF = DocReader(fName)
    rawCont = docF.read().decode("latin-1").encode("utf-8")
    contentParted = rawCont.partition(NEW_LINE)
    caption = contentParted[0]
    content = contentParted[2]

    # createDirIfNeeded(path.join(PLAIN_FOLDER, caption))

    paragraphs = content.split(u"%s\x0c" % (NEW_LINE))

    cRESULT = {}
    # nRESULT = {}
    for rawPara in paragraphs:
        para = Paragraph(rawPara, caption)
        for k in para.rdWordMap:
            for w in para.rdWordMap[k].capWords.values():
                cRESULT = addWord(cRESULT, w)
            # for w in para.rdWordMap[k].normalWords.values():
            #     nRESULT = addWord(nRESULT, w)

    return cRESULT

Пример #3

Показать файл

# Miette is "small sweet thing" in french

from cfb.reader import CfbReader
from doc.reader import DocReader
from tools import hex_dump

r = DocReader('../tests/doc/mw_lorem_ipsum.doc')
#r = DocReader('../tests/doc/gd_lorem_ipsum.doc')
#r = DocReader('../tests/doc/oo_lorem_ipsum.doc')
#r = DocReader('../tests/doc/te_lorem_ipsum.doc')

#r = DocReader('../tests/doc/mw_vesna_yandex_ru.doc')
#r = DocReader('../tests/doc/gd_vesna_yandex_ru.doc')
#r = DocReader('../tests/doc/oo_vesna_yandex_ru.doc')
#r = DocReader('../tests/doc/te_vesna_yandex_ru.doc')

print r.read()

Пример #4

Показать файл

Файл: miette.py Проект: BloodyD/DocAnalyzer

# Miette is "small sweet thing" in french

from cfb.reader import CfbReader
from doc.reader import DocReader
from tools import hex_dump

# r = DocReader('../tests/doc/mw_lorem_ipsum.doc')
r = DocReader('../tests/doc/P_089-104.doc')
#r = DocReader('../tests/doc/gd_lorem_ipsum.doc')
#r = DocReader('../tests/doc/oo_lorem_ipsum.doc')
#r = DocReader('../tests/doc/te_lorem_ipsum.doc')

#r = DocReader('../tests/doc/mw_vesna_yandex_ru.doc')
#r = DocReader('../tests/doc/gd_vesna_yandex_ru.doc')
#r = DocReader('../tests/doc/oo_vesna_yandex_ru.doc')
#r = DocReader('../tests/doc/te_vesna_yandex_ru.doc')

out = open("out", "w")

out.write(r.read())

out.close()