Python extractText示例，odf.teletype.extractText Python示例

示例#1

0

显示文件

文件： testwhitespace.py 项目： abiapp/odfpy

 def test_extract_with_span(self):
     """ Extract a text with a bold/italic span """
     poem_odt = os.path.join(
         os.path.dirname(__file__), u"examples", u"simplestyles.odt")
     d = load(poem_odt)
     teletype.extractText(d.body)
     self.assertEqual(u'Plain textBoldItalicBold italicUnderlineUnderline italicUnderline bold italicKm2 - superscriptH2O - subscript', teletype.extractText(d.body))

示例#2

0

显示文件

文件： testwhitespace.py 项目： bufke/odfpy

 def test_extract_with_span(self):
     """ Extract a text with a bold/italic span """
     poem_odt = os.path.join(
         os.path.dirname(__file__), "examples", "simplestyles.odt")
     d = load(poem_odt)
     teletype.extractText(d.text)
     self.assertEqual(u'Plain textBoldItalicBold italicUnderlineUnderline italicUnderline bold italicKm2 - superscriptH2O - subscript', teletype.extractText(d.text))

示例#3

0

显示文件

    def _get_conclusion_from_report(report):
        conclusion = []
        try:
            doc = load(report.path)
        except FileNotFoundError:
            return 'Не найдено'
        paragraphs = doc.getElementsByType(text.P)
        for i in range(len(paragraphs)):
            if teletype.extractText(
                    paragraphs[i]).strip().startswith('Заключение:'):
                conclusion.append(
                    teletype.extractText(paragraphs[i]).replace(
                        'Заключение:', '').strip())

        return '; '.join(conclusion)

示例#4

0

显示文件

文件： ConvertOdtByXML.py 项目： EliseyP/onik

    def convert_text_only(self):
        """Конвертация текста в файле ODT.

        Конвертируется все содерржимое <body>.
        Результат записывается в файл *.cnv.odt
        :return: None
        """

        body = self.doc.body
        new_doc = OpenDocumentText()
        for _body_elem in body.childNodes:
            for _elem in _body_elem.childNodes:
                body_text = teletype.extractText(_elem)
                body_text = self.converter(body_text)
                para = text.P()
                teletype.addTextToElement(para, body_text)
                new_doc.text.addElement(para)
                # print(body_text)

        # Замена шрифта в стилях.
        if self.style_font:
            self.set_font_for_all_styles()

        _suffix = '.all.odt'
        if self.extension:
            _suffix = self.extension

        new_odt = self.p_odt.with_suffix(_suffix)
        new_doc.save(new_odt.as_posix())

示例#5

0

显示文件

 def load_file(self, file):
     textdoc = load(file)
     allparas = textdoc.getElementsByType(text.P)
     allparas = [teletype.extractText(_) for _ in allparas]
     allparas = [_ for _ in allparas if len(_) > 20]
     self.allparas = allparas
     self.is_valid = True

示例#6

0

显示文件

def getODTContent(filename):
    list = []
    textdoc = load(filename)
    allparas = textdoc.getElementsByType(text.P)
    for i in range(len(allparas)):
        list.append(teletype.extractText(allparas[i]))
    return ' '.join(list)

示例#7

0

显示文件

def cargar_desde_carpeta(dataset):
    txt_declaracion = []
    n = 0
    for item in dataset:
        n = n + 1
        print(n)
        id_file = extraer_id(item)
        if id_file:
            try:
                destination = 'doc/myfile_' + id_file + '.odt'
                textdoc = load(destination)
                allparas = textdoc.getElementsByType(text.P)
                texto = []
                for i in allparas:
                    texto.append(teletype.extractText(i))
                texto = ' '.join(texto)
            except:
                try:
                    from tika import parser
                    destination = 'doc/myfile_' + id_file + '.pdf'
                    file_data = parser.from_file(destination)
                    texto = file_data['content']
                except:
                    texto = "Archivo no encontrado"
            txt_declaracion.append(texto)
        else:
            txt_declaracion.append('NADA')
    return txt_declaracion

示例#8

0

显示文件

文件： odf_output.py 项目： irsural/calibrator_pc

def __replace_text_in_odf_element(a_file, a_element_foo, a_replace_map: list):
    replace_map = {}
    for element in a_file.getElementsByType(a_element_foo):
        text = teletype.extractText(element)
        for mark in a_replace_map:
            text = text.replace(mark[0], mark[1])

        new_odf_element = odf_text.P()
        new_odf_element.setAttribute("stylename",
                                     element.getAttribute("stylename"))

        for space_elements in element.getElementsByType(odf_text.S):
            # Без этого все пробельные символы в начале строк удалятся
            spaces = space_elements.getAttribute('c')
            if spaces is not None:
                new_space_element = odf_text.S()
                new_space_element.setAttribute('c', spaces)
                new_odf_element.appendChild(new_space_element)

        new_odf_element.addText(text)
        replace_map[element] = new_odf_element

    for old, new in replace_map.items():
        old.parentNode.insertBefore(new, old)
        old.parentNode.removeChild(old)
        # Без этого дерево нодов сломается
        a_file.rebuild_caches(new.parentNode)

示例#9

0

显示文件

def openFile():
    filename = tk.filedialog.askopenfilename(initialdir ='C:\\')

    if len(filename)==0:
        print("open file for reading is cancelled.")
        return
    
    filetype=filename.split(".")
    filetype=filetype[len(filetype)-1]
    
    if(filetype=="txt"):  
        try:
            file=open(filename,'r')
            
            t=create_newtext()
            t.insert('end',file.read())
            file.close()
        except IOError as e:
            print(e,filename)
    elif(filetype=="odt"):
        textdoc = load(filename)
        allparas = textdoc.getElementsByType(text.P)

        t=create_newtext()
        for i in range(len(allparas)):
            t.insert('end',teletype.extractText(allparas[i]))
    else:
        tk.messagebox.showinfo( title='Unsupported file type',
                        message='This is unsupported file type, now the supported file type are .txt and .odt')

示例#10

0

显示文件

文件： BrowseFileSystem.py 项目： iVerb/QtTranslator

 def getODTText(self, file):  #odt datei wird gelesen
     fileName = file
     textdoc = load(fileName)
     allparas = textdoc.getElementsByType(text.P)
     outputlist = []
     for x in allparas:
         outputlist.append(teletype.extractText(x))
     self.FinalOutputText = "".join(outputlist)

示例#11

0

显示文件

def odt_analysis(file, pattern, path):
    """Finds the pattern in text (and images) for .odt files"""
    counter = 0
    textdoc = load(file)
    all_paragraphs = textdoc.getElementsByType(otext.P)
    for paragraph in all_paragraphs:
        counter += teletype.extractText(paragraph).lower().count(pattern)
    show(path, file, counter)

示例#12

0

显示文件

 def __read_text_cell(self, cell):
     text_content = []
     paragraphs = cell.getElementsByType(P)
     # for each text node
     for paragraph in paragraphs:
         data = extractText(paragraph)
         text_content.append(data)
     return '\n'.join(text_content)

示例#13

0

显示文件

def parse_odt(response):
    tmp = tempfile.NamedTemporaryFile()
    with open(tmp.name, "wb") as f:
        f.write(response.content)

    odtfile = opendocument.load(tmp.name)
    texts = odtfile.getElementsByType(text.P)
    return " ".join(teletype.extractText(t) for t in texts)

示例#14

0

显示文件

文件： ConvertOdtByXML.py 项目： EliseyP/onik

 def get_text(self):
     out_text = ''
     body = self.doc.body
     for _body_elem in body.childNodes:
         for _elem in _body_elem.childNodes:
             body_text = teletype.extractText(_elem)
             out_text += f'{body_text}\n'
     return out_text

示例#15

0

显示文件

文件：錄音稿的odt轉csv.py 项目： SuiSiann/SuiSiann-KauTui

def 讀odt檔(檔名):
    if '.odt' not in 檔名:
        exit(0)
    textdoc = load(檔名)
    內容陣列 = []
    for line in textdoc.getElementsByType(text.P):
        tsua = teletype.extractText(line).strip()
        內容陣列.append(tsua)
    return 內容陣列

示例#16

0

显示文件

文件： odsr.py 项目： pyexcel/pyexcel-ods

 def __read_text_cell(self, cell):
     text_content = []
     paragraphs = cell.getElementsByType(P)
     # for each text node
     for paragraph in paragraphs:
         name_space, tag = paragraph.parentNode.qname
         if tag != str("annotation"):
             data = extractText(paragraph)
             text_content.append(data)
     return "\n".join(text_content)

示例#17

0

显示文件

def extraer_texto_odt(file_id):
    destination = 'doc/myfile_' + file_id + '.odt'
    download_from_drive(file_id, destination)
    textdoc = load(destination)
    allparas = textdoc.getElementsByType(text.P)
    texto = []
    for i in allparas:
        texto.append(teletype.extractText(i))
    texto = ' '.join(texto)
    return texto

示例#18

0

显示文件

 def noteline_identification(self, ligne):
     text = teletype.extractText(ligne)
     for i in range(len(text)):
         if (self.check_char_note(text[i])):
             if (text[i + 1] == " " or text[i + 1] == "#"
                     or (text[i + 1] == "b" and
                         (text[i + 2] == " " or text[i + 2].lower == "m"
                          or text[i + 2] == "/"))):
                 return True
     return False

示例#19

0

显示文件

 def __read_text_cell(self, cell):
     text_content = []
     paragraphs = cell.getElementsByType(P)
     # for each text node
     for paragraph in paragraphs:
         name_space, tag = paragraph.parentNode.qname
         if tag != str("annotation"):
             data = extractText(paragraph)
             text_content.append(data)
     return "\n".join(text_content)

示例#20

0

显示文件

文件： simple_extractor.py 项目： Beehivr/lambda-text-extractor

def odf_to_text(document_path, event_handler):
    from odf.opendocument import load as odf_load
    from odf import text as odf_text
    from odf import teletype as odf_teletype

    doc = odf_load(document_path)
    paragraphs = []
    for p in doc.getElementsByType(odf_text.P):
        paragraphs.append(odf_teletype.extractText(p))

    return '\n'.join(paragraphs).strip()

示例#21

0

显示文件

文件： testwhitespace.py 项目： wahlmzr/odfpy

    def test_extract(self):
        """ Convert a paragraph to plain text """
        poem_odt = os.path.join(os.path.dirname(__file__), u"examples",
                                u"serious_poem.odt")
        d = load(poem_odt)
        allparas = d.getElementsByType(P)
        content = u"""<text:p text:style-name="Standard">The boy stood <text:s text:c="3"/>on the burning deck,<text:line-break/><text:tab/>Whence all<text:tab/>but<text:tab/><text:tab/>him had fled.<text:line-break/>The flames <text:s text:c="2"/>that lit<text:tab/>the battle's<text:tab/>wreck,<text:line-break/> <text:s text:c="11"/>Shone o'er him, round the dead. <text:s text:c="2"/></text:p>"""

        self.assertEqual(
            u"The boy stood    on the burning deck,\n\tWhence all\tbut\t\thim had fled.\nThe flames   that lit\tthe battle's\twreck,\n           Shone o'er him, round the dead.   ",
            teletype.extractText(allparas[0]))

示例#22

0

显示文件

    def odftext(self):
        """Extract texts from .odt/.ods/.odp files"""

        odf_file = odfload(self.path)
        odf_text = odf_file.getElementsByType(odftext.P)
        text = ''
        for para in odf_text:
            t = teletype.extractText(para)
            text = text + t + ' '
        text = text.replace("'", "‘")
        return text

示例#23

0

显示文件

文件： testwhitespace.py 项目： bufke/odfpy

    def test_extract(self):
        """ Convert a paragraph to plain text """
        poem_odt = os.path.join(os.path.dirname(__file__), "examples", "serious_poem.odt")
        d = load(poem_odt)
        allparas = d.getElementsByType(P)
        content = """<text:p text:style-name="Standard">The boy stood <text:s text:c="3"/>on the burning deck,<text:line-break/><text:tab/>Whence all<text:tab/>but<text:tab/><text:tab/>him had fled.<text:line-break/>The flames <text:s text:c="2"/>that lit<text:tab/>the battle's<text:tab/>wreck,<text:line-break/> <text:s text:c="11"/>Shone o'er him, round the dead. <text:s text:c="2"/></text:p>"""

        self.assertEqual(
            u"The boy stood    on the burning deck,\n\tWhence all\tbut\t\thim had fled.\nThe flames   that lit\tthe battle's\twreck,\n           Shone o'er him, round the dead.   ",
            teletype.extractText(allparas[0]),
        )

示例#24

0

显示文件

    def switch_notes(self, qte):

        for i in range(len(self.lignes)):
            #print(self.allText[self.lignes[i]])
            #ligne de notes
            old_text = teletype.extractText(self.allText[self.lignes[i]])
            newer_text = ""
            new_notes = []

            for j in range(len(self.notes[i].notes)):
                new_notes.append(
                    self.gamme.switch_note(self.notes[i].notes[j], self.bemol,
                                           qte).note)

            print("NOTESET")
            self.notes[i].show_noteset()

            print("NEW NOTESET : " + str(new_notes))

            cpt = 0

            for j in range(len(old_text) - 1):
                if old_text[j] != " " and old_text[j] != "/" and (
                        old_text[j] == self.notes[i].notes[cpt].note or
                    (old_text[j] + old_text[j + 1])
                        == self.notes[i].notes[cpt].note):
                    print("NEW NOTE")
                    newer_text += new_notes[cpt]
                    cpt += 1
                    if (cpt == len(new_notes)):
                        newer_text += " "
                        break
                else:
                    if (old_text[j] != "b" and old_text[j] != "#"):
                        newer_text += old_text[j]

            print("OLD TEXT  : " + old_text)
            print("NEW TEXT : " + newer_text)

            new_S = text.P()
            new_S.setAttribute(
                "stylename",
                self.allText[self.lignes[i]].getAttribute("stylename"))
            new_S.addText(newer_text)
            self.allText[self.lignes[i]] = new_S


#        print("\n\n\n")
#       for i in range(len(self.allText)):
#          print(self.allText[i])

        self.reset_notes()

示例#25

0

显示文件

    def __init__(self, ligne):
        self.notes = []
        self.gamme = Gamme()
        text = teletype.extractText(ligne)

        for i in range(len(text)):
            if text[i] in self.gamme.notes:
                new_note = text[i]
                if text[i + 1] == "b":
                    new_note += "b"
                elif text[i + 1] == "#":
                    new_note += "#"
                self.notes.append(Note(new_note))

示例#26

0

显示文件

文件： extract_text.py 项目： christinataft/gender-violence

def get_text_odt(file_path):
    """ Function to extract the text from an ODT file
    Args:
        file_path (str): local path to file

    Returns:
        String with all the text from the ODT file
    """
    textdoc = load(file_path)
    full_text = []
    for para in textdoc.getElementsByType(odf_text.P):
        full_text.append(teletype.extractText(para))

    return ' '.join(full_text)

示例#27

0

显示文件

def read_files(p):
    filename, file_extension = os.path.splitext(p)
    if (file_extension in [".txt", ".py", ".cpp", ".c"]):
        with open(p, 'r') as file:
            data = file.read()
        return data

    elif (file_extension == ".pdf"
          ):  ## TODO: needs tuning sometimes does not work
        #print("handling pdf:")
        data = []
        t = ""
        with open(p, 'rb') as file:
            pdfReader = PyPDF2.PdfFileReader(file)
            for i in range(pdfReader.numPages):
                pageObj = pdfReader.getPage(i)
                data.append(pageObj.extractText())
            t = '\n'.join(data)
        return (t)

        ## TODO: TEXTRACT STILL NOT FIXED
        ''' 
            if (not len(text)):
                print("HERE")
                text = textract.process(p, method='tesseract', language='eng')
        '''
        '''
        ### Testing slate3k
            with open(p, 'rb') as file:
                extracted_text = slate.PDF(file)
            return(extracted_text)
        '''

    elif (file_extension == ".docx"):
        #print("handling docx:")
        doc = docx.Document(p)
        data = []
        for para in doc.paragraphs:
            data.append(para.text)
        return '\n'.join(data)

    elif (file_extension == ".odt"):
        #print("handling odt:")
        textdoc = load(p)
        data = []
        allparas = textdoc.getElementsByType(text.P)
        for i in range(len(allparas)):
            data.append(teletype.extractText(allparas[i]))
        return '\n'.join(data)

示例#28

0

显示文件

文件： odf_output.py 项目： irsural/calibrator_pc

def __fill_odf_table(a_file, a_tables_to_draw: List[TableToDraw]):
    for table in a_file.getElementsByType(odf_table.Table):
        for table_row in table.getElementsByType(odf_table.TableRow):
            if teletype.extractText(table_row) == "%insert_table__":

                cell_style = table_row.getElementsByType(
                    odf_table.TableCell)[0].getAttribute("stylename")
                text_style = table_row.getElementsByType(
                    odf_text.P)[0].getAttribute("stylename")
                row_length_in_cells = __get_table_columns_count(table)

                # Удаляем флаговую строку
                table_row.parentNode.removeChild(table_row)

                for table_to_draw in a_tables_to_draw:
                    table_header = [
                        "Тип сигнала: " + table_to_draw.signal_type,
                        "Предел измерения: " + table_to_draw.limit,
                        "Допустимая погрешность: " + table_to_draw.error_limit
                    ]

                    __add_row_with_texts_to_table(table, None, cell_style,
                                                  table_header,
                                                  row_length_in_cells)

                    for frequency in table_to_draw.points.keys():
                        if int(frequency) != 0:
                            __add_row_with_text_to_table(
                                table, None, cell_style,
                                ' '.join(["Частота:",
                                          str(frequency),
                                          "Гц"]), row_length_in_cells)

                        for points in table_to_draw.points[frequency]:
                            points_row = __add_row_to_table(table)
                            for point in points:
                                __add_cell_to_row(points_row, cell_style,
                                                  text_style, str(point))

                            for empty_cell in range(row_length_in_cells -
                                                    len(points)):
                                # Чтобы пустые ячейки не мерджились в одну
                                __add_cell_to_row(points_row, cell_style,
                                                  text_style, "")

                # Без этого дерево нодов сломается
                a_file.rebuild_caches(table_row.parentNode)
                break

示例#29

0

显示文件

    def switch_type(self):

        for i in range(len(self.lignes)):
            old_text = teletype.extractText(self.allText[self.lignes[i]])
            new_text = ""

            new_notes = []

            cpt = 0

            for j in range(len(self.notes[i].notes)):
                ind = self.get_char_index(self.notes[i].notes[j].note)
                if self.bemol:
                    new_notes.append(self.gamme.notes_diese[ind])
                else:
                    new_notes.append(self.gamme.notes_bemol[ind])

            print("NOTES : " + str(new_notes))

            for j in range(len(old_text) - 1):
                if old_text[j] != " " and old_text[j] != "/" and (
                        old_text[j] == self.notes[i].notes[cpt].note or
                    (old_text[j] + old_text[j + 1])
                        == self.notes[i].notes[cpt].note):
                    print("NEW NOTE")
                    new_text += new_notes[cpt]
                    cpt += 1
                    if (cpt == len(new_notes)):
                        new_text += " "
                        break
                else:
                    if (old_text[j] != "b" and old_text[j] != "#"):
                        new_text += old_text[j]

            print("OLD TEXT : " + old_text)
            print("NEW TEXT : " + new_text)

            new_S = text.P()
            new_S.setAttribute(
                "stylename",
                self.allText[self.lignes[i]].getAttribute("stylename"))
            new_S.addText(new_text)

            self.allText[self.lignes[i]] = new_S
            self.bemol = not self.bemol

        self.reset_notes()

示例#30

0

显示文件

文件： ConvertOdtByXML.py 项目： EliseyP/onik

def get_text_from_odt(_odt, save_blank=None) -> str:
    """Выводит текст odt документа.

    Абзацы (\n), табуляции(\t), переносы строк (\n) обрабатываются.
    :param _odt:
    :param save_blank: Сохранять ли промежутки между абзацами как \n
    :return: text
    """
    # TODO: save_blank - как в pandoc.
    doc = load(_odt)
    out_text = ''
    body = doc.body
    for _body_elem in body.childNodes:
        for _elem in _body_elem.childNodes:
            body_text = teletype.extractText(_elem)
            out_text += f'{body_text}\n'
    return out_text

示例#31

0

显示文件

def con(filenames, code, counter, subdir):
    if not os.path.exists('input/' + subdir):
        os.makedirs('input/' + subdir)
    for file in filenames:
        counter = counter + 1
        txtString = []
        textfile = load(file)
        allparas = textfile.getElementsByType(text.P)
        for texts in allparas:
            txtString.append(teletype.extractText(texts))
        saveFile = open("input/" + subdir + '/' + str(code) +
                        str(counter).zfill(4) + ".txt",
                        'w',
                        encoding="utf-8")
        saveFile.write(''.join(txtString))
        saveFile.close()

    return counter

示例#32

0

显示文件

def clean_file(file_path):
    doc = load(file_path)
    texts = doc.getElementsByType(text.P)
    s = len(texts)
    started = False

    line_items = []
    for i in range(s):
        line = teletype.extractText(texts[i])
        if 'END OF MINIDISCS' in line:
            break
        if not started and 'DISC 1' in line:
            started = True
        if started and line and not any(a_word in line
                                        for a_word in skip_words):
            line_items.append(line)

    return line_items

示例#33

0

显示文件

文件： class_set.py 项目： Labitoku/Tarpipion

    def __init__(self, ligne):

        self.notes = []
        self.gamme = Gamme()
        text = teletype.extractText(ligne)

        for i, txt in enumerate(text):

            if txt in self.gamme.notes:
                new_note = txt

                if text[i + 1] == "b":
                    new_note += "b"

                elif text[i + 1] == "#":
                    new_note += "#"

                self.notes.append(Note(new_note))

示例#34

0

显示文件

文件： calgen.py 项目： y-salnikov/calendar_gen

def main():
    global grid
    global month_year
    if len(sys.argv)==1:	cmd_line="ncal -h"
    else: cmd_line="ncal -h -m %s" %(sys.argv[1])
    cal_prc=subprocess.Popen(cmd_line.split(' '),stdout=subprocess.PIPE)
    cal_str_lst_tr=cal_prc.communicate()[0].split('\n')
    cal_str_lst=[cal_str_lst_tr[0]]
    for c in xrange(7):
	line=u""
	for l in xrange(7):
	    st=(cal_str_lst_tr[l+1].decode("utf-8")+u"   ")[c*3:(c*3+3)]
	    line=line+st
	cal_str_lst.append(line)
    month_year=cal_str_lst[0].strip()
    header=cal_str_lst[1].split(' ')
    header=[i for i in header if len(i)>0]
    for n in cal_str_lst:
	ned=[]
	if (len(n)>20):
	    for d in xrange(7):
		day=''+n[3*d]+n[(3*d)+1]
		ned.append(day)
	    grid.append(ned)
    grid=grid[2:]
    cal=load("template_ru.ods")
    texts = cal.getElementsByType(text.P)
    s=len(texts)
    for i in range(s):
	old_text = teletype.extractText(texts[i]).encode("utf-8")
	new_text =replace_tmpl(old_text)
	if new_text!=None:
	    new_S = text.P()
	    new_S.setAttribute("stylename",texts[i].getAttribute("stylename"))
	    new_S.addText(new_text)
	    texts[i].parentNode.insertBefore(new_S,texts[i])
	    texts[i].parentNode.removeChild(texts[i])
    cal.save("календарь на %s.ods" %(month_year))

示例#35

0

显示文件

文件： import_documents.py 项目： s-alexey/orange3-text

 def read_file(self):
     odtfile = load(self.path)
     texts = odtfile.getElementsByType(text.P)
     self.content = " ".join(teletype.extractText(t) for t in texts)