Exemplo n.º 1
0
def parseDocx(inDoc):
    with open('temp/temp.txt', 'w+') as temp:
        # print("parseDocx is being called")
        import zipfile
        try:
            from xml.etree.cElementTree import XML
        except ImportError:
            from xml.etree.ElementTree import XML
            print("Running in compatibility mode")
        """
        parseDocx is a derivative of <https://github.com/mickmaccana/python-docx>
        """
        WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
        PARA = WORD_NAMESPACE + 'p'
        TEXT = WORD_NAMESPACE + 't'

        document = zipfile.ZipFile(inDoc)
        xml_content = document.read('word/document.xml')
        document.close()
        tree = XML(xml_content)
        i = 0
        paragraphs = []
        for paragraph in tree.getiterator(PARA):
            texts = [
                node.text for node in paragraph.getiterator(TEXT) if node.text
            ]
            if texts:
                paragraphs.append(''.join(texts))
                temp.write(repr(paragraphs))
        return paragraphs  ### this should be a list of all the stuf
Exemplo n.º 2
0
def get_raw_text(pthFile):
	"""
	gets a path to a file as an argument and returns a list containing
	the paragraphs of the word document file
	"""
	
	"""
	Constants used to iterate over the XML tree
	"""
	WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
	PARA = WORD_NAMESPACE + 'p'
	TEXT = WORD_NAMESPACE + 't'

	docWordDoc = zipfile.ZipFile(pthFile) #gets the documents of the word
	xmlContent = docWordDoc.read('word/document.xml') #access the xml file
	docWordDoc.close()
	treeXML = XML(xmlContent) #parses the xml content into a tree that will be further used to access the text

	lstParagraphs = [] #output list with the paragraphs of the text
	#now we proceed to extract the text from the tree
	#the idea is to iterate over the tree and 
	#for each node that contains text, substract it and add it to
	#the output
	for parParagraph in treeXML.getiterator(PARA):
		lstTexts = [nodElement.text
			    for nodElement in parParagraph.getiterator(TEXT)
			    if nodElement.text]
		if lstTexts:
			print lstTexts
			lstParagraphs.append(''.join(lstTexts))
		
	return lstParagraphs
Exemplo n.º 3
0
def get_docx_text(path=os.getcwd() + '\\word_samples'):
    """
    Take the path of a docx file as argument, return the text in unicode.
    """
    document = zipfile.ZipFile(path)
    contentToRead = ["header2.xml", "document.xml", "footer2.xml"]
    paragraphs = []

    for xmlfile in contentToRead:
        xml_content = document.read('word/{}'.format(xmlfile))
        tree = XML(xml_content)
        for paragraph in tree.getiterator(PARA):
            texts = [
                node.text for node in paragraph.getiterator(TEXT) if node.text
            ]
            if texts:
                textData = ''.join(texts)
                if xmlfile == "footer2.xml":
                    extractedTxt = "Footer : " + textData
                elif xmlfile == "header2.xml":
                    extractedTxt = "Header : " + textData
                else:
                    extractedTxt = textData

                paragraphs.append(extractedTxt)
    document.close()
    return '\n\n'.join(paragraphs)
Exemplo n.º 4
0
    def docxml_to_text(self, filename):

        texts = ""

        document = zipfile.ZipFile(filename)
        xml_content = document.read('word/document.xml')
        document.close()
        tree = XML(xml_content)

        sections = []
        for section in tree.getiterator(self.PARA):
            texts = ''
            for node in section.getiterator(self.TEXT):
                if node.text:
                    texts += node.text
            sections.append(''.join(texts))
        '''
        for section in tree.getiterator(self.PARA):
            texts = [node.text for node in section.getiterator(self.TEXT) if node.text]
            if texts:
                sections.append(''.join(texts))
        '''
        texts = '\n\n'.join(sections)

        return texts
Exemplo n.º 5
0
def parse_config(soup):
    """	There are lots of goodies in the config we get back from the ABC.
		In particular, it gives us the URLs of all the other XML data we
		need.
	"""

    xml = XML(soup)
    params = dict()
    for param in xml.getiterator('param'):
        params.setdefault(param.get('name'), param.get('value'))

    # should look like "rtmp://cp53909.edgefcs.net/ondemand"
    # Looks like the ABC don't always include this field.
    # If not included, that's okay -- ABC usually gives us the server in the auth result as well.
    rtmp_url = params['server_streaming']
    categories_url = params['categories']

    params.update({
        'rtmp_url': rtmp_url,
        'auth_url': params['auth'],
        'api_url': params['api'],
        'categories_url': categories_url,
        'captions_url': params['captions'],
    })
    return params
Exemplo n.º 6
0
def docx_extractor(path, vectors=False):
    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    ## GET METADATA
    # use lxml to parse the xml file we are interested in
    try:
        doc = lxml.etree.fromstring(document.read('docProps/core.xml'))
        # retrieve creator
        ns = {'dc': 'http://purl.org/dc/elements/1.1/'}
        creator = doc.xpath('//dc:creator', namespaces=ns)[0].text
    except:
        creator = "Unknown"
    document.close()
    tree = XML(xml_content)

    doc = {}
    vector = {}
    paragraph_nb = 1
    for paragraph in tree.getiterator(PARA):
        texts = None
        text = ""
        texts = [node.text
                 for node in paragraph.getiterator(TEXT)
                 if node.text]
        if texts:
            text = ''.join(texts)
            doc[str(paragraph_nb)] = fix_text(text)
            if vectors:
                vector[str(paragraph_nb)] = vectorizer(text, lang=detect(text))
            paragraph_nb += 1

    if vectors:
        return creator, doc, vector
    else:
        return creator, doc
Exemplo n.º 7
0
def epilepsy_docx_xml_to_txt(
        path,
        n_xml,
        docx_xml_to_txt_save_path="L:\\word_docs\\epilepsy_docx_xml_to_txt\\"):
    """
    Take the path of a docx file as argument, return the text in unicode.
    Run this if epilepsy_docx() isn't able to read the name.
    This should automatically read tables anyway.
    """

    WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
    PARA = WORD_NAMESPACE + 'p'
    TEXT = WORD_NAMESPACE + 't'

    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)

    paragraphs = []
    for paragraph in tree.getiterator(PARA):
        texts = [n.text for n in paragraph.getiterator(TEXT) if n.text]
        if texts:
            paragraphs.append(''.join(texts))

    pt_txt_xml = '\n\n'.join(paragraphs)
    save_as_txt(path, pt_txt_xml, docx_xml_to_txt_save_path)

    n_xml += 1
    return pt_txt_xml, n_xml
Exemplo n.º 8
0
def parse_captions(soup):
	"""	Converts custom iView captions into SRT format, usable in most
		decent media players.
	"""
	
	# Horrible hack to escape literal ampersands, which have been seen in
	# some captions XML. Inspired by
	# http://stackoverflow.com/questions/6088760/fix-invalid-xml-with-ampersands-in-python
	if b"<![CDATA[" not in soup:  # Not seen, but be future proof
		soup = re.sub(b"&(?![#\w]+;)", b"&amp;", soup)
	
	xml = XML(soup)

	output = ''

	i = 1
	for title in xml.getiterator('title'):
		start = title.get('start')
		ids = start.rfind(':')
		end = title.get('end')
		ide = end.rfind(':')
		output = output + str(i) + '\n'
		output = output + start[:ids] + ',' + start[ids+1:] + ' --> ' + end[:ide] + ',' + end[ide+1:] + '\n'
		output = output + title.text.replace('|','\n') + '\n\n'
		i += 1

	return output
Exemplo n.º 9
0
def get_docx_table(path):
    """
    Find the table inside the .docx file and return it in an array
    """
    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)

    rows = []
    for xml_row in tree.getiterator(TR):
        row = []
        for xml_cell in xml_row.getiterator(TC):
            # Each cell consists of one or more paragraph
            text = ""
            for paragraph in xml_cell.getiterator(PARA):
                texts = [
                    node.text for node in paragraph.getiterator(TEXT)
                    if node.text
                ]
                paragraph_text = "".join(texts)
                if paragraph_text:
                    text += paragraph_text + "\n"
            if text.endswith("\n"):
                text = text[0:-1]
            row.append(text)
        rows.append(row)
    return rows
Exemplo n.º 10
0
def parse_captions(soup):
	"""	Converts custom iView captions into SRT format, usable in most
		decent media players.
	"""
	
	# Horrible hack to escape literal ampersands, which have been seen in
	# some captions XML. Inspired by
	# http://stackoverflow.com/questions/6088760/fix-invalid-xml-with-ampersands-in-python
	if b"<![CDATA[" not in soup:  # Not seen, but be future proof
		soup = re.sub(b"&(?![#\w]+;)", b"&amp;", soup)
	
	xml = XML(soup)

	output = ''

	i = 1
	for title in xml.getiterator('title'):
		start = title.get('start')
		(start, startfract) = start.rsplit(':', 1)
		end = title.get('end')
		(end, endfract) = end.rsplit(':', 1)
		output = output + '{}\n'.format(i)
		output = output + '{},{:0<3.3} --> {},{:0<3.3}\n'.format(start, startfract, end, endfract)
		output = output + title.text.replace('|','\n') + '\n\n'
		i += 1

	return output
def get_email_para(path):
    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)
    WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
    PARA = WORD_NAMESPACE + 'p'
    TEXT = WORD_NAMESPACE + 't'
    paragraphs = []
    for paragraph in tree.getiterator(PARA):
        texts = [
            node.text for node in paragraph.getiterator(TEXT) if node.text
        ]
        if texts:
            paragraphs.append(''.join(texts))
    text = ''
    for paragraph in paragraphs:
        text += ' ' + paragraph
    #print (text)
    result = ''
    reresult = False
    match = re.findall(r'[\w\.-]+@[\w\.-]+', text)
    for paragraph in paragraphs:
        if match[0] in paragraph:
            result = (paragraph.split(match[0])[0])
            #print (result)
    document = Document(path)
    paras = list(document.paragraphs)
    for paragraph in paras:
        if result in paragraph.text:
            paragraph.style = 'Correspondence'
            document.save(file.split(".")[0] + "_PROCESSED.docx")
            reresult = True
    return reresult
Exemplo n.º 12
0
def parse_config(soup):
	"""	There are lots of goodies in the config we get back from the ABC.
		In particular, it gives us the URLs of all the other XML data we
		need.
	"""

	xml = XML(soup)
	params = dict()
	for param in xml.getiterator('param'):
		params.setdefault(param.get('name'), param.get('value'))

	# should look like "rtmp://cp53909.edgefcs.net/ondemand"
	# Looks like the ABC don't always include this field.
	# If not included, that's okay -- ABC usually gives us the server in the auth result as well.
	rtmp_url = params['server_streaming']
	rtmp_chunks = rtmp_url.split('/')

	return {
		'rtmp_url'  : rtmp_url,
		'rtmp_host' : rtmp_chunks[2],
		'rtmp_app'  : rtmp_chunks[3],
		'auth_url'  : params['auth'],
		'api_url' : params['api'],
		'categories_url' : params['categories'],
		'captions_url' : params['captions'],
	}
Exemplo n.º 13
0
def parse_captions(soup):
    """	Converts custom iView captions into SRT format, usable in most
		decent media players.
	"""

    # Horrible hack to escape literal ampersands, which have been seen in
    # some captions XML. Inspired by
    # http://stackoverflow.com/questions/6088760/fix-invalid-xml-with-ampersands-in-python
    if b"<![CDATA[" not in soup:  # Not seen, but be future proof
        soup = re.sub(b"&(?![#\w]+;)", b"&amp;", soup)

    xml = XML(soup)

    output = ''

    i = 1
    for title in xml.getiterator('title'):
        start = title.get('start')
        (start, startfract) = start.rsplit(':', 1)
        end = title.get('end')
        (end, endfract) = end.rsplit(':', 1)
        output = output + '{}\n'.format(i)
        output = output + '{},{:0<3.3} --> {},{:0<3.3}\n'.format(
            start, startfract, end, endfract)
        output = output + title.text.replace('|', '\n') + '\n\n'
        i += 1

    return output
Exemplo n.º 14
0
def get_raw_text(pthFile):
    """
	gets a path to a file as an argument and returns a list containing
	the paragraphs of the word document file
	"""
    """
	Constants used to iterate over the XML tree
	"""
    WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
    PARA = WORD_NAMESPACE + 'p'
    TEXT = WORD_NAMESPACE + 't'

    docWordDoc = zipfile.ZipFile(pthFile)  #gets the documents of the word
    xmlContent = docWordDoc.read('word/document.xml')  #access the xml file
    docWordDoc.close()
    treeXML = XML(
        xmlContent
    )  #parses the xml content into a tree that will be further used to access the text

    lstParagraphs = []  #output list with the paragraphs of the text
    #now we proceed to extract the text from the tree
    #the idea is to iterate over the tree and
    #for each node that contains text, substract it and add it to
    #the output
    for parParagraph in treeXML.getiterator(PARA):
        lstTexts = [
            nodElement.text for nodElement in parParagraph.getiterator(TEXT)
            if nodElement.text
        ]
        if lstTexts:
            print lstTexts
            lstParagraphs.append(''.join(lstTexts))

    return lstParagraphs
Exemplo n.º 15
0
def get_docx_text(path):
    """
    Take the path of a docx file as argument, return the text in unicode.
    """
    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)

    # f = open("data.xml", "w")
    # f.write(xml_content)
    tre = ET.parse('data.xml')
    root = tre.getroot()

    for child in root:
        print child.tag, child.attrib

    paragraphs = []
    for paragraph in tree.getiterator(PARA):
        texts = [
            node.text for node in paragraph.getiterator(TEXT) if node.text
        ]
        if texts:
            paragraphs.append(''.join(texts))

    return '----\n\n'.join(paragraphs)
def docx_to_text(document_path, event_handler):
    global logger

    from docx import Document
    from docx.table import Table
    from docx.text.paragraph import Paragraph
    from docx.oxml.table import CT_Tbl
    from docx.oxml.text.paragraph import CT_P

    try:
        doc = Document(document_path)
        doc_body = doc.element.body
        blocks = []
        for child in doc_body.iterchildren():
            if isinstance(child, CT_P):
                blocks.append(Paragraph(child, doc_body).text)
            elif isinstance(child, CT_Tbl):
                blocks.append('\n'.join(
                    ' | '.join(cell.text for cell in row.cells)
                    for row in Table(child, doc_body).rows))
        #end for

        text = '\n\n'.join(blocks).strip()

        return text

    except Exception:
        logger.exception('Exception while parsing <{}>.'.format(
            event_handler.key))
    #end try

    # Extract it from the XML
    with ZipFile(document_path) as document_zipfile:
        xml_content = document_zipfile.read('word/document.xml')

    try:
        from xml.etree.cElementTree import XML
    except ImportError:
        from xml.etree.ElementTree import XML

    tree = XML(xml_content)

    DOCX_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
    DOCX_PARA = DOCX_NAMESPACE + 'p'
    DOCX_TEXT = DOCX_NAMESPACE + 't'

    paragraphs = []
    for paragraph in tree.getiterator(DOCX_PARA):
        texts = [
            node.text for node in paragraph.getiterator(DOCX_TEXT) if node.text
        ]
        if texts:
            paragraphs.append(''.join(texts))
    #end for

    text = '\n\n'.join(paragraphs)

    return text
    def getText(self, path):
        """
        Take the path of a docx file as argument, return the text in unicode.
        """
        document = zipfile.ZipFile(path)
        xml_content = document.read('word/document.xml')
        tree = XML(xml_content)

        paragraphs = []
        for paragraph in tree.getiterator(PARA):
            texts = [
                node.text for node in paragraph.getiterator(TEXT) if node.text
            ]
            if texts:
                paragraphs.append(''.join(texts))

        paragraphs.append('TABLE_INFORMATION_ROW_WISE')

        for tablerow in tree.getiterator(TABLE_ROW):
            texts = [
                node.text for node in tablerow.getiterator(TEXT) if node.text
            ]
            if texts:
                paragraphs.append(' '.join(texts))

        paragraphs.append('INFORMATION_EXTRACTED_FROM_HEADER')

        try:
            xml_content = document.read('word/header1.xml')
            document.close()
            tree = XML(xml_content)

            for paragraph in tree.getiterator(PARA):
                texts = [
                    node.text for node in paragraph.getiterator(TEXT)
                    if node.text
                ]
                if texts:
                    paragraphs.append(''.join(texts))

        except Exception:
            document.close()
            pass

        return '\n\n'.join(paragraphs)
Exemplo n.º 18
0
def get_docx_tables(path):
    """
    Find the table inside the .docx file and return it in an array
    """
    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)
    for tbl in tree.getiterator(TBL):
        yield tbl
Exemplo n.º 19
0
 def get_docx_text(self, path):
     document = zipfile.ZipFile(path)
     xml_content = document.read('word/document.xml')
     tree = XML(xml_content)
     document.close()
     paragraphs = []
     for paragraph in tree.getiterator(para_tag):
         texts = [node.text for node in paragraph.getiterator(text_tag) if node.text]
         if texts:
             paragraphs.append(''.join(texts))
     return paragraphs
Exemplo n.º 20
0
def get_docx_text(path):
    document = zipfile.ZipFile(path)
    xml_content = document.read("word/document.xml")
    document.close()
    tree = XML(xml_content)

    paragraphs = []
    for paragraph in tree.getiterator(PARA):
        texts = [node.text for node in paragraph.getiterator(TEXT) if node.text]
        if texts:
            paragraphs.append("".join(texts))
    return paragraphs
def get_docx_text(path):
	"""
	Take the path of a docx file as argument, return the text in unicode
	in the form of a list.
	"""
	document = zipfile.ZipFile(path)
	xml_content = document.read('word/document.xml')
	document.close()
	tree = XML(xml_content)
 
	paragraphs = []
	for paragraph in tree.getiterator(PARA):
		texts = [node.text.encode('utf-8')
				 for node in paragraph.getiterator(TEXT)
				 if node.text]
		if texts:
			paragraphs.append(''.join(texts))
 
	return paragraphs



 
# def get_docx_text(path):
# 	"""
# 	Take the path of a docx file as argument, return the text in unicode
# 	in the form of a list.
# 	"""
# 	document = zipfile.ZipFile(path)
# 	xml_content = document.read('word/document.xml')
# 	document.close()
# 	tree = XML(xml_content)
 
# 	sections = []
# 	for section in tree.getiterator(SECT):

# 		paragraphs = []
# 		for paragraph in section.getiterator(PARA):
# 			print 'para'
# 			texts = [node.text.encode('utf-8')
# 					 for node in paragraph.getiterator(TEXT)
# 					 if node.text]
# 			if texts:
# 				paragraphs.append(''.join(texts))

# 		print str(paragraphs)

# 		if paragraphs:
# 			sections.append(''.join(paragraphs))

 
# 	return sections    
Exemplo n.º 22
0
    def returnFileText(self, xmlHeader='word/document.xml'):
        '''
        Returns the file's text in a large string
        Usage:
            returnFileText()
        Returns:
            'Lorem ipsum ....'
        '''

        if self.fileFormat == 'pdf':
            fp = open(self.filePathName, 'rb')
            rsrcmgr = PDFResourceManager()
            retstr = io.StringIO()
            codec = 'utf-8'
            laparams = LAParams()
            device = TextConverter(rsrcmgr,
                                   retstr,
                                   codec=codec,
                                   laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.get_pages(fp):
                interpreter.process_page(page)
                file = retstr.getvalue()
        elif self.fileFormat == 'doc':

            file = open(self.filePathName, encoding='latin-1').read()
        elif self.fileFormat == 'docx':

            WORD_NAMESPACE = \
                '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
            PARA = WORD_NAMESPACE + 'p'
            TEXT = WORD_NAMESPACE + 't'
            document = zipfile.ZipFile(self.filePathName)
            xml_content = document.read(xmlHeader)
            document.close()
            try:
                tree = XML(xml_content)
                paragraphs = []
                for paragraph in tree.getiterator(PARA):
                    texts = [
                        node.text for node in paragraph.getiterator(TEXT)
                        if node.text
                    ]
                    if texts:
                        paragraphs.append(''.join(texts))
            except:
                return Exception
            file = '''

'''.join(paragraphs)

        return file
Exemplo n.º 23
0
def get_docx_text(path):
    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)
    paragraphs = []
    for paragraph in tree.getiterator(PARA):
        texts = [
            node.text for node in paragraph.getiterator(TEXT) if node.text
        ]
        if texts: paragraphs.append(''.join(texts))

    return '\n\n'.join(paragraphs)
Exemplo n.º 24
0
def get_doc_text(path):
    """
    Take the path of a docx or a dot file as argument, return the text in unicode.
    """
    if "docx" == path[-4:]:
        document = zipfile.ZipFile(path)
        xml_content = document.read('word/document.xml')
        document.close()
        tree = XML(xml_content)
        #print tree

        paragraphs = []
        for paragraph in tree.getiterator(PARA):
            texts = [node.text for node in paragraph.iter(TEXT) if node.text]
            if texts:
                paragraphs.append(''.join(texts))
                pass
            pass
    #print paragraphs
        return paragraphs


#
    elif "odt" == path[-3:]:
        document = zipfile.ZipFile(path)
        xml_content = document.read('content.xml')
        document.close()
        doc = xml.dom.minidom.parseString(xml_content)
        print(" doc: ", doc)
        print("doc::end")
        #paras = doc.getElementsByTagName('text:span')
        #paras = doc.getElementsByTagName('text:p')
        #
        # we get here all elements Headers, text and table components:
        #
        paras = doc.getElementsByTagName("*")
        print("I have ", len(paras), " paragraphs ")
        paragraphs = []
        for p in paras:
            for ch in p.childNodes:
                if ch.nodeType == ch.TEXT_NODE:
                    paragraphs.append(''.join(ch.wholeText))
                    pass
                pass
            pass
        print(paragraphs)
        return paragraphs
    else:
        print()
        raise Warning("only docx and odt files are handled")
Exemplo n.º 25
0
def extract_docx_content():
    """
	Extract text content from .docx file.
    The .docx file type is in xml format, so
    this function makes use of the 
    xml.etree.cElementTree python module.
    :return: pandas DataFrame
	"""
    # - pu claims data directory
    path_input = 'Z:\\xxxx'

    # -  Namespace information needed to extract content
    WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
    PARA = WORD_NAMESPACE + 'p'
    TEXT = WORD_NAMESPACE + 't'

    # - create a dict to store processed docx file
    processed_docx_files = {}
    os.chdir(path_input)

    # - collect al .docx files
    docx_files = glob.glob('*.docx')

    # - files to remove
    remove_list = []

    # - algorithm to extract the .docx data
    for docx in docx_files:
        try:
            document = zipfile.ZipFile(docx)
            xml_content = document.read('word/document.xml')
            document.close()
            tree = XML(xml_content)

            # - load data into paragraphs list
            paragraphs = []
            for paragraph in tree.getiterator(PARA):
                texts = [
                    node.text for node in paragraph.getiterator(TEXT)
                    if node.text
                ]
                # - convert texts into a single text string
                if texts:
                    paragraphs.append(' '.join(texts).lower())

            # - update the procesed_docx_files dict
            processed_docx_files.update({docx: ''.join(paragraphs)})
        except:
            print("An error occured trying to parse {}".format(docx))
            remove_list.append(docx)
Exemplo n.º 26
0
def read_docx(file,document,path,trie):
	xml_content = document.read('word/document.xml')
	document.close()
	tree = XML(xml_content)
	paragraphs = ""
	for paragraph in tree.getiterator(PARA):
		texts=""
		for node in paragraph.getiterator(TEXT):
			if node.text:
				texts += node.text.replace('\u7460',"")
		if texts:
			paragraphs+=str(texts)
	#print(paragraphs)
	string_spilt(paragraphs,path,trie)
	trie.insert_doc_len(path,len(file)+len(paragraphs))
Exemplo n.º 27
0
    def parse(self, path_to_document):
        document = zipfile.ZipFile(path_to_document)
        xml_content = document.read('word/document.xml')
        document.close()
        tree = XML(xml_content)

        paragraphs = []
        for paragraph in tree.getiterator(PARA):
            texts = [
                node.text for node in paragraph.getiterator(TEXT) if node.text
            ]
            if texts:
                paragraphs.append(''.join(texts))

        self.text = ''.join(paragraphs)
Exemplo n.º 28
0
def get_doc_text(path):
    """
    Take the path of a docx or a dot file as argument, return the text in unicode.
    """
    if "docx" == path[-4:]:
        document = zipfile.ZipFile(path)
        xml_content = document.read('word/document.xml')
        document.close()
        tree = XML(xml_content)
    #print tree

        paragraphs = []
        for paragraph in tree.getiterator(PARA):
            texts = [node.text for node in paragraph.iter(TEXT) if node.text]
            if texts:
                paragraphs.append(''.join(texts))
                pass
            pass
    #print paragraphs
        return paragraphs
#        
    elif "odt" == path[-3:]:
        document = zipfile.ZipFile(path)
        xml_content = document.read('content.xml')
        document.close()
        doc = xml.dom.minidom.parseString(xml_content)
        print(" doc: ",doc)
        print("doc::end")
        #paras = doc.getElementsByTagName('text:span')
        #paras = doc.getElementsByTagName('text:p')
        #
        # we get here all elements Headers, text and table components: 
        #
        paras = doc.getElementsByTagName("*")
        print("I have ", len(paras), " paragraphs ")
        paragraphs = []
        for p in paras:
            for ch in p.childNodes:
                if ch.nodeType == ch.TEXT_NODE:
                    paragraphs.append(''.join(ch.wholeText))
                    pass
                pass
            pass
        print(paragraphs)
        return paragraphs
    else:
        print() 
        raise Warning("only docx and odt files are handled")    
Exemplo n.º 29
0
def docx_do_docx(azip, afile):
    word_namespace = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
    par = word_namespace + "p"
    txt = word_namespace + "t"

    xml_content = azip.read("word/document.xml")
    tree = XML(xml_content)

    paragraphs = []
    for paragraph in tree.getiterator(par):
        texts = [node.text for node in paragraph.getiterator(txt) if node.text]
        if texts:
            paragraphs.append("".join(texts))

    text = "\n\n".join(paragraphs)
    text_do_data(text, afile)
Exemplo n.º 30
0
def get_docx_text(path):
	"""	Take the path of a docx file as argument, return the text in unicode."""
	document = zipfile.ZipFile(path)
	xml_content = document.read('word/document.xml')
	document.close()
	tree = XML(xml_content)
 
	paragraphs = []
	for paragraph in tree.getiterator(PARA):
		texts = [node.text
				for node in paragraph.getiterator(TEXT)
				if node.text]
		if texts:
			paragraphs.append(''.join(texts))
 
	return '\n\n'.join(paragraphs) 
Exemplo n.º 31
0
def docx_do_docx(azip, afile):
    namespace = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
    par = namespace + 'p'
    txt = namespace + 't'

    xml_content = azip.read('word/document.xml')
    tree = XML(xml_content)

    paragraphs = []
    for paragraph in tree.getiterator(par):
        texts = [node.text for node in paragraph.getiterator(txt) if node.text]
        if texts:
            paragraphs.append(''.join(texts))

    text = '\n\n'.join(paragraphs)
    text_do_data(text, afile)
Exemplo n.º 32
0
def get_docx_text(file_path):
    """Take the path of a docx file as argument, return the text in unicode."""
    document = zipfile.ZipFile(file_path)
    xml_content = document.read("word/document.xml")
    document.close()
    tree = XML(xml_content)

    paragraphs = []
    for paragraph in tree.getiterator(PARA):
        texts = [
            node.text for node in paragraph.getiterator(TEXT) if node.text
        ]
        if texts:
            paragraphs.append("".join(texts))
    # Type String
    return "\n\n".join(paragraphs)
Exemplo n.º 33
0
 def __init__(self,filePath):
     
     document = zipfile.ZipFile(filePath)
     xml_content = document.read('word/document.xml')
     document.close()
     tree = XML(xml_content)
     
     for paragraph in tree.getiterator(self.PARA):
         texts = [node.text for node in paragraph.getiterator(self.TEXT) if node.text]
         if texts:
             self.paragraphs.append(''.join(texts))
             sentenceEnders = re.compile('[.!?][\s]{1,2}(?=[A-Z])')
             self.sentenceList = self.sentenceList + sentenceEnders.split(''.join(texts))
                   
     self.data = '\n\n'.join(self.paragraphs)
     self.words = re.findall(r"[\w']+", self.data)
     self.filteredData = ' '.join(self.words)
Exemplo n.º 34
0
def readWord(filename):
    try:
        document = zipfile.ZipFile(filename)
        xml_content = document.read('word/document.xml')
        document.close()
        tree = XML(xml_content)

        paragraphs = ''
        for paragraph in tree.getiterator(PARA):
            texts = [
                node.text for node in paragraph.getiterator(TEXT) if node.text
            ]
            if texts:
                paragraphs += str(texts) + ','
        return paragraphs
    except Exception as e:
        print('ReadWord exception', e)
Exemplo n.º 35
0
def docx_do_docx(azip, afile):
    namespace = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
    par = namespace + 'p'
    txt = namespace + 't'

    xml_content = azip.read('word/document.xml')
    tree = XML(xml_content)

    paragraphs = []
    for paragraph in tree.getiterator(par):
        texts = [node.text for node in paragraph.getiterator(txt)
                 if node.text]
        if texts:
            paragraphs.append(''.join(texts))

    text = '\n\n'.join(paragraphs)
    text_do_data(text, afile)
Exemplo n.º 36
0
def parse_docx(path, paragraphs):
    """
    Take the path of a docx file as argument and list 'paragraphs', return the text in unicode.
    """
    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)

    for paragraph in tree.getiterator(PARA):
        texts = [
            node.text for node in paragraph.getiterator(TEXT) if node.text
        ]
        if texts:
            paragraphs.append(''.join(texts))

    return '\n\n'.join(paragraphs)
Exemplo n.º 37
0
    def getContents(self):
        """
        Just read the paragraphs from an XML file.
        """

        xml_content = self.my_docx.read('word/document.xml')
        self.my_docx.close()
        tree = XML(xml_content)

        self.text_in_paragraphs = []
        for paragraph in tree.getiterator(PARA):
            texts = [node.text for node in paragraph.iter(TEXT) if node.text]
            if texts:
                self.text_in_paragraphs.append(''.join(texts))
                pass
            pass
    #print paragraphs
        return self.text_in_paragraphs
Exemplo n.º 38
0
def docxparse(inDocx, outDocx):
    with open(outDocx, "w+") as outDocx:

        #Take the path of a docx file as argument, return the text in unicode.

        document = zipfile.ZipFile(inDocx)
        xml_content = document.read('word/document.xml')
        document.close()
        tree = XML(xml_content)
        i = 0
        paragraphs = []
        for paragraph in tree.getiterator(PARA):
            texts = [
                node.text for node in paragraph.getiterator(TEXT) if node.text
            ]
            if texts:
                paragraphs.append(''.join(texts))
        outDocx.write('\n'.join(paragraphs))
Exemplo n.º 39
0
def get_docx_text(fileName):
    zipFile = zipfile.ZipFile(fileName)

    WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
    PARA = WORD_NAMESPACE + 'p'
    TEXT = WORD_NAMESPACE + 't'

    name = 'word/document.xml'
    document = zipFile.read(name, pwd=None)
    documentTree = XML(document)

    paragraphs = []
    for paragraph in documentTree.getiterator(PARA):
        texts = [
            node.text for node in paragraph.getiterator(TEXT) if node.text
        ]
        if texts:
            paragraphs.append(''.join(texts))
    return ('\n\n'.join(paragraphs))
Exemplo n.º 40
0
    def get_docx_text(path):
        WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
        PARA = WORD_NAMESPACE + 'p'
        TEXT = WORD_NAMESPACE + 't'
        """
        Take the path of a docx file as argument, return the text in unicode.
        """
        document = zipfile.ZipFile(path)
        xml_content = document.read('word/document.xml')
        document.close()
        tree = XML(xml_content)

        paragraphs = []
        for paragraph in tree.getiterator(PARA):
            texts = [node.text
                    for node in paragraph.getiterator(TEXT)
                    if node.text]
            if texts: paragraphs.append(''.join(texts))
        return '\n\n'.join(paragraphs)
Exemplo n.º 41
0
    def getContents(self):
        """
        Just read the paragraphs from an XML file.
        """

        xml_content = self.my_docx.read('word/document.xml')
        self.my_docx.close()
        tree = XML(xml_content)

        self.text_in_paragraphs = []
        for paragraph in tree.getiterator(PARA):
            texts = [node.text for node in paragraph.iter(TEXT) if node.text]
            if texts:
                self.text_in_paragraphs.append(''.join(texts))
                pass
            pass

    #print paragraphs
        return self.text_in_paragraphs
Exemplo n.º 42
0
def get_docx_text(fileName):
    """
    Take the path of a docx file as argument, return the text in unicode.
    """
    document = zipfile.ZipFile(fileName)
    xml_content = document.read('word/document.xml')
    print('xml_content', xml_content)
    document.close()
    tree = XML(xml_content)

    paragraphs = []
    for paragraph in tree.getiterator(PARA):
        texts = [
            node.text for node in paragraph.getiterator(TEXT) if node.text
        ]
        if texts:
            paragraphs.append(''.join(texts))

    return '\n\n'.join(paragraphs)
Exemplo n.º 43
0
def parse_captions(soup):
	"""	Converts custom iView captions into SRT format, usable in most
		decent media players.
	"""
	xml = XML(soup)

	output = ''

	i = 1
	for title in xml.getiterator('title'):
		start = title.get('start')
		ids = start.rfind(':')
		end = title.get('end')
		ide = end.rfind(':')
		output = output + str(i) + '\n'
		output = output + start[:ids] + ',' + start[ids+1:] + ' --> ' + end[:ide] + ',' + end[ide+1:] + '\n'
		output = output + title.text.replace('|','\n') + '\n\n'
		i += 1

	return output
def get_docx_text(path, options):
    #Take the path of a docx file as argument, return the text in unicode.
    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)

    paragraphs = []
    para = []
    for paragraph in tree.getiterator(PARA):
        alltext = [node.text
                 for node in paragraph.getiterator(TEXT)
                 if node.text]
        for bullet in paragraph.getiterator(BULLETNUM):
            if len(para) > 0:
                paragraphs.append(''.join(para))
                para = []
        para.append(''.join(alltext))
    paragraphs.append(''.join(para)) #for the last para to be appended
    return '\n\n'.join(paragraphs), [paragraphs[i:i+options+1] for i in range(0, len(paragraphs), options+1)]
Exemplo n.º 45
0
def get_docx_text(path):
    """
    Take the path of a docx file as argument, return the text in unicode.
    """
    WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
    PARA = WORD_NAMESPACE + 'p'
    TEXT = WORD_NAMESPACE + 't'
    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)

    paragraphs = []
    for paragraph in tree.getiterator(PARA):
        texts = [node.text
                 for node in paragraph.getiterator(TEXT)
                 if node.text]
        if texts:
            paragraphs.append(''.join(texts))

    return '\n\n'.join(paragraphs)
Exemplo n.º 46
0
def get_docx_text(path):
    """
    Take the path of a docx file as argument, return the text in unicode.
    """
    WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'   # formatting for docx
    PARA = WORD_NAMESPACE + 'p'                                                         # formatting for paragraphs
    TEXT = WORD_NAMESPACE + 't'                                                         # formatting for text
    document = zipfile.ZipFile(path)                                                    # the unzipped document path
    xml_content = document.read('word/document.xml')                                    # location of the primary xml document
    document.close()                                                                    # closes the document
    tree = XML(xml_content)                                                             # splits the xl into a tree

    paragraphs = []                                                                     # a list of the paragraphs
    for paragraph in tree.getiterator(PARA):                                            # for every new paragraph in the tree
        texts = [node.text                                                              # the text is the text node in the tree
                 for node in paragraph.getiterator(TEXT)                                # 
                 if node.text]                                                          # if the node is text, add it to the text list
        if texts:                                                                       # if a text is found,
            paragraphs.append(''.join(texts))                                           # add it to the paragraphs list
    #return('\n\n'.join(paragraphs))
    return(paragraphs)                                                                  # return the paragra
Exemplo n.º 47
0
def _docx_to_txt(path):
    """
    Take the path of a docx file as argument, return the text in unicode.
    """
    try:
        document = zipfile.ZipFile(path)
    except BadZipFile as e:
        print (path)
        print (" is not a valid zip file")
        raise e
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)

    paragraphs = []
    for paragraph in tree.getiterator(PARA):
        texts = [node.text
                 for node in paragraph.getiterator(TEXT)
                 if node.text]
        if texts:
            paragraphs.append(''.join(texts))

    return '\n\n'.join(paragraphs)
Exemplo n.º 48
0
Arquivo: Conv.py Projeto: Thormod/Unal
    def prepare_docx(self,source):
        """Captura el Docx, extrae textos, arregla palabras partidas y divide por párrafos/líneas para enviar a preprocess"""
        document = zipfile.ZipFile(source)
        xml_content = document.read('word/document.xml')
        document.close()
        tree = XML(xml_content)
        output = ''+str(source[:-5])+'_txt.txt'
        t = open(output,'wb')
        paragraphs = []
        for paragraph in tree.getiterator(PARA):
            texts = [node.text
                     for node in paragraph.getiterator(TEXT)
                     if node.text]

            if (texts!=[] and texts and texts!=" " and texts !=""):
                t.write(self.pre_process(''.join(texts)).encode('utf-8'))
                t.write(b'\n')
            else:
                t.write(b'\n')

        t.close()

        return output
def get_docx_comments(path, anonymous):
    """
    Take the path of a docx file as argument, return the comments text in unicode.
    """
    document = zipfile.ZipFile(path)
    xml_content = document.read(r'word/comments.xml')
    document.close()
    tree = XML(xml_content)

    paragraphs = []
    for comment in tree.getiterator(COMMENT):
        id = int(comment.attrib[ID]) + 1    # Bump ID number cuz people don't like zero.
        dt = comment.attrib[DATE]
        author = comment.attrib[AUTHOR]

        for paragraph in comment.getiterator(PARA):
            texts = [node.text
                     for node in paragraph.getiterator(TEXT)
                     if node.text]
            if texts:
                paragraphs.append('{0}. Author: {1}  Date:{2}'.format(id, author, dt))
                paragraphs.append(''.join(texts))

    return '\n\n'.join(paragraphs)