def __init__(self, inputFilePath): self._parser = etree.XMLParser(encoding='utf-8', remove_blank_text=True) self.zipper = customZipper.Zipper() self.blueprint = z.ZipFile(inputFilePath) self.prefix = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}" self.paragraphs = [] self.questions = [] self.blank = etree.fromstring( """<w:p xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:pPr><w:pStyle w:val="Normal"/><w:ind w:left="0" w:right="0" w:hanging="0"/><w:rPr><w:rFonts w:ascii="Times New Roman" w:hAnsi="Times New Roman"/><w:sz w:val="20"/><w:lang w:val="tr-TR"/></w:rPr></w:pPr><w:r><w:rPr><w:sz w:val="20"/><w:lang w:val="tr-TR"/></w:rPr></w:r></w:p>""" ) fileNameList = self.blueprint.namelist() # Traverse archived docx file and add all files # except document.xml to zipper for fName in fileNameList: with self.blueprint.open(fName) as f: if fName == 'word/document.xml': self.docXMLRoot = etree.fromstring(f.read()) else: self.zipper.addFile(fName, f.read()) self.body = self.docXMLRoot.find(self.prefix + "body") for paragraph in self.docXMLRoot.iter(self.prefix + "p"): clone = copy.deepcopy(paragraph) self.paragraphs.append(clone) # clear document for paragraph in self.docXMLRoot.iter(self.prefix + "p"): self.body.remove(paragraph) # get questions for i in range(len(self.paragraphs)): paragraphText = self.getParagraphText(self.paragraphs[i]) if ("@" in paragraphText) and not ("@&" in paragraphText): pars = [self.paragraphs[i]] j = 1 while not ("$" in self.getParagraphText( self.paragraphs[i + j])): pars.append(self.paragraphs[i + j]) j += 1 if "$" in self.getParagraphText(self.paragraphs[i + j]): pars.append(self.paragraphs[i + j]) q = Question.Question(pars) self.questions.append(q) i = i + j