def process(self, form): try: # Convert from other office format to odt if needed filename, extension = os.path.splitext(self.original_filename) odt_filename = str(filename) + '.odt' if (extension != '.odt'): self._convert_to_odt(filename) # Convert and save all the resulting files. tree, files, errors = transform(odt_filename) cnxml = clean_cnxml(etree.tostring(tree)) save_cnxml(self.save_dir, cnxml, files.items()) # now validate with jing validate_cnxml(cnxml) except ConversionError as e: return render_conversionerror(self.request, e.msg) except Exception: tb = traceback.format_exc() self.write_traceback_to_zipfile(tb, form) templatePath = 'templates/error.pt' response = {'traceback': tb} if ('title' in self.request.session): del self.request.session['title'] return render_to_response(templatePath, response, request=self.request) self.request.session.flash(self.message) return HTTPFound(location=self.request.route_url(self.nextStep()))
def process(self): try: # Convert from other office format to odt if needed filename, extension = os.path.splitext(self.original_filename) odt_filename = str(filename) + '.odt' if(extension != '.odt'): self._convert_to_odt(filename) # Convert and save all the resulting files. tree, files, errors = transform(odt_filename) cnxml = clean_cnxml(etree.tostring(tree)) save_cnxml(self.save_dir, cnxml, files.items()) # now validate with jing validate_cnxml(cnxml) except ConversionError as e: return render_conversionerror(self.request, e.msg) except Exception: tb = traceback.format_exc() self.write_traceback_to_zipfile(tb) templatePath = 'templates/error.pt' response = {'traceback': tb} if('title' in self.request.session): del self.request.session['title'] return render_to_response(templatePath, response, request=self.request) self.request.session.flash(self.message) return HTTPFound(location=self.request.route_url(self.nextStep()))
def test_odt(self): odt_files = os.listdir(test_folder_name + 'odt/') i = 0 # Find only .odt files in the testing folder for odt while (i < len(odt_files)): f = odt_files[i] filename, extension = os.path.splitext(f) if (extension != '.odt'): odt_files.remove(f) else: i = i + 1 for f in odt_files: original_filename = test_folder_name + 'odt/' + f filename, extension = os.path.splitext(original_filename) valid_filename = filename + '.cnxml' output_filename = filename + '.tmp' odt_filename = original_filename diff_filename = filename + '.diff' err_filename = filename + '.err' try: open(valid_filename, 'r') except IOError as e: print('Missing valid file (' + valid_filename + ') for testing ' + original_filename) return tree, files, errors = transform(odt_filename) cnxml = clean_cnxml(etree.tostring(tree)) validate_cnxml(cnxml) output = open(output_filename, 'w') output.write(cnxml) output.close() remove_ids(output_filename) process = subprocess.Popen( ['diff', valid_filename, output_filename], shell=False, stdout=subprocess.PIPE) std_output = process.communicate() if (std_output[0] != None and len(std_output[0]) != 0): diff_output = open(diff_filename, 'w') diff_output.write(std_output[0]) diff_output.close() print( 'Differences in the testing of ' + original_filename + ', information on those differences has been placed in ' + diff_filename) elif (std_output[1] != None and len(std_output[1]) != 0): err_output = open(err_filename, 'w') err_output.write(std_output[1]) err_output.close() print( 'Error(s) occurred while attempting to test for differences in CNXML output of ' + original_filename + ', information on these errors are in ' + err_filename)
def convert(self, data, outdata, **kwargs): ### JCC TODO: all the back and forth about whether the data is a ### file or data should be streamlined, if possible strOriginalFileName = kwargs['original_file_name'] strUserName = kwargs['user_name'] zLOG.LOG("OOo2CNXML Transform", zLOG.INFO, "Original file name is : \"" + strOriginalFileName + "\". User is : \"" + strUserName + "\"") # write the file to disk; attempt to harvest to central location else put in /tmp strFileName = self.writeToGood(data,strUserName,strOriginalFileName) if strOriginalFileName.endswith('.xml'): zLOG.LOG("OOo2CNXML Transform", zLOG.INFO, "Input file is a .xml file. Terminate import.") # importing .xml file sometime blows up the OOo server and lacks a use case so we punt. self.moveToBad(strFileName) raise OOoImportError, "Could not convert .xml file. Please try another file type." # OOo convert a doc file into an XML file embedded in a zip file. try: binOOoData = self.convertWordToOOo(strFileName) except: self.moveToBad(strFileName) raise if len(binOOoData) == 0: zLOG.LOG("OOo2CNXML Transform", zLOG.INFO, "Open Office does not return anything. The Open Office server may not be running.") # don't know for sure if the conversion failed, so do we leave # the harvested word file in the GOOD directory or do we leave # the word file in the BAD directory? Choosing to keep the GOOD # as pristine as possible at the current time. self.moveToBad(strFileName) raise OOoImportError, "Could not convert file" fileOOo = StringIO(binOOoData) try: elCnxml, filesDict, errors = odt2cnxml.transform(fileOOo) from lxml import etree strCnxml = etree.tostring(elCnxml, pretty_print=True) except OOoImportError: # toCnxml() wrote log messages self.moveToBad(strFileName) raise OOoImportError, "Generated CNXML is invalid" fileCnxmlClean = StringIO(strCnxml) outdata.setData(fileCnxmlClean) # Add images objects = filesDict #{} outdata.setSubObjects(objects) self.cleanup(strFileName) return outdata
def test_doc(self): doc_files=os.listdir(test_folder_name+'doc/') i=0 # Find only .odt files in the testing folder for odt while(i < len(doc_files)): f=doc_files[i] filename, extension = os.path.splitext(f) if(extension != '.doc'): doc_files.remove(f) else: i=i+1 for f in doc_files: original_filename=test_folder_name+'doc/'+f filename, extension = os.path.splitext(original_filename) valid_filename=filename+'.cnxml' output_filename=filename+'.tmp' doc_filename = original_filename diff_filename = filename+'.diff' err_filename = filename+'.err' odt_filename= filename+'.odt' command = '/usr/bin/soffice --headless --nologo --nofirststartwizard "macro:///Standard.Module1.SaveAsOOO(' + os.getcwd()+'/'+original_filename + ',' + os.getcwd()+'/'+odt_filename + ')"' os.system(command) try: open(valid_filename, 'r') except IOError as e: print('Missing valid file ('+valid_filename+') for testing '+original_filename) return tree, files, errors = transform(odt_filename) cnxml = clean_cnxml(etree.tostring(tree)) validate_cnxml(cnxml) output=open(output_filename,'w') output.write(cnxml) output.close() remove_ids(output_filename) process = subprocess.Popen(['diff',valid_filename,output_filename], shell=False, stdout=subprocess.PIPE) std_output = process.communicate() if(std_output[0] != None and len(std_output[0]) != 0): diff_output=open(diff_filename,'w') diff_output.write(std_output[0]) diff_output.close() print('Differences in the testing of '+original_filename+', information on those differences has been placed in '+diff_filename) elif(std_output[1] != None and len(std_output[1]) != 0): err_output=open(err_filename,'w') err_output.write(std_output[1]) err_output.close() print('Error(s) occurred while attempting to test for differences in CNXML output of '+original_filename+', information on these errors are in '+err_filename)
def test_bolditalic(self): odtfile = os.path.join(dirname, 'data', 'bolditalic.odt') validcnxml = open( os.path.join(dirname, 'data', 'bolditalic.cnxml')).read() # remove newlines and indentation and ids validcnxml = validcnxml.replace('\n', '') validcnxml = re.sub('>\s+<', '><', validcnxml) validcnxml = re.sub('id=\".*?\"', '', validcnxml) cnxml, images, errors = transform(odtfile) cnxml = etree.tostring(cnxml) # strip ids cnxml = re.sub('id=\".*?\"', '', cnxml) self.assertEqual(cnxml, validcnxml)
def test_odt(self): odt_files=os.listdir(test_folder_name+'odt/') i=0 # Find only .odt files in the testing folder for odt while(i < len(odt_files)): f=odt_files[i] filename, extension = os.path.splitext(f) if(extension != '.odt'): odt_files.remove(f) else: i=i+1 for f in odt_files: original_filename=test_folder_name+'odt/'+f filename, extension = os.path.splitext(original_filename) valid_filename=filename+'.cnxml' output_filename=filename+'.tmp' odt_filename = original_filename diff_filename = filename+'.diff' err_filename = filename+'.err' try: open(valid_filename, 'r') except IOError as e: print('Missing valid file ('+valid_filename+') for testing '+original_filename) return tree, files, errors = transform(odt_filename) cnxml = clean_cnxml(etree.tostring(tree)) validate_cnxml(cnxml) output=open(output_filename,'w') output.write(cnxml) output.close() remove_ids(output_filename) process = subprocess.Popen(['diff',valid_filename,output_filename], shell=False, stdout=subprocess.PIPE) std_output = process.communicate() if(std_output[0] != None and len(std_output[0]) != 0): diff_output=open(diff_filename,'w') diff_output.write(std_output[0]) diff_output.close() print('Differences in the testing of '+original_filename+', information on those differences has been placed in '+diff_filename) elif(std_output[1] != None and len(std_output[1]) != 0): err_output=open(err_filename,'w') err_output.write(std_output[1]) err_output.close() print('Error(s) occurred while attempting to test for differences in CNXML output of '+original_filename+', information on these errors are in '+err_filename)
def process_import(save_dir_path, original_filepath, filename, download_url): # convert from other office format to odt odt_filename = '%s.odt' % filename odt_filepath = str(os.path.join(save_dir_path, odt_filename)) # run jod service converter = JOD.DocumentConverterClient() try: converter.convert(original_filepath, 'odt', odt_filepath) except Exception as e: raise e # check file existed try: fp = open(odt_filepath, 'r') fp.close() except IOError as io: # TODO: raise exception raise io # convert to cnxml tree, files, errors = transform(odt_filepath) cnxml = clean_cnxml(etree.tostring(tree)) # convert to html html = cnxml_to_htmlpreview(cnxml) # produce zipfile ram = StringIO() zip_archive = zipfile.ZipFile(ram, 'w') zip_archive.writestr('index.html', html) for fname, fdata in files.items(): zip_archive.writestr(fname, fdata) zip_archive.close() # save zipfile zip_file_path = os.path.join(save_dir_path, '%s.zip' % filename) if os.path.exists(zip_file_path): os.rename(zip_file_path, zip_file_path + '~') f = open(zip_file_path, 'wb') f.write(ram.getvalue()) f.close() return download_url
def office_to_cnxml(pathToOfficeFile, verbose=True): import os # Get absolute path to file inputFilename = os.path.abspath(pathToOfficeFile) # Convert to ODT if necessary if inputFilename[inputFilename.rfind(".") :].lower() != ".odt": odtFilename = "/tmp/temp.odt" command = ( '/usr/bin/soffice -headless -nologo -nofirststartwizard "macro:///Standard.Module1.SaveAsOOO(' + escape_system(inputFilename)[1:-1] + "," + odtFilename + ')"' ) os.system(command) inputFilename = odtFilename # Convert to CNXML xml, files, errors = odt2cnxml.transform(inputFilename) return etree.tostring(xml), files
quit() filename=sys.argv[1] name, extension = os.path.splitext(filename) if(extension == '.odt' or extension == '.doc'): if(extension == '.doc'): doc_folder = os.getcwd()+'/'+os.path.dirname(name) os.system('./converters/doc2odt -o '+doc_folder+' '+os.getcwd()+'/'+filename) #command = '/usr/bin/soffice --headless --nologo --nofirststartwizard "macro:///Standard.Module1.SaveAsOOO(' + os.getcwd()+'/'+filename + ',' + os.getcwd()+'/'+name+'.odt' + ')"' #os.system(command) filename=name+'.odt' valid_filename=name+'.cnxml' tree, files, errors = transform(filename) cnxml = clean_cnxml(etree.tostring(tree)) output=open(valid_filename,'w') output.write(cnxml) output.close() remove_ids(valid_filename) if(extension == '.doc'): os.remove(os.getcwd()+'/'+name+'.odt') elif(extension == '.tex'): valid_filename=name+'.cnxml' fp = open(filename, 'r') latex_archive = fp.read() fp.close() # LaTeX 2 CNXML transformation
def test_doc(self): doc_files = os.listdir(test_folder_name + 'doc/') i = 0 # Find only .odt files in the testing folder for odt while (i < len(doc_files)): f = doc_files[i] filename, extension = os.path.splitext(f) if (extension != '.doc'): doc_files.remove(f) else: i = i + 1 for f in doc_files: original_filename = test_folder_name + 'doc/' + f filename, extension = os.path.splitext(original_filename) valid_filename = filename + '.cnxml' output_filename = filename + '.tmp' doc_filename = original_filename diff_filename = filename + '.diff' err_filename = filename + '.err' odt_filename = filename + '.odt' command = '/usr/bin/soffice --headless --nologo --nofirststartwizard "macro:///Standard.Module1.SaveAsOOO(' + os.getcwd( ) + '/' + original_filename + ',' + os.getcwd( ) + '/' + odt_filename + ')"' os.system(command) try: open(valid_filename, 'r') except IOError as e: print('Missing valid file (' + valid_filename + ') for testing ' + original_filename) return tree, files, errors = transform(odt_filename) cnxml = clean_cnxml(etree.tostring(tree)) validate_cnxml(cnxml) output = open(output_filename, 'w') output.write(cnxml) output.close() remove_ids(output_filename) process = subprocess.Popen( ['diff', valid_filename, output_filename], shell=False, stdout=subprocess.PIPE) std_output = process.communicate() if (std_output[0] != None and len(std_output[0]) != 0): diff_output = open(diff_filename, 'w') diff_output.write(std_output[0]) diff_output.close() print( 'Differences in the testing of ' + original_filename + ', information on those differences has been placed in ' + diff_filename) elif (std_output[1] != None and len(std_output[1]) != 0): err_output = open(err_filename, 'w') err_output.write(std_output[1]) err_output.close() print( 'Error(s) occurred while attempting to test for differences in CNXML output of ' + original_filename + ', information on these errors are in ' + err_filename)
def choose_view(request): check_login(request) templatePath = 'templates/choose.pt' form = Form(request, schema=UploadSchema) field_list = [('upload', 'File')] # clear the session if 'transformerror' in request.session: del request.session['transformerror'] if 'title' in request.session: del request.session['title'] # Check for successful form completion if form.validate(): try: # Catch-all exception block # Create a directory to do the conversions now_string = datetime.datetime.now().strftime('%Y%m%d-%H%M%S') # TODO: This has a good chance of being unique, but even so... temp_dir_name = '%s-%s' % (request.session['username'], now_string) save_dir = os.path.join( request.registry.settings['transform_dir'], temp_dir_name ) os.mkdir(save_dir) # Keep the info we need for next uploads. Note that this # might kill the ability to do multiple tabs in parallel, # unless it gets offloaded onto the form again. request.session['upload_dir'] = temp_dir_name if form.data['upload'] is not None: request.session['filename'] = form.data['upload'].filename # Google Docs Conversion # if we have a Google Docs ID and Access token. if form.data['gdocs_resource_id']: gdocs_resource_id = form.data['gdocs_resource_id'] gdocs_access_token = form.data['gdocs_access_token'] form.data['gdocs_resource_id'] = None form.data['gdocs_access_token'] = None (request.session['title'], request.session['filename']) = \ process_gdocs_resource(save_dir, \ gdocs_resource_id, \ gdocs_access_token) # HTML URL Import: elif form.data.get('url_text'): url = form.data['url_text'] form.data['url_text'] = None # Build a regex for Google Docs URLs regex = re.compile("^https:\/\/docs\.google\.com\/.*document\/[^\/]\/([^\/]+)\/") r = regex.search(url) # Take special action for Google Docs URLs if r: gdocs_resource_id = r.groups()[0] (request.session['title'], request.session['filename']) = \ process_gdocs_resource(save_dir, "document:" + gdocs_resource_id) else: # download html: #html = urllib2.urlopen(url).read() # Simple urlopen() will fail on mediawiki websites like e.g. Wikipedia! import_opener = urllib2.build_opener() import_opener.addheaders = [('User-agent', 'Mozilla/5.0')] try: import_request = import_opener.open(url) html = import_request.read() # transformation cnxml, objects, html_title = htmlsoup_to_cnxml( html, bDownloadImages=True, base_or_source_url=url) request.session['title'] = html_title cnxml = clean_cnxml(cnxml) save_cnxml(save_dir, cnxml, objects.items()) # Keep the info we need for next uploads. Note that # this might kill the ability to do multiple tabs in # parallel, unless it gets offloaded onto the form # again. request.session['filename'] = "HTML Document" validate_cnxml(cnxml) except urllib2.URLError, e: request['errors'] = ['The URL %s could not be opened' %url,] response = { 'form': FormRenderer(form), } return render_to_response(templatePath, response, request=request) # Office, CNXML-ZIP or LaTeX-ZIP file else: # Save the original file so that we can convert, plus keep it. original_filename = os.path.join( save_dir, form.data['upload'].filename.replace(os.sep, '_')) saved_file = open(original_filename, 'wb') input_file = form.data['upload'].file shutil.copyfileobj(input_file, saved_file) saved_file.close() input_file.close() form.data['upload'] = None # Check if it is a ZIP file with at least index.cnxml or a LaTeX file in it try: zip_archive = zipfile.ZipFile(original_filename, 'r') is_zip_archive = ('index.cnxml' in zip_archive.namelist()) # Do we have a latex file? if not is_zip_archive: # incoming latex.zip must contain a latex.tex file, where "latex" is the base name. (latex_head, latex_tail) = os.path.split(original_filename) (latex_root, latex_ext) = os.path.splitext(latex_tail) latex_basename = latex_root latex_filename = latex_basename + '.tex' is_latex_archive = (latex_filename in zip_archive.namelist()) except zipfile.BadZipfile: is_zip_archive = False is_latex_archive = False # ZIP package from previous conversion if is_zip_archive: # Unzip into transform directory zip_archive.extractall(path=save_dir) # Rename ZIP file so that the user can download it again os.rename(original_filename, os.path.join(save_dir, 'upload.zip')) # Read CNXML with open(os.path.join(save_dir, 'index.cnxml'), 'rt') as fp: cnxml = fp.read() # Convert the CNXML to XHTML for preview html = cnxml_to_htmlpreview(cnxml) with open(os.path.join(save_dir, 'index.xhtml'), 'w') as index: index.write(html) cnxml = clean_cnxml(cnxml) validate_cnxml(cnxml) # LaTeX elif is_latex_archive: f = open(original_filename) latex_archive = f.read() # LaTeX 2 CNXML transformation cnxml, objects = latex_to_cnxml(latex_archive, original_filename) cnxml = clean_cnxml(cnxml) save_cnxml(save_dir, cnxml, objects.items()) validate_cnxml(cnxml) # OOo / MS Word Conversion else: # Convert from other office format to odt if needed odt_filename = original_filename filename, extension = os.path.splitext(original_filename) if(extension != '.odt'): odt_filename= '%s.odt' % filename command = '/usr/bin/soffice -headless -nologo -nofirststartwizard "macro:///Standard.Module1.SaveAsOOO(' + escape_system(original_filename)[1:-1] + ',' + odt_filename + ')"' os.system(command) try: fp = open(odt_filename, 'r') fp.close() except IOError as io: raise ConversionError("%s not found" % original_filename) # Convert and save all the resulting files. tree, files, errors = transform(odt_filename) cnxml = clean_cnxml(etree.tostring(tree)) save_cnxml(save_dir, cnxml, files.items()) # now validate with jing validate_cnxml(cnxml)
quit() filename = sys.argv[1] name, extension = os.path.splitext(filename) if (extension == '.odt' or extension == '.doc'): if (extension == '.doc'): doc_folder = os.getcwd() + '/' + os.path.dirname(name) os.system('./converters/doc2odt -o ' + doc_folder + ' ' + os.getcwd() + '/' + filename) #command = '/usr/bin/soffice --headless --nologo --nofirststartwizard "macro:///Standard.Module1.SaveAsOOO(' + os.getcwd()+'/'+filename + ',' + os.getcwd()+'/'+name+'.odt' + ')"' #os.system(command) filename = name + '.odt' valid_filename = name + '.cnxml' tree, files, errors = transform(filename) cnxml = clean_cnxml(etree.tostring(tree)) output = open(valid_filename, 'w') output.write(cnxml) output.close() remove_ids(valid_filename) if (extension == '.doc'): os.remove(os.getcwd() + '/' + name + '.odt') elif (extension == '.tex'): valid_filename = name + '.cnxml' fp = open(filename, 'r') latex_archive = fp.read() fp.close() # LaTeX 2 CNXML transformation