def fileuploadIMG(UPLOAD_FOLDER, OUTPUT_FOLDER, settings, filename): #try: columnNumber = int(settings[2]) if not os.path.exists(ocropy + "/processedFiles/"): os.makedirs(ocropy + "/processedFiles/") #check if pdf pdfFlag = False im = 0 if '.' in filename and filename.rsplit('.', 1)[1] in set(['pdf']): pdfFlag = True else: #check if image is valid im = Image.open(UPLOAD_FOLDER + filename) im = im.getcolors() if pdfFlag or im == None or len(im) != 0: #process the image file extract text of it prepareIMG(UPLOAD_FOLDER, OUTPUT_FOLDER, filename, columnNumber) #stop if no text found if os.path.exists(ocropy + "/processedFiles/"+filename+'/ocrWdummy.txt'): p = subprocess.Popen(["./citeExtract.pl -m extract_citations " + ocropy +"/processedFiles/" +filename+"/ocrWdummy.txt", LOCDB + "tmp/" + filename +"_ParsIMG.xml"],shell=True,stdout=subprocess.PIPE,cwd=parsCit) parscitstring= p.communicate()[0] with open("tmp/" + filename + "_ParsIMG.xml", 'w') as f: f.write(parscitstring) outputxmlsoup = createBibstruct(filename) #delete tmp files os.remove("tmp/" + filename+'_ParsIMG.xml') #with open(OUTPUT_FOLDER + "Output" + filename +'.xml','w') as xmlf: # xmlf.write(outputxmlsoup.encode('utf-8')) with open(ocropy + "/processedFiles/" + filename + '/xmloutput.xml','w') as xmlf: xmlf.write(outputxmlsoup.encode('utf-8')) copyfile(ocropy + "/processedFiles/" + filename + "/xmloutput.xml", OUTPUT_FOLDER + filename + "/Output" + filename + ".xml") copyfile(ocropy + "/processedFiles/" + filename + "/temp.html", OUTPUT_FOLDER + filename + "/filenameTemp.html") copyfile(ocropy + "/processedFiles/" + filename + "/tempcorrection.html", OUTPUT_FOLDER + filename + "/filenameTempcorrection.html") #uncomment to remove tmp data images shutil.rmtree(ocropy + "/processedFiles/" + filename) settings = [] settings.append("IMG") settings.append(columnNumber) writeLog(filename, settings, True) #writeCorrect(filename) os.remove(UPLOAD_FOLDER + filename) outputfile = OUTPUT_FOLDER + filename + "/Output" + filename+'.xml' outputfilename = filename.replace(filename.split("_")[0], "")[1:] if os.path.exists(outputfile): print "Finished inputfile : " + outputfilename else: print "Error inputfile : " + outputfilename
def fileuploadText(UPLOAD_FOLDER, OUTPUT_FOLDER, settings, filename): #checks the file extension and preprocesses the file based on it if "True" in settings[1]: Txt_Dummy = True else: Txt_Dummy = False prepareText(UPLOAD_FOLDER, Txt_Dummy, filename) mode = "extract_meta" if Txt_Dummy: mode = "extract_citations" p = subprocess.Popen([ "./citeExtract.pl -m" + mode + " " + LOCDB + "tmp/" + filename + "_Textdummy.txt ", LOCDB + "tmp/" + filename + "_ParsText.xml" ], shell=True, stdout=subprocess.PIPE, cwd=parsCit) parscitstring = p.communicate()[0] with open("tmp/" + filename + "_ParsText.xml", 'w') as f: f.write(parscitstring) outputxmlsoup = createBibstruct(filename) if filename[-3:].lower() == "pdf": output_grobid = processfileGrobid(UPLOAD_FOLDER, filename) xmltags3 = output_grobid.find_all('BibStructured') algotag3 = outputxmlsoup.algorithm for curr in xmltags3: algotag3.append(curr) os.system("mv " + LOCDB + "tmp/" + filename + '_Textdummy.txt ' + LOCDB + "processed-files/" + filename + '_Textdummy.txt') os.system("mv " + LOCDB + "tmp/" + filename + '_ParsText.txt ' + LOCDB + "processed-files/" + filename + '_ParsText.txt') os.makedirs(OUTPUT_FOLDER + filename) with open(OUTPUT_FOLDER + filename + "/Output" + filename + '.xml', 'w') as xmlf: xmlf.write(outputxmlsoup.encode('utf-8')) settings = [] settings.append("TXT") settings.append(Txt_Dummy) writeLog(filename, settings, True) os.remove(UPLOAD_FOLDER + filename) os.makedirs(OUTPUT_FOLDER + filename) outputfile = OUTPUT_FOLDER + filename + "/Output" + filename + '.xml' outputfilename = filename.replace(filename.split("_")[0], "")[1:] if os.path.exists(outputfile): print "Finished inputfile : " + outputfilename writeUserLog("Finished inputfile : " + outputfilename) else: print "Error inputfile : " + outputfilename writeUserLog("Error inputfile : " + outputfilename)
def fileupload(): #checks the upload and stores it to the upload folder if request.method == 'POST': if 'files' not in request.files: flash('No file part') return redirect(request.url) #requested files and pdf mode are retrieved from flask fileList = request.files.getlist('files') if request.form.get('pdfFlag'): Settings[0] = "IMG" else: Settings[0] = "TXT" if request.form.get('Txt_Dummy'): Settings[1] = "True" else: Settings[1] = "False" #if request.form.get('colBool'): #Settings[2] = str(int(request.form.get('colNumb')) - 1) #else: #Settings[2] = "0" autoview = False if request.form.get('autoviewResults'): autoview = True filenameFP_List = [] filenameString = "" for uploadedFile in fileList: if uploadedFile.filename == '': flash('No selected inputfile') return redirect(request.url) if uploadedFile and check_file_extension(uploadedFile.filename, ALLOWED_EXTENSIONS): filenameFP = secure_filename(uploadedFile.filename) print 'Uploaded inputfile : ' + filenameFP writeUserLog("Uploaded inputfile : " + filenameFP) #adding timestamp ts = datetime.now().strftime('%Y%m%d%H%M%S') filenameFP = ts + "_" + filenameFP writeLog(filenameFP, Settings, False) uploadedFile.save(os.path.join(UPLOAD_FOLDER, filenameFP)) filenameFP_List.append(filenameFP) filenameString += filenameFP + "\n" processFile(UPLOAD_FOLDER, OUTPUT_FOLDER, MAX_PROCESSES, Settings, filenameFP_List) #sync process result = createResultView(OUTPUT_FOLDER, filenameFP_List) return Response(result, content_type='text/xml; charset=utf-8') if autoview: return render_template("form_submitocr.html", waiting="1", filesText=filenameString) else: return updateHTML() return "Error"
def fileuploadText(UPLOAD_FOLDER, OUTPUT_FOLDER, settings, filename): #try: #checks the file extension and preprocesses the file based on it if "True" in settings[1]: Txt_Dummy = True else: Txt_Dummy = False prepareText(UPLOAD_FOLDER, Txt_Dummy, filename) mode = "extract_meta" if Txt_Dummy: mode = "extract_citations" p = subprocess.Popen([ "./citeExtract.pl -m" + mode + " " + LOCDB + "tmp/" + filename + "_Textdummy.txt ", LOCDB + "tmp/" + filename + "_ParsText.xml" ], shell=True, stdout=subprocess.PIPE, cwd=parsCit) parscitstring = p.communicate()[0] with open("tmp/" + filename + "_ParsText.xml", 'w') as f: f.write(parscitstring) outputxmlsoup = createBibstruct(filename) #delete tmp files os.remove("tmp/" + filename + '_Textdummy.txt') os.remove("tmp/" + filename + '_ParsText.xml') with open(OUTPUT_FOLDER + "Output" + filename + '.xml', 'w') as xmlf: xmlf.write(outputxmlsoup.encode('utf-8')) settings = [] settings.append("TXT") settings.append(Txt_Dummy) writeLog(filename, settings, True) #writeCorrect(filename) os.remove(UPLOAD_FOLDER + filename) #create output folder for image file if not os.path.exists(OUTPUT_FOLDER + filename): os.makedirs(OUTPUT_FOLDER + filename + "/") #outputfile = OUTPUT_FOLDER + "Output" + filename+'.xml' outputfile = output_folder + filename + '/Output' + filename + ".xml" outputfilename = filename.replace(filename.split("_")[0], "")[1:] if os.path.exists(outputfile): print "Finished inputfile : " + outputfilename else: print "Error inputfile : " + outputfilename
def fileupload(): print "" print "" print "----------------------------------------" print "request: ", request print "request.files: ", request.files print "request.url: ", request.url print "request.form: ", request.form print "----------------------------------------" print "" print "" #checks the upload and stores it to the upload folder if request.method == 'POST': if 'files' not in request.files: flash('No file part') return redirect(request.url) #requested files and pdf mode are retrieved from flask fileList = request.files.getlist('files') if request.form.get('pdfFlag'): Settings[0] = "IMG" else: Settings[0] = "TXT" if request.form.get('Txt_Dummy'): Settings[1] = "True" else: Settings[1] = "False" if request.form.get('colBool'): Settings[2] = str(int(request.form.get('colNumb')) - 1) else: Settings[2] = "0" filenameFP_List = [] for uploadedFile in fileList: if uploadedFile.filename == '': flash('No selected inputfile') return redirect(request.url) if uploadedFile and check_file_extension(uploadedFile.filename, ALLOWED_EXTENSIONS): filenameFP = secure_filename(uploadedFile.filename) print 'Uploaded inputfile : ' + filenameFP #adding timestamp ts = datetime.now().strftime('%m%d%H%M%S') filenameFP = ts + "_" + filenameFP writeLog(filenameFP, Settings, False) uploadedFile.save(os.path.join(UPLOAD_FOLDER, filenameFP)) filenameFP_List.append(filenameFP) processFile(UPLOAD_FOLDER, OUTPUT_FOLDER, MAX_PROCESSES, Settings, filenameFP_List) return mergeOutputXML(OUTPUT_FOLDER, filenameFP_List) #return redirect("/") return "Error"
def fileuploadXML(UPLOAD_FOLDER, OUTPUT_FOLDER, filename): #open input file with open(UPLOAD_FOLDER + filename, 'r') as f: xmlsoup = bs.BeautifulSoup(f.read(), 'xml') prepareXML(xmlsoup, filename) p = subprocess.Popen([ "./citeExtract.pl -m extract_citations " + LOCDB + "tmp/" + filename + "_XMLdummy.txt ", LOCDB + "tmp/" + filename + "_ParsXML.xml" ], shell=True, stdout=subprocess.PIPE, cwd=parsCit) parscitstring = p.communicate()[0] with open("tmp/" + filename + "_ParsXML.xml", 'w') as f: f.write(parscitstring) outputxmlsoup = createBibstruct(xmlsoup, filename) #delete tmp files os.remove("tmp/" + filename + '_XMLdummy.txt') os.remove("tmp/" + filename + '_ParsXML.xml') os.makedirs(OUTPUT_FOLDER + filename) with open(OUTPUT_FOLDER + filename + "/Output" + filename + '.xml', 'w') as xmlf: xmlf.write(outputxmlsoup.encode('utf-8')) settings = [] settings.append("XML") writeLog(filename, settings, True) #writeCorrect(filename) os.remove(UPLOAD_FOLDER + filename) outputfile = OUTPUT_FOLDER + filename + "/Output" + filename + '.xml' outputfilename = filename.replace(filename.split("_")[0], "")[1:] if os.path.exists(outputfile): print "Finished inputfile : " + outputfilename writeUserLog("Finished inputfile : " + outputfilename) else: print "Error inputfile : " + outputfilename writeUserLog("Error inputfile : " + outputfilename)
def fileupload(): if debugMode.lower() == "yes": print "" print "####################################" print "In fileupload()" print "####################################" print "" #checks the upload request parameters and stores it to the upload folder if request.method == 'POST': if 'files' not in request.files: flash('No file part') return redirect(request.url) #requested files and pdf mode are retrieved from flask fileList = request.files.getlist('files') if request.form.get('pdfFlag'): Settings[0] = "IMG" else: Settings[0] = "TXT" if request.form.get('Txt_Dummy'): Settings[1] = "True" else: Settings[1] = "False" autoview = False if request.form.get('autoviewResults'): autoview = True filenameFP_List = [] filenameString = "" for uploadedFile in fileList: if uploadedFile.filename == '': flash('No selected inputfile') return redirect(request.url) if uploadedFile and check_file_extension(uploadedFile.filename, ALLOWED_EXTENSIONS): filenameFP = secure_filename(uploadedFile.filename) print 'Uploaded inputfile : ' + filenameFP print "" writeUserLog("Uploaded inputfile : " + filenameFP) #adding timestamp ts = datetime.now().strftime('%Y%m%d%H%M%S') filenameFP = ts + "_" + filenameFP writeLog(filenameFP, Settings, False) uploadedFile.save( os.path.join(LOCDB + UPLOAD_FOLDER, filenameFP)) filenameFP_List.append(filenameFP) filenameString += filenameFP + "\n" else: return "Error: Invalid file extension..." try: job = q.enqueue_call(func=processFile, args=( UPLOAD_FOLDER, OUTPUT_FOLDER, MAX_PROCESSES, Settings, filenameFP_List, ), result_ttl=8000, timeout=80000) print(job.get_id()) return job.get_id() #sync process filenameFP_List = natsort.natsorted(filenameFP_List) result = createResultView(OUTPUT_FOLDER, filenameFP_List) return Response(result, content_type='text/xml; charset=utf-8') except: return "An Error occured during file processing..." if autoview: return render_template("form_submitocr.html", waiting="1", filesText=filenameString) else: return updateHTML() return "Error"
def fileuploadXML(UPLOAD_FOLDER, OUTPUT_FOLDER, filename): mapOutputsoup = None imgOutputXmlSoup = None #open input file with open(UPLOAD_FOLDER + filename, 'r') as f: xmlsoup = bs.BeautifulSoup(f.read(), 'xml') response = False if filename[-3:].lower() == "xml": prepareXML(xmlsoup, filename) else: response = prepareHTML(xmlsoup, filename) if response == True: mapOutputsoup = mapHTML(xmlsoup, filename) p = subprocess.Popen([ "./citeExtract.pl -m extract_citations " + LOCDB + "tmp/" + filename + "_XMLdummy.txt ", LOCDB + "tmp/" + filename + "_ParsXML.xml" ], shell=True, stdout=subprocess.PIPE, cwd=parsCit) parscitstring = p.communicate()[0] with open("tmp/" + filename + "_ParsXML.xml", 'w') as f: f.write(parscitstring) if filename[:-3].lower() == "xml": outputxmlsoup = createBibstruct(xmlsoup, filename) else: outputxmlsoup = createBibstructHTML(filename) if mapOutputsoup != None: bibtags2 = mapOutputsoup.find_all('BibStructured') algotag2 = outputxmlsoup.algorithm for currtag in bibtags2: algotag2.append(currtag) if imgOutputXmlSoup != None: bibtags2 = imgOutputXmlSoup.find_all('BibStructured') algotag2 = outputxmlsoup.algorithm for currtag in bibtags2: algotag2.append(currtag) os.system("mv " + LOCDB + "tmp/" + filename + '_XMLdummy.txt ' + LOCDB + "processed-files/" + filename + '_XMLdummy.txt') os.system("mv " + LOCDB + "tmp/" + filename + '_ParsXML.xml ' + LOCDB + "processed-files/" + filename + '_ParsXML.xml') os.makedirs(LOCDB + OUTPUT_FOLDER + filename) with open(LOCDB + OUTPUT_FOLDER + filename + "/Output" + filename + '.xml', 'w') as xmlf: xmlf.write(outputxmlsoup.encode('utf-8')) settings = [] settings.append("XML") writeLog(filename, settings, True) os.remove(UPLOAD_FOLDER + filename) outputfile = OUTPUT_FOLDER + filename + "/Output" + filename + '.xml' outputfilename = filename.replace(filename.split("_")[0], "")[1:] if os.path.exists(outputfile): print "Finished inputfile : " + outputfilename writeUserLog("Finished inputfile : " + outputfilename) else: print "Error inputfile : " + outputfilename writeUserLog("Error inputfile : " + outputfilename)