def segmentReference(): if request.method == 'POST': filename = request.form.get('filename').encode('utf8') coordinates = request.form.get('coordinates') if debugMode.lower() == "yes": print "filename:", filename print "coordinates:", type(coordinates) coordinates = coordinates.strip().split(' ') if debugMode.lower() == "yes": print "coordinates:", coordinates coordinates = map(int, coordinates) if filename and coordinates: # For single filename only filename = [filename] tempFilesList = [] for currFile1 in filename: for currFile2 in os.listdir("images"): if currFile1[:-4] in currFile2: tempFilesList.append(currFile2) latest_filename = findLatest(tempFilesList) try: result = coordinatesLookup(latest_filename, coordinates) return Response(result, content_type='text/xml; charset=utf-8') except ValueError: return "Error processing the given coordinates" return 'Error: Results not found'
def getimage(): # checks the upload and stores it to the upload folder if request.method == 'POST': if 'files' not in request.files: flash('No file part') return redirect(request.url) # requested files and pdf mode are retrieved from flask fileList = request.files.getlist('files') securefname = secure_filename(fileList[0].filename) if os.path.splitext(securefname)[1].lower() == ".pdf": filenameFP_List = [] for uploadedFile in fileList: if uploadedFile.filename == '': flash('No selected inputfile') return redirect(request.url) if uploadedFile and check_file_extension( uploadedFile.filename, ALLOWED_EXTENSIONS): filenameFP = secure_filename(uploadedFile.filename) filenameFP_List.append(filenameFP) allFilenames3 = os.listdir("images") allFilenames3 = natsort.natsorted(allFilenames3) filenameFP_List = natsort.natsorted(filenameFP_List) tempFilesList = [] for currFile1 in filenameFP_List: for currFile2 in allFilenames3: if currFile1[:-4] in currFile2: tempFilesList.append(currFile2) if debugMode.lower() == "yes": print "tempFilesList:", len(tempFilesList) tempFilesList = filterCropFiles(tempFilesList) if len(tempFilesList) != 0: latestFileNames = findLatest(tempFilesList) if debugMode.lower() == "yes": print "latestFileNames:", latestFileNames currDir = os.getcwd() os.chdir(LOCDB + "images/") memory_file = BytesIO() with zipfile.ZipFile(memory_file, 'w') as zf: for individualFile in latestFileNames: zf.write(individualFile) os.chdir(currDir) memory_file.seek(0) return send_file(memory_file, attachment_filename=currFile1[:-4] + '.zip', as_attachment=True) else: return "Error: No Files Found..." else: return "Wrong File Type: Please upload a pdf file" return "Error: Results not found"
def createResultView(OUTPUT_FOLDER, files_list, mode=1): resultList = [] missList = [] if debugMode.lower() == "yes": print "" print "####################################" print "In createResultView()" print "files_list:", files_list print "mode:", mode print "####################################" print "" #find files for file in files_list: found = False for folder in os.listdir("output"): if mode == 1: if file == folder: resultList.append(folder) found = True elif file[:-4] == folder: resultList.append(folder) found = True else: if file[:-4] in folder: resultList.append(folder) found = True elif file[:-4] == folder: resultList.append(folder) found = True if not found: missList.append(file) resultList = natsort.natsorted(resultList) if debugMode.lower() == "yes": print "" print "resultList (before):", resultList resultList = findLatest(resultList) resultList = natsort.natsorted(resultList) if debugMode.lower() == "yes": print "" print "resultList (after):", resultList complete_output = "<?xml version=\"1.0\" encoding=\"utf-8\"?><LOCDBViewResults>\n<FilesFound>\n" for r in resultList: complete_output += "<filename>" + r + "</filename>\n" complete_output += "</FilesFound>\n<FilesNotFound>" for m in missList: complete_output += "<filename>" + m + "</filename>\n" complete_output += "</FilesNotFound>\n" complete_output += mergeOutputXML(OUTPUT_FOLDER, resultList) complete_output += "</LOCDBViewResults>" return complete_output
def fileview(): #checks the upload and stores it to the upload folder if request.method == 'POST': if 'files' not in request.files: flash('No file part') return redirect(request.url) #requested files and pdf mode are retrieved from flask fileList = request.files.getlist('files') filenameFP_List = [] for uploadedFile in fileList: if uploadedFile.filename == '': flash('No selected inputfile') return redirect(request.url) if uploadedFile and check_file_extension(uploadedFile.filename, ALLOWED_EXTENSIONS): filenameFP = secure_filename(uploadedFile.filename) print "filenameFP:", filenameFP filenameFP_List.append(filenameFP) if debugMode.lower() == "yes": print "filenameFP_List:", filenameFP_List print "OUTPUT_FOLDER:", OUTPUT_FOLDER filenameFP_List = natsort.natsorted(filenameFP_List) result = createResultView(OUTPUT_FOLDER, filenameFP_List, 2) return Response(result, content_type='text/xml; charset=utf-8') return "Error: Results not found"
def filterCropFiles(latestFileNames): filesFound = [] crop_names = [] non_crop_names = [] for curr1 in latestFileNames: if "crop" in curr1: crop_names.append(curr1) else: non_crop_names.append(curr1) compare = lambda x, y: collections.Counter(x) == collections.Counter(y) if debugMode.lower() == "yes": print "crop_names:", crop_names print "" print "non_crop_names:", non_crop_names print "" for curr2 in non_crop_names: found = False for curr3 in crop_names: if curr2[:-4] in curr3: found = True if found == True: filesFound.append(curr2) if compare(non_crop_names, filesFound) == False: for curr2 in non_crop_names: if not (curr2 in filesFound): filesFound.append(curr2) return filesFound
def prepareXML(xmlsoup, filename): if debugMode.lower() == "yes": print "////////////////////////////////////" print " In prepareXML() " print "////////////////////////////////////" #loads xml as bs structure and extracts all citations xmltags = xmlsoup.find_all('Citation') refList = [] for bib in xmltags: for node in bib.find_all('BibUnstructured'): ref = '[' + bib.get('ID') + '] ' + ''.join( node.find_all(text=True)) refList.append(ref) #reads dummy text with open('dummy.txt', 'r') as dummy: text = dummy.read() #stores the dummy text and references as XMLdummy.txt with open("tmp/" + filename + '_XMLdummy.txt', 'w') as dummyxml: dummyxml.write(text + '\n') dummyxml.write("REFERENCES\n\n") for ref in refList: dummyxml.write(ref.replace("\n", "").encode('utf-8') + '\n')
def findLatest(files_list): '''files_list = sorted(files_list) if len(files_list) > 1: counter = 0 for x in files_list: print "counter:",counter print x[-10:-5] if ("-crop" == str(x[-10:-5])) or ("-crop" == str(x[-11:-6])): print x del files_list[counter] else: counter += 1 ''' if debugMode.lower() == "yes": print "" print "####################################" print "In findLatest()" print "files_list:", files_list print "####################################" print "" newList = [] latest_timestamp = files_list[0].split("_", 1)[0] for i, curr in enumerate(files_list): new_timestamp = curr.split("_", 1)[0] print "new_timestamp:", new_timestamp if ((i > 0) and (new_timestamp >= old_timestamp)) or (len(files_list) == 1): latest_timestamp = new_timestamp old_timestamp = new_timestamp if debugMode.lower() == "yes": print "latest_timestamp:", latest_timestamp for curr in files_list: temp_timestamp = curr.split("_", 1)[0] if temp_timestamp == latest_timestamp: newList.append(curr) return newList
def upload_corrections(): if request.method == 'POST': allowed_extensions = ['jpg', 'png', 'jpeg', 'tif'] f = request.files.getlist('files') if debugMode.lower() == "yes": print "files:", f for curr in f: securedFilename = secure_filename(curr.filename) if securedFilename[-3:].lower() == "xml": curr.save(os.path.join(annotationsDir + securedFilename)) elif securedFilename[-3:].lower() in allowed_extensions: curr.save(os.path.join(imagesDir + securedFilename)) else: return "Unsupported File Found: Please use jpg, png, jpeg, tif and xml extensions only." return 'File uploaded successfully'
def fileupload(): if debugMode.lower() == "yes": print "" print "####################################" print "In fileupload()" print "####################################" print "" #checks the upload request parameters and stores it to the upload folder if request.method == 'POST': if 'files' not in request.files: flash('No file part') return redirect(request.url) #requested files and pdf mode are retrieved from flask fileList = request.files.getlist('files') if request.form.get('pdfFlag'): Settings[0] = "IMG" else: Settings[0] = "TXT" if request.form.get('Txt_Dummy'): Settings[1] = "True" else: Settings[1] = "False" autoview = False if request.form.get('autoviewResults'): autoview = True filenameFP_List = [] filenameString = "" for uploadedFile in fileList: if uploadedFile.filename == '': flash('No selected inputfile') return redirect(request.url) if uploadedFile and check_file_extension(uploadedFile.filename, ALLOWED_EXTENSIONS): filenameFP = secure_filename(uploadedFile.filename) print 'Uploaded inputfile : ' + filenameFP print "" writeUserLog("Uploaded inputfile : " + filenameFP) #adding timestamp ts = datetime.now().strftime('%Y%m%d%H%M%S') filenameFP = ts + "_" + filenameFP writeLog(filenameFP, Settings, False) uploadedFile.save( os.path.join(LOCDB + UPLOAD_FOLDER, filenameFP)) filenameFP_List.append(filenameFP) filenameString += filenameFP + "\n" else: return "Error: Invalid file extension..." try: job = q.enqueue_call(func=processFile, args=( UPLOAD_FOLDER, OUTPUT_FOLDER, MAX_PROCESSES, Settings, filenameFP_List, ), result_ttl=8000, timeout=80000) print(job.get_id()) return job.get_id() #sync process filenameFP_List = natsort.natsorted(filenameFP_List) result = createResultView(OUTPUT_FOLDER, filenameFP_List) return Response(result, content_type='text/xml; charset=utf-8') except: return "An Error occured during file processing..." if autoview: return render_template("form_submitocr.html", waiting="1", filesText=filenameString) else: return updateHTML() return "Error"
def processFile(UPLOAD_FOLDER, OUTPUT_FOLDER, MAX_PROCESSES, Settings, filenameList): if debugMode.lower() == "yes": print "" print "####################################" print "In processFile()" print "####################################" print "" if not os.path.exists("tmp/"): os.makedirs("tmp/") #select correct processing method xml = set(['xml', 'htm', 'html']) img = set(['pdf', 'png', 'jpg', 'jpeg', 'tif']) text = set(['doc', 'docx', 'odt', 'txt']) i = 0 while i < len(filenameList): filename = filenameList[i] if (filename[-3:].lower() == "htm") or (filename[-4:].lower() == "html"): if filename[-3:].lower() == "htm": tempName = filename[:-3] + "pdf" os.system("wkhtmltopdf " + UPLOAD_FOLDER + filename + " " + UPLOAD_FOLDER + filename[:-3] + "pdf") elif filename[-4:].lower() == "html": tempName = filename[:-4] + "pdf" os.system("wkhtmltopdf " + UPLOAD_FOLDER + filename + " " + UPLOAD_FOLDER + filename[:-4] + "pdf") filenameList.append(tempName) if (Settings[0] == "IMG") and (filename[-3:].lower() == "pdf"): reader = pyPdf.PdfFileReader(open(LOCDB + UPLOAD_FOLDER + filename)) pages = reader.getNumPages() if pages == 1: if debugMode.lower() == "yes": print "Single Page PDF" os.system("convert -density 300 " + LOCDB + UPLOAD_FOLDER + filename + " -quality 90 " + LOCDB + UPLOAD_FOLDER + filename[:-4] + ".jpg") os.system("mv " + LOCDB + UPLOAD_FOLDER + filename + " " + LOCDB + "processed-files/" + filename) #os.remove(LOCDB+UPLOAD_FOLDER+filename) del filenameList[i] filenameList.append(filename[:-4] + ".jpg") continue elif pages > 1: os.system("convert -density 300 " + LOCDB + UPLOAD_FOLDER + filename + " -quality 90 " + LOCDB + UPLOAD_FOLDER + filename[:-4] + ".jpg") os.system("mv " + LOCDB + UPLOAD_FOLDER + filename + " " + LOCDB + "processed-files/" + filename) #os.remove(LOCDB+UPLOAD_FOLDER+filename) f = [] for (dirpath, dirnames, filenames) in os.walk(LOCDB + UPLOAD_FOLDER): f.extend(filenames) break for curr3 in f: chunks = curr3.split("-") temp_name = curr3[:0 - len(chunks[len(chunks) - 1]) - 1] if temp_name == filename[:-4]: filenameList.append(curr3) del filenameList[i] continue i += 1 i = 0 if MAX_PROCESSES == 0: MAX_PROCESSES = len(filenameList) pool = Pool(processes=MAX_PROCESSES) while i < len(filenameList): filename = filenameList[i] if check_file_extension(filename, xml): Settings[0] = "XML" pool.apply_async(fileuploadXML, ( UPLOAD_FOLDER, OUTPUT_FOLDER, filename, )) if check_file_extension(filename, img): if check_file_extension(filename, set( ['pdf'])) and "TXT" in Settings[0]: Settings[0] = "TXT" pool.apply_async(fileuploadText, ( UPLOAD_FOLDER, OUTPUT_FOLDER, Settings, filename, )) else: Settings[0] = "IMG" pool.apply(fileuploadIMG, ( UPLOAD_FOLDER, OUTPUT_FOLDER, Settings, filename, )) if check_file_extension(filename, text): Settings[0] = "TXT" pool.apply_async(fileuploadText, ( UPLOAD_FOLDER, OUTPUT_FOLDER, Settings, filename, )) i += 1 #sync process pool.close() pool.join() filenameList = natsort.natsorted(filenameList) result = createResultView(OUTPUT_FOLDER, filenameList) return Response(result, content_type='text/xml; charset=utf-8')
def mapHTML(parseHtmlsoup, filename): print "////////////////////////////////////" print " In mapHTML() " print "////////////////////////////////////" soup = bs.BeautifulSoup("<algorithm></algorithm>", 'xml') algotag = soup.algorithm algotag['fname'] = filename.split("_", 1)[1] reftags = parseHtmlsoup.find_all("cite") print "len(reftags):", len(reftags) # Old Format if len(reftags) != 0: for curr1 in reftags: bibsoup = bs.BeautifulSoup('<BibStructured></BibStructured>', 'xml') bibtag = bibsoup.BibStructured bibtag['detector'] = "Mapping" bibtag['namer'] = "Mapping" if debugMode.lower() == "yes": print "" children = curr1.findChildren(recursive=False) temp_string = "" for child in children: if child.string != None: temp_string += child.string + " " temp_string = temp_string.strip() if debugMode.lower() == "yes": print "temp_string:", temp_string authorssoup = bs.BeautifulSoup("<authors></authors>", 'xml') temp_authorstag = authorssoup.authors authorstags = curr1.find_all("span", {"class": "cit-auth"}) for curr2 in authorstags: authorsoup = bs.BeautifulSoup("<author></author>", 'xml') authortag = authorsoup.author if debugMode.lower() == "yes": print "Author:", curr2.span.string authortag.string = curr2.span.string temp_authorstag.append(authortag) if len(authorssoup.authors) != 0: bibtag.append(temp_authorstag) datesoup = bs.BeautifulSoup("<date></date>", 'xml') datetag = datesoup.date pubDatetags = curr1.find_all("span", {"class": "cit-pub-date"}) for curr2 in pubDatetags: if debugMode.lower() == "yes": print "Date:", curr2.string print "datetag.string:", datetag.string if (curr2.string != None) or (len(curr2.string) != 0): datetag.string = curr2.string bibtag.append(datetag) titlesoup = bs.BeautifulSoup("<title></title>", 'xml') titletag = titlesoup.title titletags = curr1.find_all("span", {"class": "cit-article-title"}) if len(titletags) == 0: titletags = curr1.find_all("span", {"class": "cit-source"}) for curr2 in titletags: if debugMode.lower() == "yes": print "Title:", curr2.string if (curr2.string != None) or (len(curr2.string) != 0): titletag.string = curr2.string bibtag.append(titletag) locationsoup = bs.BeautifulSoup("<location></location>", 'xml') locationtag = locationsoup.location locationtags = curr1.find_all("span", {"class": "cit-publ-loc"}) for curr2 in locationtags: if debugMode.lower() == "yes": print "Location:", curr2.string if (curr2.string != None) or (len(curr2.string) != 0): locationtag.string = curr2.string bibtag.append(locationtag) publishersoup = bs.BeautifulSoup("<publisher></publisher>", 'xml') publishertag = publishersoup.publisher publishertags = curr1.find_all("span", {"class": "cit-publ-name"}) for curr2 in publishertags: if debugMode.lower() == "yes": print "Publisher:", curr2.string if (curr2.string != None) or (len(curr2.string) != 0): publishertag.string = curr2.string bibtag.append(publishertag) journalsoup = bs.BeautifulSoup("<journal></journal>", 'xml') journaltag = journalsoup.journal journaltags = curr1.find_all("span", {"class": "cit-jnl-abbrev"}) for curr2 in journaltags: if debugMode.lower() == "yes": print "Journal:", curr2.string if (curr2.string != None) or (len(curr2.string) != 0): journaltag.string = curr2.string bibtag.append(journaltag) volumesoup = bs.BeautifulSoup("<volume></volume>", 'xml') volumetag = volumesoup.volume volumetags = curr1.find_all("span", {"class": "cit-vol"}) for curr2 in volumetags: if debugMode.lower() == "yes": print "volume:", curr2.string if (curr2.string != None) or (len(curr2.string) != 0): volumetag.string = curr2.string bibtag.append(volumetag) pagessoup = bs.BeautifulSoup("<pages></pages>", 'xml') pagestag = pagessoup.pages fpagetags = curr1.find_all("span", {"class": "cit-fpage"}) lpagetags = curr1.find_all("span", {"class": "cit-lpage"}) for curr2, curr3 in zip(fpagetags, lpagetags): if debugMode.lower() == "yes": print "Pages:", curr2.string + '-' + curr3.string if (curr2.string != None) or (len(curr2.string) != 0): pagestag.string = curr2.string + '-' + curr3.string bibtag.append(pagestag) algotag.append(bibtag) return soup