def deletehighlights(request): if request.method == 'POST': path = os.path.join(settings.MEDIA_DIR, 'documents', 'cleanpdf') try: shutil.rmtree(path) except: pass os.mkdir(path) documentslist = request.POST.getlist('doc') absolutedocumentlist = [settings.BASE_DIR + s for s in documentslist] overlap = request.session['overlap'] prioritydict = request.session['prioritydict'] # resultDict = gui.analyse_file_webapp(absolutedocumentlist, overlap, prioritydict) gui.DeleteHighlights(absolutedocumentlist) timestr = time.strftime("%Y%m%d-%H%M%S") shutil.make_archive("CleanedPDFs" + timestr, 'zip', path) response = HttpResponse(open( settings.BASE_DIR + "/CleanedPDFs" + timestr + ".zip", 'rb'), content_type='application/zip') response[ 'Content-Disposition'] = 'attachment; filename=CleanedPDFs' + timestr + '.zip' try: shutil.rmtree(path) except: pass return response
def extracthighlights(request): if request.method == 'POST': timestr = time.strftime("%Y%m%d-%H%M%S") path = os.path.join(settings.BASE_DIR, 'AvaliableHL' + timestr + '.xlsx') try: os.remove(path) except: pass documentslist = request.POST.getlist('doc') absolutedocumentlist = [settings.BASE_DIR + s for s in documentslist] overlap = request.session['overlap'] prioritydict = request.session['prioritydict'] gui.ExtractHighlights(absolutedocumentlist) if os.path.exists(path): with open(path, "rb") as excel: data = excel.read() response = HttpResponse( data, content_type= 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' ) response[ 'Content-Disposition'] = 'attachment; filename=AvaliableHL' + timestr + '.xlsx' try: os.remove(path) except: pass return response
def ocrpdf(request): pk = request.GET['key'] doc = FileDocument.objects.get(pk=pk) path = doc.file_field.path gui.ocr_pdf_if_not_searchable(path) fs = FileSystemStorage(location=settings.BASE_DIR) # try: with fs.open(path.split("/")[-1][:-4] + "_OCR.pdf") as pdfocr: response = HttpResponse(pdfocr, content_type='application/pdf') response['Content-Disposition'] = 'attachment; filename=' + path.split( "/")[-1][:-4] + "_OCR.pdf" # os.remove(path.split("/")[-1][:-4]+"_OCR.pdf") # except: # response = HttpResponse("Non OCR PDF") return response
def exportdetailstoexcel(request): if request.method == 'POST': # timestr = time.strftime("%Y%m%d-%H%M%S") # path = os.path.join(settings.BASE_DIR, 'DetailedExcel'+timestr+'.xlsx') timestr = time.strftime("%Y%m%d-%H%M%S") path = os.path.join(settings.BASE_DIR, 'DATA.xlsx') try: os.remove(path) except: pass documentslist = request.POST.getlist('doc') absolutedocumentlist = [settings.BASE_DIR + s for s in documentslist] overlap = request.session['overlap'] filtername = request.session['filtername'] prioritydict = request.session['prioritydict'] searchtextflag = request.session['searchtextflag'] resultDict = gui.analyse_file_webapp(absolutedocumentlist, filtername, overlap, prioritydict, searchtextflag) gui.arrangeAliases(resultDict['d'], False) gui.ExportDetailstoExcel() # path = settings.BASE_DIR + "/DetailedExcel"+timestr+".xlsx" path = settings.BASE_DIR + "/DATA.xlsx" if os.path.exists(path): with open(path, "rb") as excel: data = excel.read() response = HttpResponse( data, content_type= 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' ) # response['Content-Disposition'] = 'attachment; filename=DetailedExcel'+timestr+'.xlsx' response[ 'Content-Disposition'] = 'attachment; filename=DATA' + timestr + '.xlsx' try: os.remove(path) except: pass return response
def savedocdict2word(request): path = os.path.join(settings.BASE_DIR, 'Chronology.docx') try: os.remove(path) except: pass documentslist = request.POST.getlist('doc') absolutedocumentlist = [settings.BASE_DIR + s for s in documentslist] gui.saveDocDict2Word(absolutedocumentlist) path = os.path.join(settings.BASE_DIR, 'Chronology.docx') if os.path.exists(path): document = Document(path) timestr = time.strftime("%Y%m%d-%H%M%S") response = HttpResponse( content_type= 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ) response[ 'Content-Disposition'] = 'attachment; filename=Chronology_' + timestr + '.docx' document.save(response) return response
def exporttopdf(request): if request.method == 'POST': path = os.path.join(settings.MEDIA_DIR, 'documents', 'highlight') try: shutil.rmtree(path) except: pass os.mkdir(path) documentslist = request.POST.getlist('doc') absolutedocumentlist = [settings.BASE_DIR + s for s in documentslist] overlap = request.session['overlap'] prioritydict = request.session['prioritydict'] gui.InvColorDictLabelstoColors = request.session[ 'InvColorDictLabelstoColors'] searchtextflag = request.session['searchtextflag'] gui.ExporttoPDF(overlap, prioritydict, searchtextflag) files = os.listdir(path) timestr = time.strftime("%Y%m%d-%H%M%S") for index, file in enumerate(files): os.rename( os.path.join(path, file), os.path.join(path + "/" + re.sub(".pdf", "", file) + ''.join([timestr, '.pdf']))) shutil.make_archive("HighlightedPDFs", 'zip', path) response = HttpResponse(open( settings.BASE_DIR + "/HighlightedPDFs.zip", 'rb'), content_type='application/zip') files = os.listdir(path) for index, filenew in enumerate(files): os.rename(os.path.join(path, filenew), os.path.join(path + "/" + re.sub(timestr, "", filenew))) response[ 'Content-Disposition'] = 'attachment; filename=HighlightedPDFs.zip' # try: # shutil.rmtree(path) # except: # pass return response
def analysisresult(request): if request.method == 'POST': resulttask = request.POST.getlist('resulttask')[0] sortdata = request.session['sortdata'] filtername = request.session['filtername'] overlap = request.session['overlap'] prioritydict = request.session['prioritydict'] documentslist = request.session['documentslist'] absolutedocumentlist = request.session['absolutedocumentlist'] datekeeper = {} request.session['datekeeper'] = {} resultDict = gui.analyse_file_webapp_shared_task( absolutedocumentlist, overlap, filtername, prioritydict, resulttask) gui.arrangeAliases(resultDict['d'], False) if sortdata == 1: # Checking if sortdata is set to true resultDict['d'] = dict( sorted(resultDict['d'].items(), key=lambda x: x[0])) for key, value in list(resultDict['d'].items()): if (key != 'DATE') and (key != 'NUMBER'): resultDict['d'][key] = dict( sorted(resultDict['d'][key].items(), key=lambda x: x[0])) elif key == 'NUMBER': for k, v in list(resultDict['d'][key].items()): if k.count(".") > 1: resultDict['d'][key].pop(k) continue try: float(k) except: resultDict['d'][key].pop(k) resultDict['d'][key] = dict( sorted(resultDict['d'][key].items(), key=lambda x: float(x[0]))) # sorted(resultDict['d'][key].items(), key=lambda x: float(re.sub('\d+.\d+', '', x[0])))) # .\d+.{1,2}$ elif key == 'DATE': for k, v in list(resultDict['d']['DATE'].items()): monthlist = [ 'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december' ] newdate = k for month in monthlist: if k.lower() in month: newdate = k + " 1 2020" elif (bool(re.match(r"[0-9]{1,2} [A-z]* ", k)) == False) and \ (bool(re.match(r"[0-9]{1,2} [A-z]*", k)) and \ ('\n' not in k)): newdate = k + " 2020" elif bool(re.match(r"'[1-9]{1,2}", k)): newdate = re.sub("'", "19", k) elif bool(re.match(r"'[0-9]{1,2}", k)): newdate = re.sub("'", "20", k) r('library("lubridate")') new_key = r( 'parse_date_time("' + newdate + '", orders = c("ymd", "dmy", "mdy", "bdy", "bY", "b", "Yb", "Y"))[1]' )[0] new_key = datetime.utcfromtimestamp(new_key).strftime( '%d-%m-%Y') datekeeper[new_key] = k resultDict['d']['DATE'][new_key] = resultDict['d'][ 'DATE'].pop(k) request.session['datekeeper'] = datekeeper resultDict['d']['DATE'] = dict( sorted( resultDict['d']['DATE'].items(), key=lambda x: datetime.strptime(x[0], '%d-%m-%Y'))) ## TODO: Move this to backend too # if filtername == 1: # Checking if filtername is set to true # for key, value in resultDict['d'].items(): # if key == 'PERSON': # for name in list(resultDict['d'][key]): # if re.sub(" ", "", name).isalpha() == False: # resultDict['d'][key].pop(name) # elif pp.tag(name)[1] != 'Person': # resultDict['d'][key].pop(name) # elif len(name) < 4: # resultDict['d'][key].pop(name) searchtextflag = request.session['searchtextflag'] gui.ExporttoPDF(overlap, prioritydict, searchtextflag) return render( request, os.path.join(TEMPLATE_DIR_PDFSCANNER, "analysisresult.html"), { 'resultdict': resultDict['d'], 'documentslist': documentslist, 'listofkeys': list(resultDict['d'].keys()), "NumberOfCat": len(resultDict['d'].keys()), 'NumberOfValues': numberofvalues(resultDict['d']), 'NumberOfFiles': len(absolutedocumentlist) })