Пример #1
0
def deletehighlights(request):
    if request.method == 'POST':
        path = os.path.join(settings.MEDIA_DIR, 'documents', 'cleanpdf')
        try:
            shutil.rmtree(path)
        except:
            pass
        os.mkdir(path)
        documentslist = request.POST.getlist('doc')
        absolutedocumentlist = [settings.BASE_DIR + s for s in documentslist]
        overlap = request.session['overlap']
        prioritydict = request.session['prioritydict']
        # resultDict = gui.analyse_file_webapp(absolutedocumentlist, overlap, prioritydict)
        gui.DeleteHighlights(absolutedocumentlist)
        timestr = time.strftime("%Y%m%d-%H%M%S")
        shutil.make_archive("CleanedPDFs" + timestr, 'zip', path)

        response = HttpResponse(open(
            settings.BASE_DIR + "/CleanedPDFs" + timestr + ".zip", 'rb'),
                                content_type='application/zip')
        response[
            'Content-Disposition'] = 'attachment; filename=CleanedPDFs' + timestr + '.zip'
        try:
            shutil.rmtree(path)
        except:
            pass
        return response
Пример #2
0
def extracthighlights(request):
    if request.method == 'POST':
        timestr = time.strftime("%Y%m%d-%H%M%S")
        path = os.path.join(settings.BASE_DIR,
                            'AvaliableHL' + timestr + '.xlsx')
        try:
            os.remove(path)
        except:
            pass
        documentslist = request.POST.getlist('doc')
        absolutedocumentlist = [settings.BASE_DIR + s for s in documentslist]
        overlap = request.session['overlap']
        prioritydict = request.session['prioritydict']
        gui.ExtractHighlights(absolutedocumentlist)

        if os.path.exists(path):
            with open(path, "rb") as excel:
                data = excel.read()

        response = HttpResponse(
            data,
            content_type=
            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
        )
        response[
            'Content-Disposition'] = 'attachment; filename=AvaliableHL' + timestr + '.xlsx'
        try:
            os.remove(path)
        except:
            pass
        return response
Пример #3
0
def ocrpdf(request):
    pk = request.GET['key']
    doc = FileDocument.objects.get(pk=pk)
    path = doc.file_field.path

    gui.ocr_pdf_if_not_searchable(path)
    fs = FileSystemStorage(location=settings.BASE_DIR)

    # try:
    with fs.open(path.split("/")[-1][:-4] + "_OCR.pdf") as pdfocr:
        response = HttpResponse(pdfocr, content_type='application/pdf')
        response['Content-Disposition'] = 'attachment; filename=' + path.split(
            "/")[-1][:-4] + "_OCR.pdf"
        # os.remove(path.split("/")[-1][:-4]+"_OCR.pdf")
    # except:
    #     response = HttpResponse("Non OCR PDF")
    return response
Пример #4
0
def exportdetailstoexcel(request):
    if request.method == 'POST':
        # timestr = time.strftime("%Y%m%d-%H%M%S")
        # path = os.path.join(settings.BASE_DIR, 'DetailedExcel'+timestr+'.xlsx')
        timestr = time.strftime("%Y%m%d-%H%M%S")
        path = os.path.join(settings.BASE_DIR, 'DATA.xlsx')
        try:
            os.remove(path)
        except:
            pass
        documentslist = request.POST.getlist('doc')
        absolutedocumentlist = [settings.BASE_DIR + s for s in documentslist]
        overlap = request.session['overlap']
        filtername = request.session['filtername']
        prioritydict = request.session['prioritydict']
        searchtextflag = request.session['searchtextflag']
        resultDict = gui.analyse_file_webapp(absolutedocumentlist, filtername,
                                             overlap, prioritydict,
                                             searchtextflag)
        gui.arrangeAliases(resultDict['d'], False)
        gui.ExportDetailstoExcel()

        # path = settings.BASE_DIR + "/DetailedExcel"+timestr+".xlsx"
        path = settings.BASE_DIR + "/DATA.xlsx"
        if os.path.exists(path):
            with open(path, "rb") as excel:
                data = excel.read()

        response = HttpResponse(
            data,
            content_type=
            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
        )
        # response['Content-Disposition'] = 'attachment; filename=DetailedExcel'+timestr+'.xlsx'
        response[
            'Content-Disposition'] = 'attachment; filename=DATA' + timestr + '.xlsx'
        try:
            os.remove(path)
        except:
            pass
        return response
Пример #5
0
def savedocdict2word(request):
    path = os.path.join(settings.BASE_DIR, 'Chronology.docx')
    try:
        os.remove(path)
    except:
        pass
    documentslist = request.POST.getlist('doc')
    absolutedocumentlist = [settings.BASE_DIR + s for s in documentslist]
    gui.saveDocDict2Word(absolutedocumentlist)
    path = os.path.join(settings.BASE_DIR, 'Chronology.docx')
    if os.path.exists(path):
        document = Document(path)
    timestr = time.strftime("%Y%m%d-%H%M%S")
    response = HttpResponse(
        content_type=
        'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
    )
    response[
        'Content-Disposition'] = 'attachment; filename=Chronology_' + timestr + '.docx'
    document.save(response)

    return response
Пример #6
0
def exporttopdf(request):
    if request.method == 'POST':
        path = os.path.join(settings.MEDIA_DIR, 'documents', 'highlight')
        try:
            shutil.rmtree(path)
        except:
            pass
        os.mkdir(path)
        documentslist = request.POST.getlist('doc')
        absolutedocumentlist = [settings.BASE_DIR + s for s in documentslist]
        overlap = request.session['overlap']
        prioritydict = request.session['prioritydict']
        gui.InvColorDictLabelstoColors = request.session[
            'InvColorDictLabelstoColors']
        searchtextflag = request.session['searchtextflag']
        gui.ExporttoPDF(overlap, prioritydict, searchtextflag)

        files = os.listdir(path)
        timestr = time.strftime("%Y%m%d-%H%M%S")
        for index, file in enumerate(files):
            os.rename(
                os.path.join(path, file),
                os.path.join(path + "/" + re.sub(".pdf", "", file) +
                             ''.join([timestr, '.pdf'])))

        shutil.make_archive("HighlightedPDFs", 'zip', path)
        response = HttpResponse(open(
            settings.BASE_DIR + "/HighlightedPDFs.zip", 'rb'),
                                content_type='application/zip')
        files = os.listdir(path)
        for index, filenew in enumerate(files):
            os.rename(os.path.join(path, filenew),
                      os.path.join(path + "/" + re.sub(timestr, "", filenew)))
        response[
            'Content-Disposition'] = 'attachment; filename=HighlightedPDFs.zip'
        # try:
        #     shutil.rmtree(path)
        # except:
        #     pass
        return response
Пример #7
0
def analysisresult(request):
    if request.method == 'POST':
        resulttask = request.POST.getlist('resulttask')[0]
        sortdata = request.session['sortdata']
        filtername = request.session['filtername']
        overlap = request.session['overlap']
        prioritydict = request.session['prioritydict']
        documentslist = request.session['documentslist']
        absolutedocumentlist = request.session['absolutedocumentlist']
        datekeeper = {}
        request.session['datekeeper'] = {}

        resultDict = gui.analyse_file_webapp_shared_task(
            absolutedocumentlist, overlap, filtername, prioritydict,
            resulttask)
        gui.arrangeAliases(resultDict['d'], False)

        if sortdata == 1:  # Checking if sortdata is set to true
            resultDict['d'] = dict(
                sorted(resultDict['d'].items(), key=lambda x: x[0]))
            for key, value in list(resultDict['d'].items()):
                if (key != 'DATE') and (key != 'NUMBER'):
                    resultDict['d'][key] = dict(
                        sorted(resultDict['d'][key].items(),
                               key=lambda x: x[0]))

                elif key == 'NUMBER':
                    for k, v in list(resultDict['d'][key].items()):
                        if k.count(".") > 1:
                            resultDict['d'][key].pop(k)
                            continue
                        try:
                            float(k)
                        except:
                            resultDict['d'][key].pop(k)
                    resultDict['d'][key] = dict(
                        sorted(resultDict['d'][key].items(),
                               key=lambda x: float(x[0])))
                    # sorted(resultDict['d'][key].items(), key=lambda x: float(re.sub('\d+.\d+', '', x[0]))))
                #     .\d+.{1,2}$

                elif key == 'DATE':
                    for k, v in list(resultDict['d']['DATE'].items()):
                        monthlist = [
                            'january', 'february', 'march', 'april', 'may',
                            'june', 'july', 'august', 'september', 'october',
                            'november', 'december'
                        ]
                        newdate = k
                        for month in monthlist:
                            if k.lower() in month:
                                newdate = k + " 1 2020"
                            elif (bool(re.match(r"[0-9]{1,2} [A-z]* ", k)) == False) and \
                                    (bool(re.match(r"[0-9]{1,2} [A-z]*", k)) and \
                                     ('\n' not in k)):
                                newdate = k + " 2020"
                            elif bool(re.match(r"'[1-9]{1,2}", k)):
                                newdate = re.sub("'", "19", k)
                            elif bool(re.match(r"'[0-9]{1,2}", k)):
                                newdate = re.sub("'", "20", k)

                        r('library("lubridate")')
                        new_key = r(
                            'parse_date_time("' + newdate +
                            '", orders = c("ymd", "dmy", "mdy", "bdy", "bY", "b", "Yb", "Y"))[1]'
                        )[0]
                        new_key = datetime.utcfromtimestamp(new_key).strftime(
                            '%d-%m-%Y')
                        datekeeper[new_key] = k
                        resultDict['d']['DATE'][new_key] = resultDict['d'][
                            'DATE'].pop(k)
                    request.session['datekeeper'] = datekeeper
                    resultDict['d']['DATE'] = dict(
                        sorted(
                            resultDict['d']['DATE'].items(),
                            key=lambda x: datetime.strptime(x[0], '%d-%m-%Y')))

        ## TODO: Move this to backend too
        # if filtername == 1:  # Checking if filtername is set to true
        #     for key, value in resultDict['d'].items():
        #         if key == 'PERSON':
        #             for name in list(resultDict['d'][key]):
        #                 if re.sub(" ", "", name).isalpha() == False:
        #                     resultDict['d'][key].pop(name)
        #                 elif pp.tag(name)[1] != 'Person':
        #                     resultDict['d'][key].pop(name)
        #                 elif len(name) < 4:
        #                     resultDict['d'][key].pop(name)
        searchtextflag = request.session['searchtextflag']
        gui.ExporttoPDF(overlap, prioritydict, searchtextflag)

        return render(
            request,
            os.path.join(TEMPLATE_DIR_PDFSCANNER, "analysisresult.html"), {
                'resultdict': resultDict['d'],
                'documentslist': documentslist,
                'listofkeys': list(resultDict['d'].keys()),
                "NumberOfCat": len(resultDict['d'].keys()),
                'NumberOfValues': numberofvalues(resultDict['d']),
                'NumberOfFiles': len(absolutedocumentlist)
            })