Пример #1
0
def segmentReference():
    if request.method == 'POST':
        filename = request.form.get('filename').encode('utf8')
        coordinates = request.form.get('coordinates')
        if debugMode.lower() == "yes":
            print "filename:", filename
            print "coordinates:", type(coordinates)
        coordinates = coordinates.strip().split(' ')

        if debugMode.lower() == "yes":
            print "coordinates:", coordinates
        coordinates = map(int, coordinates)

        if filename and coordinates:
            # For single filename only
            filename = [filename]
            tempFilesList = []
            for currFile1 in filename:
                for currFile2 in os.listdir("images"):
                    if currFile1[:-4] in currFile2:
                        tempFilesList.append(currFile2)
            latest_filename = findLatest(tempFilesList)

            try:
                result = coordinatesLookup(latest_filename, coordinates)
                return Response(result, content_type='text/xml; charset=utf-8')
            except ValueError:
                return "Error processing the given coordinates"

    return 'Error: Results not found'
Пример #2
0
def getimage():
    # checks the upload and stores it to the upload folder
    if request.method == 'POST':
        if 'files' not in request.files:
            flash('No file part')
            return redirect(request.url)

        # requested files and pdf mode are retrieved from flask
        fileList = request.files.getlist('files')
        securefname = secure_filename(fileList[0].filename)

        if os.path.splitext(securefname)[1].lower() == ".pdf":
            filenameFP_List = []
            for uploadedFile in fileList:
                if uploadedFile.filename == '':
                    flash('No selected inputfile')
                    return redirect(request.url)
                if uploadedFile and check_file_extension(
                        uploadedFile.filename, ALLOWED_EXTENSIONS):
                    filenameFP = secure_filename(uploadedFile.filename)
                    filenameFP_List.append(filenameFP)

            allFilenames3 = os.listdir("images")
            allFilenames3 = natsort.natsorted(allFilenames3)
            filenameFP_List = natsort.natsorted(filenameFP_List)
            tempFilesList = []
            for currFile1 in filenameFP_List:
                for currFile2 in allFilenames3:
                    if currFile1[:-4] in currFile2:
                        tempFilesList.append(currFile2)

                if debugMode.lower() == "yes":
                    print "tempFilesList:", len(tempFilesList)

                tempFilesList = filterCropFiles(tempFilesList)
                if len(tempFilesList) != 0:
                    latestFileNames = findLatest(tempFilesList)

                    if debugMode.lower() == "yes":
                        print "latestFileNames:", latestFileNames

                    currDir = os.getcwd()
                    os.chdir(LOCDB + "images/")
                    memory_file = BytesIO()
                    with zipfile.ZipFile(memory_file, 'w') as zf:
                        for individualFile in latestFileNames:
                            zf.write(individualFile)
                    os.chdir(currDir)
                    memory_file.seek(0)
                    return send_file(memory_file,
                                     attachment_filename=currFile1[:-4] +
                                     '.zip',
                                     as_attachment=True)
                else:
                    return "Error: No Files Found..."
        else:
            return "Wrong File Type: Please upload a pdf file"

    return "Error: Results not found"
def createResultView(OUTPUT_FOLDER, files_list, mode=1):
    resultList = []
    missList = []

    if debugMode.lower() == "yes":
        print ""
        print "####################################"
        print "In createResultView()"
        print "files_list:", files_list
        print "mode:", mode
        print "####################################"
        print ""
    #find files
    for file in files_list:
        found = False
        for folder in os.listdir("output"):
            if mode == 1:
                if file == folder:
                    resultList.append(folder)
                    found = True
                elif file[:-4] == folder:
                    resultList.append(folder)
                    found = True
            else:
                if file[:-4] in folder:
                    resultList.append(folder)
                    found = True
                elif file[:-4] == folder:
                    resultList.append(folder)
                    found = True
        if not found:
            missList.append(file)

    resultList = natsort.natsorted(resultList)
    if debugMode.lower() == "yes":
        print ""
        print "resultList (before):", resultList

    resultList = findLatest(resultList)
    resultList = natsort.natsorted(resultList)
    if debugMode.lower() == "yes":
        print ""
        print "resultList (after):", resultList

    complete_output = "<?xml version=\"1.0\" encoding=\"utf-8\"?><LOCDBViewResults>\n<FilesFound>\n"
    for r in resultList:
        complete_output += "<filename>" + r + "</filename>\n"

    complete_output += "</FilesFound>\n<FilesNotFound>"

    for m in missList:
        complete_output += "<filename>" + m + "</filename>\n"

    complete_output += "</FilesNotFound>\n"
    complete_output += mergeOutputXML(OUTPUT_FOLDER, resultList)
    complete_output += "</LOCDBViewResults>"

    return complete_output
Пример #4
0
def fileview():
    #checks the upload and stores it to the upload folder
    if request.method == 'POST':
        if 'files' not in request.files:
            flash('No file part')
            return redirect(request.url)
        #requested files and pdf mode are retrieved from flask
        fileList = request.files.getlist('files')
        filenameFP_List = []
        for uploadedFile in fileList:
            if uploadedFile.filename == '':
                flash('No selected inputfile')
                return redirect(request.url)
            if uploadedFile and check_file_extension(uploadedFile.filename,
                                                     ALLOWED_EXTENSIONS):
                filenameFP = secure_filename(uploadedFile.filename)
                print "filenameFP:", filenameFP
                filenameFP_List.append(filenameFP)

        if debugMode.lower() == "yes":
            print "filenameFP_List:", filenameFP_List
            print "OUTPUT_FOLDER:", OUTPUT_FOLDER
        filenameFP_List = natsort.natsorted(filenameFP_List)
        result = createResultView(OUTPUT_FOLDER, filenameFP_List, 2)

        return Response(result, content_type='text/xml; charset=utf-8')

    return "Error: Results not found"
def filterCropFiles(latestFileNames):
    filesFound = []
    crop_names = []
    non_crop_names = []
    for curr1 in latestFileNames:
        if "crop" in curr1:
            crop_names.append(curr1)
        else:
            non_crop_names.append(curr1)

    compare = lambda x, y: collections.Counter(x) == collections.Counter(y)

    if debugMode.lower() == "yes":
        print "crop_names:", crop_names
        print ""
        print "non_crop_names:", non_crop_names
        print ""

    for curr2 in non_crop_names:
        found = False
        for curr3 in crop_names:
            if curr2[:-4] in curr3:
                found = True
        if found == True:
            filesFound.append(curr2)

    if compare(non_crop_names, filesFound) == False:
        for curr2 in non_crop_names:
            if not (curr2 in filesFound):
                filesFound.append(curr2)

    return filesFound
Пример #6
0
def prepareXML(xmlsoup, filename):
    if debugMode.lower() == "yes":
        print "////////////////////////////////////"
        print "         In prepareXML()            "
        print "////////////////////////////////////"

    #loads xml as bs structure and extracts all citations
    xmltags = xmlsoup.find_all('Citation')
    refList = []
    for bib in xmltags:
        for node in bib.find_all('BibUnstructured'):
            ref = '[' + bib.get('ID') + '] ' + ''.join(
                node.find_all(text=True))
            refList.append(ref)

    #reads dummy text
    with open('dummy.txt', 'r') as dummy:
        text = dummy.read()

    #stores the dummy text and references as XMLdummy.txt
    with open("tmp/" + filename + '_XMLdummy.txt', 'w') as dummyxml:
        dummyxml.write(text + '\n')
        dummyxml.write("REFERENCES\n\n")
        for ref in refList:
            dummyxml.write(ref.replace("\n", "").encode('utf-8') + '\n')
def findLatest(files_list):
    '''files_list = sorted(files_list)
    if len(files_list) > 1:
        counter = 0
        for x in files_list:
            print "counter:",counter
            print x[-10:-5]
            if ("-crop" == str(x[-10:-5])) or ("-crop" == str(x[-11:-6])):
                print x
                del files_list[counter]
            else:
                counter += 1
    '''
    if debugMode.lower() == "yes":
        print ""
        print "####################################"
        print "In findLatest()"
        print "files_list:", files_list
        print "####################################"
        print ""

    newList = []
    latest_timestamp = files_list[0].split("_", 1)[0]
    for i, curr in enumerate(files_list):
        new_timestamp = curr.split("_", 1)[0]
        print "new_timestamp:", new_timestamp

        if ((i > 0) and
            (new_timestamp >= old_timestamp)) or (len(files_list) == 1):
            latest_timestamp = new_timestamp

        old_timestamp = new_timestamp

    if debugMode.lower() == "yes":
        print "latest_timestamp:", latest_timestamp

    for curr in files_list:
        temp_timestamp = curr.split("_", 1)[0]

        if temp_timestamp == latest_timestamp:
            newList.append(curr)

    return newList
Пример #8
0
def upload_corrections():
    if request.method == 'POST':
        allowed_extensions = ['jpg', 'png', 'jpeg', 'tif']

        f = request.files.getlist('files')
        if debugMode.lower() == "yes":
            print "files:", f
        for curr in f:
            securedFilename = secure_filename(curr.filename)
            if securedFilename[-3:].lower() == "xml":
                curr.save(os.path.join(annotationsDir + securedFilename))
            elif securedFilename[-3:].lower() in allowed_extensions:
                curr.save(os.path.join(imagesDir + securedFilename))
            else:
                return "Unsupported File Found: Please use jpg, png, jpeg, tif and xml extensions only."

        return 'File uploaded successfully'
Пример #9
0
def fileupload():
    if debugMode.lower() == "yes":
        print ""
        print "####################################"
        print "In fileupload()"
        print "####################################"
        print ""

    #checks the upload request parameters and stores it to the upload folder
    if request.method == 'POST':
        if 'files' not in request.files:
            flash('No file part')
            return redirect(request.url)
        #requested files and pdf mode are retrieved from flask
        fileList = request.files.getlist('files')
        if request.form.get('pdfFlag'):
            Settings[0] = "IMG"
        else:
            Settings[0] = "TXT"
        if request.form.get('Txt_Dummy'):
            Settings[1] = "True"
        else:
            Settings[1] = "False"

        autoview = False
        if request.form.get('autoviewResults'):
            autoview = True
        filenameFP_List = []
        filenameString = ""
        for uploadedFile in fileList:
            if uploadedFile.filename == '':
                flash('No selected inputfile')
                return redirect(request.url)
            if uploadedFile and check_file_extension(uploadedFile.filename,
                                                     ALLOWED_EXTENSIONS):
                filenameFP = secure_filename(uploadedFile.filename)
                print 'Uploaded inputfile : ' + filenameFP
                print ""
                writeUserLog("Uploaded inputfile : " + filenameFP)
                #adding timestamp
                ts = datetime.now().strftime('%Y%m%d%H%M%S')
                filenameFP = ts + "_" + filenameFP
                writeLog(filenameFP, Settings, False)
                uploadedFile.save(
                    os.path.join(LOCDB + UPLOAD_FOLDER, filenameFP))
                filenameFP_List.append(filenameFP)
                filenameString += filenameFP + "\n"
            else:
                return "Error: Invalid file extension..."
        try:
            job = q.enqueue_call(func=processFile,
                                 args=(
                                     UPLOAD_FOLDER,
                                     OUTPUT_FOLDER,
                                     MAX_PROCESSES,
                                     Settings,
                                     filenameFP_List,
                                 ),
                                 result_ttl=8000,
                                 timeout=80000)
            print(job.get_id())

            return job.get_id()

            #sync process
            filenameFP_List = natsort.natsorted(filenameFP_List)
            result = createResultView(OUTPUT_FOLDER, filenameFP_List)
            return Response(result, content_type='text/xml; charset=utf-8')
        except:
            return "An Error occured during file processing..."

        if autoview:
            return render_template("form_submitocr.html",
                                   waiting="1",
                                   filesText=filenameString)
        else:
            return updateHTML()

    return "Error"
def processFile(UPLOAD_FOLDER, OUTPUT_FOLDER, MAX_PROCESSES, Settings,
                filenameList):
    if debugMode.lower() == "yes":
        print ""
        print "####################################"
        print "In processFile()"
        print "####################################"
        print ""
    if not os.path.exists("tmp/"):
        os.makedirs("tmp/")

    #select correct processing method
    xml = set(['xml', 'htm', 'html'])
    img = set(['pdf', 'png', 'jpg', 'jpeg', 'tif'])
    text = set(['doc', 'docx', 'odt', 'txt'])
    i = 0
    while i < len(filenameList):
        filename = filenameList[i]

        if (filename[-3:].lower() == "htm") or (filename[-4:].lower()
                                                == "html"):
            if filename[-3:].lower() == "htm":
                tempName = filename[:-3] + "pdf"
                os.system("wkhtmltopdf " + UPLOAD_FOLDER + filename + " " +
                          UPLOAD_FOLDER + filename[:-3] + "pdf")
            elif filename[-4:].lower() == "html":
                tempName = filename[:-4] + "pdf"
                os.system("wkhtmltopdf " + UPLOAD_FOLDER + filename + " " +
                          UPLOAD_FOLDER + filename[:-4] + "pdf")
            filenameList.append(tempName)

        if (Settings[0] == "IMG") and (filename[-3:].lower() == "pdf"):
            reader = pyPdf.PdfFileReader(open(LOCDB + UPLOAD_FOLDER +
                                              filename))
            pages = reader.getNumPages()
            if pages == 1:
                if debugMode.lower() == "yes":
                    print "Single Page PDF"
                os.system("convert -density 300 " + LOCDB + UPLOAD_FOLDER +
                          filename + " -quality 90 " + LOCDB + UPLOAD_FOLDER +
                          filename[:-4] + ".jpg")
                os.system("mv " + LOCDB + UPLOAD_FOLDER + filename + " " +
                          LOCDB + "processed-files/" + filename)
                #os.remove(LOCDB+UPLOAD_FOLDER+filename)
                del filenameList[i]
                filenameList.append(filename[:-4] + ".jpg")
                continue
            elif pages > 1:
                os.system("convert -density 300 " + LOCDB + UPLOAD_FOLDER +
                          filename + " -quality 90 " + LOCDB + UPLOAD_FOLDER +
                          filename[:-4] + ".jpg")
                os.system("mv " + LOCDB + UPLOAD_FOLDER + filename + " " +
                          LOCDB + "processed-files/" + filename)
                #os.remove(LOCDB+UPLOAD_FOLDER+filename)
                f = []
                for (dirpath, dirnames,
                     filenames) in os.walk(LOCDB + UPLOAD_FOLDER):
                    f.extend(filenames)
                    break
                for curr3 in f:
                    chunks = curr3.split("-")
                    temp_name = curr3[:0 - len(chunks[len(chunks) - 1]) - 1]
                    if temp_name == filename[:-4]:
                        filenameList.append(curr3)
                del filenameList[i]
                continue
        i += 1

    i = 0
    if MAX_PROCESSES == 0:
        MAX_PROCESSES = len(filenameList)
    pool = Pool(processes=MAX_PROCESSES)
    while i < len(filenameList):
        filename = filenameList[i]
        if check_file_extension(filename, xml):
            Settings[0] = "XML"
            pool.apply_async(fileuploadXML, (
                UPLOAD_FOLDER,
                OUTPUT_FOLDER,
                filename,
            ))

        if check_file_extension(filename, img):
            if check_file_extension(filename, set(
                ['pdf'])) and "TXT" in Settings[0]:
                Settings[0] = "TXT"
                pool.apply_async(fileuploadText, (
                    UPLOAD_FOLDER,
                    OUTPUT_FOLDER,
                    Settings,
                    filename,
                ))
            else:
                Settings[0] = "IMG"
                pool.apply(fileuploadIMG, (
                    UPLOAD_FOLDER,
                    OUTPUT_FOLDER,
                    Settings,
                    filename,
                ))
        if check_file_extension(filename, text):
            Settings[0] = "TXT"
            pool.apply_async(fileuploadText, (
                UPLOAD_FOLDER,
                OUTPUT_FOLDER,
                Settings,
                filename,
            ))
        i += 1

    #sync process
    pool.close()
    pool.join()

    filenameList = natsort.natsorted(filenameList)
    result = createResultView(OUTPUT_FOLDER, filenameList)
    return Response(result, content_type='text/xml; charset=utf-8')
Пример #11
0
def mapHTML(parseHtmlsoup, filename):
    print "////////////////////////////////////"
    print "          In mapHTML()              "
    print "////////////////////////////////////"

    soup = bs.BeautifulSoup("<algorithm></algorithm>", 'xml')
    algotag = soup.algorithm
    algotag['fname'] = filename.split("_", 1)[1]

    reftags = parseHtmlsoup.find_all("cite")
    print "len(reftags):", len(reftags)

    # Old Format
    if len(reftags) != 0:
        for curr1 in reftags:
            bibsoup = bs.BeautifulSoup('<BibStructured></BibStructured>',
                                       'xml')
            bibtag = bibsoup.BibStructured
            bibtag['detector'] = "Mapping"
            bibtag['namer'] = "Mapping"
            if debugMode.lower() == "yes":
                print ""
            children = curr1.findChildren(recursive=False)
            temp_string = ""
            for child in children:
                if child.string != None:
                    temp_string += child.string + " "

            temp_string = temp_string.strip()
            if debugMode.lower() == "yes":
                print "temp_string:", temp_string

            authorssoup = bs.BeautifulSoup("<authors></authors>", 'xml')
            temp_authorstag = authorssoup.authors
            authorstags = curr1.find_all("span", {"class": "cit-auth"})
            for curr2 in authorstags:
                authorsoup = bs.BeautifulSoup("<author></author>", 'xml')
                authortag = authorsoup.author
                if debugMode.lower() == "yes":
                    print "Author:", curr2.span.string
                authortag.string = curr2.span.string
                temp_authorstag.append(authortag)
            if len(authorssoup.authors) != 0:
                bibtag.append(temp_authorstag)
            datesoup = bs.BeautifulSoup("<date></date>", 'xml')
            datetag = datesoup.date
            pubDatetags = curr1.find_all("span", {"class": "cit-pub-date"})
            for curr2 in pubDatetags:
                if debugMode.lower() == "yes":
                    print "Date:", curr2.string
                    print "datetag.string:", datetag.string
                if (curr2.string != None) or (len(curr2.string) != 0):
                    datetag.string = curr2.string
                    bibtag.append(datetag)

            titlesoup = bs.BeautifulSoup("<title></title>", 'xml')
            titletag = titlesoup.title
            titletags = curr1.find_all("span", {"class": "cit-article-title"})
            if len(titletags) == 0:
                titletags = curr1.find_all("span", {"class": "cit-source"})
            for curr2 in titletags:
                if debugMode.lower() == "yes":
                    print "Title:", curr2.string
                if (curr2.string != None) or (len(curr2.string) != 0):
                    titletag.string = curr2.string
                    bibtag.append(titletag)

            locationsoup = bs.BeautifulSoup("<location></location>", 'xml')
            locationtag = locationsoup.location
            locationtags = curr1.find_all("span", {"class": "cit-publ-loc"})
            for curr2 in locationtags:
                if debugMode.lower() == "yes":
                    print "Location:", curr2.string
                if (curr2.string != None) or (len(curr2.string) != 0):
                    locationtag.string = curr2.string
                    bibtag.append(locationtag)

            publishersoup = bs.BeautifulSoup("<publisher></publisher>", 'xml')
            publishertag = publishersoup.publisher
            publishertags = curr1.find_all("span", {"class": "cit-publ-name"})
            for curr2 in publishertags:
                if debugMode.lower() == "yes":
                    print "Publisher:", curr2.string
                if (curr2.string != None) or (len(curr2.string) != 0):
                    publishertag.string = curr2.string
                    bibtag.append(publishertag)

            journalsoup = bs.BeautifulSoup("<journal></journal>", 'xml')
            journaltag = journalsoup.journal
            journaltags = curr1.find_all("span", {"class": "cit-jnl-abbrev"})
            for curr2 in journaltags:
                if debugMode.lower() == "yes":
                    print "Journal:", curr2.string
                if (curr2.string != None) or (len(curr2.string) != 0):
                    journaltag.string = curr2.string
                    bibtag.append(journaltag)

            volumesoup = bs.BeautifulSoup("<volume></volume>", 'xml')
            volumetag = volumesoup.volume
            volumetags = curr1.find_all("span", {"class": "cit-vol"})
            for curr2 in volumetags:
                if debugMode.lower() == "yes":
                    print "volume:", curr2.string
                if (curr2.string != None) or (len(curr2.string) != 0):
                    volumetag.string = curr2.string
                    bibtag.append(volumetag)

            pagessoup = bs.BeautifulSoup("<pages></pages>", 'xml')
            pagestag = pagessoup.pages
            fpagetags = curr1.find_all("span", {"class": "cit-fpage"})
            lpagetags = curr1.find_all("span", {"class": "cit-lpage"})
            for curr2, curr3 in zip(fpagetags, lpagetags):
                if debugMode.lower() == "yes":
                    print "Pages:", curr2.string + '-' + curr3.string

                if (curr2.string != None) or (len(curr2.string) != 0):
                    pagestag.string = curr2.string + '-' + curr3.string
                    bibtag.append(pagestag)
            algotag.append(bibtag)
    return soup