def handleUploadWorkSpace(self): """ This function takes care of the session when you upload a workspace(.lexos) file Args: None Returns: None """ # save .lexos file savePath = os.path.join(constants.UPLOAD_FOLDER, constants.WORKSPACE_DIR) savefile = os.path.join(savePath, str(self.nextID) + '.zip') try: os.makedirs(savePath) except: pass f = open(savefile, 'wb') f.write(request.data) f.close() # clean the session folder shutil.rmtree(session_manager.session_folder()) # extract the zip with zipfile.ZipFile(savefile) as zf: zf.extractall(savePath) NewSessionPath = os.path.join(savePath, constants.WORKSPACE_UPLOAD_DIR) general_functions.copydir(NewSessionPath, session_manager.session_folder()) # remove temp os.remove(savefile) shutil.rmtree(savePath)
def generateRWmatrix(dataList): """ Generates rolling windows graph raw data matrix Args: dataPoints: a list of [x, y] points Returns: Output file path and extension. """ extension = '.csv' deliminator = ',' folderPath = pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER) if (not os.path.isdir(folderPath)): makedirs(folderPath) outFilePath = pathjoin(folderPath, 'RWresults' + extension) rows = ["" for _ in xrange(len(dataList[0]))] with open(outFilePath, 'w') as outFile: for i in xrange(len(dataList)): for j in xrange(len(dataList[i])): rows[j] = rows[j] + str(dataList[i][j]) + deliminator for i in xrange(len(rows)): outFile.write(rows[i] + '\n') outFile.close() return outFilePath, extension
def __init__(self, originalFilename, fileName, fileString, fileID): """ Constructor Creates a new LexosFile object from the information passed in, and performs some preliminary processing. Args: fileName: File name of the originally uploaded file. fileString: Contents of the file's text. fileID: The ID to assign to the new file. Returns: The newly constructed LexosFile object. """ self.id = fileID # Starts out without an id - later assigned one from FileManager self.originalSourceFilename = originalFilename self.name = fileName self.contentsPreview = self.generatePreview(fileString) self.savePath = pathjoin(session_manager.session_folder(), constants.FILECONTENTS_FOLDER, str(self.id) + '.txt') self.saveContents(fileString) self.active = True self.classLabel = '' splitName = self.name.split('.') self.label = '.'.join(splitName[:-1]) self.setTypeFrom(splitName[-1], fileString) self.hasTags = self.checkForTags(fileString) self.isGutenberg = self.checkForGutenberg(fileString) self.options = {}
def handleUploadWorkSpace(self): """ This function takes care of the session when you upload a workspace(.lexos) file Args: None Returns: None """ # save .lexos file savePath = os.path.join(constants.UPLOAD_FOLDER, constants.WORKSPACE_DIR) savefile = os.path.join(savePath, str(self.nextID) + '.zip') try: os.makedirs(savePath) except: pass f = open(savefile, 'wb') f.write(request.data) f.close() # clean the session folder shutil.rmtree(session_manager.session_folder()) # extract the zip upload_session_path = os.path.join( constants.UPLOAD_FOLDER, str(self.nextID) + '_upload_work_space_folder') with zipfile.ZipFile(savefile) as zf: zf.extractall(upload_session_path) general_functions.copydir(upload_session_path, session_manager.session_folder()) # remove temp shutil.rmtree(savePath) shutil.rmtree(upload_session_path) try: # if there is no file content folder make one. # this dir will be lost during download(zip) if your original file content folder does not contain anything. os.makedirs( os.path.join(session_manager.session_folder(), constants.FILECONTENTS_FOLDER)) except (WindowsError, OSError) as e: pass
def getTopWordCSV(TestResult, TestMethod): """ Write the generated topword results to an output CSV file Args: TestResult: Analysis Result generated by either generateKWTopwords() or GenerateZTestTopWord() TestMethod: 'pzClass' - proportional z-test for class, 'pzAll' - proportional z-test for all, 'KW' - Kruskal Wallis test for class Returns: Path of the generated CSV file """ # make the path ResultFolderPath = os.path.join(session_manager.session_folder(), constants.RESULTS_FOLDER) try: os.makedirs(ResultFolderPath) # attempt to make the save path dirctory except OSError: pass SavePath = os.path.join(ResultFolderPath, constants.TOPWORD_CSV_FILE_NAME) delimiter = ',' CSVcontent = '' if TestMethod == 'pzClass': CSVcontent = 'Proptional-Z test for Class \n' # add a header for key in TestResult: TableLegend = 'File: ' + key[0] + 'compare to Class: ' + key[1] + delimiter TableTopWord = 'TopWord, ' TableZscore = 'Z-score, ' for data in TestResult[key]: TableTopWord += data[0] + delimiter TableZscore += str(data[1]) + delimiter CSVcontent += TableLegend + TableTopWord + '\n' + delimiter + TableZscore + '\n' if TestMethod == 'pzAll': CSVcontent = 'Proptional-Z test for all \n' # add a header for File in TestResult: TableLegend = 'File: ' + File[0] + delimiter TableTopWord = 'TopWord, ' TableZscore = 'Z-score, ' for data in File[1]: TableTopWord += data[0] + delimiter TableZscore += str(data[1]) + delimiter CSVcontent += TableLegend + TableTopWord + '\n' + delimiter + TableZscore + '\n' if TestMethod == 'KW': CSVcontent = 'Kruckal-Wallis test for Class \n' # add a header TableTopWord = 'TopWord, ' TableZscore = 'Z-score, ' for data in TestResult: TableTopWord += data[0] + delimiter TableZscore += str(data[1]) + delimiter CSVcontent += TableTopWord + '\n' + TableZscore + '\n' with open(SavePath, 'w') as f: f.write(CSVcontent) return SavePath
def getTopWordCSV(TestResult, TestMethod): """ Write the generated topword results to an output CSV file Args: TestResult: Analysis Result generated by either generateKWTopwords() or GenerateZTestTopWord() TestMethod: 'pzClass' - proportional z-test for class, 'pzAll' - proportional z-test for all, 'KW' - Kruskal Wallis test for class Returns: Path of the generated CSV file """ # make the path ResultFolderPath = os.path.join(session_manager.session_folder(), constants.RESULTS_FOLDER) try: os.makedirs(ResultFolderPath) # attempt to make the save path dirctory except OSError: pass SavePath = os.path.join(ResultFolderPath, constants.TOPWORD_CSV_FILE_NAME) delimiter = ',' CSVcontent = '' if TestMethod == 'pzClass': CSVcontent = 'Proptional-Z test for Class \n' # add a header for key in TestResult: TableLegend = 'File: ' + key[0] + 'compare to Class: ' + key[1] + delimiter TableTopWord = 'TopWord, ' TableZscore = 'Z-score, ' for data in TestResult[key]: TableTopWord += data[0] + delimiter TableZscore += str(data[1]) + delimiter CSVcontent += TableLegend + TableTopWord + '\n' + delimiter + TableZscore + '\n' if TestMethod == 'pzAll': CSVcontent = 'Proptional-Z test for all \n' # add a header for File in TestResult: TableLegend = 'File: ' + File[0] + delimiter TableTopWord = 'TopWord, ' TableZscore = 'Z-score, ' for data in File[1]: TableTopWord += data[0] + delimiter TableZscore += str(data[1]) + delimiter CSVcontent += TableLegend + TableTopWord + '\n' + delimiter + TableZscore + '\n' if TestMethod == 'KW': CSVcontent = 'Kruckal-Wallis test for Class \n' # add a header TableTopWord = 'TopWord, ' TableZscore = 'Z-score, ' for data in TestResult: TableTopWord += data[0] + delimiter TableZscore += str(data[1]) + delimiter CSVcontent += TableTopWord + '\n' + TableZscore + '\n' with open(SavePath, 'w') as f: f.write(CSVcontent.encode('utf-8')) return SavePath
def dendrogramimage(): """ Reads the png image of the dendrogram and displays it on the web browser. *dendrogramimage() linked to in analysis.html, displaying the dendrogram.png Note: Returns a response object with the dendrogram png to flask and eventually to the browser. """ # dendrogramimage() is called in analysis.html, displaying the dendrogram.png (if session['dengenerated'] != False). imagePath = pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER, constants.DENDROGRAM_FILENAME) return send_file(imagePath)
def zipWorkSpace(self): """ Sends a zip file containing a pickel file of the session and the session folder. Args: fileName: Name to assign to the zipped file. Returns: the path of the zipped workspace """ # initialize the save path savepath = os.path.join(constants.UPLOAD_FOLDER, constants.WORKSPACE_DIR) id = str(self.nextID % 10000) # take the last 4 digit workspacefilepath = os.path.join(constants.UPLOAD_FOLDER, id + '_' + constants.WORKSPACE_FILENAME) # remove unnecessary content in the workspace try: shutil.rmtree(os.path.join(session_manager.session_folder(), constants.RESULTS_FOLDER)) # attempt to remove result folder(CSV matrix that kind of crap) except: pass # move session folder to work space folder try: os.remove(workspacefilepath) # try to remove previous workspace in order to resolve conflict except: pass try: shutil.rmtree(savepath) # empty the save path in order to resolve conflict except: pass general_functions.copydir(session_manager.session_folder(), savepath) # save session in the work space folder session_manager.save(savepath) # zip the dir zipf = zipfile.ZipFile(workspacefilepath, 'w') general_functions.zipdir(savepath, zipf) zipf.close() # remove the original dir shutil.rmtree(savepath) return workspacefilepath
def kmeansimage(): """ Reads the png image of the kmeans and displays it on the web browser. *kmeansimage() linked to in analysis.html, displaying the kmeansimage.png Note: Returns a response object with the kmeansimage png to flask and eventually to the browser. """ # kmeansimage() is called in kmeans.html, displaying the KMEANS_GRAPH_FILENAME (if session['kmeansdatagenerated'] != False). imagePath = pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER, constants.KMEANS_GRAPH_FILENAME) return send_file(imagePath)
def handleUploadWorkSpace(self): """ This function takes care of the session when you upload a workspace(.lexos) file Args: None Returns: None """ # save .lexos file savePath = os.path.join(constants.UPLOAD_FOLDER, constants.WORKSPACE_DIR) savefile = os.path.join(savePath, str(self.nextID) + '.zip') try: os.makedirs(savePath) except: pass f = open(savefile, 'wb') f.write(request.data) f.close() # clean the session folder shutil.rmtree(session_manager.session_folder()) # extract the zip upload_session_path = os.path.join(constants.UPLOAD_FOLDER, str(self.nextID) + '_upload_work_space_folder') with zipfile.ZipFile(savefile) as zf: zf.extractall(upload_session_path) general_functions.copydir(upload_session_path, session_manager.session_folder()) # remove temp shutil.rmtree(savePath) shutil.rmtree(upload_session_path) try: # if there is no file content folder make one. # this dir will be lost during download(zip) if your original file content folder does not contain anything. os.makedirs(os.path.join(session_manager.session_folder(), constants.FILECONTENTS_FOLDER)) except (WindowsError, OSError) as e: pass
def saveFileManager(fileManager): """ Saves the file manager to the hard drive. Args: fileManager: File manager object to be saved. Returns: None """ fileManagerPath = os.path.join(session_folder(), constants.FILEMANAGER_FILENAME) pickle.dump(fileManager, open(fileManagerPath, 'wb'))
def generateCSV(filemanager): """ Generates a CSV file from the active files. Args: None Returns: The filepath where the CSV was saved, and the chosen extension (.csv or .tsv) for the file. """ transpose = request.form['csvorientation'] == 'filerow' useTSV = request.form['csvdelimiter'] == 'tab' extension = '.tsv' if useTSV else '.csv' countMatrix = generateCSVMatrix(filemanager) delimiter = '\t' if useTSV else ',' # replace newlines and tabs with space to avoid messing output sheet format countMatrix[0] = [item.replace('\t', ' ') for item in countMatrix[0]] countMatrix[0] = [item.replace('\n', ' ') for item in countMatrix[0]] # replace comma with Chinese comma to avoid messing format for .csv output file if delimiter == ',': newComma = u'\uFF0C'.encode('utf-8') countMatrix[0] = [item.replace(',', newComma) for item in countMatrix[0]] folderPath = pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER) if (not os.path.isdir(folderPath)): makedirs(folderPath) outFilePath = pathjoin(folderPath, 'results' + extension) # Write results to output file, and write class labels depending on transpose classLabelList = ["Class Label"] for lFile in filemanager.files.values(): if lFile.active: classLabelList.append(lFile.classLabel) with open(outFilePath, 'w') as outFile: for i, row in enumerate(countMatrix): rowStr = delimiter.join([str(x) for x in row]) if transpose: rowStr += delimiter + classLabelList[i] outFile.write(rowStr + '\n') if not transpose: outFile.write(delimiter.join(classLabelList) + '\n') outFile.close() return outFilePath, extension
def scrubContents(self, savingChanges): """ Scrubs the contents of the file according to the options chosen by the user, saves the changes or doesn't, and returns a preview of the changes either way. Args: savingChanges: Boolean saying whether or not to save the changes made. Returns: Returns a preview string of the possibly changed file. """ cache_options = [] for key in request.form.keys(): if 'usecache' in key: cache_options.append(key[len('usecache'):]) if 'scrub' not in self.options: self.options['scrub'] = {} scrubOptions = self.getScrubOptions() textString = self.loadContents() textString = scrubber.scrub(textString, gutenberg=self.isGutenberg, lower=scrubOptions['lowercasebox'], punct=scrubOptions['punctuationbox'], apos=scrubOptions['aposbox'], hyphen=scrubOptions['hyphensbox'], amper=scrubOptions['ampersandbox'], digits=scrubOptions['digitsbox'], tags=scrubOptions['tagbox'], whiteSpace=scrubOptions['whitespacebox'], spaces=scrubOptions['spacesbox'], tabs=scrubOptions['tabsbox'], newLines=scrubOptions['newlinesbox'], opt_uploads=request.files, cache_options=cache_options, cache_folder=session_manager.session_folder() + '/scrub/', previewing=not savingChanges) if savingChanges: self.saveContents(textString) self.saveScrubOptions() # renew the preview self.contentsPreview = self.generatePreview() textString = self.contentsPreview return textString
def __init__(self): """ Constructor: Creates an empty file manager. Args: None Returns: FileManager object with no files. """ self.files = {} self.nextID = 0 makedirs(pathjoin(session_manager.session_folder(), constants.FILECONTENTS_FOLDER))
def updateWorkspace(self): """ Updates the whole work space Args: None Returns: None """ # update the savepath of each file for lFile in self.files.values(): lFile.savePath = pathjoin(session_manager.session_folder(), constants.FILECONTENTS_FOLDER, str(lFile.id) + '.txt') # update the session session_manager.load()
def __init__(self): """ Constructor: Creates an empty file manager. Args: None Returns: FileManager object with no files. """ self.files = {} self.nextID = 0 makedirs( pathjoin(session_manager.session_folder(), constants.FILECONTENTS_FOLDER))
def generateStatistics(filemanager): """ Calls analyze/information to get the information about each file and the whole corpus Args: None Returns: FileInfoList: a list contains a tuple that containing the file id and the file information (see analyze/information.py/Corpus_Information.returnstatistics() function for more) corpusInformation: the statistics information about the whole corpus (see analyze/information.py/File_Information.returnstatistics() function for more) """ checkedLabels = request.form.getlist('segmentlist') ids = set(filemanager.files.keys()) checkedLabels = set(map(int, checkedLabels)) # convert the checkedLabels into int for id in ids - checkedLabels: # if the id is not in checked list filemanager.files[id].disable() # make that file inactive in order to getMatrix FileInfoList = [] folderpath = os.path.join(session_manager.session_folder(), constants.RESULTS_FOLDER) # folder path for storing graphs and plots try: os.mkdir(folderpath) # attempt to make folder to store graphs/plots except: pass ngramSize, useWordTokens, useFreq, useTfidf, normOption, greyWord, showDeleted, onlyCharGramsWithinWords, MFW, culling = filemanager.getMatrixOptions() countMatrix = filemanager.getMatrix(useWordTokens=useWordTokens, useTfidf=False, normOption=normOption, onlyCharGramsWithinWords=onlyCharGramsWithinWords, ngramSize=ngramSize, useFreq=False, greyWord=greyWord, showGreyWord=showDeleted, MFW=MFW, cull=culling) WordLists = general_functions.matrixtodict(countMatrix) Files = [file for file in filemanager.getActiveFiles()] for i in range(len(Files)): templabel = countMatrix[i + 1][0] # because the first row of the first line is the '' fileinformation = information.File_Information(WordLists[i], templabel) FileInfoList.append((Files[i].id, fileinformation.returnstatistics())) corpusInformation = information.Corpus_Information(WordLists, Files) # make a new object called corpus corpusInfoDict = corpusInformation.returnstatistics() return FileInfoList, corpusInfoDict
def generateRWmatrixPlot(dataPoints, legendLabelsList): """ Generates rolling windows graph raw data matrix Args: dataPoints: a list of [x, y] points Returns: Output file path and extension. """ extension = '.csv' deliminator = ',' folderPath = pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER) if (not os.path.isdir(folderPath)): makedirs(folderPath) outFilePath = pathjoin(folderPath, 'RWresults' + extension) maxlen = 0 for i in xrange(len(dataPoints)): if len(dataPoints[i]) > maxlen: maxlen = len(dataPoints[i]) maxlen += 1 rows = [""] * maxlen legendLabelsList[0] = legendLabelsList[0].split('#') rows[0] = (deliminator + deliminator).join(legendLabelsList[0]) + deliminator + deliminator with open(outFilePath, 'w') as outFile: for i in xrange(len(dataPoints)): for j in xrange(1, len(dataPoints[i]) + 1): rows[j] = rows[j] + str(dataPoints[i][j - 1][0]) + deliminator + str( dataPoints[i][j - 1][1]) + deliminator for i in xrange(len(rows)): outFile.write(rows[i] + '\n') outFile.close() return outFilePath, extension
def loadFileManager(): """ Loads the file manager for the specific session from the hard drive. Args: None Returns: The file manager object for the session. """ fileManagerPath = os.path.join(session_folder(), constants.FILEMANAGER_FILENAME) # encryption # if constants.FILEMANAGER_KEY != '': # fileManagerPath = general_function.decryptFile(path=fileManagerPath, key=constants.FILEMANAGER_KEY) fileManager = pickle.load(open(fileManagerPath, 'rb')) # encryption # if constants.FILEMANAGER_KEY != '': # os.remove(fileManagerPath) return fileManager
def generateSimsCSV(filemanager): """ Generates a CSV file from the calculating similarity. Args: None Returns: The filepath where the CSV was saved, and the chosen extension .csv for the file. """ extension = '.csv' cosineSims, DocumentName = generateSimilarities(filemanager) delimiter = ',' cosineSims=cosineSims.split("***"); DocumentName=DocumentName.split("***"); folderPath = pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER) if (not os.path.isdir(folderPath)): makedirs(folderPath) outFilePath = pathjoin(folderPath, 'results' + extension) compFileId = request.form['uploadname'] with open(outFilePath, 'w') as outFile: outFile.write("Similarity Rankings:"+'\n') outFile.write("\'The module used to produce this ranking employs Latent Semantic Analysis to generate unique\n vectors for each document. The cosine angle between your comparison document's vector and the vector\n of each document of your corpus is calculated and these values are then compared. Cosine similarity\n measures can be between 0 and 1 and the higher the value the closer the comparison document's vector is to that\n document's vector as opposed to the other documents' vectors."+'\n') outFile.write("Selected Comparison Document: "+delimiter+str(filemanager.getActiveLabels()[int(compFileId.encode("utf-8"))])+'\n') outFile.write("Rank," + "Document,"+ "Cosine Similarity"+'\n') for i in range(0,(len(cosineSims)-1)): outFile.write(str(i+1)+delimiter+DocumentName[i]+delimiter+cosineSims[i]+'\n') outFile.close() return outFilePath, extension
def generateMCJSONObj(filemanager): """ Generates a JSON object for multicloud when working with a mallet .txt file. Args: malletPath: path to the saved mallet .txt file Returns: An object, formatted in the JSON that d3 needs, either a list or a dictionary. """ contentPath = os.path.join(session_manager.session_folder(), constants.FILECONTENTS_FOLDER, constants.MALLET_INPUT_FILE_NAME) outputPath = os.path.join(session_manager.session_folder(), constants.RESULTS_FOLDER, constants.MALLET_OUTPUT_FILE_NAME) try: makedirs(pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER)) # attempt to make the result dir except: pass # result dir already exists if request.form['analysistype'] == 'userfiles': JSONObj = generateJSONForD3(filemanager, mergedSet=False) else: # request.form['analysistype'] == 'topicfile' topicString = str(request.files['optuploadname']) topicString = re.search(r"'(.*?)'", topicString) topicString = topicString.group(1) if topicString != '': request.files['optuploadname'].save(contentPath) with open(contentPath, 'r') as f: content = f.read() # reads content from the upload file if content.startswith('#doc source pos typeindex type topic'): # --- begin converting a Mallet file into the file d3 can understand --- tuples = [] # Read the output_state file with open(contentPath) as f: # Skip the first three lines for _ in xrange(3): next(f) # Create a list of type:topic combinations for line in f: line = re.sub('\s+', ' ', line) # Make sure the number of columns is correct try: doc, source, pos, typeindex, type, topic = line.rstrip().split(' ') tuple = type + ':' + topic tuples.append(tuple) except: raise Exception( "Your source data cannot be parsed into a regular number of columns. Please ensure that there are no spaces in your file names or file paths. It; may be easiest to open the outpt_state file in a spreadsheet using a space as; the delimiter and text as the field type. Data should only be present in columns; A to F. Please fix any misaligned data and run this script again.") # Count the number of times each type-topic combo appears from collections import defaultdict topicCount = defaultdict(int) for x in tuples: topicCount[x] += 1 # Populate a topicCounts dict with type: topic:count words = [] topicCounts = {} for k, v in topicCount.iteritems(): type, topic = k.split(':') count = int(v) tc = topic + ":" + str(count) if type in words: topicCounts[type] = topicCounts[type] + " " + tc else: topicCounts[type] = tc words.append(type) # Add a word ID out = "" i = 0 for k, v in topicCounts.iteritems(): out += str(i) + " " + k + " " + v + "\n" i += 1 # Write the output file with open(outputPath, 'w') as f: f.write(out) # Python will convert \n to os.linesep # --- end converting a Mallet file into the file d3 can understand --- else: with open(outputPath, 'w') as f: f.write(content) # if this is the jsonform, just write that in the output folder JSONObj = multicloud_topic.topicJSONmaker(outputPath) return JSONObj
def generateDendrogram(filemanager): """ Generates dendrogram image and PDF from the active files. Args: None Returns: Total number of PDF pages, ready to calculate the height of the embeded PDF on screen """ ngramSize, useWordTokens, useFreq, useTfidf, normOption, greyWord, showGreyWord, onlyCharGramsWithinWords, MFW, culling = filemanager.getMatrixOptions() countMatrix = filemanager.getMatrix(useWordTokens=useWordTokens, useTfidf=useTfidf, normOption=normOption, onlyCharGramsWithinWords=onlyCharGramsWithinWords, ngramSize=ngramSize, useFreq=useFreq, greyWord=greyWord, showGreyWord=showGreyWord, MFW=MFW, cull=culling) # Gets options from request.form and uses options to generate the dendrogram (with the legends) in a PDF file orientation = str(request.form['orientation']) title = request.form['title'] pruning = request.form['pruning'] pruning = int(request.form['pruning']) if pruning else 0 linkage = str(request.form['linkage']) metric = str(request.form['metric']) augmentedDendrogram = False if 'augmented' in request.form: augmentedDendrogram = request.form['augmented'] == 'on' showDendroLegends = False if 'dendroLegends' in request.form: showDendroLegends = request.form['dendroLegends'] == 'on' dendroMatrix = [] fileNumber = len(countMatrix) totalWords = len(countMatrix[0]) for row in range(1, fileNumber): wordCount = [] for col in range(1, totalWords): wordCount.append(countMatrix[row][col]) dendroMatrix.append(wordCount) distanceList = dendrogrammer.getDendroDistances(linkage, metric, dendroMatrix) legend = getDendrogramLegend(filemanager, distanceList) folderPath = pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER) if (not os.path.isdir(folderPath)): makedirs(folderPath) # we need labels (segment names) tempLabels = [] for matrixRow in countMatrix: tempLabels.append(matrixRow[0]) pdfPageNumber = dendrogrammer.dendrogram(orientation, title, pruning, linkage, metric, tempLabels, dendroMatrix, legend, folderPath, augmentedDendrogram, showDendroLegends) return pdfPageNumber
def hierarchy(): """ Handles the functionality on the hierarchy page. It analyzes the various texts and displays a dendrogram. Note: Returns a response object (often a render_template call) to flask and eventually to the browser. """ fileManager = managers.utility.loadFileManager() leq = '≤'.decode('utf-8') if request.method == "GET": # "GET" request occurs when the page is first loaded. if 'analyoption' not in session: session['analyoption'] = constants.DEFAULT_ANALIZE_OPTIONS if 'hierarchyoption' not in session: session['hierarchyoption'] = constants.DEFAULT_HIERARCHICAL_OPTIONS labels = fileManager.getActiveLabels() thresholdOps = {} return render_template('hierarchy.html', labels=labels, thresholdOps=thresholdOps) if 'dendro_download' in request.form: # The 'Download Dendrogram' button is clicked on hierarchy.html. # sends pdf file to downloads folder. utility.generateDendrogram(fileManager) attachmentname = "den_" + request.form['title'] + ".pdf" if request.form['title'] != '' else 'dendrogram.pdf' session_manager.cacheAnalysisOption() session_manager.cacheHierarchyOption() return send_file(pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER + "dendrogram.pdf"), attachment_filename=attachmentname, as_attachment=True) if 'dendroSVG_download' in request.form: utility.generateDendrogram(fileManager) attachmentname = "den_" + request.form['title'] + ".svg" if request.form['title'] != '' else 'dendrogram.svg' session_manager.cacheAnalysisOption() session_manager.cacheHierarchyOption() return send_file(pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER + "dendrogram.svg"), attachment_filename=attachmentname, as_attachment=True) if 'getdendro' in request.form: # The 'Get Dendrogram' button is clicked on hierarchy.html. pdfPageNumber, score, inconsistentMax, maxclustMax, distanceMax, distanceMin, monocritMax, monocritMin, threshold = utility.generateDendrogram( fileManager) session['dengenerated'] = True labels = fileManager.getActiveLabels() inconsistentOp = "0 " + leq + " t " + leq + " " + str(inconsistentMax) maxclustOp = "2 " + leq + " t " + leq + " " + str(maxclustMax) distanceOp = str(distanceMin) + " " + leq + " t " + leq + " " + str(distanceMax) monocritOp = str(monocritMin) + " " + leq + " t " + leq + " " + str(monocritMax) thresholdOps = {"inconsistent": inconsistentOp, "maxclust": maxclustOp, "distance": distanceOp, "monocrit": monocritOp} managers.utility.saveFileManager(fileManager) session_manager.cacheAnalysisOption() session_manager.cacheHierarchyOption() return render_template('hierarchy.html', labels=labels, pdfPageNumber=pdfPageNumber, score=score, inconsistentMax=inconsistentMax, maxclustMax=maxclustMax, distanceMax=distanceMax, distanceMin=distanceMin, monocritMax=monocritMax, monocritMin=monocritMin, threshold=threshold, thresholdOps=thresholdOps)
def generateKMeansVoronoi(filemanager): """ Generates a table of cluster_number and file name from the active files. Args: None Returns: kmeansIndex.tolist(): a list of index of the closest center of the file silttScore: a float of silhouette score based on KMeans algorithm fileNameStr: a string of file names, separated by '#' KValue: an int of the number of K from input """ ngramSize, useWordTokens, useFreq, useTfidf, normOption, greyWord, showGreyWord, onlyCharGramsWithinWords, MFW, culling = filemanager.getMatrixOptions() countMatrix = filemanager.getMatrix(useWordTokens=useWordTokens, useTfidf=False, normOption=normOption, onlyCharGramsWithinWords=onlyCharGramsWithinWords, ngramSize=ngramSize, useFreq=False, greyWord=greyWord, showGreyWord=showGreyWord, MFW=MFW, cull=culling) del countMatrix[0] for row in countMatrix: del row[0] matrix = np.array(countMatrix) # Gets options from request.form and uses options to generate the K-mean results KValue = len(filemanager.getActiveFiles()) / 2 # default K value max_iter = 300 # default number of iterations initMethod = request.form['init'] n_init = 300 tolerance = 1e-4 if (request.form['nclusters'] != '') and (int(request.form['nclusters']) != KValue): KValue = int(request.form['nclusters']) if (request.form['max_iter'] != '') and (int(request.form['max_iter']) != max_iter): max_iter = int(request.form['max_iter']) if request.form['n_init'] != '': n_init = int(request.form['n_init']) if request.form['tolerance'] != '': tolerance = float(request.form['tolerance']) metric_dist = request.form['KMeans_metric'] fileNameList = [] for lFile in filemanager.files.values(): if lFile.active: if request.form["file_" + str(lFile.id)] == lFile.label: fileNameList.append(lFile.label.encode("utf-8")) else: newLabel = request.form["file_" + str(lFile.id)].encode("utf-8") fileNameList.append(newLabel) fileNameStr = fileNameList[0] for i in range(1, len(fileNameList)): fileNameStr += "#" + fileNameList[i] folderPath = pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER) if (not os.path.isdir(folderPath)): makedirs(folderPath) kmeansIndex, silttScore, colorChart, finalPointsList, finalCentroidsList, textData, maxVal = KMeans.getKMeansVoronoi( matrix, KValue, max_iter, initMethod, n_init, tolerance, metric_dist, fileNameList) return kmeansIndex, silttScore, fileNameStr, KValue, colorChart, finalPointsList, finalCentroidsList, textData, maxVal