def getTheAnalysedData(): logger.info('got the request') if request.method == 'POST': data = json.loads(request.data) dataAboutThecell = data['columnToAnalyse'] dataList = dataAboutThecell.split(':;:') columnvalue, columnName, rowValue, rowName, fileName = dataList[ 0], dataList[1], dataList[2], dataList[3], dataList[4] # columnName = data['columnToAnalyse'] #.find_one({"author": "Mike"}) collectionDb = mongoDb.getMongoCollectionClient( host='localhost', port=27017, dbName='informationRetreival', collectionName='dataframeAnalysis') dataDb = collectionDb.find_one({"_id": fileName}) # uniqueValueNameMApToMode = {} if rowName in dataDb['processed']['analysedData']: dataForTheColumnName = dataDb['processed']['analysedData'][ rowName]['analysis'] modeDataOfCellRow = dataForTheColumnName[rowValue][ 'dataAlongDifferentColumn'][columnName] if columnName in dataDb['processed']['analysedData']: dataForTheColumnName = dataDb['processed']['analysedData'][ columnName]['analysis'] modeDataOfCellColumn = dataForTheColumnName[columnvalue][ 'dataAlongDifferentColumn'][rowName] return json.dumps({ 'status': 'ok', 'columnData': modeDataOfCellColumn, 'rowData': modeDataOfCellRow, 'rowHeader': rowName, 'columnHeader': columnName })
import pika,logging,json import informationRetreval from databaseAndQueue import mongoDb logging.basicConfig(format='%(levelname)s:%(asctime)s:%(message)s', level=logging.INFO) logger = logging.getLogger(__name__) collectionDb = mongoDb.getMongoCollectionClient(host='localhost',port=27017, dbName='informationRetreival',collectionName='dataframeAnalysis') collectionDbHeaderInfo = mongoDb.getMongoCollectionClient(host='localhost',port=27017, dbName='informationRetreival',collectionName='dataframeHeadAndDtype') def removeTheKey(mapOfInfo,keyToRemove): mapOfInfoWithoutdataFrame = {} for key in mapOfInfo: innerDict = mapOfInfo[key] value = innerDict.pop(keyToRemove,None) mapOfInfoWithoutdataFrame[key] = innerDict return mapOfInfoWithoutdataFrame def informationRetrieval(ch, method, properties, body): dataFromQueue = json.loads(body) fileName = dataFromQueue['fileName'] filePath = dataFromQueue['filePath'] dataFrameObj = informationRetreval.getTheDataFrame(filePath) dataDb = collectionDbHeaderInfo.find_one({"fileName": fileName}) columnUsefullMap = {} blackListColumns = [] for dataOfColumn in dataDb['headers']:
def upload_file_browse(): logger.info('got the request') folder = os.path.abspath("static/csv/") if request.method == 'POST': file = request.files['file'] if file: try: filename = secure_filename(file.filename) file.save(os.path.join(folder, filename)) pathOfSavedFile = os.path.join(folder, filename) logger.info('downloaded the file') dataFrameObj = pd.read_csv(pathOfSavedFile) dtypesOfColumns = dataFrameObj.dtypes columnNameToUse = getTheModeOfHeaders(dataFrameObj) mapOfNameToDtype = dict(dtypesOfColumns) mapOfNameToDtype = [(str(nameOfColumn), str(mapOfNameToDtype[nameOfColumn]), nameOfColumn in columnNameToUse) for nameOfColumn in mapOfNameToDtype] #publishing data to queue dataToPublish = { 'filePath': pathOfSavedFile, 'fileName': filename } collectionDbAnalysis = mongoDb.getMongoCollectionClient( host='localhost', port=27017, dbName='informationRetreival', collectionName='dataframeAnalysis') collectionDbHeaderAndType = mongoDb.getMongoCollectionClient( host='localhost', port=27017, dbName='informationRetreival', collectionName='dataframeHeadAndDtype') dataDb = collectionDbAnalysis.find_one({"_id": filename}) if dataDb is None: channel = getQueueObj() channel.basic_publish( exchange='', routing_key='informationRetreival', body=json.dumps(dataToPublish), properties=pika.BasicProperties( delivery_mode=2, # make message persistent )) dataToStoreInMongo = { '_id': filename, 'filePath': pathOfSavedFile } dataToStoreInMongoHeadersIndType = { 'fileName': filename, 'headers': mapOfNameToDtype } collectionDbAnalysis.insert_one(dataToStoreInMongo) collectionDbHeaderAndType.insert_one( dataToStoreInMongoHeadersIndType) jsonToReturn = { 'status': 'ok', 'fileName': filename, 'csvHeaders': mapOfNameToDtype } except Exception, e: logger.exception(e) jsonToReturn = {'status': 'error', 'errorMessage': e} logger.info('returing the data {}'.format(mapOfNameToDtype)) return json.dumps(jsonToReturn)
def doingRangeAnalysis(): logger.info('got the request') if request.method == 'POST': folder = os.path.abspath("static/csv/") data = json.loads(request.data) fileName = data['fileName'] rangeData = data['columnRangeData'] columnNameToAnalyse = data['columnNametoAnalyse'] pandasQuery = '' pathOfSavedFile = os.path.join(folder, fileName) logger.info('downloaded the file') dataFrameObj = pd.read_csv(pathOfSavedFile) for columnNameInRangeData in rangeData: minimumvalue = rangeData[columnNameInRangeData]['min'] maximumvalue = rangeData[columnNameInRangeData]['max'] queryForColumn = '{} <= {} <= {}'.format(minimumvalue, columnNameInRangeData, maximumvalue) pandasQuery += queryForColumn + '&' pandasQuery = pandasQuery.strip('&') mapOfInfo = informationRetreval.doRangeAnalysis( dataFrameObj, pandasQuery, columnNameToAnalyse, []) collectionDb = mongoDb.getMongoCollectionClient( host='localhost', port=27017, dbName='informationRetreival', collectionName='dataframeAnalysis') collectionDbHeaderAndType = mongoDb.getMongoCollectionClient( host='localhost', port=27017, dbName='informationRetreival', collectionName='dataframeHeadAndDtype') dataDb = collectionDb.find_one({"_id": fileName}) headerTypeForFile = collectionDbHeaderAndType.find_one( {"fileName": fileName}) headerListValues = headerTypeForFile['headers'] headerType = [(f[0], f[1]) for f in headerListValues if ('int' in f[1] or 'float' in f[1]) and ( f[0] != columnNameToAnalyse) and (f[2] == True)] mapOfHeaderAndType = {} if len(headerType) > 0: mapOfHeaderAndType = Convert(headerType, mapOfHeaderAndType) if len(mapOfHeaderAndType.keys()) > 0: useRange = True else: useRange = False columnDistributionData = {} if useRange: for numericColumnName in mapOfHeaderAndType.keys(): dataForTheNumericColumn = dataDb['processed']['analysedData'][ numericColumnName]['analysis'] numericvalues = [ float(k.replace(':;:', '.')) for k in dataForTheNumericColumn.keys() ] numericvalues.sort() mapOfHeaderAndType[numericColumnName] = numericvalues uniqueValueNameMApToMode = {} for uniqueColumnVal in mapOfInfo: uniqueValueNameMApToMode[uniqueColumnVal] = {} modeDataOfColumn = mapOfInfo[uniqueColumnVal] # dataForColumn = dataForTheColumnName[uniqueColumnVal] columnHeaders = modeDataOfColumn['dataAlongDifferentColumn'].keys() for columnValueToAnalyse in modeDataOfColumn[ 'dataAlongDifferentColumn']: modeData = modeDataOfColumn['dataAlongDifferentColumn'][ columnValueToAnalyse]['mode'] columnDistributionData[uniqueColumnVal] = modeDataOfColumn[ 'totalNumValues'] valueToSetForColumn = { 'mode': None, 'samples': None, 'relevant': False } if modeData is not None: # if len(modeData) == 1: # titleSatisfying = modeDataOfColumn[columnValueToAnalyse]['title_satisfying'] valueToSetForColumn['mode'] = modeData valueToSetForColumn['samples'] = [] valueToSetForColumn['relevant'] = True uniqueValueNameMApToMode[uniqueColumnVal][ columnValueToAnalyse] = valueToSetForColumn logger.info('got the data from mongo db for {}'.format(fileName)) columnHeaders.insert(0, columnNameToAnalyse) return json.dumps({ 'status': 'ok', 'columnModeData': uniqueValueNameMApToMode, 'columnValues': uniqueValueNameMApToMode.keys(), 'columnHeaders': columnHeaders, 'fileName': fileName, 'distribution': columnDistributionData, 'numericValue': useRange, 'rangeValue': mapOfHeaderAndType })
def getTheParsedDataFromDb(): logger.info('got the request') folder = os.path.abspath("static/csv/") if request.method == 'POST': data = json.loads(request.data) fileName = data['nameOfFile'] columnName = data['columnToAnalyse'] #.find_one({"author": "Mike"}) collectionDb = mongoDb.getMongoCollectionClient( host='localhost', port=27017, dbName='informationRetreival', collectionName='dataframeAnalysis') collectionDbHeaderAndType = mongoDb.getMongoCollectionClient( host='localhost', port=27017, dbName='informationRetreival', collectionName='dataframeHeadAndDtype') dataDb = collectionDb.find_one({"_id": fileName}) headerTypeForFile = collectionDbHeaderAndType.find_one( {"fileName": fileName}) headerListValues = headerTypeForFile['headers'] headerType = [(f[0], f[1]) for f in headerListValues if ('int' in f[1] or 'float' in f[1]) and ( f[0] != columnName) and (f[2] == True)] mapOfHeaderAndType = {} if len(headerType) > 0: mapOfHeaderAndType = Convert(headerType, mapOfHeaderAndType) if len(mapOfHeaderAndType.keys()) > 0: useRange = True else: useRange = False uniqueValueNameMApToMode = {} columnDistributionData = {} if useRange: for numericColumnName in mapOfHeaderAndType.keys(): dataForTheNumericColumn = dataDb['processed']['analysedData'][ numericColumnName]['analysis'] numericvalues = [ float(k.replace(':;:', '.')) for k in dataForTheNumericColumn.keys() ] numericvalues.sort() mapOfHeaderAndType[numericColumnName] = numericvalues if columnName in dataDb['processed']['analysedData']: dataForTheColumnName = dataDb['processed']['analysedData'][ columnName]['analysis'] # if 'int' in headerType or 'float' in headerType: # numericvalues = [float(k) for k in dataForTheColumnName.keys()] # numericvalues.sort() # useRange=True # else: # numericvalues=[] for uniqueColumnVal in dataForTheColumnName: uniqueValueNameMApToMode[uniqueColumnVal] = {} modeDataOfColumn = dataForTheColumnName[uniqueColumnVal] # dataForColumn = dataForTheColumnName[uniqueColumnVal] columnHeaders = modeDataOfColumn[ 'dataAlongDifferentColumn'].keys() for columnValueToAnalyse in modeDataOfColumn[ 'dataAlongDifferentColumn']: modeData = modeDataOfColumn['dataAlongDifferentColumn'][ columnValueToAnalyse]['mode'] columnDistributionData[uniqueColumnVal] = modeDataOfColumn[ 'totalNumValues'] valueToSetForColumn = { 'mode': None, 'samples': None, 'relevant': False } if modeData is not None: # if len(modeData) == 1: # titleSatisfying = modeDataOfColumn[columnValueToAnalyse]['title_satisfying'] valueToSetForColumn['mode'] = modeData valueToSetForColumn['samples'] = [] valueToSetForColumn['relevant'] = True uniqueValueNameMApToMode[uniqueColumnVal][ columnValueToAnalyse] = valueToSetForColumn logger.info('got the data from mongo db for {}'.format(fileName)) columnHeaders.insert(0, columnName) return json.dumps({ 'status': 'ok', 'columnModeData': uniqueValueNameMApToMode, 'columnValues': uniqueValueNameMApToMode.keys(), 'columnHeaders': columnHeaders, 'fileName': fileName, 'distribution': columnDistributionData, 'numericValue': useRange, 'rangeValue': mapOfHeaderAndType })