def vector_trans(pData, pDesc, pModelName, pRootDir, pFromDir, pToDir): try: pData[pDesc].fillna("unknown", inplace=True) print('Started vector for Sample ') vec = TfidfVectorizer(ngram_range=(1, 3), stop_words="english", analyzer='char', tokenizer=tokenize, min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1) vec.fit(pData[pDesc]) x = vec.transform(pData[pDesc]) if not os.path.exists(pRootDir + '\\' + str(pModelName) + '\\' + str(pModelName[6:]) + '_Vector'): os.makedirs(pRootDir + '\\' + str(pModelName) + '\\' + str(pModelName[6:]) + '_Vector') vec_loc = pRootDir + '\\' + str(pModelName) + '\\' + str( pModelName[6:]) + '_Vector' + '\\' + str( pModelName[6:]) + ".vector.pkl" pickle.dump(vec, open(vec_loc, 'wb')) print('completed vector for Sample ') except OSError as e: raise (e) print(traceback.format_exc()) print("*** ERROR[002]: %s " % (e.strerror)) utils.movefile(pFromDir, pToDir) return (-1) sys.exit(-1) return x, vec
def preprocess(pData, pTktDesc, pTrainDir, pFailedDir, ewords): try: pData = pData.applymap(str) # pData = col_keyword(pData, pTktDesc, pCol) # pData = pData.dropna(subset = ['Sample']) pData = pData.dropna(subset=[pTktDesc]) norm_corpus = normalize_corpus(corpus=pData[pTktDesc], html_stripping=True, contraction_expansion=True, accented_char_removal=True, text_lower_case=True, text_lemmatization=False, text_stemming=False, special_char_removal=True, remove_digits=True, custm_stpwrds=False, stopword_removal=True, ewords=ewords, stopwords=stopword_list, eng_words=engwords) except Exception as e: raise (e) utils.movefile(pTrainDir, pFailedDir) print(traceback.format_exc()) print('Error ocurred due to template', e) return (-1) pData['Sample'] = norm_corpus return (0, pData)
def similaritymain(pTrainData, pTestData, pLevel1, pLevel2, pDesc, pFromDir, pToDir): try: pMatches, pTestData['Intent'], pTestData['Confidence_Level'] = [],'Nan','Nan' pTrainData, __ = traindata(pTrainData, pDesc, pLevel1, pLevel2, pFromDir, pToDir) pTrainDataDesc = pd.DataFrame(pTrainData[pDesc]) pTrainDataDescUnq = pTrainDataDesc[pDesc].unique() vectorizer = TfidfVectorizer(min_df=1, analyzer='char_wb', lowercase=False) tfidf = vectorizer.fit_transform(pTrainDataDescUnq) nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tfidf) queryTFIDF_ = vectorizer.transform(pTestData[pDesc].values) distances, indices = nbrs.kneighbors(queryTFIDF_) pTestDataDescList = list(pTestData[pDesc].values) #need to convert back to a list for i,j in enumerate(indices): pTemp = [distances[i][0], pTrainDataDesc.values[j][0][0],pTestDataDescList[i]] pMatches.append(pTemp) pMatchesDf = pd.DataFrame(pMatches, columns=['Confidence_Level','Matched name','Original name']) for i in range(len(pTestData)): pTestData['Intent'][i] = pTrainData[np.where(pTrainData[pDesc] == pMatchesDf['Matched name'][i], True , False)]['Intent'].values[0] pTestData['Confidence_Level'][i] = pMatchesDf['Confidence_Level'][i] except Exception as e: print('*** ERROR[003]: Error in similarity main function: ', sys.exc_info()[0],str(e)) print(traceback.format_exc()) utils.movefile(pFromDir, pToDir) return(-1) return(0, pTestData)
def awesome_cossim_top(A, B, ntop, pFromDir, pToDir, lower_bound=0): try: # force A and B as a CSR matrix. # If they have already been CSR, there is no overhead A = A.tocsr() B = B.tocsr() M, _ = A.shape _, N = B.shape idx_dtype = np.int32 nnz_max = M*ntop indptr = np.zeros(M+1, dtype=idx_dtype) indices = np.zeros(nnz_max, dtype=idx_dtype) data = np.zeros(nnz_max, dtype=A.dtype) ct.sparse_dot_topn( M, N, np.asarray(A.indptr, dtype=idx_dtype), np.asarray(A.indices, dtype=idx_dtype), A.data, np.asarray(B.indptr, dtype=idx_dtype), np.asarray(B.indices, dtype=idx_dtype), B.data, ntop, lower_bound, indptr, indices, data) except Exception as e: print('*** ERROR[001]: Error in similarity calculating matrix: ', sys.exc_info()[0],str(e)) print(traceback.format_exc()) utils.movefile(pFromDir, pToDir) return(-1) return csr_matrix((data,indices,indptr),shape=(M,N))
def createModel(pData, pDesc, pLevel1, pLevel2, pModelName, pRootDir, nTickets, pFromDir, pToDir, features, pFeature): try: if pFeature: x, vec = vector_trans(pData, pDesc, pModelName, pRootDir, pFromDir, pToDir) x = concatfeatures(pData, features, pRootDir, pModelName, pFromDir, pToDir, pVec=vec) else: x, vec = vector_trans(pData, pDesc, pModelName, pRootDir, pFromDir, pToDir) print('Number of Tickets for training :', len(pData)) pTrainData, __ = traindata(pData, pDesc, pLevel1, pLevel2, pFromDir, pToDir) pTrainData['Intent'] = pTrainData['Intent'].astype('category') pLabel = [ k for k in pTrainData['Intent'].value_counts().keys() if pTrainData['Intent'].value_counts()[k] > int(nTickets) ] pTrainData = pd.concat( [pTrainData, pd.get_dummies(pTrainData['Intent'])], axis=1) pTrainData.drop(['Intent'], axis=1, inplace=True) for index, name in enumerate(pLabel): print('Creating vector for intent: ', name) m, r = get_mdl(x, pTrainData[name]) pFolderName = ['_Csr_matrix', '_Model'] for foldername in pFolderName: if not os.path.exists(pRootDir + '\\' + str(pModelName) + '\\' + str(pModelName[6:]) + foldername): os.makedirs(pRootDir + '\\' + str(pModelName) + '\\' + str(pModelName[6:]) + foldername) if foldername == '_Model': model_loc = pRootDir + '\\' + str(pModelName) + '\\' + str( pModelName[6:]) + foldername + '\\' + str( name).replace('/', 'or').replace( ':', 'or') + ".model.pkl" pickle.dump(m, open(model_loc, 'wb')) else: r_loc = pRootDir + '\\' + str(pModelName) + '\\' + str( pModelName[6:]) + foldername + '\\' + str( name).replace('/', 'or').replace(':', 'or') + ".npz" sparse.save_npz(r_loc, r) except OSError as e: raise (e) print(traceback.format_exc()) print("*** ERROR[003] : %s" % (e.strerror)) utils.movefile(pFromDir, pToDir) return (-1) sys.exit(-1) return (0)
def similaritypolymain(pTrainData, pTestData, pLevel1, pLevel2, pDesc, pFromDir, pToDir, Nbest): try: pTrainData = pTrainData[pTrainData[pDesc].notna()] pTestData = pTestData[pTestData[pDesc].notna()] pTestData['Intent'], pTestData['Confidence_Level'] = 'Nan','Nan' pTrainData, __ = traindata(pTrainData, pDesc, pLevel1, pLevel2, pFromDir, pToDir) pTrainDataDesc = pd.DataFrame(pTrainData[pDesc]) pTrainDataDescUnq = pTrainDataDesc[pDesc].unique().tolist() pTestDataDescList = list(pTestData[pDesc].values) #need to convert back to a list model = PolyFuzz("TF-IDF") model.match(pTestDataDescList, pTrainDataDescUnq, nbest = int(Nbest)) pMatchesDf = model.get_matches() IntCol = ["To"] for i in range(1, int(Nbest)-1): IntCol.append("BestMatch" + "__" + str(i)) pTestData['Intent' + '__' + str(i)] = 'NaN' SimCol = ['Similarity'] for k in range(1, int(Nbest) - 1): SimCol.append("Similarity" + "__" + str(k)) pTestData['Confidence_Level'+ '__' + str(k)] = 'NaN' for i in range(len(IntCol)): col = str(IntCol[i]) if col != "To": for j in range(len(pTestData)): if pMatchesDf[col][j] != None: pTestData['Intent' + '__' + str(i-1)][j] = pTrainData[np.where(pTrainData[pDesc] == pMatchesDf[col][j], True , False)]['Intent'].values[0] else: for j in range(len(pTestData)): if pMatchesDf[col][j] != None: pTestData['Intent'][j] = pTrainData[np.where(pTrainData[pDesc] == pMatchesDf[IntCol[i]][j], True , False)]['Intent'].values[0] for l in range(len(SimCol)): col = str(SimCol[l]) if col != "Similarity": for m in range(len(pTestData)): if pMatchesDf[col][m] != None: pTestData['Confidence_Level'+ '__' + int(l-1)][m] = pMatchesDf[SimCol[l]][m] else: for m in range(len(pTestData)): if pMatchesDf[SimCol[l]][m] != None: pTestData['Confidence_Level'][m] = pMatchesDf[SimCol[l]][m] except Exception as e: print('*** ERROR[004]: Error in similarity poly main function: ', sys.exc_info()[0],str(e)) print(traceback.format_exc()) utils.movefile(pFromDir, pToDir) return(-1) sys.exit(-1) return(0, pTestData)
def delete(self): content_dir = self.get_content_directory() log.info("Delete called on work [%d]" % self.id) saved_id = self.id # delete() unsets the ID attribute, which we need in order to clear out the directory super(Work, self).delete() fname = "saved-work-%d.zip" % saved_id self.id = saved_id if hasattr(settings, "ATTIC"): try: target = os.path.join(settings.ATTIC, fname) log.info("Saving work to %s" % target) archive = utils.recreate_ingest_package(self) utils.movefile(archive,target) except OSError, ose: log.error("Error encountered saving deleted work (id:%s) to %s: %r" % ( saved_id, target, ose))
def traindata(pData, pDesc, pLevel1, pLevel2, pFromDir, pToDir): try: pData[pDesc]= pData[pDesc].astype('str') pData[pLevel1] = pData[pLevel1].astype('str') pData[pLevel2] = pData[pLevel2].astype('str') pData.dropna(subset = [pLevel1, 'Level2']) pData['Intent'] = pData[[pLevel1,pLevel2]].agg('__'.join, axis=1).astype('category') pLabel = pData['Intent'].cat.categories.tolist() except Exception as e: print('*** ERROR[002]: Error in similarity transform train data: ', sys.exc_info()[0],str(e)) print(traceback.format_exc()) utils.movefile(pFromDir, pToDir) return(-1) return pData, pLabel
def maintrain(pData, pDesc, pLevel1, pLevel2, pModelName, pRootDir, nTickets, pFromDir, pToDir, pSheetName, features, pFeature): if not set([pDesc, pLevel1, pLevel2]).issubset(pData.columns): utils.movefile(pFromDir, pToDir) __, pFailedData = utils.Filelist(pToDir, pSheetName) print( '*** ERROR[001]: Loading XLS - Could be due to using non-standard template ***', str(pFailedData.columns)) return (-1, pData) sys.exit(-1) try: pData = pData.dropna(subset=[pDesc, pLevel1, pLevel2], how='any') train.createModel(pData, pDesc, pLevel1, pLevel2, pModelName, pRootDir, nTickets, pTrainDir, pToDir, features, pFeature) except Exception as e: print('*** ERROR[002]: Error in Train main function: ', sys.exc_info()[0], str(e)) print(traceback.format_exc()) return (-1) return (0)
def concatfeatures(pData, features, pRootDir, pModelName, pFromDir, pToDir, pVec): try: encoder = OneHotEncoder(categories="auto", handle_unknown='ignore') Train_encoded = encoder.fit_transform(pData[features]) x = hstack([pVec, Train_encoded]).tocsr() if not os.path.exists(pRootDir + '\\' + str(pModelName) + '\\' + str(pModelName[6:]) + '_Ohe_encode'): os.makedirs(pRootDir + '\\' + str(pModelName) + '\\' + str(pModelName[6:]) + '_Ohe_encode') enc_loc = pRootDir + '\\' + str(pModelName) + '\\' + str( pModelName[6:]) + '_Ohe_encode' + '\\' + str( pModelName[6:]) + ".ohe.pkl" pickle.dump(encoder, open(enc_loc, 'wb')) except Exception as e: print('*** ERROR[001]: Error in Training: ', sys.exc_info()[0], str(e)) print(traceback.format_exc()) utils.movefile(pFromDir, pToDir) return (-1) sys.exit(-1) return x
def maintest(pData, pDesc, pTh, pThSim, pTicketId, pLevel1, pLevel2, pModelName, pRootDir, pFromDir, pToDir, pSheetName, sim, features, pFeature): if not set([pDesc, pTicketId]).issubset(pData.columns): utils.movefile(pFromDir, pToDir) __, pFailedData = utils.Filelist(pToDir, pSheetName) print( '*** ERROR[003]: Loading XLS - Could be due to using non-standard template ***', str(pData.columns)) print(traceback.format_exc()) return (-1, pData) sys.exit(-1) try: pData = pData.dropna(subset=[pDesc, pTicketId], how='any') _, TestOutputData, pClassNames, pVec = test.intentpred( pData, pDesc, pTh, pThSim, pTicketId, pLevel1, pLevel2, pModelName, pRootDir, pFromDir, pToDir, sim, features, pFeature) except Exception as e: print('*** ERROR[004]: Error in Test main function: ', sys.exc_info()[0], str(e)) print(traceback.format_exc()) return (-1) return (0, TestOutputData, pClassNames, pVec)
def intentpred(pData, pDesc, pTh, pThSim, pTicketId, pLevel1, pLevel2, pModelName, pRootDir, pFromDir, pToDir, sim, features, pFeatures): try: if 'Confidence_Level' not in pData: pData['Confidence_Level'] = float(pThSim + 1) if sim: pDataTh = pData[np.where(pData['Confidence_Level'] < float(pThSim), True, False)] else: pDataTh = pData print('Length of file for Prediction after similarity:', pDataTh.shape[0]) if len(pDataTh) > 0: pDataTh[pDesc].fillna("unknown", inplace=True) oCategoryNames = categories(pRootDir, pModelName) pDataTh[pTicketId] = pDataTh[pTicketId].astype('category') preds = np.zeros((len(pDataTh), len(oCategoryNames))) vec = loadTfidfFile(pRootDir, pModelName) if pFeatures: oTktVec = vec.transform(pDataTh[pDesc].astype(str)) encoder = loadEncFile(pRootDir, pModelName) tkt_desc = featuresconcat(pDataTh, features, encoder, pVec=oTktVec) else: tkt_desc = vec.transform(pDataTh[pDesc].astype(str)) for index, name in enumerate(oCategoryNames): print('Calculating prediction of intent', name) estimator = loadmodel(pRootDir, pModelName, name) r = loadcsr_matrix(pRootDir, pModelName, name) preds[:, index] = estimator.predict_proba(tkt_desc.multiply(r))[:, 1] pintentdf = pd.DataFrame(preds, columns=oCategoryNames) pintentdf['Confidence_Level'] = pintentdf[oCategoryNames].max( axis=1) pintentdf['Intent'] = pintentdf[oCategoryNames].idxmax(axis=1) pintentdf['Intent'] = np.where( pintentdf['Confidence_Level'] > float(pTh), pintentdf['Intent'], 'Others') pDataTh.reset_index(drop=True, inplace=True) pintentdf.reset_index(drop=True, inplace=True) pintentdf = pd.concat([pDataTh[pTicketId], pintentdf], axis=1) pintentdf = pintentdf[[pTicketId, 'Confidence_Level', 'Intent']] pData.loc[pData[pTicketId].isin(pintentdf[pTicketId]), ['Confidence_Level', 'Intent']] = pintentdf[[ 'Confidence_Level', 'Intent' ]].values pData[['Level1', 'Level2']] = pData.Intent.str.split( "__", expand=True, ) else: pData['Confidence_Level'] = pData['Confidence_Level'].astype( 'float') pData[['Level1', 'Level2']] = pData.Intent.str.split( "__", expand=True, ) except Exception as e: print(e) print('*** ERROR[001]: intentpred ***', sys.exc_info()[0], str(e)) print(traceback.format_exc()) utils.movefile(pFromDir, pToDir) return (-1, pData) sys.exit(-1) return (0, pData, oCategoryNames, vec)