Exemplo n.º 1
0
def vector_trans(pData, pDesc, pModelName, pRootDir, pFromDir, pToDir):
    try:
        pData[pDesc].fillna("unknown", inplace=True)
        print('Started vector for Sample ')
        vec = TfidfVectorizer(ngram_range=(1, 3),
                              stop_words="english",
                              analyzer='char',
                              tokenizer=tokenize,
                              min_df=3,
                              max_df=0.9,
                              strip_accents='unicode',
                              use_idf=1,
                              smooth_idf=1,
                              sublinear_tf=1)
        vec.fit(pData[pDesc])
        x = vec.transform(pData[pDesc])
        if not os.path.exists(pRootDir + '\\' + str(pModelName) + '\\' +
                              str(pModelName[6:]) + '_Vector'):
            os.makedirs(pRootDir + '\\' + str(pModelName) + '\\' +
                        str(pModelName[6:]) + '_Vector')
        vec_loc = pRootDir + '\\' + str(pModelName) + '\\' + str(
            pModelName[6:]) + '_Vector' + '\\' + str(
                pModelName[6:]) + ".vector.pkl"
        pickle.dump(vec, open(vec_loc, 'wb'))
        print('completed vector for Sample ')

    except OSError as e:
        raise (e)
        print(traceback.format_exc())
        print("*** ERROR[002]: %s " % (e.strerror))
        utils.movefile(pFromDir, pToDir)
        return (-1)
        sys.exit(-1)
    return x, vec
def preprocess(pData, pTktDesc, pTrainDir, pFailedDir, ewords):
    try:
        pData = pData.applymap(str)
        # pData = col_keyword(pData, pTktDesc, pCol)
        # pData = pData.dropna(subset = ['Sample'])
        pData = pData.dropna(subset=[pTktDesc])
        norm_corpus = normalize_corpus(corpus=pData[pTktDesc],
                                       html_stripping=True,
                                       contraction_expansion=True,
                                       accented_char_removal=True,
                                       text_lower_case=True,
                                       text_lemmatization=False,
                                       text_stemming=False,
                                       special_char_removal=True,
                                       remove_digits=True,
                                       custm_stpwrds=False,
                                       stopword_removal=True,
                                       ewords=ewords,
                                       stopwords=stopword_list,
                                       eng_words=engwords)
    except Exception as e:
        raise (e)
        utils.movefile(pTrainDir, pFailedDir)
        print(traceback.format_exc())
        print('Error ocurred due to template', e)
        return (-1)

    pData['Sample'] = norm_corpus
    return (0, pData)
def similaritymain(pTrainData, pTestData, pLevel1, pLevel2, pDesc, pFromDir, pToDir):
    try:
        pMatches, pTestData['Intent'], pTestData['Confidence_Level'] = [],'Nan','Nan'
        pTrainData, __ = traindata(pTrainData, pDesc, pLevel1, pLevel2, pFromDir, pToDir)
        pTrainDataDesc = pd.DataFrame(pTrainData[pDesc])
        pTrainDataDescUnq = pTrainDataDesc[pDesc].unique()
        vectorizer = TfidfVectorizer(min_df=1, analyzer='char_wb', lowercase=False)
        tfidf = vectorizer.fit_transform(pTrainDataDescUnq)
        nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tfidf)
        queryTFIDF_ = vectorizer.transform(pTestData[pDesc].values)
        distances, indices = nbrs.kneighbors(queryTFIDF_)
        pTestDataDescList = list(pTestData[pDesc].values) #need to convert back to a list

        for i,j in enumerate(indices):
          pTemp = [distances[i][0], pTrainDataDesc.values[j][0][0],pTestDataDescList[i]]
          pMatches.append(pTemp)
        pMatchesDf = pd.DataFrame(pMatches, columns=['Confidence_Level','Matched name','Original name'])
        
        for i in range(len(pTestData)):
            pTestData['Intent'][i] = pTrainData[np.where(pTrainData[pDesc] == pMatchesDf['Matched name'][i], True , False)]['Intent'].values[0]
            pTestData['Confidence_Level'][i] = pMatchesDf['Confidence_Level'][i]
    except Exception as e:
        print('*** ERROR[003]: Error in similarity main function: ', sys.exc_info()[0],str(e))
        print(traceback.format_exc())
        utils.movefile(pFromDir, pToDir)
        return(-1)
    return(0, pTestData)
def awesome_cossim_top(A, B, ntop, pFromDir, pToDir, lower_bound=0):
    try:
        # force A and B as a CSR matrix.
        # If they have already been CSR, there is no overhead
        A = A.tocsr()
        B = B.tocsr()
        M, _ = A.shape
        _, N = B.shape
     
        idx_dtype = np.int32
     
        nnz_max = M*ntop
     
        indptr = np.zeros(M+1, dtype=idx_dtype)
        indices = np.zeros(nnz_max, dtype=idx_dtype)
        data = np.zeros(nnz_max, dtype=A.dtype)

        ct.sparse_dot_topn(
            M, N, np.asarray(A.indptr, dtype=idx_dtype),
            np.asarray(A.indices, dtype=idx_dtype),
            A.data,
            np.asarray(B.indptr, dtype=idx_dtype),
            np.asarray(B.indices, dtype=idx_dtype),
            B.data,
            ntop,
            lower_bound,
            indptr, indices, data)
    except Exception as e:
        print('*** ERROR[001]: Error in similarity calculating matrix: ', sys.exc_info()[0],str(e))
        print(traceback.format_exc())
        utils.movefile(pFromDir, pToDir)
        return(-1)

    return csr_matrix((data,indices,indptr),shape=(M,N))
Exemplo n.º 5
0
def createModel(pData, pDesc, pLevel1, pLevel2, pModelName, pRootDir, nTickets,
                pFromDir, pToDir, features, pFeature):
    try:
        if pFeature:
            x, vec = vector_trans(pData, pDesc, pModelName, pRootDir, pFromDir,
                                  pToDir)
            x = concatfeatures(pData,
                               features,
                               pRootDir,
                               pModelName,
                               pFromDir,
                               pToDir,
                               pVec=vec)
        else:
            x, vec = vector_trans(pData, pDesc, pModelName, pRootDir, pFromDir,
                                  pToDir)

        print('Number of Tickets for training :', len(pData))
        pTrainData, __ = traindata(pData, pDesc, pLevel1, pLevel2, pFromDir,
                                   pToDir)
        pTrainData['Intent'] = pTrainData['Intent'].astype('category')
        pLabel = [
            k for k in pTrainData['Intent'].value_counts().keys()
            if pTrainData['Intent'].value_counts()[k] > int(nTickets)
        ]
        pTrainData = pd.concat(
            [pTrainData, pd.get_dummies(pTrainData['Intent'])], axis=1)
        pTrainData.drop(['Intent'], axis=1, inplace=True)

        for index, name in enumerate(pLabel):
            print('Creating vector for intent: ', name)
            m, r = get_mdl(x, pTrainData[name])
            pFolderName = ['_Csr_matrix', '_Model']
            for foldername in pFolderName:
                if not os.path.exists(pRootDir + '\\' + str(pModelName) +
                                      '\\' + str(pModelName[6:]) + foldername):
                    os.makedirs(pRootDir + '\\' + str(pModelName) + '\\' +
                                str(pModelName[6:]) + foldername)
                if foldername == '_Model':
                    model_loc = pRootDir + '\\' + str(pModelName) + '\\' + str(
                        pModelName[6:]) + foldername + '\\' + str(
                            name).replace('/', 'or').replace(
                                ':', 'or') + ".model.pkl"
                    pickle.dump(m, open(model_loc, 'wb'))
                else:
                    r_loc = pRootDir + '\\' + str(pModelName) + '\\' + str(
                        pModelName[6:]) + foldername + '\\' + str(
                            name).replace('/', 'or').replace(':',
                                                             'or') + ".npz"
                    sparse.save_npz(r_loc, r)

    except OSError as e:
        raise (e)
        print(traceback.format_exc())
        print("*** ERROR[003] : %s" % (e.strerror))
        utils.movefile(pFromDir, pToDir)
        return (-1)
        sys.exit(-1)
    return (0)
def similaritypolymain(pTrainData, pTestData, pLevel1, pLevel2, pDesc, pFromDir, pToDir, Nbest):
    try:
        pTrainData = pTrainData[pTrainData[pDesc].notna()]
        pTestData = pTestData[pTestData[pDesc].notna()]
        pTestData['Intent'], pTestData['Confidence_Level'] = 'Nan','Nan'
        pTrainData, __ = traindata(pTrainData, pDesc, pLevel1, pLevel2, pFromDir, pToDir)
        pTrainDataDesc = pd.DataFrame(pTrainData[pDesc])
        pTrainDataDescUnq = pTrainDataDesc[pDesc].unique().tolist()
        pTestDataDescList = list(pTestData[pDesc].values) #need to convert back to a list
        model = PolyFuzz("TF-IDF")
        model.match(pTestDataDescList, pTrainDataDescUnq, nbest = int(Nbest))
        pMatchesDf = model.get_matches()

        IntCol = ["To"]
        for i in range(1, int(Nbest)-1):
            IntCol.append("BestMatch" + "__" + str(i))
            pTestData['Intent' + '__' + str(i)] = 'NaN'

        SimCol = ['Similarity']
        for k in range(1, int(Nbest) - 1):
            SimCol.append("Similarity" + "__" + str(k))
            pTestData['Confidence_Level'+ '__' + str(k)] = 'NaN'
            
        for i in range(len(IntCol)):
            col = str(IntCol[i])
            if col != "To":
                for j in range(len(pTestData)):
                    if pMatchesDf[col][j] != None:
                        pTestData['Intent' + '__' + str(i-1)][j] = pTrainData[np.where(pTrainData[pDesc] == pMatchesDf[col][j], True , False)]['Intent'].values[0]
            else:
                for j in range(len(pTestData)):
                    if pMatchesDf[col][j] != None:
                        pTestData['Intent'][j] = pTrainData[np.where(pTrainData[pDesc] == pMatchesDf[IntCol[i]][j], True , False)]['Intent'].values[0]                 
                    
            
        for l in range(len(SimCol)):
            col = str(SimCol[l])
            if col != "Similarity":
                for m in range(len(pTestData)):
                    if pMatchesDf[col][m] != None:
                        pTestData['Confidence_Level'+ '__' + int(l-1)][m] = pMatchesDf[SimCol[l]][m]
            else:
                for m in range(len(pTestData)):
                    if pMatchesDf[SimCol[l]][m] != None:
                        pTestData['Confidence_Level'][m] = pMatchesDf[SimCol[l]][m]
            
    except Exception as e:
        print('*** ERROR[004]: Error in similarity poly main function: ', sys.exc_info()[0],str(e))
        print(traceback.format_exc())
        utils.movefile(pFromDir, pToDir)
        return(-1)
        sys.exit(-1)
    return(0, pTestData)    
Exemplo n.º 7
0
 def delete(self):
     content_dir = self.get_content_directory()
     log.info("Delete called on work [%d]" % self.id)
     saved_id = self.id # delete() unsets the ID attribute, which we need in order to clear out the directory
     super(Work, self).delete()
     fname = "saved-work-%d.zip" % saved_id
     self.id = saved_id
     if hasattr(settings, "ATTIC"):
         try:
             target = os.path.join(settings.ATTIC, fname)
             log.info("Saving work to %s" % target)
             archive = utils.recreate_ingest_package(self)
             utils.movefile(archive,target)
         except OSError, ose:
             log.error("Error encountered saving deleted work (id:%s) to %s: %r" % ( saved_id, target, ose))
def traindata(pData, pDesc, pLevel1, pLevel2, pFromDir, pToDir):
    try:
        pData[pDesc]= pData[pDesc].astype('str')
        pData[pLevel1] =  pData[pLevel1].astype('str')
        pData[pLevel2] =  pData[pLevel2].astype('str')
        pData.dropna(subset = [pLevel1, 'Level2'])
        pData['Intent'] = pData[[pLevel1,pLevel2]].agg('__'.join, axis=1).astype('category')
        pLabel =  pData['Intent'].cat.categories.tolist()
        
    except Exception as e:
        print('*** ERROR[002]: Error in similarity transform train data: ', sys.exc_info()[0],str(e))
        print(traceback.format_exc())
        utils.movefile(pFromDir, pToDir)
        return(-1)
    return pData, pLabel
def maintrain(pData, pDesc, pLevel1, pLevel2, pModelName, pRootDir, nTickets,
              pFromDir, pToDir, pSheetName, features, pFeature):
    if not set([pDesc, pLevel1, pLevel2]).issubset(pData.columns):
        utils.movefile(pFromDir, pToDir)
        __, pFailedData = utils.Filelist(pToDir, pSheetName)
        print(
            '*** ERROR[001]: Loading XLS - Could be due to using non-standard template ***',
            str(pFailedData.columns))
        return (-1, pData)
        sys.exit(-1)
    try:
        pData = pData.dropna(subset=[pDesc, pLevel1, pLevel2], how='any')
        train.createModel(pData, pDesc, pLevel1, pLevel2, pModelName, pRootDir,
                          nTickets, pTrainDir, pToDir, features, pFeature)

    except Exception as e:
        print('*** ERROR[002]: Error in Train main function: ',
              sys.exc_info()[0], str(e))
        print(traceback.format_exc())
        return (-1)
    return (0)
Exemplo n.º 10
0
def concatfeatures(pData, features, pRootDir, pModelName, pFromDir, pToDir,
                   pVec):
    try:
        encoder = OneHotEncoder(categories="auto", handle_unknown='ignore')
        Train_encoded = encoder.fit_transform(pData[features])
        x = hstack([pVec, Train_encoded]).tocsr()
        if not os.path.exists(pRootDir + '\\' + str(pModelName) + '\\' +
                              str(pModelName[6:]) + '_Ohe_encode'):
            os.makedirs(pRootDir + '\\' + str(pModelName) + '\\' +
                        str(pModelName[6:]) + '_Ohe_encode')
        enc_loc = pRootDir + '\\' + str(pModelName) + '\\' + str(
            pModelName[6:]) + '_Ohe_encode' + '\\' + str(
                pModelName[6:]) + ".ohe.pkl"
        pickle.dump(encoder, open(enc_loc, 'wb'))

    except Exception as e:
        print('*** ERROR[001]: Error in Training: ', sys.exc_info()[0], str(e))
        print(traceback.format_exc())
        utils.movefile(pFromDir, pToDir)
        return (-1)
        sys.exit(-1)
    return x
def maintest(pData, pDesc, pTh, pThSim, pTicketId, pLevel1, pLevel2,
             pModelName, pRootDir, pFromDir, pToDir, pSheetName, sim, features,
             pFeature):
    if not set([pDesc, pTicketId]).issubset(pData.columns):
        utils.movefile(pFromDir, pToDir)
        __, pFailedData = utils.Filelist(pToDir, pSheetName)
        print(
            '*** ERROR[003]: Loading XLS - Could be due to using non-standard template ***',
            str(pData.columns))
        print(traceback.format_exc())
        return (-1, pData)
        sys.exit(-1)
    try:
        pData = pData.dropna(subset=[pDesc, pTicketId], how='any')
        _, TestOutputData, pClassNames, pVec = test.intentpred(
            pData, pDesc, pTh, pThSim, pTicketId, pLevel1, pLevel2, pModelName,
            pRootDir, pFromDir, pToDir, sim, features, pFeature)

    except Exception as e:
        print('*** ERROR[004]: Error in Test main function: ',
              sys.exc_info()[0], str(e))
        print(traceback.format_exc())
        return (-1)
    return (0, TestOutputData, pClassNames, pVec)
def intentpred(pData, pDesc, pTh, pThSim, pTicketId, pLevel1, pLevel2,
               pModelName, pRootDir, pFromDir, pToDir, sim, features,
               pFeatures):

    try:
        if 'Confidence_Level' not in pData:
            pData['Confidence_Level'] = float(pThSim + 1)
        if sim:
            pDataTh = pData[np.where(pData['Confidence_Level'] < float(pThSim),
                                     True, False)]
        else:
            pDataTh = pData
        print('Length of file for Prediction after similarity:',
              pDataTh.shape[0])
        if len(pDataTh) > 0:
            pDataTh[pDesc].fillna("unknown", inplace=True)
            oCategoryNames = categories(pRootDir, pModelName)
            pDataTh[pTicketId] = pDataTh[pTicketId].astype('category')
            preds = np.zeros((len(pDataTh), len(oCategoryNames)))

            vec = loadTfidfFile(pRootDir, pModelName)

            if pFeatures:
                oTktVec = vec.transform(pDataTh[pDesc].astype(str))
                encoder = loadEncFile(pRootDir, pModelName)
                tkt_desc = featuresconcat(pDataTh,
                                          features,
                                          encoder,
                                          pVec=oTktVec)
            else:
                tkt_desc = vec.transform(pDataTh[pDesc].astype(str))

            for index, name in enumerate(oCategoryNames):
                print('Calculating prediction of intent', name)
                estimator = loadmodel(pRootDir, pModelName, name)
                r = loadcsr_matrix(pRootDir, pModelName, name)
                preds[:,
                      index] = estimator.predict_proba(tkt_desc.multiply(r))[:,
                                                                             1]
            pintentdf = pd.DataFrame(preds, columns=oCategoryNames)
            pintentdf['Confidence_Level'] = pintentdf[oCategoryNames].max(
                axis=1)
            pintentdf['Intent'] = pintentdf[oCategoryNames].idxmax(axis=1)
            pintentdf['Intent'] = np.where(
                pintentdf['Confidence_Level'] > float(pTh),
                pintentdf['Intent'], 'Others')
            pDataTh.reset_index(drop=True, inplace=True)
            pintentdf.reset_index(drop=True, inplace=True)
            pintentdf = pd.concat([pDataTh[pTicketId], pintentdf], axis=1)
            pintentdf = pintentdf[[pTicketId, 'Confidence_Level', 'Intent']]
            pData.loc[pData[pTicketId].isin(pintentdf[pTicketId]),
                      ['Confidence_Level', 'Intent']] = pintentdf[[
                          'Confidence_Level', 'Intent'
                      ]].values
            pData[['Level1', 'Level2']] = pData.Intent.str.split(
                "__",
                expand=True,
            )
        else:
            pData['Confidence_Level'] = pData['Confidence_Level'].astype(
                'float')

        pData[['Level1', 'Level2']] = pData.Intent.str.split(
            "__",
            expand=True,
        )

    except Exception as e:
        print(e)
        print('*** ERROR[001]: intentpred ***', sys.exc_info()[0], str(e))
        print(traceback.format_exc())
        utils.movefile(pFromDir, pToDir)
        return (-1, pData)
        sys.exit(-1)
    return (0, pData, oCategoryNames, vec)