def predictLabels(cv,tfidf,clf,df_toBePredictedData):
    '''
    Passes the to be predicted dataset via NLP Pipeline (Count Vectorizer , TFIDF Transformer)
    Predicts and returns the labels for the input data in a form of DataFrame.

    Parameters : 
    cv : Count Vectorizer Model
    tfidf : TFIDF Transformer Model
    clf : Trained Model 
    df_toBePredictedData (DataFrame) : To Be Predicted Data (Unlabelled Data)

    Returns : 
    df_toBePredictedData (DataFrame) : Updated To Be Predicted Data (Unlabelled Data), including prediction probabilities for different labels
    '''
    
    predictData = np.array(df_toBePredictedData.loc[:,['req_1','req_2']])
    #logs.writeLog(str(df_toBePredictedData))
    
    predict_counts = cv.transform(predictData)
    predict_tfidf = tfidf.transform(predict_counts)
    predict_labels = clf.predict(predict_tfidf)
    predict_prob = clf.predict_proba(predict_tfidf)
    
    logs.writeLog ("\nTotal Labels Predicted : "+ str(len(predict_labels)))

    df_toBePredictedData['predictedProb'] = predict_prob.tolist() 
    df_toBePredictedData['maxProb'] = np.amax(predict_prob,axis=1)
    
    return df_toBePredictedData    
def validateClassifier(cv,tfidf,clf_model,df_validationSet):
    '''
    Passes the validation dataset (Unseen data) via NLP Pipeline (Count Vectorizer , TFIDF Transformer)
    Calculate the accuracy and other metrics to evaluate the performance of the model on validation set (unseen data)
    
    Parameters : 
    cv : Count Vectorizer Model
    tfidf : TFIDF Transformer Model
    clf : Trained Model 
    df_validationSet (DataFrame) : Validation Data (Unseen Data)

    Returns : 
    clf_val_score/f1/precision/recall (float) : Accuracy Value on Validation Data / F1 score / Precision / Recall
    '''
    
    predictData = np.array(df_validationSet.loc[:,['req_1','req_2']])
    actualLabels = np.array(df_validationSet.loc[:,'Label']).astype('int')
    predict_counts = cv.transform(predictData)
    predict_tfidf = tfidf.transform(predict_counts)
    
    predict_labels = clf_model.predict(predict_tfidf)
    clf_val_score = clf_model.score(predict_tfidf,actualLabels)

    f1 = round(f1_score(actualLabels, predict_labels,average='macro'),2)
    precision = round(precision_score(actualLabels, predict_labels,average='macro'),2)
    recall = round(recall_score(actualLabels, predict_labels,average='macro'),2)
    
    labelClasses = list(set(actualLabels))   #np.array(y_train).astype('int')
    logs.writeLog ("\n\nClassification Report On Validation Set: \n\n"+str(classification_report(actualLabels,predict_labels)))
    cm = confusion_matrix(actualLabels,predict_labels,labels=labelClasses)    
    logs.writeLog ("\n\nConfusion Matrix : \n"+str(cm)+"\n")
    
    return clf_val_score,f1,precision,recall
def main():
    #Ignore Future warnings if any occur.
    warnings.simplefilter(action='ignore', category=FutureWarning)

    pd.set_option('display.max_columns',
                  500)  #To make sure all the columns are visible in the logs.
    pd.set_option('display.width', 1000)

    #initialize directory which contains all the data and which will contain logs and outputs
    currentFileDir = os.getcwd()

    #Reads run time arguments
    args = logs.getArguments(currentFileDir + "/ALParams.txt")
    comments = args.loc[0, 'comments']

    #Creates Logs folder structure
    logFilePath, OFilePath = logs.createLogs(currentFileDir + "/Logs", args)

    df_rqmtComb, df_Analysis = learnTargetLabel(args)

    #Adds the Analysis DataFrame to Output File
    logs.addOutputToExcel(df_Analysis,
                          "\nAnalysis of  Label Classification  : \n")

    #logs.updateResults(df_rqmtComb,args)   #Update Results in excel....

    logs.writeLog("\nOutput Analysis is available at : " + str(OFilePath))
    logs.writeLog("\nLogs are available at : " + str(logFilePath))
def leastConfidenceSampling(df_uncertain):
    df_uncertain['lconf'] = 1 - df_uncertain['maxProb']
    df_uncertain = df_uncertain.sort_values(by=['lconf'], ascending=False)
    logs.writeLog("\n\nLeast Confidence Calculations..." +
                  str(len(df_uncertain)) + " Rows\n" + str(df_uncertain[:10]))
    #logs.writeLog(str(df.index.values[0]))
    sampleIndex = df_uncertain.index.values[0]
    return sampleIndex
def entropySampling(df_uncertain):
    df_uncertain['entropy'] = [
        entropy(x) for x in df_uncertain['predictedProb']
    ]
    #logs.writeLog(str(df))
    df_uncertain = df_uncertain.sort_values(by=['entropy'], ascending=False)
    logs.writeLog("\n\nEntropy Calculations..." + str(len(df_uncertain)) +
                  " Rows\n" + str(df_uncertain[:10]))
    #logs.writeLog(str(df.index.values[0]))
    sampleIndex = df_uncertain.index.values[0]
    return sampleIndex
def minMarginSampling(df_uncertain):
    df_uncertain['Margin'] = [
        max(x) - min(x) for x in df_uncertain['predictedProb']
    ]
    #logs.writeLog(str(df))
    df_uncertain = df_uncertain.sort_values(by=['Margin'], ascending=True)
    logs.writeLog("\n\nMin Margin Calcuations..." + str(len(df_uncertain)) +
                  " Rows\n" + str(df_uncertain[:10]))
    #logs.writeLog(str(df.index.values[0]))
    sampleIndex = df_uncertain.index.values[0]
    return sampleIndex
Пример #7
0
def predictLabels(cv, tfidf, clf, df_toBePredictedData, targetLabel):
    '''
    Count Vectorizer (cv) applies Bag of Words on the Features 
    Next, tfidf Transformation is applied.
    predicts and returns the labels for the input data in a form of DataFrame.
    '''
    #predictData = np.array(df_toBePredictedData.loc[:,['req1','req2','BLabelled','MLabelled']])
    df_toBePredictedData['req'] = df_toBePredictedData[
        'req1'] + " " + df_toBePredictedData['req2']
    predictData = np.array(df_toBePredictedData.loc[:, 'req'])
    #logs.writeLog(str(df_toBePredictedData))

    predict_counts = cv.transform(predictData)
    predict_tfidf = tfidf.transform(predict_counts)
    predict_labels = clf.predict(predict_tfidf)
    predict_prob = clf.predict_proba(predict_tfidf)
    #predict_classes = clf.classes_
    #logs.writeLog(str(predict_classes))
    #logs.writeLog (str(type(predict_prob)))
    #logs.writeLog (str(predict_prob.shape))
    #clf_pred_score = round(np.mean(predict_labels == actualLabels),2)

    logs.writeLog("\nTotal Labels Predicted : " + str(len(predict_labels)))

    #f1 = round(f1_score(actualLabels, predict_labels,average='macro'),2)
    #precision = round(precision_score(actualLabels, predict_labels,average='macro'),2)
    #recall = round(recall_score(actualLabels, predict_labels,average='macro'),2)

    #print ("\nClassification Report : \n",classification_report(actualLabels,predict_labels))
    #cm = confusion_matrix(actualLabels,predict_labels,labels=[0,1])
    #print ("\nConfusion Matrix : \n",cm)
    #tn,fp,fn,tp = cm.ravel()
    #acc = round((tn+tp)/(tn+fp+fn+tp),2)
    #print ("\nAccuracy : ",acc)

    if targetLabel == 'BinaryClass':
        #Save the prediction results.... predictedProb saves the prediction probabilities in a list form for each prediction.
        #df_predictionResults = pd.DataFrame({'req1_id':df_toBePredictedData.loc[:,'req1_id'],'req1':df_toBePredictedData.loc[:],'req2':predictData[:,1],'BinaryClass':predict_labels[:],'MultiClass':0,'predictedProb':predict_prob.tolist(),'BLabelled':predictData[:,2],'MLabelled':predictData[:,3]})  #added 0 as dummy value to MultiClass because we are predicting BinaryClass
        df_toBePredictedData['BinaryClass'] = predict_labels[:]
        df_toBePredictedData['MultiClass'] = 0
        df_toBePredictedData['predictedProb'] = predict_prob.tolist()
        df_toBePredictedData['maxProb'] = np.amax(predict_prob, axis=1)
    else:
        #Save the prediction results.... predictedProb saves the prediction probabilities in a list form for each prediction.
        #df_predictionResults = pd.DataFrame({'req1':predictData[:,0],'req2':predictData[:,1],'BinaryClass':1,'MultiClass':predict_labels[:],'predictedProb':predict_prob.tolist(),'BLabelled':predictData[:,2],'MLabelled':predictData[:,3]})  #added 1 as Binary Class because we do MultiClass prediction only for dependent combinations.
        df_toBePredictedData['BinaryClass'] = 1
        df_toBePredictedData['MultiClass'] = predict_labels[:]
        df_toBePredictedData['predictedProb'] = predict_prob.tolist()
        df_toBePredictedData['maxProb'] = np.amax(predict_prob, axis=1)

    return df_toBePredictedData  #f1,precision,recall,clf_pred_score,acc
Пример #8
0
def minMarginSampling(df_uncertain):

    df_uncertain['sorted'] = df_uncertain['predictedProb'].sort_values().apply(
        lambda x: sorted(x, reverse=True))
    df_uncertain['first'] = [x[0] for x in df_uncertain['sorted']]
    df_uncertain['second'] = [x[1] for x in df_uncertain['sorted']]
    df_uncertain['Margin'] = df_uncertain['first'] - df_uncertain['second']

    df_uncertain = df_uncertain.sort_values(by=['Margin'], ascending=True)
    logs.writeLog("\n\nMin Margin Calcuations..." + str(len(df_uncertain)) +
                  " Rows\n" + str(df_uncertain[:10]))
    #logs.writeLog(str(df.index.values[0]))
    sampleIndex = df_uncertain.index.values[0]
    return sampleIndex
Пример #9
0
def main():
    warnings.simplefilter(
        action='ignore',
        category=FutureWarning)  #Ignore Future warnings if any occur.

    #initialize directory which contains all the data and which will contain logs and outputs
    currentFileDir = os.getcwd()
    args = logs.getArguments(currentFileDir + "/ALParams.txt")

    #args=get_args()  #Get all the command line arguments
    #options = vars(args)  #Stores the arguments as dictionary ; used in logs
    ifileName = args.loc[0, 'input']
    #clf = args.classifier
    comments = args.loc[0, 'comments']
    dependencyTypeNeeded = args.loc[0, 'dependencyTypeNeeded']

    logFilePath, OFilePath = logs.createLogs(
        currentFileDir + "/static/data/Logs", args, comments
    )  #Creates the log file, default value is os.getcwd()+"/static/data/logs/" ; user still can provide his own logPath if needed.

    df_rqmtData = getData(currentFileDir + "/static/data/" + ifileName)
    logs.writeLog("\n\nData Fetched from the input file : " +
                  str(len(df_rqmtData)) + " Rows \n" + str(df_rqmtData[:10]))

    logs.writeLog("\n\nStep 1 :- Learning BinaryClass Label\n")
    df_rqmtComb, df_BinaryAnalysis, thresholdConf = learnTargetLabel(
        args, df_rqmtData, 'BinaryClass')

    logs.addOutputToExcel(
        df_BinaryAnalysis,
        "\nAnalysis of BinaryClass Label Classification (Threshold Prob " +
        str(thresholdConf) + ") : \n")
    input("Hit Enter to Proceed....")

    if dependencyTypeNeeded == 'y':
        df_rqmtComb.drop(columns=['req'], inplace=True)
        logs.writeLog("\n\nStep 2 :- Learning MultiClass Label\n")
        df_rqmtCombDependent = df_rqmtComb[
            df_rqmtComb['BinaryClass'] ==
            1]  #Filtering only the Dependent Requirement Combinations for low level classification
        df_rqmtCombInDependent = df_rqmtComb[df_rqmtComb['BinaryClass'] == 0]

        df_rqmtCombDependent['MLabelled'] = df_rqmtCombDependent[
            'MLabelled'].replace(
                ' ', 'A'
            )  #Mark the intelligently annotated combinations as unlabelled. (But what about the manual annotations done?)
        logs.writeLog(
            "Following is the data set to be used for MultiClass classification : "
            + str(len(df_rqmtCombDependent)) + " Rows\n" +
            str(df_rqmtCombDependent[:10]))

        if len(df_rqmtCombDependent[df_rqmtCombDependent['MLabelled'] ==
                                    'A']) > 0:
            df_rqmtCombUpdated, df_MultiAnalysis, thresholdConf = learnTargetLabel(
                args, df_rqmtCombDependent, 'MultiClass')
            logs.addOutputToExcel(
                df_MultiAnalysis,
                "Analysis of MultiClass Label Classification (Threshold Prob "
                + str(thresholdConf) + ") : \n")
            df_rqmtComb = pd.concat(
                [df_rqmtCombUpdated, df_rqmtCombInDependent], axis=0)
            #logs.updateResults(df_rqmtResults,args)   #Update Results in excel....
        else:
            logs.writeLog(
                "\n\nThere are no dependent combinations. So its not possible to find the Dependency Types."
            )

    logs.updateResults(df_rqmtComb, args)  #Update Results in excel....

    logs.writeLog("\nOutput Analysis is available at : " + str(OFilePath))
    logs.writeLog("\nLogs are available at : " + str(logFilePath))
Пример #10
0
def learnTargetLabel(args, df_rqmts, targetLabel):
    #Based on scenario, user can provide the initial manual annotations in the input file and mark them as 'M' in Labelled Column
    #If the user decides to provide the manual annotations on the go. Then inorder to provide the initial Manual Annotations which will form the training set,
    #User can provide the trainingCount number of annotations and the labelled data will be marked as 'M' in Labelled column.

    if targetLabel == "BinaryClass":
        labelColumn = "BLabelled"
    else:
        labelColumn = "MLabelled"

    #LabelledCombinations (Manually Annotated Set)
    df_manuallyAnnotatedSet = shuffle(df_rqmts[df_rqmts[labelColumn] == 'M'])

    #Dump the labelled/Manually annotated data combinations in results.csv file
    #logs.createAnnotationsFile(df_manuallyAnnotatedSet)

    if args.loc[0, 'initManualAnnotAvail'].lower(
    ) == 'n':  #Create Initial Training Set only if the ManualAnnotationAvailabilityFlag is 'n'
        df_rqmts = createInitialTrainingSet(
            df_rqmts, int(args.loc[0, 'trainingCount']), targetLabel
        )  #User provides manual annotations for trainingCount(default=10) requirement combinations.
        logs.writeLog(
            "Combined Data Set after doing the initial Manual Annotations : \n"
            + str(df_rqmts))

    ###############################################################################################################################

    df_manuallyAnnotatedSet = shuffle(df_rqmts[df_rqmts[labelColumn] == 'M'])
    #Splitting Initially Manually Annotated Data (300 Data points into Train and Validation Test Sets......)
    validationSetCount = int(len(df_manuallyAnnotatedSet) *
                             .2)  #retain 20% for testing (unseen data)
    df_validationSet = df_manuallyAnnotatedSet[:validationSetCount]
    logs.writeLog("\nSeparating Validation Set : " +
                  str(len(df_validationSet)) + " Rows\n" +
                  str(df_validationSet[:10]))

    df_validationSet.reset_index(drop=True, inplace=True)
    df_rqmts = df_rqmts[~df_rqmts.isin(df_validationSet)].dropna(
    )  #Removed the 75 selected combinations which formed the Validation Set.

    ###############################################################################################################################

    #Initial Analysis of the data available.
    iteration = 0

    manuallyAnnotatedCount = len(df_rqmts[df_rqmts[labelColumn] == 'M'])
    intelligentlyAnnotatedCount = len(df_rqmts[df_rqmts[labelColumn] == 'I'])
    toBeAnnotatedCount = len(df_rqmts[df_rqmts[labelColumn] == 'A'])

    if targetLabel == 'BinaryClass':
        #Create a dataframe to store the results after each iteration of active Learning.
        df_resultTracker = pd.DataFrame(columns=[
            'Iteration', 'ManuallyAnnotated', 'IntelligentlyAnnotated',
            'ToBeAnnotated', 'TrainingSize', 'TestSize', 'ValidationSize',
            'ClassifierTestScore', 'ClassifierValidationScore',
            'DependentCount', 'IndependentCount', 'f1Score', 'precisionScore',
            'recallScore'
        ])

        #Number of combinations which have been manually or intelligently labelled as dependent or independent
        dependentCount = len(
            df_rqmts[(df_rqmts['BinaryClass'].isin(['1.0', '1']))
                     & (df_rqmts[labelColumn].isin(['M', 'I']))])
        independentCount = len(
            df_rqmts[(df_rqmts['BinaryClass'].isin(['0.0', '0']))
                     & (df_rqmts[labelColumn].isin(['M', 'I']))])

        #Add the initial analysis to the analysis dataFrame created.
        df_resultTracker = df_resultTracker.append(
            {
                'Iteration': iteration,
                'ManuallyAnnotated': manuallyAnnotatedCount,
                'IntelligentlyAnnotated': intelligentlyAnnotatedCount,
                'ToBeAnnotated': toBeAnnotatedCount,
                'TrainingSize': '-',
                'TestSize': '-',
                'ValidationSize': validationSetCount,
                'ClassifierTestScore': '-',
                'ClassifierValidationScore': '-',
                'DependentCount': dependentCount,
                'IndependentCount': independentCount,
                'f1Score': '-',
                'precisionScore': '-',
                'recallScore': '-'
            },
            ignore_index=True)

    else:
        #Create a dataframe to store the results after each iteration of active Learning...  #Added ORCount,ANDCount etc columns to keep a track of different dependency types
        #df_resultTracker = pd.DataFrame(columns=['Iteration','ManuallyAnnotated','IntelligentlyAnnotated','ToBeAnnotated','TrainingSize','TestSize','ValidationSize','ClassifierTestScore','ClassifierValidationScore','AndCount','ORCount','RequiresCount','SimilarCount','CannotSayCount','f1Score','precisionScore','recallScore'])
        #df_resultTracker = pd.DataFrame(columns=['Iteration','ManuallyAnnotated','IntelligentlyAnnotated','ToBeAnnotated','TrainingSize','TestSize','ValidationSize','ClassifierTestScore','ClassifierValidationScore','RequiresCount','SimilarCount','OtherCount','f1Score','precisionScore','recallScore'])
        df_resultTracker = pd.DataFrame(columns=[
            'Iteration', 'ManuallyAnnotated', 'IntelligentlyAnnotated',
            'ToBeAnnotated', 'TrainingSize', 'TestSize', 'ValidationSize',
            'ClassifierTestScore', 'ClassifierValidationScore',
            'RequiresCount', 'RefinesCount', 'ConflictsCount', 'f1Score',
            'precisionScore', 'recallScore'
        ])

        df_rqmts['MultiClass'].replace(to_replace=" ", value="", inplace=True)

        #Number of combinations which have been manually or intelligently labelled for different dependency types
        #andCount = len(df_rqmts[(df_rqmts['MultiClass'].astype('int')==1) & (df_rqmts[labelColumn].isin(['M','I']))])
        #orCount = len(df_rqmts[(df_rqmts['MultiClass'].astype('int')==2) & (df_rqmts[labelColumn].isin(['M','I']))])
        #requiresCount = len(df_rqmts[(df_rqmts['MultiClass'].astype('int')==3) & (df_rqmts[labelColumn].isin(['M','I']))])
        #similarCount = len(df_rqmts[(df_rqmts['MultiClass'].astype('int')==4) & (df_rqmts[labelColumn].isin(['M','I']))])
        #cannotSayCount = len(df_rqmts[(df_rqmts['MultiClass'].astype('int')==5) & (df_rqmts[labelColumn].isin(['M','I']))])
        #otherCount = len(df_rqmts[(df_rqmts['MultiClass'].astype('int')==6) & (df_rqmts[labelColumn].isin(['M','I']))])

        requiresCount = len(
            df_rqmts[(df_rqmts['MultiClass'].isin(['1.0', '1']))
                     & (df_rqmts[labelColumn].isin(['M', 'I']))])
        refinesCount = len(
            df_rqmts[(df_rqmts['MultiClass'].isin(['2.0', '2']))
                     & (df_rqmts[labelColumn].isin(['M', 'I']))])
        conflictsCount = len(
            df_rqmts[(df_rqmts['MultiClass'].isin(['3.0', '3']))
                     & (df_rqmts[labelColumn].isin(['M', 'I']))])

        #Add the initial analysis to the analysis dataFrame created.
        #df_resultTracker = df_resultTracker.append({'Iteration':iteration,'ManuallyAnnotated':manuallyAnnotatedCount,'IntelligentlyAnnotated':intelligentlyAnnotatedCount,'ToBeAnnotated':toBeAnnotatedCount,'TrainingSize':'-','TestSize':'-','ValidationSize':validationSetCount,'ClassifierTestScore':'-','ClassifierValidationScore':'-','AndCount':andCount,'ORCount':orCount,'RequiresCount':requiresCount,'SimilarCount':similarCount,'CannotSayCount':cannotSayCount,'f1Score':'-','precisionScore':'-','recallScore':'-'},ignore_index=True)
        #df_resultTracker = df_resultTracker.append({'Iteration':iteration,'ManuallyAnnotated':manuallyAnnotatedCount,'IntelligentlyAnnotated':intelligentlyAnnotatedCount,'ToBeAnnotated':toBeAnnotatedCount,'TrainingSize':'-','TestSize':'-','ValidationSize':validationSetCount,'ClassifierTestScore':'-','ClassifierValidationScore':'-','RequiresCount':requiresCount,'SimilarCount':similarCount,'OtherCount':otherCount,'f1Score':'-','precisionScore':'-','recallScore':'-'},ignore_index=True)
        df_resultTracker = df_resultTracker.append(
            {
                'Iteration': iteration,
                'ManuallyAnnotated': manuallyAnnotatedCount,
                'IntelligentlyAnnotated': intelligentlyAnnotatedCount,
                'ToBeAnnotated': toBeAnnotatedCount,
                'TrainingSize': '-',
                'TestSize': '-',
                'ValidationSize': validationSetCount,
                'ClassifierTestScore': '-',
                'ClassifierValidationScore': '-',
                'RequiresCount': requiresCount,
                'RefinesCount': refinesCount,
                'ConflictsCount': conflictsCount,
                'f1Score': '-',
                'precisionScore': '-',
                'recallScore': '-'
            },
            ignore_index=True)

    logs.writeLog("\n\nInitial Data Analysis : \n" + str(df_resultTracker) +
                  "\n")

    confidence = 0  #initial value of confidence; user will be providing value after looking for the probability distribution.

    while True:

        iteration += 1

        logs.writeLog("\n\nIteration : " + str(iteration) + "\n")
        logs.writeLog("\nSplitting Labelled Data and Unlabelled Data\n")

        df_labelledData = df_rqmts[df_rqmts[labelColumn].isin(
            ['M', 'I'])]  #Training Data
        logs.writeLog('\nLabelled Data : ' + str(len(df_labelledData)) +
                      ' Rows \n' + str(df_labelledData[:10]) + "\n")

        df_unlabelledData = df_rqmts[df_rqmts[labelColumn] == 'A']  #Test Data
        logs.writeLog('\nUnlabelled Data : ' + str(len(df_unlabelledData)) +
                      ' Rows \n' + str(df_unlabelledData[:10]) + "\n")

        if len(df_labelledData) == 0:
            err = "There are no Labelled Data Points to Training the Classifier. Either Manually Annotate them in input file or Mark initManualAnnotAvail flag 'y' in arguments."
            logs.writeLog("Error! : " + str(err))
            raise (err)

        if len(
                df_unlabelledData
        ) == 0:  #If there are no more ToBeAnnotated Combinations then Exit..
            logs.writeLog("There are no more Unlabelled Data Points....")
            df_rqmts = pd.concat([df_rqmts, df_validationSet],
                                 axis=0,
                                 ignore_index=True)
            return df_rqmts, df_resultTracker, confidence

        if iteration >= 11:  #After 10 iterations, ask user if he/she wants to continue active learner....
            while True:
                logs.writeLog(
                    "Exceeded the iteration limit. Still there are " +
                    str(len(df_unlabelledData)) +
                    " combinations to be Labelled. Do you wish to continue Annotating? Enter 'y'/'n'"
                )
                userInput = input()
                logs.writeLog("User's input : " + str(userInput))
                if userInput.lower() == 'n':
                    logs.writeLog("Stopping Condition Reached...")
                    df_rqmts = pd.concat([df_rqmts, df_validationSet],
                                         axis=0,
                                         ignore_index=True)
                    return df_rqmts, df_resultTracker, confidence
                elif userInput.lower() == 'y':
                    logs.writeLog("Continuing with Iteration " +
                                  str(iteration))
                    break
                else:
                    logs.writeLog("Invalid Input. Allowed Values -- y / n")
                    continue

        logs.writeLog("\n" + 100 * "-")
        logs.writeLog("\nCreating Classifier....")
        countVectorizer, tfidfTransformer, classifier, classifierTestScore, trainSize, testSize, f1Score, precisionScore, recallScore = createClassifier(
            args.loc[0, 'classifier'], float(args.loc[0, 'testsize']),
            df_labelledData, targetLabel)

        ############################################################################################################################
        logs.writeLog("\n\nValidating Classifier...")
        classifierValidationScore = validateClassifier(countVectorizer,
                                                       tfidfTransformer,
                                                       classifier,
                                                       df_validationSet,
                                                       targetLabel)
        logs.writeLog("\n\nClassifier Validation Set Score : " +
                      str(classifierValidationScore))
        ############################################################################################################################
        logs.writeLog("\n" + 100 * "-")

        input("\n\nHit Enter to Proceed....")

        logs.writeLog("\n\nPredicting Labels....")
        df_predictionResults = predictLabels(countVectorizer, tfidfTransformer,
                                             classifier, df_unlabelledData,
                                             targetLabel)
        logs.writeLog('\nPrediction Results :  ' +
                      str(len(df_predictionResults)) + " Rows \n" +
                      str(df_predictionResults[:10]))

        df_finalPredictions, confidence = analyzePredictions(
            args, df_predictionResults, targetLabel, confidence)
        logs.writeLog("\n\nFinal Predictions : " +
                      str(len(df_finalPredictions)) + " Rows \n" +
                      str(df_finalPredictions[:10]))

        df_updatedDatabase = pd.concat([df_labelledData, df_finalPredictions],
                                       axis=0,
                                       ignore_index=True)
        logs.writeLog("\n\nUpdated Database : " +
                      str(len(df_updatedDatabase)) + " Rows \n" +
                      str(df_updatedDatabase[:10]))
        df_rqmts = df_updatedDatabase

        #Update the Results and add them to Result Tracker

        if targetLabel == 'BinaryClass':
            manuallyAnnotatedCount = len(
                df_rqmts[df_rqmts['BLabelled'] == 'M'])
            intelligentlyAnnotatedCount = len(
                df_rqmts[df_rqmts['BLabelled'] == 'I'])
            toBeAnnotatedCount = len(df_rqmts[df_rqmts['BLabelled'] == 'A'])

            dependentCount = len(
                df_rqmts[(df_rqmts['BinaryClass'].astype('int') == 1)
                         & (df_rqmts['BLabelled'].isin(['M', 'I']))])
            independentCount = len(
                df_rqmts[(df_rqmts['BinaryClass'].astype('int') == 0)
                         & (df_rqmts['BLabelled'].isin(['M', 'I']))])

            df_resultTracker = df_resultTracker.append(
                {
                    'Iteration': iteration,
                    'ManuallyAnnotated': manuallyAnnotatedCount,
                    'IntelligentlyAnnotated': intelligentlyAnnotatedCount,
                    'ToBeAnnotated': toBeAnnotatedCount,
                    'TrainingSize': trainSize,
                    'TestSize': testSize,
                    'ValidationSize': validationSetCount,
                    'ClassifierTestScore': classifierTestScore,
                    'ClassifierValidationScore': classifierValidationScore,
                    'DependentCount': dependentCount,
                    'IndependentCount': independentCount,
                    'f1Score': f1Score,
                    'precisionScore': precisionScore,
                    'recallScore': recallScore
                },
                ignore_index=True)
            logs.writeLog("\n\nAnalysis DataFrame : \n" +
                          str(df_resultTracker))

        else:
            manuallyAnnotatedCount = len(
                df_rqmts[df_rqmts['MLabelled'] == 'M'])
            intelligentlyAnnotatedCount = len(
                df_rqmts[df_rqmts['MLabelled'] == 'I'])
            toBeAnnotatedCount = len(df_rqmts[df_rqmts['MLabelled'] == 'A'])

            #andCount = len(df_rqmts[(df_rqmts['MultiClass'].astype('int')==1) & (df_rqmts['MLabelled'].isin(['M','I']))])
            #orCount = len(df_rqmts[(df_rqmts['MultiClass'].astype('int')==2) & (df_rqmts['MLabelled'].isin(['M','I']))])
            #requiresCount = len(df_rqmts[(df_rqmts['MultiClass'].astype('int')==3) & (df_rqmts['MLabelled'].isin(['M','I']))])
            #similarCount = len(df_rqmts[(df_rqmts['MultiClass'].astype('int')==4) & (df_rqmts['MLabelled'].isin(['M','I']))])
            #cannotSayCount = len(df_rqmts[(df_rqmts['MultiClass'].astype('int')==5) & (df_rqmts['MLabelled'].isin(['M','I']))])
            #otherCount = len(df_rqmts[(df_rqmts['MultiClass'].astype('int')==6) & (df_rqmts['MLabelled'].isin(['M','I']))])

            requiresCount = len(
                df_rqmts[(df_rqmts['MultiClass'].astype('int') == 1)
                         & (df_rqmts[labelColumn].isin(['M', 'I']))])
            refinesCount = len(
                df_rqmts[(df_rqmts['MultiClass'].astype('int') == 2)
                         & (df_rqmts[labelColumn].isin(['M', 'I']))])
            conflictsCount = len(
                df_rqmts[(df_rqmts['MultiClass'].astype('int') == 3)
                         & (df_rqmts[labelColumn].isin(['M', 'I']))])

            df_resultTracker = df_resultTracker.append(
                {
                    'Iteration': iteration,
                    'ManuallyAnnotated': manuallyAnnotatedCount,
                    'IntelligentlyAnnotated': intelligentlyAnnotatedCount,
                    'ToBeAnnotated': toBeAnnotatedCount,
                    'TrainingSize': trainSize,
                    'TestSize': testSize,
                    'ValidationSize': validationSetCount,
                    'ClassifierTestScore': classifierTestScore,
                    'ClassifierValidationScore': classifierValidationScore,
                    'RequiresCount': requiresCount,
                    'RefinesCount': refinesCount,
                    'ConflictsCount': conflictsCount,
                    'f1Score': f1Score,
                    'precisionScore': precisionScore,
                    'recallScore': recallScore
                },
                ignore_index=True)
            #df_resultTracker = df_resultTracker.append({'Iteration':iteration,'ManuallyAnnotated':manuallyAnnotatedCount,'IntelligentlyAnnotated':intelligentlyAnnotatedCount,'ToBeAnnotated':toBeAnnotatedCount,'TrainingSize':trainSize,'TestSize':testSize,'ValidationSize':validationSetCount,'ClassifierTestScore':classifierTestScore,'ClassifierValidationScore':classifierValidationScore,'RequiresCount':requiresCount,'SimilarCount':similarCount,'OtherCount':otherCount,'f1Score':f1Score,'precisionScore':precisionScore,'recallScore':recallScore},ignore_index=True)

            logs.writeLog("\n\nAnalysis DataFrame : \n" +
                          str(df_resultTracker))

    #Merge Validation Set back to the prediction set to ensure all the 19699 combinations are returned.
    df_rqmts = pd.concat([df_rqmts, df_validationSet],
                         axis=0,
                         ignore_index=True)
    return df_rqmts, df_resultTracker, confidence
Пример #11
0
def createInitialTrainingSet(df_data, count, label):
    '''
    Randomly selects the requirement combinations and get's them annotated by the user and marks them Labelled - 'M'. 
    Returns the updated dataset.
    '''
    i = 0  #Counter Variable
    logs.writeLog(
        "\nPlease annotate the following requirement combinations...(For Training Set)"
    )

    while i < count:
        logs.writeLog("\nCombination " + str(i + 1) + " .....")

        #The requirement combination is updated and added to the dataFrame
        if label == 'BinaryClass':
            selection = df_data[df_data['BLabelled'] == 'A'].sample(
                1
            )  #Samples one requirement combination which is To Be annotated ie. Marked as 'A'
            selectedIndex = selection.index.values[0]
            selection.reset_index(inplace=True)

            #Get the requirements
            req1_id = selection.loc[0, 'req1_id']
            req2_id = selection.loc[0, 'req2_id']
            req1 = selection.loc[0, 'req1']
            req2 = selection.loc[0, 'req2']

            #logs.writeLog("Removed Index : "+str(selectedIndex))
            df_data.drop(
                index=selectedIndex, inplace=True
            )  #Drops the particular requirement combination from the original DataFrame

            df_userAnnot = pd.DataFrame(columns=df_data.columns)
            userAnnot = getManualAnnotation(
                req1_id, req2_id, req1, req2, label
            )  #User provides the annotation for the requirement combination

            if userAnnot == "exit":
                raise Exception('\nExited the Program successfully!')

            df_userAnnot = df_userAnnot.append(
                {
                    'req1_id': req1_id,
                    'req1': req1,
                    'req2_id': req2_id,
                    'req2': req2,
                    'BinaryClass': userAnnot,
                    'MultiClass': 0,
                    'BLabelled': 'M',
                    'MLabelled': 'A'
                },
                ignore_index=True
            )  #Added MultiClass as 0 because when we are learning BinaryClass... MultiClass can contain a dummy value.
            logs.createAnnotationsFile(df_userAnnot)

            df_data = pd.concat(
                [df_data, df_userAnnot], axis=0
            )  #Manually Annotated Values are concatenated with the original dataset and the resultant is returned.

        else:
            print(df_data)
            selection = df_data[df_data['MLabelled'] == 'A'].sample(
                1
            )  #Samples one requirement combination which is To Be annotated ie. Marked as 'A'
            selectedIndex = selection.index.values[0]
            selection.reset_index(inplace=True)

            #Get the requirements
            req1_id = selection.loc[0, 'req1_id']
            req2_id = selection.loc[0, 'req2_id']
            req1 = selection.loc[0, 'req1']
            req2 = selection.loc[0, 'req2']

            #logs.writeLog("Removed Index : "+str(selectedIndex))
            df_data.drop(
                index=selectedIndex, inplace=True
            )  #Drops the particular requirement combination from the original DataFrame

            df_userAnnot = pd.DataFrame(columns=df_data.columns)
            userAnnot = getManualAnnotation(
                req1_id, req2_id, req1, req2, label
            )  #User provides the annotation for the requirement combination

            if userAnnot == "exit":
                raise Exception('\nExited the Program successfully!')

            df_userAnnot = df_userAnnot.append(
                {
                    'req1_id': req1_id,
                    'req1': req1,
                    'req2_id': req2_id,
                    'req2': req2,
                    'BinaryClass': 1,
                    'MultiClass': userAnnot,
                    'BLabelled': 'M',
                    'MLabelled': 'M'
                },
                ignore_index=True
            )  #Added BinaryClass as 1 because we are learning the MultiClass labels only for the dependent Combinations (for which BinaryClass is 1)
            logs.createAnnotationsFile(df_userAnnot)

            df_data = pd.concat(
                [df_data, df_userAnnot], axis=0
            )  #Manually Annotated Values are concatenated with the original dataset and the resultant is returned.

        i = i + 1

    logs.writeLog("Initial Manual Annotations completed.")

    return df_data
Пример #12
0
def createClassifier(clf, splitratio, df_labelledData, targetLabel):
    '''
    Creates and returns Count Vectorizer , tfidf Transformer, Classifier, Classifier Test Score, Length of train and test sets.
    '''
    # NOTE: NOT DELETING THE COMMENTED CODE SNIPPET AS IT MIGHT BE NEEDED IN FUTURE...

    #Convert numpy array of the training dataset.
    #trainData = np.array(trainData)
    #X_train = trainData[:,:-2]  #Keep Features aka requirement details in X_train
    #logs.writeLog("X_train : "+str(X_train))
    #y_train = trainData[:,-2].astype('int') #Keep the labels aka BinaryClass in y_train; update the datatype as int.
    #logs.writeLog("y_train : "+str(y_train))
    #print ("Inside Create Classifier")
    #print ("df_labelledData length : ",str(len(df_labelledData)))
    logs.writeLog("\nPerforming Balancing of Data....\n")
    df_labelledData[targetLabel] = df_labelledData[targetLabel].astype(
        'int')  #Making sure the values are integer only and not float...
    #######################################DATA BALANCING########################################################
    #Create empty dataframes to store the Balanced Combinations .... Making sure equal number of combinations corresponding to all label are available in train and test sets.
    df_testSet = pd.DataFrame(columns=df_labelledData.columns)
    df_trainSet = pd.DataFrame(columns=df_labelledData.columns)

    stats = df_labelledData[targetLabel].value_counts(
    )  #Returns a series of number of different types of TargetLabels (values) available with their count.
    #print("Stats : "+str(stats))
    min_value_count = stats.min(
    )  #Calculate minimum value count out of all labels.... will extract this number of combinations of each label type.
    #Calcalate the Test Size and Train Size... number of combinations to be sampled for each LABEL type.
    test_size = int(min_value_count * splitratio) if (
        int(min_value_count * splitratio) >= 1
    ) else 1  #added if else condition in case test size is less than 1. then minimum size should be 1.
    train_size = min_value_count - test_size
    #For each type of lab
    for key in stats.keys():
        #Sample out some values for Test Set
        df_sampleTest = df_labelledData[df_labelledData[targetLabel] ==
                                        key].sample(test_size)
        df_labelledData = df_labelledData[
            ~df_labelledData.isin(df_sampleTest)].dropna(
            )  #Remove Sampled Values from original data set.
        df_testSet = pd.concat([df_testSet, df_sampleTest],
                               axis=0)  #Add sampled values into the Test Set

        #Sample out some values for Train Set
        df_sampleTrain = df_labelledData[df_labelledData[targetLabel] ==
                                         key].sample(train_size)
        df_labelledData = df_labelledData[
            ~df_labelledData.isin(df_sampleTrain)].dropna(
            )  #Remove Sampled Values from original data set.
        df_trainSet = pd.concat([df_trainSet, df_sampleTrain],
                                axis=0)  #Add sampled values into the Test Set

    #Shuffle both Train and Test Set....
    df_trainSet = shuffle(df_trainSet)
    df_testSet = shuffle(df_testSet)

    #Split Train Test Sets into X_train,y_train,X_test,y_test   (Similar to Train Test Split....)
    #X_train = df_trainSet.loc[:,['req1','req2']] #Special characters are included in this as input is a list.
    #y_train = df_trainSet.loc[:,targetLabel]
    #X_test = df_testSet.loc[:,['req1','req2']]
    #y_test = df_testSet.loc[:,targetLabel]

    #Combining the requirement 1 and requirement 2 in a single column.
    df_trainSet['req'] = df_trainSet['req1'] + " " + df_trainSet['req2']
    df_testSet['req'] = df_testSet['req1'] + " " + df_testSet['req2']

    X_train = df_trainSet.loc[:, 'req']
    y_train = df_trainSet.loc[:, targetLabel]
    X_test = df_testSet.loc[:, 'req']
    y_test = df_testSet.loc[:, targetLabel]

    #############################################################################################################

    #Train / Test Split (80/20)
    #X_train,X_test,y_train,y_test = train_test_split(df_labelledData.loc[:,['req1','req2']],df_labelledData.loc[:,targetLabel],test_size=splitratio)
    #labelledData.iloc[:,:-2]  --> ['req1','req2]   labelledData.iloc[:,-2]  -->  ['BinaryClass' / 'MuliClass']

    logs.writeLog("\nTraining Set Size : " + str(len(X_train)))
    logs.writeLog("\nTrain Set Value Count : \n" +
                  str(df_trainSet[targetLabel].value_counts()))

    logs.writeLog("\nTest Set Size : " + str(len(X_test)))
    logs.writeLog("\nTest Set Value Count : \n" +
                  str(df_testSet[targetLabel].value_counts()))

    logs.writeLog("\n\nTraining Model.....")
    #Initialize Count Vectorizer which in a way performs Bag of Words on X_train
    #count_vect = CountVectorizer(tokenizer=lambda doc: doc, analyzer=split_into_lemmas, lowercase=False, stop_words='english')
    count_vect = CountVectorizer(lowercase=False, stop_words='english')
    X_train_counts = count_vect.fit_transform(np.array(X_train))

    #feature_names = count_vect.get_feature_names()  #--- Can be used for analysis if needed.
    #print ("\nFeature names : ", feature_names)
    #print (len(feature_names))
    #print ("\nBag Of Words :\n" ,repr(X_train_counts))
    #print (X_train_counts.toarray())
    #print (X_train_counts.toarray().shape)
    #input ("...............")

    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    #print ("\nAfter TFIDF Transformation: \n",repr(X_train_tfidf))

    X_test_counts = count_vect.transform(np.array(X_test))
    X_test_tfidf = tfidf_transformer.transform(X_test_counts)

    #Random Forest Classifier Creation
    if clf == "RF":
        clf_model = RandomForestClassifier().fit(
            X_train_tfidf,
            np.array(y_train).astype('int'))

        #Cross Validation Code Snippet...
        #clf_rdf = RandomForestClassifier()
        #scores = cross_val_score(clf_rdf,X_train_tfidf,y_train,cv=5)
        #logs.writeLog ("\nRandom Forest Classifier Cross Validation Score : "+str(scores.mean()))

        #predicted = clf_rdf.predict(X_test_tfidf)
        #print ("Prediction quality:" + str(np.mean(predicted == y_test)))

    #Naive Bayes Classifier Creation
    elif clf == "NB":
        clf_model = MultinomialNB().fit(X_train_tfidf,
                                        np.array(y_train).astype('int'))

    #Support Vector Machine Classifier Creation.
    elif clf == "SVM":
        clf_model = SVC(C=1.0,
                        cache_size=200,
                        class_weight=None,
                        coef0=0.0,
                        decision_function_shape='ovr',
                        degree=3,
                        gamma=1.0,
                        kernel='rbf',
                        max_iter=-1,
                        probability=True,
                        random_state=None,
                        shrinking=True,
                        tol=0.001,
                        verbose=False).fit(X_train_tfidf,
                                           np.array(y_train).astype('int'))
    '''
    #Ensemble Creation
    elif clf == "ensemble":
        training = zip(X_train_tfidf,np.array(y_train).astype('int'))
        names = ['Random Forest','Naive Bayes','Support Vector Classifier']
        classifiers = [RandomForestClassifier(),MultinomialNB(),SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma=1.0, kernel='rbf', max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001, verbose=False)]
        models = zip(names,classifiers)
        print (type(models))
        clf_model = VotingClassifier(estimators=models,voting='hard',n_jobs=-1)
        clf_model = clf_model.fit(X_train_tfidf,np.array(y_train).astype('int')) #n_jobs = -1 makes allows models to be created in parallel (using all the cores, else we can mention 2 for using 2 cores)
    '''
    predict_labels = clf_model.predict(X_test_tfidf)
    actualLabels = np.array(y_test).astype('int')
    labelClasses = list(set(actualLabels))  #np.array(y_train).astype('int')
    #print ("labelClasses : ",labelClasses)
    clf_test_score = clf_model.score(X_test_tfidf, actualLabels)
    logs.writeLog("\n" + clf + " Classifier Test Score : " +
                  str(clf_test_score))

    #print (predict_labels)
    #print (actualLabels)

    f1 = round(f1_score(actualLabels, predict_labels, average='macro'), 2)
    precision = round(
        precision_score(actualLabels, predict_labels, average='macro'), 2)
    recall = round(recall_score(actualLabels, predict_labels, average='macro'),
                   2)
    #logs.writeLog("\n\nF1 Score : "+str(f1))
    #logs.writeLog("\n\nPrecision Score : "+str(precision))
    #logs.writeLog("\n\nRecall Score : "+str(recall))
    logs.writeLog("\n\nClassification Report : \n\n" +
                  str(classification_report(actualLabels, predict_labels)))
    cm = confusion_matrix(actualLabels, predict_labels, labels=labelClasses)
    logs.writeLog("\n\nConfusion Matrix : \n" + str(cm) + "\n")
    #tn,fp,fn,tp = cm.ravel()
    #acc = round((tn+tp)/(tn+fp+fn+tp),2)
    #logs.writeLog ("\n\nAccuracy : "+str(acc))
    return count_vect, tfidf_transformer, clf_model, clf_test_score, len(
        X_train), len(X_test), f1, precision, recall
Пример #13
0
def annotUncertainSamples(args, df_uncertainSamples, targetLabel):
    '''
    Based on the queryType, most uncertain samples are sampled and forwarded to manual annotater to get the annotations.
    '''
    df_manuallyAnnotated = pd.DataFrame(
        columns=df_uncertainSamples.columns
    )  #Create an empty Dataframe to store the manually annotated Results

    queryType = args.loc[0, 'samplingType']

    iteration = 0
    while iteration < int(
            args.loc[0, 'manualAnnotationsCount']
    ):  #while iteration is less than number of annotations that need to be done.
        if (len(df_uncertainSamples) > 0):
            logs.writeLog("\n\n Iteration : " + str(iteration + 1))
            if queryType == 'leastConfidence':
                indexValue = leastConfidenceSampling(df_uncertainSamples)
            elif queryType == 'minMargin':
                indexValue = minMarginSampling(df_uncertainSamples)
            elif queryType == 'entropy':
                indexValue = entropySampling(df_uncertainSamples)

            sample = df_uncertainSamples.loc[indexValue, :]
            logs.writeLog("\n\nMost Uncertain Sample : \n" + str(sample))

            df_userAnnot = pd.DataFrame(columns=[
                'req1_id', 'req2_id', 'req1', 'req2', 'BinaryClass',
                'MultiClass', 'BLabelled', 'MLabelled'
            ])
            userAnnot = getManualAnnotation(
                sample['req1_id'], sample['req2_id'], sample['req1'],
                sample['req2'], targetLabel
            )  #Passes the requirements to the user and requests annotation.

            if userAnnot == "exit":
                #Dump df_trainingSet into Annotations.csv (These are the manual annotations done before active learning actually starts)
                raise Exception('\nExited the Program successfully!')

            #Remove the selected sample from the original dataframe
            df_uncertainSamples.drop(index=indexValue, inplace=True)
            df_uncertainSamples.reset_index(inplace=True, drop=True)
            #logs.writeLog(str(df))

            if targetLabel == "BinaryClass":
                #Add the newly annotated combination in the manuallyAnnotatedDf
                df_userAnnot = df_userAnnot.append(
                    {
                        'req1_id': sample['req1_id'],
                        'req1': sample['req1'],
                        'req2_id': sample['req2_id'],
                        'req2': sample['req2'],
                        'BinaryClass': userAnnot,
                        'MultiClass': 0,
                        'BLabelled': 'M',
                        'MLabelled': 'A'
                    },
                    ignore_index=True
                )  #Added MultiClass as 0 because when we are learning BinaryClass... MultiClass can contain a dummy value.
                logs.createAnnotationsFile(df_userAnnot)

                df_manuallyAnnotated = pd.concat(
                    [df_manuallyAnnotated, df_userAnnot])
                #logs.writeLog("Manually Annotated DataFrame : \n"+str(manuallyAnnotatedDf))
            else:
                #Add the newly annotated combination in the manuallyAnnotatedDf
                df_userAnnot = df_userAnnot.append(
                    {
                        'req1_id': sample['req1_id'],
                        'req1': sample['req1'],
                        'req2_id': sample['req2_id'],
                        'req2': sample['req2'],
                        'BinaryClass': 1,
                        'MultiClass': userAnnot,
                        'BLabelled': sample['BLabelled'],
                        'MLabelled': 'M'
                    },
                    ignore_index=True
                )  #Added MultiClass as 0 because when we are learning BinaryClass... MultiClass can contain a dummy value.
                logs.createAnnotationsFile(df_userAnnot)

                df_manuallyAnnotated = pd.concat(
                    [df_manuallyAnnotated, df_userAnnot])
                #logs.writeLog("Manually Annotated DataFrame : \n"+str(manuallyAnnotatedDf))

        iteration += 1

    #Remove all the extra columns. df now contains only combinations marked 'A'
    df_uncertainSamples = df_uncertainSamples[[
        'req1_id', 'req1', 'req2_id', 'req2', 'BinaryClass', 'MultiClass',
        'BLabelled', 'MLabelled'
    ]]
    #logs.writeLog(str(df_uncertainSamples))

    #Remove all the extra columns. df now contains only combinations marked 'M'
    df_manuallyAnnotated = df_manuallyAnnotated[[
        'req1_id', 'req1', 'req2_id', 'req2', 'BinaryClass', 'MultiClass',
        'BLabelled', 'MLabelled'
    ]]
    logs.writeLog("\n\nManually Annotated Combinations... " +
                  str(len(df_manuallyAnnotated)) + "Rows \n" +
                  str(df_manuallyAnnotated[:10]))

    return pd.concat([df_manuallyAnnotated, df_uncertainSamples], axis=0)
Пример #14
0
def analyzePredictions(args, df_predictions, targetLabel, confidence):
    '''
    Calculates the maximum prability value from the prediction probabilites of all the classes
    Labels the combinations as 'I' aka intelligently annotated for which the maximum probability value is greater than the confidence
    Labels doubtful combinations as 'A'
    Calls annotUncertainSamples function, to annotate most uncertain samples manually.
    Returns final dataframe containing all the combinations after marking them 'M','I','A' as the case may be.
    '''

    if targetLabel == "BinaryClass":
        labelColumn = "BLabelled"
    else:
        labelColumn = "MLabelled"

    logs.writeLog(
        "\n\nAnalyzing Predictions... Based on the Maximum Probability Value for each combination."
    )

    if confidence == 0:
        #probabilityBins = df_predictions['maxProb'].value_counts()
        bins = np.arange(0, 1.1, 0.1)
        probabilityBins = pd.cut(
            df_predictions['maxProb'],
            bins=bins).value_counts().sort_index(ascending=False)
        while True:
            try:
                logs.writeLog(
                    "\n\nFollowing is the observed maximum probability distribution : \n"
                    + str(probabilityBins))
                logs.writeLog(
                    "\n\nPlease select the threshold probability value to mark the predictions confident : "
                )
                confidence = float(input(""))
                logs.writeLog("User Input : " + str(confidence))
                if confidence > 1.0:
                    logs.writeLog(
                        "\n\nInvalid Input. Please provide a valid value.\n")
                else:
                    break
            except ValueError:
                logs.writeLog(
                    "\n\nVALUE ERROR! ---- Invalid Input. Please provide a valid value."
                )
    logs.writeLog(
        "\n\nLooking for Confident Predictions.... for which Confidence Value >= "
        + str(confidence))
    df_ConfidentPredictions = df_predictions[(
        df_predictions['maxProb'] >= float(confidence)
    )]  #Mark the values as confident if the maxProb is greater than confidence value.
    df_ConfidentPredictions[labelColumn] = 'I'
    logs.writeLog("\n\nConfident Predictions : " +
                  str(len(df_ConfidentPredictions)) + " Rows\n" +
                  str(df_ConfidentPredictions[:10]))

    df_ConfidentPredictions = df_ConfidentPredictions[[
        'req1_id', 'req1', 'req2_id', 'req2', 'BinaryClass', 'MultiClass',
        'BLabelled', 'MLabelled'
    ]]  #Remove the extra columns.

    #Predictions which are not part of the ConfidentPredictions
    logs.writeLog("\n\nSegregating the doubtful predictions...")
    df_doubtfulPredictions = df_predictions[~df_predictions.isin(
        df_ConfidentPredictions
    )].dropna(
    )  #Make sure the BinaryClass/MultiClass of To Be labelled are not np.NaN; give it any dummy value
    df_doubtfulPredictions[
        labelColumn] = 'A'  #Mark the doubtful Predictions as 'A' - to be Annoatated....
    logs.writeLog("\n\nDoubtful Predictions : " +
                  str(len(df_doubtfulPredictions)) + " Rows\n" +
                  str(df_doubtfulPredictions[:10]))

    #Create an empty DataFrame
    df_AnnotatedData = pd.DataFrame()
    df_results = pd.DataFrame()

    if df_doubtfulPredictions.shape[
            0] > 0:  #Manual Annotation is needed only if there is a doubtful Prediction.

        #Call annotUncertainSamples Function; Asks user to manually annotate annotCount number of samples where sample are selected with the queryType.
        df_AnnotatedData = annotUncertainSamples(args, df_doubtfulPredictions,
                                                 targetLabel)
        logs.writeLog("\n\nAnnotated Data : " + str(len(df_AnnotatedData)) +
                      " Rows \n" + str(df_AnnotatedData[:10]))

    #Merge the results.
    df_results = pd.concat([df_ConfidentPredictions, df_AnnotatedData], axis=0)
    #resultsDf.rename(columns={'predictedLabel':targetLabel},inplace=True)
    df_results.reset_index(inplace=True, drop=True)

    return df_results, confidence
Пример #15
0
def getManualAnnotation(req1_id, req2_id, req1, req2, target):
    '''
    The user get's the two requirements and is expected to provide the annotation for the combination. 
    The target can be BinaryClass or MultiClass based on the arguments provided by the user.
    Returns the annotation.
    '''
    if target == 'BinaryClass':
        while True:  #While loop to make sure the user provides proper input.

            logs.writeLog(
                "\n\nAre the following Requirements Dependent or Not?")
            logs.writeLog("\n\nRequirement 1 (" + req1_id + ") : " + str(req1))
            logs.writeLog("\nRequirement 2 (" + req2_id + ") : " + str(req2))
            logs.writeLog(
                "\nPlease enter 1 for Dependent, 0 for Independent   :   ")
            userAnnotation = input("")
            if userAnnotation in ['1', '0']:
                logs.writeLog("\nValue provided by the user :- " +
                              str(userAnnotation.lower()))
                return userAnnotation
            elif userAnnotation.lower() == "exit":
                logs.writeLog("\nValue provided by the user :- " +
                              str(userAnnotation.lower()))
                return "exit"
                #raise Exception ('\nExited the Program successfully!')
            else:
                logs.writeLog("\nUser Annotation : " + userAnnotation.lower())
                logs.writeLog("Invalid Input. Allowed Values -- 1 / 0")
                logs.writeLog(
                    "In order to exit from the program. Enter 'exit'")
                continue
    else:
        while True:  #While loop to make sure the user provides proper input.

            logs.writeLog(
                "\n\nPlease provide the dependency type for the following requirements."
            )
            logs.writeLog("\n\nRequirement 1 (" + req1_id + ") : " + str(req1))
            logs.writeLog("\nRequirement 2 (" + req2_id + ") : " + str(req2))
            #logs.writeLog ("\nPlease select one of the following choices. \n1 - AND\n2 - OR \n3 - Requires \n4 - Similar \n5 - Cannot Say \nEnter your Choice here :   ")   #Removed 0 - Independent
            #logs.writeLog ("\nPlease select one of the following choices. \n3 - Requires \n4 - Similar \n6 - Others \nEnter your Choice here :   ")   #Removed 0 - Independent
            logs.writeLog(
                "\nPlease select one of the following choices. \n1 - Requires \n2 - Reflects \n3 - Conflicts \nEnter your Choice here :   "
            )

            userAnnotation = input("")
            #if userAnnotation in ['1','2','3','4','5']:
            #if userAnnotation in ['3','4','6']:
            if userAnnotation in ['1', '2', '3']:
                logs.writeLog("\nValue provided by the user :- " +
                              str(userAnnotation.lower()))
                return userAnnotation
            elif userAnnotation.lower() == "exit":
                logs.writeLog("\nValue provided by the user :- " +
                              str(userAnnotation.lower()))
                return "exit"
                #raise Exception ('\nExited the Program successfully!')
            else:
                logs.writeLog("\nValue provided by the user :- " +
                              str(userAnnotation.lower()))
                #logs.writeLog ("Invalid Input. Allowed Values -- 1 / 2 / 3 / 4 / 5 ")
                #logs.writeLog ("Invalid Input. Allowed Values -- 3 / 4 / 6 ")
                logs.writeLog("Invalid Input. Allowed Values -- 1 / 2 / 3 ")
                logs.writeLog(
                    "In order to exit from the program. Enter 'exit'")
                continue
    return None
def learnTargetLabel(args):
    '''
    Active Learning iterative process
    1. Prepare Data
    2. Create Classifier
    3. Evaluate Classifier
    4. Select Uncertain Samples and get them annotated by Oracle
    5. Update Data Set (Merge newly annotated samples to original dataset) 
    6. Repeat steps 1-5 until stopping condition is reached.

    Parameters : 
    args (dataframe) : Run-time arguments in a dataframe.

    Returns :
    df_rqmts (dataframe) : Updated / Final requirements dataset, included the prediction values at the last iteration of Active learning process. 
    df_resultTracker (dataframe) : Results for tracking purpose

    '''
    #Read run time arguments
    idir = os.getcwd() + args.loc[0, 'input']
    splitratio = float(args.loc[0, 'testsize'])
    maxIterations = int(args.loc[0, 'maxIterations'])
    resamplingTechnique = args.loc[0, 'resampling']

    logs.writeLog("Fetching data from the input directory.")
    #Read To be Annotated, Training, Test and Validation Sets generated after executing splitData.py
    try:
        df_tobeAnnotated = pd.read_csv(idir + "/ToBeAnnotated.csv")
        df_training = pd.read_csv(idir + "/TrainingSet.csv")
        df_test = pd.read_csv(idir + "/TestSet.csv")
        df_manuallyAnnotated = pd.concat([df_training, df_test])
        df_validation = pd.read_csv(idir + "/ValidationSet.csv")

        #Combines all requirement combinations in a single DataFrame
        df_rqmts = pd.concat([df_manuallyAnnotated, df_tobeAnnotated])

    except FileNotFoundError as err:
        logs.writeLog(
            "File Not Found! Please provide correct path of the directory containing Training, Test, Validation, ToBeAnnotated and Manually Annotated DataSet."
        )
        print(err)
        exit()

    #Create a dataframe to track the results
    #df_resultTracker = pd.DataFrame(columns=['Iteration','ManuallyAnnotated','ToBeAnnotated','TrainingSize','TestSize','ValidationSize','ClassifierTestScore','ClassifierValidationScore','IndependentCount','RequiresCount','SimilarCount','BlocksCount','t_5FoldCVScore','t_10FoldCVScore','t_f1Score','t_precisionScore','t_recallScore','v_f1Score','v_precisionScore','v_recallScore','v_5FoldCVScore','v_10FoldCVScore'])
    df_resultTracker = pd.DataFrame(columns=[
        'Iteration', 'ManuallyAnnotated', 'ToBeAnnotated', 'TrainingSize',
        'TestSize', 'ValidationSize', 'ClassifierTestScore',
        'ClassifierValidationScore', 'IndependentCount', 'RequiresCount',
        'SimilarCount', 'BlocksCount', 't_f1Score', 't_precisionScore',
        't_recallScore', 'v_f1Score', 'v_precisionScore', 'v_recallScore'
    ])

    iteration = 0

    while True:
        iteration += 1
        logs.writeLog("\n" + 100 * "-")
        logs.writeLog("\n\nIteration : " + str(iteration) + "\n")

        #For first iteration of active learning use the data available in df_train, df_test.
        #For subsequent iterations, recreate the training and test sets as new data points will be annotated by manual annotator at end of each iteration.
        if iteration > 1:
            df_manuallyAnnotated = df_rqmts[df_rqmts['AnnotationStatus'] ==
                                            'M']  #Training Data
            df_manuallyAnnotated['Label'] = df_manuallyAnnotated[
                'Label'].astype('int')
            logs.writeLog("df_manuallyAnnotated len : " +
                          str(len(df_manuallyAnnotated)))
            #if resamplingTechnique  == "under_sampling":
            logs.writeLog(
                "\nSplitting the Training/Test Set into training and test set - "
                + str(1 - splitratio) + "/" + str(splitratio) + " split.")
            df_training, df_test = splitDataSet(df_manuallyAnnotated,
                                                1 - splitratio,
                                                balancedClass=False)
            #        else:

            df_tobeAnnotated = df_rqmts[df_rqmts['AnnotationStatus'] != 'M']

        logs.writeLog("\nCreating Classifier...")
        countVectorizer, tfidfTransformer, classifier, classifierTestScore, t_f1Score, t_precisionScore, t_recallScore = clf_model.createClassifier(
            args.loc[0, 'classifier'], df_training, df_test,
            resamplingTechnique)

        #logs.writeLog ("\n\nEvaluating 5 fold and 10 fold Cross Validation Scores (Test Set)...")
        #t_cf5_score,t_cf10_score = clf_model.Crossfoldvalidation(countVectorizer,tfidfTransformer,classifier,pd.concat([df_training,df_test]))

        #logs.writeLog("\n\n5 fold Cross Validation Score : "+str(t_cf5_score))
        #logs.writeLog("\n\n10 fold Cross Validation Score : "+str(t_cf10_score))

        logs.writeLog("\n\nValidating Classifier...")
        classifierValidationScore, v_f1Score, v_precisionScore, v_recallScore = clf_model.validateClassifier(
            countVectorizer, tfidfTransformer, classifier, df_validation)
        logs.writeLog("\n\nClassifier Validation Set Score : " +
                      str(classifierValidationScore))

        #logs.writeLog ("\n\nEvaluating 5 fold and 10 fold Cross Validation Scores (Validation Set)...")
        #v_cf5_score,v_cf10_score = clf_model.Crossfoldvalidation(countVectorizer,tfidfTransformer,classifier,df_validation)

        #logs.writeLog("\n\n5 fold Cross Validation Score : "+str(v_cf5_score))
        #logs.writeLog("\n\n10 fold Cross Validation Score : "+str(v_cf10_score))

        #Update Analysis DataFrame (For tracking purpose)
        df_training['Label'] = df_training['Label'].astype('int')
        df_test['Label'] = df_test['Label'].astype('int')
        independentCount = len(df_training[df_training['Label'] == 0]) + len(
            df_test[df_test['Label'] == 0])
        requiresCount = len(df_training[df_training['Label'] == 1]) + len(
            df_test[df_test['Label'] == 1])
        similarCount = len(df_training[df_training['Label'] == 2]) + len(
            df_test[df_test['Label'] == 2])
        blocksCount = len(df_training[df_training['Label'] == 3]) + len(
            df_test[df_test['Label'] == 3])

        #df_resultTracker = df_resultTracker.append({'Iteration':iteration,'ManuallyAnnotated':len(df_manuallyAnnotated),'ToBeAnnotated':len(df_tobeAnnotated),'TrainingSize':len(df_training),'TestSize':len(df_test),'ValidationSize':len(df_validation),'ClassifierTestScore':classifierTestScore,'ClassifierValidationScore':classifierValidationScore,'IndependentCount':independentCount,'RequiresCount':requiresCount,'SimilarCount':similarCount,'BlocksCount':blocksCount,'t_5FoldCVScore':t_cf5_score,'t_10FoldCVScore':t_cf10_score,'t_f1Score':t_f1Score,'t_precisionScore':t_precisionScore,'t_recallScore':t_recallScore,'v_5FoldCVScore':v_cf5_score,'v_10FoldCVScore':v_cf10_score,'v_f1Score':v_f1Score,'v_precisionScore':v_precisionScore,'v_recallScore':v_recallScore},ignore_index=True)
        df_resultTracker = df_resultTracker.append(
            {
                'Iteration': iteration,
                'ManuallyAnnotated': len(df_manuallyAnnotated),
                'ToBeAnnotated': len(df_tobeAnnotated),
                'TrainingSize': len(df_training),
                'TestSize': len(df_test),
                'ValidationSize': len(df_validation),
                'ClassifierTestScore': classifierTestScore,
                'ClassifierValidationScore': classifierValidationScore,
                'IndependentCount': independentCount,
                'RequiresCount': requiresCount,
                'SimilarCount': similarCount,
                'BlocksCount': blocksCount,
                't_f1Score': t_f1Score,
                't_precisionScore': t_precisionScore,
                't_recallScore': t_recallScore,
                'v_f1Score': v_f1Score,
                'v_precisionScore': v_precisionScore,
                'v_recallScore': v_recallScore
            },
            ignore_index=True)

        logs.writeLog("\n\nAnalysis DataFrame : \n" + str(df_resultTracker))

        logs.writeLog("\n\nPredicting Labels....")
        df_predictionResults = clf_model.predictLabels(countVectorizer,
                                                       tfidfTransformer,
                                                       classifier,
                                                       df_tobeAnnotated)

        logs.writeLog("\n\nFinding Uncertain Samples and Annotating them.....")
        df_finalPredictions = annotate.analyzePredictions(
            args, df_predictionResults)

        logs.writeLog("\n\nMerging Newly Labelled Data Samples....")
        df_rqmts = pd.concat([df_training, df_test, df_finalPredictions],
                             axis=0,
                             ignore_index=True)
        #Remove unwanted columns
        df_rqmts = df_rqmts[[
            'comboId', 'req1Id', 'req1', 'req_1', 'req2Id', 'req2', 'req_2',
            'Label', 'AnnotationStatus'
        ]]

        if iteration >= maxIterations:
            logs.writeLog(
                "\n\nStopping Condition Reached... Exiting the program.")
            break

    #Merge Validation Set back to the prediction set to ensure all the 19699 combinations are returned.
    df_rqmts = pd.concat([df_rqmts, df_validation], axis=0, ignore_index=True)

    return df_rqmts, df_resultTracker
def analyzePredictions(args, df_predictions):
    '''
    Analyzis the predictions, samples the most uncertain data points and queries it from the oracle (original database/file) and updates dataframe accordingly.
    '''
    df_manuallyAnnotated = pd.DataFrame(columns=[
        'comboId', 'req1Id', 'req1', 'req_1', 'req2Id', 'req2', 'req_2',
        'Label', 'AnnotationStatus'
    ])  #Create an empty Dataframe to store the manually annotated Results

    queryType = args.loc[0, 'samplingType']

    iteration = 0
    while iteration < int(
            args.loc[0, 'manualAnnotationsCount']
    ):  #while iteration is less than number of annotations that need to be done.
        if (len(df_predictions) > 0):
            logs.writeLog("\n\nIteration : " + str(iteration + 1))
            if queryType == 'leastConfidence':
                indexValue = leastConfidenceSampling(df_predictions)
            elif queryType == 'minMargin':
                indexValue = minMarginSampling(df_predictions)
            elif queryType == 'entropy':
                indexValue = entropySampling(df_predictions)

            sample = df_predictions.loc[indexValue, :]
            logs.writeLog("\n\nMost Uncertain Sample : \n" + str(sample))

            df_userAnnot = pd.DataFrame(columns=[
                'comboId', 'req1Id', 'req1', 'req_1', 'req2Id', 'req2',
                'req_2', 'Label', 'AnnotationStatus'
            ])

            df_userAnnot = df_userAnnot.append(
                {
                    'comboId': sample['comboId'],
                    'req1Id': sample['req1Id'],
                    'req1': sample['req1'],
                    'req_1': sample['req_1'],
                    'req2Id': sample['req2Id'],
                    'req2': sample['req2'],
                    'req_2': sample['req_2'],
                    'Label': sample['Label'],
                    'AnnotationStatus': 'M'
                },
                ignore_index=True)  #Added AnnotationStatus as M
            #logs.createAnnotationsFile(df_userAnnot)

            #Remove the selected sample from the original dataframe
            df_predictions.drop(index=indexValue, inplace=True)
            df_predictions.reset_index(inplace=True, drop=True)

            df_manuallyAnnotated = pd.concat(
                [df_manuallyAnnotated, df_userAnnot])

        iteration += 1

    #Remove all the extra columns. df now contains only combinations marked 'A'
    df_predictions = df_predictions[[
        'comboId', 'req1Id', 'req1', 'req_1', 'req2Id', 'req2', 'req_2',
        'Label', 'AnnotationStatus'
    ]]

    df_manuallyAnnotated = df_manuallyAnnotated[[
        'comboId', 'req1Id', 'req1', 'req_1', 'req2Id', 'req2', 'req_2',
        'Label', 'AnnotationStatus'
    ]]
    logs.writeLog("\n\nManually Annotated Combinations... " +
                  str(len(df_manuallyAnnotated)) + "Rows \n" +
                  str(df_manuallyAnnotated[:10]))

    return pd.concat([df_manuallyAnnotated, df_predictions], axis=0)
def createClassifier(clf,df_trainSet,df_testSet,resampling_type):
    '''
    Passes the dataset via NLP Pipeline (Count Vectorizer , TFIDF Transformer)
    Performs Synthetic Monitoring Over-Sampling after performing TFIDF transformation (ONLY when resampling_type is over_sampling)
    Trains the classifier (Random Forest / Naive Bayes / SVM / Ensemble using Voting Classifier)

    Parameters : 
    clf (str) : Name of classifier (options - RF, NB, SVM , ensemble)
    df_trainSet (DataFrame) : Training Data
    df_testSet (DataFrame) : Test Data

    Returns : 
    count_vect : Count Vectorizer Model
    tfidf_transformer : TFIDF Transformer Model
    clf_model : Trained Model 
    clf_test_score (float) : Accuracy achieved on Test Set 
    f1/precision/recall (float) : F1, Precision and Recall scores (macro average)
    '''
    
    #logs.writeLog("\nSplitting the Training/Test Set into training and test set - "+str(1-splitratio)+"/"+str(splitratio)+" split.")
    #df_trainSet,df_testSet = balanceDataSet(df_annotatedSet,"Label",1-splitratio)

    #df_trainSet = shuffle(df_trainSet)
    #df_testSet = shuffle(df_testSet)

    #Convert dataframes to numpy array's
    X_train = df_trainSet.loc[:,['req_1','req_2']]  #Using req_1,req_2 rather than req1,req2 because req_1,req_2 have been cleaned - lower case+punctuations
    y_train = df_trainSet.loc[:,'Label'].astype("int")
    X_test = df_testSet.loc[:,['req_1','req_2']]
    y_test = df_testSet.loc[:,'Label'].astype("int")

    logs.writeLog("\nTraining Set Size : "+str(len(X_train)))
    logs.writeLog("\nTrain Set Value Count : \n"+str(df_trainSet['Label'].value_counts()))

    logs.writeLog("\nTest Set Size : "+str(len(X_test)))
    logs.writeLog("\nTest Set Value Count : \n"+str(df_testSet['Label'].value_counts()))
    
    logs.writeLog("\n\nTraining Model....")
    
    #Perform Bag of Words
    count_vect = CountVectorizer(tokenizer=my_tokenizer,lowercase=False)
    X_train_counts = count_vect.fit_transform(np.array(X_train))
    
    #Transform a count matrix to a normalized tf or tf-idf representation.
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf= tfidf_transformer.fit_transform(X_train_counts)
    
    #######################################################################################
    if resampling_type == "over_sampling":
        logs.writeLog("\n\nValue Count for each class in training set."+str(Counter(y_train)))
        
        logs.writeLog("\n\nPerforming Over Sampling")
        sm = SMOTE(random_state=0)
        X_train_tfidf, y_train = sm.fit_resample(X_train_tfidf, y_train)
        logs.writeLog("\n\nValue Count for each class in training set."+str(Counter(y_train)))
    #######################################################################################
    X_test_counts = count_vect.transform(np.array(X_test))
    X_test_tfidf = tfidf_transformer.transform(X_test_counts)
    
    #Initiate Classifiers
    rf_model = RandomForestClassifier(random_state=0)
    nb_model = MultinomialNB()
    svm_model = SVC(random_state = 0, probability=True)  #predict_proba not available if probability = False

    #Random Forest Classifier Creation
    if clf == "RF" :
        clf_model = rf_model.fit(X_train_tfidf, np.array(y_train).astype('int'))
        
    #Naive Bayes Classifier Creation
    elif clf == "NB":
        clf_model = nb_model.fit(X_train_tfidf,np.array(y_train).astype('int'))

    #Support Vector Machine Classifier Creation.
    elif clf == "SVM":
        clf_model = svm_model.fit(X_train_tfidf,np.array(y_train).astype('int'))
    
    #Ensemble Creation
    elif clf == "ensemble":
        #Predict_proba works only when Voting = 'soft'
        #n_jobs = -1 makes allows models to be created in parallel (using all the cores, else we can mention 2 for using 2 cores)
        clf_model = VotingClassifier(estimators=[('RF', rf_model), ('NB', nb_model),('SVM',svm_model)], voting='soft',n_jobs=1)  
        clf_model.fit(X_train_tfidf,np.array(y_train).astype('int'))

    #Predict labels
    predict_labels = clf_model.predict(X_test_tfidf)
    actualLabels = np.array(y_test).astype('int')
    labelClasses = list(set(actualLabels))   #np.array(y_train).astype('int')
    
    #Calculate Classifier Test Accuracy and other important metrics
    clf_test_score = clf_model.score(X_test_tfidf,actualLabels)
    logs.writeLog ("\n"+clf+" Classifier Test Score : "+str(clf_test_score))
    
    f1 = round(f1_score(actualLabels, predict_labels,average='macro'),2)
    precision = round(precision_score(actualLabels, predict_labels,average='macro'),2)
    recall = round(recall_score(actualLabels, predict_labels,average='macro'),2)
    
    logs.writeLog ("\n\nClassification Report on Test Set: \n\n"+str(classification_report(actualLabels,predict_labels)))
    cm = confusion_matrix(actualLabels,predict_labels,labels=labelClasses)    
    logs.writeLog ("\n\nConfusion Matrix : \n"+str(cm)+"\n")
    

    return count_vect, tfidf_transformer, clf_model,clf_test_score,f1,precision,recall