Exemplo n.º 1
0
def SvmGetClassifierInputs(ctx, featuresMaps, outClassifierInputs):
    logging.getLogger("Svm").info("get classifier inputs")
    outClassifierInputs[:] = []

    featuresKeys = set()

    for featureVector in featuresMaps:
        featuresKeys.update(featureVector.keys())
    #featuresKeys = featuresMaps[0].keys() # Assuming at least a single features map exists
    for itrComment, rawCsvCommentDict in enumerate(ctx.mRawCsvComments):
        # @TODO: Classify "Thumbs Down!"
        svmType = -1 + 2 * MinerMiscUtils.getCommentLabel(rawCsvCommentDict)
        inputsCollector = [SvmUtilGetStrSign(svmType) + str(svmType)]
        for itrFeature, featureKey in enumerate(featuresKeys):
            if (featureKey in featuresMaps[itrComment]):
                featureValue = -1 + 2 * int(
                    featuresMaps[itrComment][featureKey])
            else:
                featureValue = 0

            inputsCollector.append(" " + str(itrFeature + 1) + ":" +
                                   str(featureValue))

        outClassifierInputs.append("".join(inputsCollector))
    assert (len(outClassifierInputs) == len(ctx.mRawCsvComments))
Exemplo n.º 2
0
def CAR_get_comment_labels(ctx):
    logging.getLogger("CAR").info("get comment labels")
    outCommentLabels = []
    for rawCsvCommentDict in ctx.mRawCsvComments:
        outCommentLabels.append(
            MinerMiscUtils.getCommentLabel(rawCsvCommentDict))
    return outCommentLabels
def NaiveBayesGetClassifierInputs( ctx, featuresMaps, outClassifierInputs, bTrain ):
    logging.getLogger("NaiveBayes").info( "get classifier inputs" )
    outClassifierInputs[:] = []
    for itrComment, rawCsvCommentDict in enumerate( ctx.mRawCsvComments ):
        strLabel = "?"
        if ( bTrain ):
            strLabel = str(MinerMiscUtils.getCommentLabel(rawCsvCommentDict))
        outClassifierInputs.append( ( featuresMaps[ itrComment ], strLabel ) )
Exemplo n.º 4
0
def NaiveBayesGetClassifierInputs(ctx, featuresMaps, outClassifierInputs,
                                  bTrain):
    logging.getLogger("NaiveBayes").info("get classifier inputs")
    outClassifierInputs[:] = []
    for itrComment, rawCsvCommentDict in enumerate(ctx.mRawCsvComments):
        strLabel = "?"
        if (bTrain):
            strLabel = str(MinerMiscUtils.getCommentLabel(rawCsvCommentDict))
        outClassifierInputs.append((featuresMaps[itrComment], strLabel))
Exemplo n.º 5
0
def SvmGetClassifierInputs( ctx, featuresMaps, outClassifierInputs ):
    logging.getLogger("Svm").info( "get classifier inputs" )
    outClassifierInputs[:] = []
    
    featuresKeys = set()
    
    for featureVector in featuresMaps:
        featuresKeys.update(featureVector.keys())
    #featuresKeys = featuresMaps[0].keys() # Assuming at least a single features map exists
    for itrComment, rawCsvCommentDict in enumerate( ctx.mRawCsvComments ):
        # @TODO: Classify "Thumbs Down!"
        svmType = -1 + 2 * MinerMiscUtils.getCommentLabel( rawCsvCommentDict )
        inputsCollector = [SvmUtilGetStrSign( svmType ) + str(svmType)]
        for itrFeature, featureKey in enumerate( featuresKeys ):
            if(featureKey in featuresMaps[itrComment]):
                featureValue = -1 + 2*int(featuresMaps[itrComment][ featureKey ])
            else:
                featureValue=0
                
            inputsCollector.append( " " + str( itrFeature+1 ) + ":" + str(featureValue) )
            
        outClassifierInputs.append( "".join( inputsCollector ) )
    assert( len( outClassifierInputs ) == len( ctx.mRawCsvComments ) )    
def addFeaturesDist( ctx, outFeaturesMaps ):
    logging.getLogger("Features").info( "Distance" )

    # Centroids for each class label
    centroids = [{}, {}]    
    centroidsCacheFileName = "centroidsCache.txt"
    if ( MinerMiscUtils.fileExists(centroidsCacheFileName)):
        # Load from cache!
        centroids = pickle.load( open( centroidsCacheFileName ) )
    else:
        # Sum up all features vectors
        for itrComment, rawCsvCommentDict in enumerate( ctx.mRawCsvComments ):
            label = MinerMiscUtils.getCommentLabel(rawCsvCommentDict)
            for key, value in outFeaturesMaps[ itrComment ].iteritems():
                if ( type( value ) is str ):
                    print "BREAK = " + key + " = " + value + "\n"
                if ( key in centroids[label]):
                    centroids[label][key] += value
                else:
                    centroids[label][key] = value
                
                for altLabel in range( len(centroids ) ):
                    if ( altLabel != label ):
                        if key not in centroids[ altLabel ]:
                            centroids[ altLabel ][key] = 0.0
        
        # Average the centroids
        for centroid in centroids:
            for key, value in centroid.iteritems():
                value /= len( outFeaturesMaps )
                centroid[ key ] = value
        
        # Cache the centroids to disk
        pickle.dump( centroids, open( centroidsCacheFileName, "wb" ) )
        
    # Determine distance from both centroids
    distances = [ [], [] ]
    averageDistance = [ 0.0, 0.0 ]    
    for featuresMap in outFeaturesMaps:
        for label, centroid in enumerate(centroids):
            totalSqDist = 0.0
            for centroidKey, centroidValue in centroid.iteritems():
                commentValue = 0.0
                if ( centroidKey in featuresMap ):
                    commentValue = featuresMap[ centroidKey ]
                sqDist = commentValue - centroidValue
                sqDist *= sqDist
                totalSqDist += sqDist
            totalDist = math.sqrt( totalSqDist )
            distances[label].append( totalDist )
            averageDistance[label] += totalDist 
    
    for label in range( len( averageDistance ) ):
        averageDistance[ label ] /= len( outFeaturesMaps )
    
    # Determine standard deviation
    averageStdDev = [ 0, 0 ]
    for label, labelDistances in enumerate( distances ):
        for distance in labelDistances:
            sqDistFromMean = distance - averageDistance[ label ]
            sqDistFromMean *= sqDistFromMean
            averageStdDev[ label ] += sqDistFromMean
    
    for label in range( len( averageStdDev ) ):
        averageStdDev[ label ] /= len( outFeaturesMaps )
        averageStdDev[ label ]  = math.sqrt( averageStdDev[ label ] )
        
    # Map all feature vectors as being closer or farther from std dev
    for itrComment, featuresMap in enumerate(outFeaturesMaps):
        for label, stdDev in enumerate( averageStdDev ):
            featuresMap[ "Dist--"+str(label) ] = distances[ label ][ itrComment ] > stdDev
Exemplo n.º 7
0
def addFeaturesDist(ctx, outFeaturesMaps):
    logging.getLogger("Features").info("Distance")

    # Centroids for each class label
    centroids = [{}, {}]
    centroidsCacheFileName = "centroidsCache.txt"
    if (MinerMiscUtils.fileExists(centroidsCacheFileName)):
        # Load from cache!
        centroids = pickle.load(open(centroidsCacheFileName))
    else:
        # Sum up all features vectors
        for itrComment, rawCsvCommentDict in enumerate(ctx.mRawCsvComments):
            label = MinerMiscUtils.getCommentLabel(rawCsvCommentDict)
            for key, value in outFeaturesMaps[itrComment].iteritems():
                if (type(value) is str):
                    print "BREAK = " + key + " = " + value + "\n"
                if (key in centroids[label]):
                    centroids[label][key] += value
                else:
                    centroids[label][key] = value

                for altLabel in range(len(centroids)):
                    if (altLabel != label):
                        if key not in centroids[altLabel]:
                            centroids[altLabel][key] = 0.0

        # Average the centroids
        for centroid in centroids:
            for key, value in centroid.iteritems():
                value /= len(outFeaturesMaps)
                centroid[key] = value

        # Cache the centroids to disk
        pickle.dump(centroids, open(centroidsCacheFileName, "wb"))

    # Determine distance from both centroids
    distances = [[], []]
    averageDistance = [0.0, 0.0]
    for featuresMap in outFeaturesMaps:
        for label, centroid in enumerate(centroids):
            totalSqDist = 0.0
            for centroidKey, centroidValue in centroid.iteritems():
                commentValue = 0.0
                if (centroidKey in featuresMap):
                    commentValue = featuresMap[centroidKey]
                sqDist = commentValue - centroidValue
                sqDist *= sqDist
                totalSqDist += sqDist
            totalDist = math.sqrt(totalSqDist)
            distances[label].append(totalDist)
            averageDistance[label] += totalDist

    for label in range(len(averageDistance)):
        averageDistance[label] /= len(outFeaturesMaps)

    # Determine standard deviation
    averageStdDev = [0, 0]
    for label, labelDistances in enumerate(distances):
        for distance in labelDistances:
            sqDistFromMean = distance - averageDistance[label]
            sqDistFromMean *= sqDistFromMean
            averageStdDev[label] += sqDistFromMean

    for label in range(len(averageStdDev)):
        averageStdDev[label] /= len(outFeaturesMaps)
        averageStdDev[label] = math.sqrt(averageStdDev[label])

    # Map all feature vectors as being closer or farther from std dev
    for itrComment, featuresMap in enumerate(outFeaturesMaps):
        for label, stdDev in enumerate(averageStdDev):
            featuresMap["Dist--" +
                        str(label)] = distances[label][itrComment] > stdDev
Exemplo n.º 8
0
def CAR_get_comment_labels(ctx):
    logging.getLogger("CAR").info("get comment labels")
    outCommentLabels = []
    for rawCsvCommentDict in ctx.mRawCsvComments:
        outCommentLabels.append(MinerMiscUtils.getCommentLabel(rawCsvCommentDict))
    return outCommentLabels