Пример #1
0
def calculateFeaturesAndLabelsForDatasets():

    trainingPositiveDataset = datasets.loadTrainingPositiveDatasetFromFile()
    trainingNegativeDataset = datasets.loadTrainingNegativeDatasetFromFile()

    print("Generating labels...")
    labels = [1] * len(trainingPositiveDataset)
    labels.extend([0] * len(trainingNegativeDataset))

    featuresToUse = features.TEST_FEATURES

    testFeatures = features.calcFeatures(trainingPositiveDataset[0],
                                         featuresToUse)
    print(testFeatures)
    samplesQuantity = len(trainingPositiveDataset) + len(
        trainingNegativeDataset)
    featuresQuantity = len(testFeatures)
    print("Total samples: " + str(samplesQuantity))
    print("Features quantity: " + str(featuresQuantity))

    print("Creating LIL sparse matrix for features...")

    featuresForSamples = sparse.lil_matrix((samplesQuantity, featuresQuantity))
    """featuresForSamples[0] = testFeatures[:]
    
    print("Features list: " + str(testFeatures))
    print("Test features in numpy array: " + str(featuresForSamples[0]))"""

    currentSample = 0
    print("Calculating features for positive data...")
    for word in trainingPositiveDataset:
        featuresForSamples[currentSample] = features.calcFeatures(
            word, features.TEST_FEATURES)[:]
        currentSample += 1

    print("Calculating features for negative data...")
    for word in trainingNegativeDataset:
        featuresForSamples[currentSample] = features.calcFeatures(
            word, features.TEST_FEATURES)[:]
        currentSample += 1

    print("Converting LIL sparse matrix to CSR sparse matrix...")
    featuresForSamplesSparse = sparse.csr_matrix(featuresForSamples)
    return featuresForSamplesSparse, labels
def calculateFeaturesAndLabelsForDatasets():
    
    trainingPositiveDataset = datasets.loadTrainingPositiveDatasetFromFile()
    trainingNegativeDataset = datasets.loadTrainingNegativeDatasetFromFile()

    print("Generating labels...")
    labels = [1] * len(trainingPositiveDataset)
    labels.extend([0] * len(trainingNegativeDataset))
    
    featuresToUse = features.TEST_FEATURES
    
    testFeatures = features.calcFeatures(trainingPositiveDataset[0], featuresToUse)
    print(testFeatures)
    samplesQuantity = len(trainingPositiveDataset) + len(trainingNegativeDataset)
    featuresQuantity = len(testFeatures)
    print("Total samples: " + str(samplesQuantity))
    print("Features quantity: " + str(featuresQuantity))
    
    print("Creating LIL sparse matrix for features...")
    
    featuresForSamples = sparse.lil_matrix((samplesQuantity, featuresQuantity))

    """featuresForSamples[0] = testFeatures[:]
    
    print("Features list: " + str(testFeatures))
    print("Test features in numpy array: " + str(featuresForSamples[0]))"""
    
    currentSample = 0
    print("Calculating features for positive data...")
    for word in trainingPositiveDataset:
        featuresForSamples[currentSample] = features.calcFeatures(word, features.TEST_FEATURES)[:]
        currentSample += 1
    
    print("Calculating features for negative data...")
    for word in trainingNegativeDataset:
        featuresForSamples[currentSample] = features.calcFeatures(word, features.TEST_FEATURES)[:]
        currentSample += 1
        
    print("Converting LIL sparse matrix to CSR sparse matrix...")
    featuresForSamplesSparse = sparse.csr_matrix(featuresForSamples)
    return featuresForSamplesSparse, labels
    probability = 1.0
    for position in range(0, min(lettersLimit, len(word))):
        letter = word[position]
        #probability *= lettersProbabilities[position][letter] / maxProbabilities[position]
        value = lettersProbabilities[position][letter]
        #value = max(sys.float_info.epsilon, value)
        #probability += math.log(value)
        probability *= value
    #return math.exp(probability / min(FIRST_LETTERS_TO_COUNT,len(word)))
    #return (probability / min(FIRST_LETTERS_TO_COUNT,len(word)))

    return probability


PROBABILITY_THRESHOLDS = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1, 0.2]
dataset = datasets.loadTrainingPositiveDatasetFromFile()

lettersProbabilities = calculateLettersProbabilitiesForDataset(dataset)


def wordProbabilityClassifier(lettersProbabilities, word, lettersLimit=3):
    probability = calculateWordProbability(lettersProbabilities, lettersLimit,
                                           word)
    return [probability]


def wordProbabilityReverseFeature(word):
    result = []
    for limit in limits:
        result.extend(
            wordProbabilityClassifier(lettersProbabilities,
    
    probability = 1.0
    for position in range(0, min(lettersLimit,len(word))):
        letter = word[position]
        #probability *= lettersProbabilities[position][letter] / maxProbabilities[position]
        value = lettersProbabilities[position][letter]
        #value = max(sys.float_info.epsilon, value)
        #probability += math.log(value)
        probability *= value
    #return math.exp(probability / min(FIRST_LETTERS_TO_COUNT,len(word)))
    #return (probability / min(FIRST_LETTERS_TO_COUNT,len(word)))
            
    return probability
    
PROBABILITY_THRESHOLDS = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1, 0.2]
dataset = datasets.loadTrainingPositiveDatasetFromFile()


lettersProbabilities = calculateLettersProbabilitiesForDataset(dataset)

def wordProbabilityClassifier(lettersProbabilities, word, lettersLimit = 3):
    probability = calculateWordProbability(lettersProbabilities,
                                           lettersLimit,
                                           word)
    return [probability]
                
    
def wordProbabilityReverseFeature(word):
    result = []
    for limit in limits:
        result.extend(wordProbabilityClassifier(lettersProbabilities, word, lettersLimit = limit))
MODEL_TO_USE = ADABOOST_DECISION_TREES_MODEL

print("Loading features...")

featuresForSamples = datasets.loadFeatures()

print("Samples: " + str(featuresForSamples.shape[0]))

print("Loading labels...")
labels = datasets.loadLabels()

positivePartInActualDataset = 0.5
negativePartInActualDataset = 1.0 - positivePartInActualDataset

positiveSamples = len(datasets.loadTrainingPositiveDatasetFromFile())
negativeSamples = len(datasets.loadTrainingNegativeDatasetFromFile())
positiveWeight = negativeSamples * negativePartInActualDataset * 1.0 /  \
 (positiveSamples * positivePartInActualDataset)

print("Positive weight :" + str(positiveWeight))

weights = constructWeightsForPositiveSamples(labels, positiveWeight)

print("Training model...")
start = time.time()

trainedModel = fitModel(MODEL_TO_USE, featuresForSamples, labels, weights)

end = time.time()
print("Completed in " + str(end - start))
Пример #6
0
MODEL_TO_USE = ADABOOST_DECISION_TREES_MODEL


print("Loading features...")

featuresForSamples = datasets.loadFeatures()

print("Samples: " + str(featuresForSamples.shape[0]))

print("Loading labels...")
labels = datasets.loadLabels()

positivePartInActualDataset = 0.5
negativePartInActualDataset = 1.0 - positivePartInActualDataset

positiveSamples = len(datasets.loadTrainingPositiveDatasetFromFile())
negativeSamples = len(datasets.loadTrainingNegativeDatasetFromFile())
positiveWeight = negativeSamples * negativePartInActualDataset * 1.0 /  \
 (positiveSamples * positivePartInActualDataset)
 
print("Positive weight :" + str(positiveWeight))

weights = constructWeightsForPositiveSamples(labels, positiveWeight)

print("Training model...")
start = time.time()

trainedModel = fitModel(MODEL_TO_USE, featuresForSamples, labels, weights)

end = time.time()
print("Completed in " + str(end-start))