Пример #1
0
 def createTrainingSupervisedDataSet(self,msrcImages , scale , keepClassDistTrain):
     print "\tSplitting MSRC data into train, test, valid data sets."
     splitData = pomio.splitInputDataset_msrcData(msrcImages, scale, keepClassDistTrain)
     
     print "\tNow generating features for each training image."
     trainData = FeatureGenerator.processLabeledImageData(splitData[0], ignoreVoid=True)
     features = trainData[0]
     numDataPoints = np.shape(features)[0]
     numFeatures = np.shape(features)[1]
     labels = trainData[1]
     numLabels = np.size(labels) #!!error! nb unique labels, or max label
     assert numDataPoints == numLabels , "Number of feature data points and number of labels not equal!"
     
     dataSetTrain = ClassificationDataSet(numFeatures , numClasses)
     
     print "\tNow adding all data points to the ClassificationDataSet..."
     for idx in range(0,numDataPoints):
         feature = trainData[0][idx]
         label =  trainData[1][idx]
         
         binaryLabels = np.zeros(numClasses)
         # to cope with the removal of void class (idx 13)
         if label < voidClass:
             binaryLabels[label] = 1
         else:
             binaryLabels[label-1] = 1
             
         dataSetTrain.addSample(feature , binaryLabels) 
 
     print "\tAdded" , np.size(trainData) , " labeled data points to DataSet."
     return dataSetTrain
Пример #2
0
    def main(self):
        [dataMat, labelMat
         ] = FeatureGenerator.FeatureParser(self.messageBook, self.limitBook,
                                            self.timeFrame,
                                            self.useCachedFiles).parse()
        print "Generated Features of size: (%d,%d)" % dataMat.shape
        self.labelMat = labelMat

        count = 0

        self.predictMat = numpy.zeros((labelMat.shape))

        for obs in dataMat:  #an observation is a row in the dataMat
            self.cache.add(obs)
            if self.cache.needUpdate():
                print "huh"
                self.model = self.trainer.train(self.cache.getData())
            prediction = self.model.predict(obs)
            self.predictMat[count, 0] = prediction
            count += 1
            if count % 1000 == 0:
                print "On Obs: %d" % count
            self.trader.trade(Trader.DataPt(obs), prediction)

        self.printAnalysis()
Пример #3
0
    def __init__(self, config, db=None, stats=None, dmodel=None, load_path=None, w2v=None):
        if type(config) == str or type(config) == unicode:
            with open(config) as data_file:
                self._config = json.load(data_file)
        else:
            self._config = config

        self._dmodel = dmodel

        print "GBRT params:", self._config['hyper_patameters']
        if load_path is None:
            self._model = GradientBoostingClassifier(loss=self._config['hyper_patameters']['loss'],
                                                     learning_rate=self._config['hyper_patameters']['learning_rate'],
                                                     n_estimators=self._config['hyper_patameters']['n_estimators'],
                                                     max_depth=self._config['hyper_patameters']['max_depth'],
                                                     max_features=None)
        else:
            self.loadModel(load_path)

        self._feature_generator = \
            FeatureGenerator(mention_features=self._config['features']['mention_features'],
                             entity_features=self._config['features']['entity_features'],
                             yamada_embedding_path=ProjectSettings.getPath()[0] +
                                                   self._config['features']['yamada_embedding_path'],
                             stats=stats,
                             db=db,
                             dmodel=dmodel,
                             w2v=w2v)
        self._train_X = []
        self._train_Y = []
        self._db = db
Пример #4
0
    def createTrainingSupervisedDataSet(self, msrcImages, scale,
                                        keepClassDistTrain):
        print "\tSplitting MSRC data into train, test, valid data sets."
        splitData = pomio.splitInputDataset_msrcData(msrcImages, scale,
                                                     keepClassDistTrain)

        print "\tNow generating features for each training image."
        trainData = FeatureGenerator.processLabeledImageData(splitData[0],
                                                             ignoreVoid=True)
        features = trainData[0]
        numDataPoints = np.shape(features)[0]
        numFeatures = np.shape(features)[1]
        labels = trainData[1]
        numLabels = np.size(labels)  #!!error! nb unique labels, or max label
        assert numDataPoints == numLabels, "Number of feature data points and number of labels not equal!"

        dataSetTrain = ClassificationDataSet(numFeatures, numClasses)

        print "\tNow adding all data points to the ClassificationDataSet..."
        for idx in range(0, numDataPoints):
            feature = trainData[0][idx]
            label = trainData[1][idx]

            binaryLabels = np.zeros(numClasses)
            # to cope with the removal of void class (idx 13)
            if label < voidClass:
                binaryLabels[label] = 1
            else:
                binaryLabels[label - 1] = 1

            dataSetTrain.addSample(feature, binaryLabels)

        print "\tAdded", np.size(trainData), " labeled data points to DataSet."
        return dataSetTrain
Пример #5
0
def train(xml_file, con_file, dep_file, alg, concept, classifier_pickle):
	aus = get_annotation_units(xml_file)
	aus = UnifiedReader(aus, con_file, dep_file)
	aus = instance_filter(aus, None, True, concept)
	fss_n_lists = map(lambda x: FeatureGenerator.get_featuresets(x, concept), [aus])
	print fss_n_lists[0][0][1]
	classifier = nltk.MaxentClassifier.train(fss_n_lists[0], alg, trace=0, max_iter=1000)
	print len(classifier.labels()), classifier.labels
	pickle_out = open(classifier_pickle, 'wb')
	pickle.dump(classifier, pickle_out)
	pickle_out.close()
def featureGen():
    """
    Desc : Genarate Features
    """
    print('\n ********** Feature Generator ***********')
    fileName='D:\\Fall 2015\\Search\\YELP Challenge\\Yelp-Dataset-Challenge-2015\\Task_2\\Train_Review.csv'
    mapFile='D:\\Fall 2015\\Search\\YELP Challenge\\Yelp-Dataset-Challenge-2015\\Task_2\\Business_File.csv'
    writeFile='D:\\Fall 2015\\Search\\YELP Challenge\\Yelp-Dataset-Challenge-2015\\Task_2\\Generated_Features.csv'
    oFeature=feature.featureGen()
    oFeature.getBusinesSToCato(mapFile)
    oFeature.generateFeature(fileName,writeFile)
def featureGen():
    """
    Desc : Genarate Features
    """
    print('\n ********** Feature Generator ***********')
    fileName = 'D:\\Fall 2015\\Search\\YELP Challenge\\Yelp-Dataset-Challenge-2015\\Task_2\\Train_Review.csv'
    mapFile = 'D:\\Fall 2015\\Search\\YELP Challenge\\Yelp-Dataset-Challenge-2015\\Task_2\\Business_File.csv'
    writeFile = 'D:\\Fall 2015\\Search\\YELP Challenge\\Yelp-Dataset-Challenge-2015\\Task_2\\Generated_Features.csv'
    oFeature = feature.featureGen()
    oFeature.getBusinesSToCato(mapFile)
    oFeature.generateFeature(fileName, writeFile)
Пример #8
0
    def __init__(self,
                 config,
                 db=None,
                 stats=None,
                 models_as_features=None,
                 load_path=None,
                 w2v=None,
                 base_path="../"):
        if type(config) == str or type(config) == unicode:
            with open(config) as data_file:
                self._config = json.load(data_file)
        else:
            self._config = config

        self.models_as_features = models_as_features

        print "GBRT params:", self._config['hyper_patameters']
        if load_path is None:
            self._model = GradientBoostingClassifier(
                loss=self._config['hyper_patameters']['loss'],
                learning_rate=self._config['hyper_patameters']
                ['learning_rate'],
                n_estimators=self._config['hyper_patameters']['n_estimators'],
                max_depth=self._config['hyper_patameters']['max_depth'],
                max_features=None)
        else:
            self.loadModel(load_path)

        self._feature_generator = \
            FeatureGenerator(feature_names=self._config['features']['feature_names'],
                             yamada_embedding_path=base_path +
                                                   self._config['features']['yamada_embedding_path'],
                               #                             yamada_id2title_path=ProjectSettings.getPath()[0] +
                               #                                                   self._config['features']['yamada_title2id_path'],
                             stats=stats,
                             db=db,
                             models_as_features=self.models_as_features,
                             w2v=w2v)
        self._train_X = []
        self._train_Y = []
        self._db = db
Пример #9
0
def computePixelFeatures(rgbImage, ftype):
    res = []

    if ftype == 'classic':
        res = FeatureGenerator.generatePixelFeaturesForImage(rgbImage)
    else:
        raise Exception('Invalid feature type "%s"' % ftype)

    assert res.shape[1] > 0
    assert res.shape[0] == rgbImage.shape[0] * rgbImage.shape[1]
    assert np.all(np.isfinite(res))
    return res
Пример #10
0
def computePixelFeatures( rgbImage, ftype ):
  res = []

  if ftype == 'classic':
    res = FeatureGenerator.generatePixelFeaturesForImage( rgbImage )
  else:
    raise Exception('Invalid feature type "%s"' % ftype)

  assert res.shape[1] > 0
  assert res.shape[0] == rgbImage.shape[0]*rgbImage.shape[1]
  assert np.all( np.isfinite( res ) )
  return res
def predictSuperPixelLabels(classifier, image,numberSuperPixels, superPixelCompactness):
    print "\n**Computing super pixel labelling for input image"
    
    # Get superpixels
    spgraph = SuperPixels.computeSuperPixelGraph(image,'slic',[numberSuperPixels, superPixelCompactness])
    imgSuperPixelsMask = spgraph.m_labels
    imgSuperPixels = spgraph.m_nodes
    numberImgSuperPixels = len(imgSuperPixels)
    print "**Image contains", numberImgSuperPixels, "superpixels"
    
    # Get superpixel features
    superPixelFeatures = FeatureGenerator.generateSuperPixelFeatures(image, imgSuperPixelsMask, None)
    assert np.shape(superPixelFeatures)[0] == numberImgSuperPixels, "Number of superpixels in feature array != number super pixels in image!:: " + str(np.shape(superPixelFeatures)[0]) + " vs. " + str(numberImgSuperPixels)

    superPixelLabels = classifier.predict( superPixelFeatures )
    
    return (superPixelLabels, spgraph)
Пример #12
0
def generateImagePredictionClassDist(rgbImage,
                                     classifier,
                                     requireAllClasses=True):
    """This image takes an RGB image as an (i,j,3) numpy array, a scikit-learn classifier and produces probability distribution over each pixel and class.
    Returns an (i,j,N) numpy array where N= total number of classes for use in subsequent modelling."""

    # TODO Broaden to cope with more classifiers :)
    #assert (str(type(classifier)) == "<class 'sklearn.linear_model.logistic.LogisticRegression'>") , "Check classifier type value:: " + str(type(classifier))
    testClassifier = None

    imageDimensions = rgbImage[:, :, 0].shape
    nbCols = imageDimensions[1]
    nbRows = imageDimensions[0]
    #params = classifier.get_params(deep=True)

    #print "Classifier paras::" , params

    # Take image, generate features, use classifier to predict labels, ensure normalised dist and shape to (i,j,N) np.array

    # generate predictions for the image
    # todo: replace with features.computePixelFeatures JRS
    imagePixelFeatures = FeatureGenerator.generatePixelFeaturesForImage(
        rgbImage)
    #print imagePixelFeatures
    predictedPixelLabels = classifier.predict(imagePixelFeatures)
    predictionProbs = classifier.predict_proba(imagePixelFeatures)
    print "\nShape of predicted labels::", np.shape(predictedPixelLabels)
    print "\nShape of prediction probs::", np.shape(predictionProbs)
    numClasses = pomio.getNumClasses()

    assert not requireAllClasses or \
        (np.shape(predictionProbs)[1] == numClasses or \
             np.shape(predictionProbs)[1] == numClasses+1) , \
             "Classifer prediction does not match all classes (23 or 24):: " + \
             str(np.shape(predictionProbs)[1])
    print predictionProbs

    #!!predictionProbs = np.reshape(predictionProbs, (nbCols, nbRows, numClasses ))
    print 'reshaping to ', (nbCols, nbRows, predictionProbs.shape[1])
    predictionProbs = np.reshape(predictionProbs,
                                 (nbRows, nbCols, predictionProbs.shape[1]))

    return predictionProbs
Пример #13
0
def classify(txt_file, con_file, dep_file, concept, classifier_pickle, output_file):
	#print >> sys.stderr, "1"
	aus = get_annotation_units_from_txt(txt_file)
	aus = UnifiedReader(aus, con_file, dep_file)
	fss_n_lists = map(lambda x: FeatureGenerator.get_featuresets(x, concept), [aus])

	#print >> sys.stderr, "2"
	pickle_in = open(classifier_pickle, 'rb')
	classifier = pickle.load(pickle_in)
	pickle_in.close()
	#labels = classifier.labels()

	#print >> sys.stderr, "3"
	fout = codecs.open(output_file, mode='w', encoding='utf-8')
	for fs, l in fss_n_lists[0]:
		prob_dist = classifier.prob_classify(fs)
		label = prob_dist.max()
		#print >> fout, '\t'.join(['%s\t%f' % (x, prob_dist.prob(x)) for x in labels])
		print >> fout, '%s\t%f' % (label, prob_dist.prob(label))
def generateImagePredictionClassDist(rgbImage, classifier, requireAllClasses=True):
    """This image takes an RGB image as an (i,j,3) numpy array, a scikit-learn classifier and produces probability distribution over each pixel and class.
    Returns an (i,j,N) numpy array where N= total number of classes for use in subsequent modelling."""
    
    # TODO Broaden to cope with more classifiers :)
    #assert (str(type(classifier)) == "<class 'sklearn.linear_model.logistic.LogisticRegression'>") , "Check classifier type value:: " + str(type(classifier)) 
    testClassifier = None
    
    imageDimensions = rgbImage[:,:,0].shape
    nbCols = imageDimensions[1]
    nbRows = imageDimensions[0]
    #params = classifier.get_params(deep=True)
    
    #print "Classifier paras::" , params
    
    # Take image, generate features, use classifier to predict labels, ensure normalised dist and shape to (i,j,N) np.array
    
    # generate predictions for the image
        # todo: replace with features.computePixelFeatures JRS
    imagePixelFeatures = FeatureGenerator.generatePixelFeaturesForImage(rgbImage)
    #print imagePixelFeatures
    predictedPixelLabels = classifier.predict(imagePixelFeatures)
    predictionProbs = classifier.predict_proba(imagePixelFeatures)
    print "\nShape of predicted labels::" , np.shape(predictedPixelLabels)
    print "\nShape of prediction probs::" , np.shape(predictionProbs)
    numClasses = pomio.getNumClasses()
    
    assert not requireAllClasses or \
        (np.shape(predictionProbs)[1] == numClasses or \
             np.shape(predictionProbs)[1] == numClasses+1) , \
             "Classifer prediction does not match all classes (23 or 24):: " + \
             str(np.shape(predictionProbs)[1])
    print predictionProbs
    
    #!!predictionProbs = np.reshape(predictionProbs, (nbCols, nbRows, numClasses ))
    print 'reshaping to ', (nbCols, nbRows, predictionProbs.shape[1] )
    predictionProbs = np.reshape(predictionProbs, (nbRows, nbCols, predictionProbs.shape[1] ))
    
    return predictionProbs
Пример #15
0
def ARFFPrinter(aus, concept, outFile):
    #fss_n_lists = map(lambda x: FeatureGenerator.get_featuresets(x, concept), [aus])
    featuresets = FeatureGenerator.get_featuresets(aus, concept)

    # calculation for header
    attDict = dict()
    for featureset in featuresets:
        for key, value in featureset[0].items():
            try: attDict[key].add(value)
            except KeyError: attDict[key] = set([value])

    attributes = attDict.keys()

    fout = open(outFile, 'w')
    
    # print header 
    print >> fout, "@relation %s"%concept
    for attribute in attributes: 
        if attribute.startswith('contain-'): dataType = '{True, False}'
        else: dataType = 'string'
        print >> fout, '@attribute "%s" %s'%(re.sub('"','\\"',attribute), dataType)
    if concept is 'CCS': classes = 'unidentifiable normalTOcancer cancerTOnormal'
    elif concpet is 'PT': classes = 'observation causality'
    else: raise ValueError
    print >> fout, '@attribute %s {%s}'%(concept, classes)

    #print data
    print >> fout, "@data"
    for featureset in featuresets:
        dataLine = ""
        for attribute in attributes:
            try: dataLine += '"'+re.sub('"','\\"',unicode(featureset[0][attribute]).encode('ascii','ignore'))+'"'+','
            except KeyError: dataLine += 'False'+','
        dataLine += featureset[1]
        print >> fout, dataLine
        

    fout.close()
Пример #16
0
def predictSuperPixelLabels(classifier, image, numberSuperPixels,
                            superPixelCompactness):
    print "\n**Computing super pixel labelling for input image"

    # Get superpixels
    spgraph = SuperPixels.computeSuperPixelGraph(
        image, 'slic', [numberSuperPixels, superPixelCompactness])
    imgSuperPixelsMask = spgraph.m_labels
    imgSuperPixels = spgraph.m_nodes
    numberImgSuperPixels = len(imgSuperPixels)
    print "**Image contains", numberImgSuperPixels, "superpixels"

    # Get superpixel features
    superPixelFeatures = FeatureGenerator.generateSuperPixelFeatures(
        image, imgSuperPixelsMask, None)
    assert np.shape(
        superPixelFeatures
    )[0] == numberImgSuperPixels, "Number of superpixels in feature array != number super pixels in image!:: " + str(
        np.shape(superPixelFeatures)[0]) + " vs. " + str(numberImgSuperPixels)

    superPixelLabels = classifier.predict(superPixelFeatures)

    return (superPixelLabels, spgraph)
def predictSuperPixelLabels(classifier, image,numberSuperPixels,  \
                                superPixelCompactness, makeProbabilities ):
    print "\n**Computing super pixel labelling for input image"
    outProbs = None

    # Get superpixels
    spgraph = superPixels.computeSuperPixelGraph(image,'slic',[numberSuperPixels, superPixelCompactness])
    imgSuperPixelsMask = spgraph.m_labels
    imgSuperPixels = spgraph.m_nodes
    numberImgSuperPixels = len(imgSuperPixels)
    print "**Image contains", numberImgSuperPixels, "superpixels"
    
    # Get superpixel features
    # todo: replace with features.computeSuperPixelFeatures JRS
    superPixelFeatures = FeatureGenerator.generateSuperPixelFeatures(image, imgSuperPixelsMask, None)
    assert np.shape(superPixelFeatures)[0] == numberImgSuperPixels, "Number of superpixels in feature array != number super pixels in image!:: " + str(np.shape(superPixelFeatures)[0]) + " vs. " + str(numberImgSuperPixels)

    superPixelLabels = classifier.predict( superPixelFeatures )
    
    if makeProbabilities:
        outProbs = classification.classProbsOfFeatures(
            superPixelFeatures, classifier, requireAllClasses=False
        )
    return (superPixelLabels, spgraph, outProbs)
Пример #18
0
        print "\nProcessing " + str(scale*100) + \
            "% of MSRC data on a 60/20/20 split serialised for easier file IO"
        splitData = pomio.splitInputDataset_msrcData(
            msrcImages,
            datasetScale=scale,
            keepClassDistForTraining=True,
            trainSplit=0.6,
            validationSplit=0.2,
            testSplit=0.2)

        validationDataset = splitData[1]
        testDataset = splitData[2]

        if doVal:
            print "Processing validation data::"
            validationData = FeatureGenerator.processLabeledImageData(
                validationDataset, ignoreVoid=True)

        if doTest:
            print "Processing test data::"
            testingData = FeatureGenerator.processLabeledImageData(
                testDataset, ignoreVoid=True)
    else:
        # Just training data
        splitData = [msrcImages, None, None]

    # prepare training data
    trainDataset = splitData[0]

    trainLabels = None

    for idx in range(0, np.size(trainDataset)):
Пример #19
0

if __name__ == "__main__":

    # Create network
    print "*Creating neural net"
    net = createDefaultNeuralNet()

    msrcData = "/home/amb/dev/mrf/data/MSRC_ObjCategImageDatabase_v2"

    print "\n*Creating training dataset"
    labeledData = createTrainingSupervisedDataSet(msrcData, 0.05, True)

    print "\n*Training network via backpropogation"
    trainingResult = trainNetworkBackprop(net, labeledData)

    net = trainingResult[0]
    trainer = trainingResult[1]

    predictImage = pomio.msrc_loadImages(msrcData)[1]

    print "\n*Read in an image from the MSRC dataset::", np.shape(
        predictImage.m_img)
    # todo: replace with features.computePixelFeatures JRS
    imageFeatures = FeatureGenerator.generatePixelFeaturesForImage(
        predictImage.m_img)

    print "\n*Using neural net to predict class label::"
    prediction = predictClass(imageFeatures, net)
    print prediction
Пример #20
0
    print('Averaging weights...')
    for i in range(0, len(prev_weights)):
        prev_weights[i].divide(counter)

    return prev_weights


if __name__ == '__main__':
    train = sys.argv[1]
    test = sys.argv[2]
    out = sys.argv[3]

    T = 3

    print('Extracting training features...')
    sentences = FeatureGenerator.process_file(train)
    weights = []

    n_additional_feats = len(sentences[0][0].additional_feats)

    print('Training...')
    for tag in range(0, len(NERTag.__members__)):
        new_weight = Weight.Weight(len(NERTag.__members__))
        weights.append(new_weight)

    weights = train_viterbi_avg_perceptron(weights, T, sentences)
    #weights = train_perceptron(weights, T, sentences)

    accuracy = 0
    n_sentences = 0
def getSuperPixelData(msrcImages,numberSuperPixels, superPixelCompactness):
    
    # Should probably make this a call to pomio in case the ordering changes in the future...
    voidClassLabel = pomio.getVoidIdx()
    
    numberImages = len(msrcImages)    
    
    # for each image:
    #   determine superpixel label (discard if void)
    #   compute superpixel features of valid superpixels
    #   append features to cumulative array of all super pixel features
    #   append label to array of all labels
    
    superPixelFeatures = None
    superPixelLabels = np.array([], int) # used for superpixel labels
    numberVoidSuperPixels = 0   # keep track of void superpixels

    nbClasses = pomio.getNumClasses()
    classAdjCounts = np.zeros( (nbClasses, nbClasses) )
    adjCountsTotal = 0
    adjVoidCountsTotal = 0

    for imgIdx in range(0, numberImages):
    
        superPixelIgnoreList = np.array([], int) # this is used to skip over the superpixel in feature processing
    
        print "\n**Processing Image#" , (imgIdx + 1) , " of" , numberImages
    
        # get raw image and ground truth labels
        img = msrcImages[imgIdx].m_img
        imgPixelLabels = msrcImages[imgIdx].m_gt
        
        # create superpixel map and graph for image
        spgraph = SuperPixels.computeSuperPixelGraph( img, 'slic', [numberSuperPixels, superPixelCompactness] )
        imgSuperPixelMask = spgraph.m_labels
        imgSuperPixels = spgraph.m_nodes
        numberImgSuperPixels = spgraph.getNumSuperPixels()
    
        # create superpixel exclude list & superpixel label array
        allSPClassLabels = []
        for spIdx in range(0, numberImgSuperPixels):
            
            superPixelValue = imgSuperPixels[spIdx]
            #print "\tINFO: Processing superpixel =", superPixelValue , " of" , numberImgSuperPixels, " in image"
            
            
            # Assume superpixel labels are sequence of integers
            superPixelValueMask = (imgSuperPixelMask == superPixelValue ) # Boolean array for indexing superpixel-pixels
            superPixelLabel = assignClassLabelToSuperPixel(superPixelValueMask, imgPixelLabels)
            allSPClassLabels.append( superPixelLabel)

            if(superPixelLabel == voidClassLabel):
            
                # add to ignore list, increment void count & do not add to superpixel label array
                superPixelIgnoreList = np.append(superPixelIgnoreList, superPixelValue)
                numberVoidSuperPixels = numberVoidSuperPixels + 1
                
            else:
                superPixelLabels = np.append(superPixelLabels, superPixelLabel)
        
        assert len(allSPClassLabels) == numberImgSuperPixels
        (theseClassAdjCounts,adjVoidCount,adjCount) = spgraph.countClassAdjacencies( nbClasses, allSPClassLabels )
        classAdjCounts     += theseClassAdjCounts
        adjCountsTotal     += adjCount
        adjVoidCountsTotal += adjVoidCount

        # Now we have the superpixel labels, and an ignore list of void superpixels - time to get the features!
        imgSuperPixelFeatures = FeatureGenerator.generateSuperPixelFeatures(img, imgSuperPixelMask, excludeSuperPixelList=superPixelIgnoreList)
        
        if superPixelFeatures == None:        
            superPixelFeatures = imgSuperPixelFeatures;
        else:
            # stack the superpixel features into a single list
            superPixelFeatures = np.vstack( [ superPixelFeatures, imgSuperPixelFeatures ] )
    
    
    assert np.shape(superPixelFeatures)[0] == np.shape(superPixelFeatures)[0] , "Number of samples != number labels"
    print "\n**Processed total of" , numberImages, "images"
    print "  %d out of %d adjacencies were ignored due to void (%.2f %%)" % \
        (adjVoidCountsTotal, adjCountsTotal, \
             100.0*adjVoidCountsTotal/adjCountsTotal)

    # Now return the results
    return [ superPixelFeatures, superPixelLabels, classAdjCounts ]
    adjProbs += 10.0
    # Now turn it into normalised probabilities.
    # todo: hey but this is not normalised for default class probability!
    adjProbs /= adjProbs.sum()
    # transform
    adjProbs = -np.log( adjProbs )
else:
    adjProbs = None

# prefer to merge regions with high degree
if args.nbrPotentialMethod == 'adjacencyAndDegreeSensitive':
    assert adjProbs != None, 'You asked for neighbour potential method "%s", but no adjacency probs specified'\
        % args.nbrPotentialMethod

print 'Computing superpixel features...'
ftrs = FeatureGenerator.generateSuperPixelFeatures( imgRGB, spix.m_labels, [] )

print 'Computing class probabilities...'
classProbs = bonzaClass.classProbsOfFeatures(ftrs,clfr,\
                                                 requireAllClasses=False)

if args.verbose:
    plt.interactive(1)

    if adjProbs != None:
        plt.figure()
        plt.imshow(np.log(1+adjProbs), cmap=cm.get_cmap('gray'), interpolation='none')
        plt.title('Adjacency probabilities')
        plt.waitforbuttonpress()

    plt.figure()
Пример #23
0
def n_fold_test(n_folds, xml_file, con_file, dep_file, alg, concept, classification_method, multiple_cancer_terms, unique_pmids, dup_pmids_in_one_fold, classifier_pickle):
	# instance filtering according to the options
	aus = get_annotation_units(xml_file)
	aus = UnifiedReader(aus, con_file, dep_file)
	aus = instance_filter(aus, classification_method, multiple_cancer_terms, concept)

	# divide into n sets
	n_lists = fold_divider(n_folds, aus, unique_pmids, dup_pmids_in_one_fold)

	# convert annotation units into feature sets
	fss_n_lists = map(lambda x: FeatureGenerator.get_featuresets(x, concept), n_lists)
	print fss_n_lists[0][0][1]

	# N-fold cross validation
	results = []
	classifiers = []
	#threads = []
	for i in range(n_folds):
		results.append(0)
		classifiers.append(0)

	start = time.time()
	for i in range(len(fss_n_lists)):
		one_fold_test(i, fss_n_lists, results, classifiers, alg)
		#threads.append(threading.Thread(target=one_fold_test, args=(i, fss_n_lists, results, classifiers, alg)))
		#threads[i].start()
	#for i in range(len(fss_n_lists)):
	#	threads[i].join()
	
	print '#fold\taccuracy\ttrain_time\ttest_time'
	for i in range(len(fss_n_lists)):
		print 'fold_%s\t%s\t%s\t%s'%(i, results[i][0], results[i][1], results[i][2]), results[i][3]

	acc_sum, t_train_sum, t_test_sum = reduce(lambda x, y: (x[0]+y[0],x[1]+y[1],x[2]+y[2]),results)
	print 'average\t%s\t%s\t%s\t' % (float(acc_sum/n_folds), float(t_train_sum/n_folds), float(t_test_sum/n_folds))
	print 'total elapsed time: %d' % (time.time()-start)

	# for excel
	print 'accuracy'
	for i in range(len(fss_n_lists)):
		print results[i][0]
	print float(acc_sum/n_folds)

	classes = list()
	for i in range(len(results[0][3])):
		classes.append(results[0][3][i][0])
	for clas in classes:
		print clas
		print 'precision'
		for i in range(len(fss_n_lists)):
			for numbers in results[i][3]:
				if numbers[0] == clas: print numbers[1]
		print 'recall'
		for i in range(len(fss_n_lists)):
			for numbers in results[i][3]:
				if numbers[0] == clas: print numbers[2]
		print 'f'
		for i in range(len(fss_n_lists)):
			for numbers in results[i][3]:
				if numbers[0] == clas: print numbers[3]
	
	
	pickle_out = open(classifier_pickle, 'wb')
	pickle.dump(classifiers, pickle_out)
	pickle_out.close()
ex = data[0]


plt.figure()
plt.imshow(ex.m_img)
plt.title('original image')

plt.figure()
clrs = [[z/255.0 for z in c[1]] for c in pomio.msrc_classToRGB]
pomio.showLabels( ex.m_gt )
plt.title('ground truth labels' )

print 'unique class labels: ', np.unique(ex.m_gt)

# generate features
imagePixelFeatures = FeatureGenerator.generatePixelFeaturesForImage(ex.m_img)
# For each feature, how many distinct values?
for i in range(imagePixelFeatures.shape[1]):
    print "Feature %d has %d distinct values" \
        % (i, len(np.unique( imagePixelFeatures[:,i])) )

# Plot a selection of features
sel = np.arange(26,30)
# sel = range(80,86)

# colours have to be on range 0-1
plt.figure()
# just plot some of the data for clarity
nbpts = 2000
ptsz = 5
whichPts = np.random.choice( imagePixelFeatures.shape[0], nbpts, replace=False)
Пример #25
0
class GBRTModel:
    def __init__(self, config, db=None, stats=None, dmodel=None, load_path=None, w2v=None):
        if type(config) == str or type(config) == unicode:
            with open(config) as data_file:
                self._config = json.load(data_file)
        else:
            self._config = config

        self._dmodel = dmodel

        print "GBRT params:", self._config['hyper_patameters']
        if load_path is None:
            self._model = GradientBoostingClassifier(loss=self._config['hyper_patameters']['loss'],
                                                     learning_rate=self._config['hyper_patameters']['learning_rate'],
                                                     n_estimators=self._config['hyper_patameters']['n_estimators'],
                                                     max_depth=self._config['hyper_patameters']['max_depth'],
                                                     max_features=None)
        else:
            self.loadModel(load_path)

        self._feature_generator = \
            FeatureGenerator(mention_features=self._config['features']['mention_features'],
                             entity_features=self._config['features']['entity_features'],
                             yamada_embedding_path=ProjectSettings.getPath()[0] +
                                                   self._config['features']['yamada_embedding_path'],
                             stats=stats,
                             db=db,
                             dmodel=dmodel,
                             w2v=w2v)
        self._train_X = []
        self._train_Y = []
        self._db = db

    def getPredictor(self):
        return PointwisePredict(self)

    def predict(self, mention, candidate1, candidate2=None):
        if candidate2 is not None:
            raise "Unsupported operation"
        # create feature_vec from mention and candidate and predic prob for pointwise predictor
        feature_vec = self._feature_generator.getPointwiseFeatures(mention, candidate1)
        Y = self._model.predict_proba(np.asarray(feature_vec).reshape(1, -1))
        with open('feature_set.txt', 'a') as f:
            f.write('     -> ' + str(Y[0][1]) + '\n')
        return Y[0][1]

    def train(self, mention, candidate1, candidate2, correct):
        '''
        Gathers mention and candidate features into a dataFrame
        :param mention:
        :param candidate1: suppose to be None
        :param candidate2: None
        :param correct:
        :return: only builds the _train_df
        '''
        self._train_X.append(self._feature_generator.getPointwiseFeatures(mention, candidate1))
        self._train_Y.append(1.0 if correct == candidate1 else 0.0)

    def finalize(self):
        '''
        trains the model over accumulated _train_df
        :return:
        '''

        trainX = np.array(self._train_X)
        trainy = np.array(self._train_Y)
        print "fitting gbrt model (", len(self._train_Y), "samples)"
        self._model.fit(trainX, trainy)

    def saveModel(self, fname):
        pickle.dump(self._model, open(fname + ".gbrtmodel", "wb"))

    def loadModel(self, fname):
        self._model = pickle.load(open(fname + ".gbrtmodel", "rb"))
# get particular image we like
ex = data[0]

plt.figure()
plt.imshow(ex.m_img)
plt.title('original image')

plt.figure()
clrs = [[z / 255.0 for z in c[1]] for c in pomio.msrc_classToRGB]
pomio.showLabels(ex.m_gt)
plt.title('ground truth labels')

print 'unique class labels: ', np.unique(ex.m_gt)

# generate features
imagePixelFeatures = FeatureGenerator.generatePixelFeaturesForImage(ex.m_img)
# For each feature, how many distinct values?
for i in range(imagePixelFeatures.shape[1]):
    print "Feature %d has %d distinct values" \
        % (i, len(np.unique( imagePixelFeatures[:,i])) )

# Plot a selection of features
sel = np.arange(12, 29, 2)
# sel = range(80,86)

# colours have to be on range 0-1
plt.figure()
# just plot some of the data for clarity
nbpts = 2000
ptsz = 5
whichPts = np.random.choice(imagePixelFeatures.shape[0], nbpts, replace=False)
    adjProbs += 10.0
    # Now turn it into normalised probabilities.
    # todo: hey but this is not normalised for default class probability!
    adjProbs /= adjProbs.sum()
    # transform
    adjProbs = -np.log(adjProbs)
else:
    adjProbs = None

# prefer to merge regions with high degree
if args.nbrPotentialMethod == 'adjacencyAndDegreeSensitive':
    assert adjProbs != None, 'You asked for neighbour potential method "%s", but no adjacency probs specified'\
        % args.nbrPotentialMethod

print 'Computing superpixel features...'
ftrs = FeatureGenerator.generateSuperPixelFeatures(imgRGB, spix.m_labels, [])

print 'Computing class probabilities...'
classProbs = bonzaClass.classProbsOfFeatures(ftrs,clfr,\
                                                 requireAllClasses=False)

if args.verbose:
    plt.interactive(1)

    if adjProbs != None:
        plt.figure()
        plt.imshow(np.log(1 + adjProbs),
                   cmap=cm.get_cmap('gray'),
                   interpolation='none')
        plt.title('Adjacency probabilities')
        plt.waitforbuttonpress()
Пример #28
0
def getSuperPixelData(msrcImages, numberSuperPixels, superPixelCompactness):

    # Should probably make this a call to pomio in case the ordering changes in the future...
    voidClassLabel = pomio.getVoidIdx()

    numberImages = len(msrcImages)

    # for each image:
    #   determine superpixel label (discard if void)
    #   compute superpixel features of valid superpixels
    #   append features to cumulative array of all super pixel features
    #   append label to array of all labels

    superPixelFeatures = None
    superPixelLabels = np.array([], int)  # used for superpixel labels
    numberVoidSuperPixels = 0  # keep track of void superpixels

    nbClasses = pomio.getNumClasses()
    classAdjCounts = np.zeros((nbClasses, nbClasses))
    adjCountsTotal = 0
    adjVoidCountsTotal = 0

    for imgIdx in range(0, numberImages):

        superPixelIgnoreList = np.array(
            [], int
        )  # this is used to skip over the superpixel in feature processing

        print "\n**Processing Image#", (imgIdx + 1), " of", numberImages

        # get raw image and ground truth labels
        img = msrcImages[imgIdx].m_img
        imgPixelLabels = msrcImages[imgIdx].m_gt

        # create superpixel map and graph for image
        spgraph = SuperPixels.computeSuperPixelGraph(
            img, 'slic', [numberSuperPixels, superPixelCompactness])
        imgSuperPixelMask = spgraph.m_labels
        imgSuperPixels = spgraph.m_nodes
        numberImgSuperPixels = spgraph.getNumSuperPixels()

        # create superpixel exclude list & superpixel label array
        allSPClassLabels = []
        for spIdx in range(0, numberImgSuperPixels):

            superPixelValue = imgSuperPixels[spIdx]
            #print "\tINFO: Processing superpixel =", superPixelValue , " of" , numberImgSuperPixels, " in image"

            # Assume superpixel labels are sequence of integers
            superPixelValueMask = (
                imgSuperPixelMask == superPixelValue
            )  # Boolean array for indexing superpixel-pixels
            superPixelLabel = assignClassLabelToSuperPixel(
                superPixelValueMask, imgPixelLabels)
            allSPClassLabels.append(superPixelLabel)

            if (superPixelLabel == voidClassLabel):

                # add to ignore list, increment void count & do not add to superpixel label array
                superPixelIgnoreList = np.append(superPixelIgnoreList,
                                                 superPixelValue)
                numberVoidSuperPixels = numberVoidSuperPixels + 1

            else:
                superPixelLabels = np.append(superPixelLabels, superPixelLabel)

        assert len(allSPClassLabels) == numberImgSuperPixels
        (theseClassAdjCounts, adjVoidCount,
         adjCount) = spgraph.countClassAdjacencies(nbClasses, allSPClassLabels)
        classAdjCounts += theseClassAdjCounts
        adjCountsTotal += adjCount
        adjVoidCountsTotal += adjVoidCount

        # Now we have the superpixel labels, and an ignore list of void superpixels - time to get the features!
        imgSuperPixelFeatures = FeatureGenerator.generateSuperPixelFeatures(
            img, imgSuperPixelMask, excludeSuperPixelList=superPixelIgnoreList)

        if superPixelFeatures == None:
            superPixelFeatures = imgSuperPixelFeatures
        else:
            # stack the superpixel features into a single list
            superPixelFeatures = np.vstack(
                [superPixelFeatures, imgSuperPixelFeatures])

    assert np.shape(superPixelFeatures)[0] == np.shape(
        superPixelFeatures)[0], "Number of samples != number labels"
    print "\n**Processed total of", numberImages, "images"
    print "  %d out of %d adjacencies were ignored due to void (%.2f %%)" % \
        (adjVoidCountsTotal, adjCountsTotal, \
             100.0*adjVoidCountsTotal/adjCountsTotal)

    # Now return the results
    return [superPixelFeatures, superPixelLabels, classAdjCounts]
Пример #29
0
    



if __name__ == "__main__":
    
    # Create network
    print "*Creating neural net"
    net = createDefaultNeuralNet()
    
    msrcData = "/home/amb/dev/mrf/data/MSRC_ObjCategImageDatabase_v2"
    
    print "\n*Creating training dataset"
    labeledData = createTrainingSupervisedDataSet(msrcData, 0.05, True) 
    
    print "\n*Training network via backpropogation"
    trainingResult = trainNetworkBackprop(net, labeledData)
    
    net = trainingResult[0]
    trainer = trainingResult[1]
    
    predictImage = pomio.msrc_loadImages(msrcData)[1]
    
    print "\n*Read in an image from the MSRC dataset::" , np.shape(predictImage.m_img)
    imageFeatures = FeatureGenerator.generatePixelFeaturesForImage(predictImage.m_img)
    
    print "\n*Using neural net to predict class label::"
    prediction = predictClass(imageFeatures, net)
    print prediction
    
Пример #30
0
class GBRTModel:
    def __init__(self,
                 config,
                 db=None,
                 stats=None,
                 models_as_features=None,
                 load_path=None,
                 w2v=None):
        if type(config) == str or type(config) == unicode:
            with open(config) as data_file:
                self._config = json.load(data_file)
        else:
            self._config = config

        self.models_as_features = models_as_features

        print "GBRT params:", self._config['hyper_patameters']
        if load_path is None:
            self._model = GradientBoostingClassifier(
                loss=self._config['hyper_patameters']['loss'],
                learning_rate=self._config['hyper_patameters']
                ['learning_rate'],
                n_estimators=self._config['hyper_patameters']['n_estimators'],
                max_depth=self._config['hyper_patameters']['max_depth'],
                max_features=None)
        else:
            self.loadModel(load_path)

        self._feature_generator = \
            FeatureGenerator(feature_names=self._config['features']['feature_names'],
                             yamada_embedding_path=ProjectSettings.getPath()[0] +
                                                   self._config['features']['yamada_embedding_path'],
                               #                             yamada_id2title_path=ProjectSettings.getPath()[0] +
                               #                                                   self._config['features']['yamada_title2id_path'],
                             stats=stats,
                             db=db,
                             models_as_features=self.models_as_features,
                             w2v=w2v)
        self._train_X = []
        self._train_Y = []
        self._db = db

    def getPredictor(self):
        return PointwisePredict(self)

    def predict(self, mention, candidates):
        max_score = -1
        max_entity = None
        ret = dict()
        for candidate in candidates:
            # create feature_vec from mention and candidate and predic prob for pointwise predictor
            feature_vec = self._feature_generator.getFeatureVector(
                mention, candidate)
            Y = self._model.predict_proba(
                np.asarray(feature_vec).reshape(1, -1))
            with open('feature_set.txt', 'a') as f:
                f.write('     -> ' + str(Y[0][1]) + '\n')
            ret[candidate] = Y[0][1]
        return ret

    def train(self, mention, candidate, is_correct):
        '''
        Gathers mention and candidate features into a dataFrame
        :param mention:
        :param candidate1: suppose to be None
        :param candidate2: None
        :param correct:
        :return: only builds the _train_df
        '''
        self._train_X.append(
            self._feature_generator.getFeatureVector(mention, candidate))
        self._train_Y.append(1 if is_correct else 0)

    def is_trainable(self, candidate):
        return True

    def finalize(self):
        '''
        trains the model over accumulated _train_df
        :return:
        '''

        trainX = np.array(self._train_X)
        trainy = np.array(self._train_Y)
        print "fitting gbrt model (", len(self._train_Y), "samples)"
        self._model.fit(trainX, trainy.reshape(trainy.shape[0], ))

    def saveModel(self, fname):
        pickle.dump(self._model, open(fname + ".gbrtmodel", "wb"))

    def loadModel(self, fname):
        self._model = pickle.load(open(fname + ".gbrtmodel", "rb"))
Пример #31
0
        splitData = pomio.splitInputDataset_msrcData(
            msrcImages,
            datasetScale=scale,
            keepClassDistForTraining=True,
            trainSplit=0.6,
            validationSplit=0.2,
            testSplit=0.2
            )
        
        validationDataset = splitData[1]
        testDataset = splitData[2]
        
        
        if doVal:
            print "Processing validation data::"
            validationData = FeatureGenerator.processLabeledImageData(validationDataset, ignoreVoid=True)
        
        if doTest:
            print "Processing test data::"
            testingData = FeatureGenerator.processLabeledImageData(testDataset, ignoreVoid=True)
    else:
        # Just training data
        splitData = [msrcImages,None,None]

    # prepare training data
    trainDataset = splitData[0]
    
    trainLabels = None
    
    for idx in range(0, np.size(trainDataset)):
        if trainLabels == None: