Пример #1
0
def TestPredictionIntensity():
    C = 8192
    gamma = 0.0001220703125
    windowSize = 12
    df = ReadCSVPandas('Close_Values.csv')

    fe = FeatureExtractor()
    feature = fe.ExtractIntensity(df['BVMF:BBDC4'])

    fig = plt.figure()
    ax = plt.subplot(111)
    ax.plot(feature, color='r')
    ax.plot(df['BVMF:BBDC4'], color='k')
    plt.show()
    ts = np.array(feature)

    trainingPeriod = windowSize * 30
    testingPeriod = 5

    svm = PredSVM(ts[0:trainingPeriod], 'rbf', C, gamma)
    svm.GenerateTrainingDataset(windowSize)
    svm.Train()
    svm.GridSearch()

    testY = svm.PredictNextN(testingPeriod)
    PlotResults(ts[trainingPeriod:trainingPeriod + testingPeriod], testY)

    plt.show()
Пример #2
0
def test_epsilonNeighbor():
    x_scipySparse = None; train_set_x = None; numInstances = 0; numFeatures = 0;
    if((os.path.exists("input_scipySparse.obj"))):
        print "loading sparse data from pickled file..."
        f = open("input_scipySparse.obj", 'r')
        x_scipySparse = cPickle.load(f)
        f.close()
        numInstances, numFeatures = x_scipySparse.shape
        
    else: 
        print "extracting features and building sparse data..."
        fe = FeatureExtractor()  
        fe.extractFeatures()
        train_set_x = fe.instanceList
        featureDict = fe.featDict   
        numInstances = len(train_set_x)
        numFeatures = len(featureDict)        
        x_lil = sp.lil_matrix((numInstances,numFeatures), dtype='float32') # the data is presented as a sparse matrix 
        i = -1; v = -1;
        try:
            for i,instance in enumerate(train_set_x):
                for v in instance.input:
                    x_lil[i, v] = 1
        except:
            print "i=",i," v=",v
        x_scipySparse = x_lil.tocsc()
        f = open("input_scipySparse.obj", 'w')
        cPickle.dump(x_scipySparse, f, protocol=cPickle.HIGHEST_PROTOCOL)
        f.close()

    epsilonNeighbor(x_scipySparse)
    def process(self, directory, output, feature_type):

        start = datetime.datetime.now()

        for root, subFolders, filenames in os.walk(directory):

            for filename in fnmatch.filter(filenames, self.h5Regex):

                candfile = os.path.join(root, filename)
                cand = Candidate.load_hdf5(str(candfile))

                fe = FeatureExtractor()

                features = fe.getfeatures(cand, feature_type)
                features.append("?")

                self.storeFeature(features,candfile)

        outputText = ""

        for f in self.FeatureStore:

            outputText += f + "\n"

        outputFile = open(output, 'a')
        outputFile.write(str(outputText))
        outputFile.close()

        end = datetime.datetime.now()
        print 'Processing time = ',str(end-start)
Пример #4
0
def classify(queue, lsh, child_conn, n_sigs):
    fe = FeatureExtractor()
    count = 0
    classified_relationships = []
    print multiprocessing.current_process(), "started"
    while True:
        try:
            line = queue.get_nowait()
            count += 1
            if count % 1000 == 0:
                print multiprocessing.current_process(), count, " processed, remaining ", queue.qsize()

            relationships = fe.process_classify(line)

            for r in relationships:
                rel = r[0]
                shingles = r[1]

                # compute signatures
                sigs = MinHash.signature(shingles.getvalue().split(), n_sigs)

                # find closest neighbours
                types = lsh.classify(sigs)
                if types is not None:
                    classified_r = (rel.e1, rel.e2, rel.sentence, types.encode("utf8"))
                else:
                    classified_r = (rel.e1, rel.e2, rel.sentence, "None")
                classified_relationships.append(classified_r)

        except Queue.Empty:
            print multiprocessing.current_process(), "Queue is Empty"
            child_conn.send(classified_relationships)
            break
Пример #5
0
def extract_features(queue, lsh, child_conn, n_sigs):
    fe = FeatureExtractor()
    relationships = []
    count = 0
    while True:
        try:
            line = queue.get_nowait()
            count += 1
            if count % 1000 == 0:
                print count, " processed, remaining ", queue.qsize()

            rel_id, rel_type, e1, e2, sentence = line.split('\t')
            rel_id = int(rel_id.split(":")[1])
            shingles = fe.process_index(sentence, e1, e2)

            try:
                shingles = shingles.getvalue().strip().split(' ')
            except AttributeError, e:
                print line
                print shingles
                sys.exit(-1)

            sigs = MinHash.signature(shingles, n_sigs)
            lsh.index(rel_type, rel_id, sigs)
            relationships.append((rel_type, rel_id, sigs, shingles))

        except Queue.Empty:
            print multiprocessing.current_process(), "Queue is Empty"
            child_conn.send(relationships)
            break
Пример #6
0
def extract_features(queue, lsh, child_conn, n_sigs):
    fe = FeatureExtractor()
    relationships = []
    count = 0
    while True:
        try:
            line = queue.get_nowait()
            count += 1
            if count % 1000 == 0:
                print count, " processed, remaining ", queue.qsize()

            rel_id, rel_type, e1, e2, sentence = line.split('\t')
            rel_id = int(rel_id.split(":")[1])
            shingles = fe.process_index(sentence, e1, e2)

            try:
                shingles = shingles.getvalue().strip().split(' ')
            except AttributeError, e:
                print line
                print shingles
                sys.exit(-1)

            sigs = MinHash.signature(shingles, n_sigs)
            lsh.index(rel_type, rel_id, sigs)
            relationships.append((rel_type, rel_id, sigs, shingles))

        except Queue.Empty:
            print multiprocessing.current_process(), "Queue is Empty"
            child_conn.send(relationships)
            break
Пример #7
0
class QClassifierImpl:
    """
    A wrapper for question classifier
    """

    def __init__(self, train_data_path, pred_qs = None):
        """
        Constructor
        """
        logging.basicConfig(level = logging.DEBUG,
                format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
                datefmt='%a, %d %b %Y %H:%M:%S',
                filename='qclassifier.log',
                filemode='w')
        reload(sys)
        sys.setdefaultencoding('utf8')

        self.clf = None
        self.path = train_data_path
        self.pred_qs = pred_qs
        self.extractor = FeatureExtractor()
        self.features = None
        self.labels = None
        self.vectorizer = None
        self.cate = ['Person', 'Number', 'Location', 'Other']

    def train(self):
        """
        Train use all of the given data
        """
        self.extractor.load(path = self.path)
        self.features = self.extractor.extract_features()
        self.labels = self.extractor.get_labels()
        self.clf = QClassifier(questions = self.extractor.questions)
        assert(len(self.labels) == len(self.features))

        X = self.features
        Y = self.labels
        self.vectorizer = FeatureHasher(input_type = 'string', non_negative = True)
        X = self.vectorizer.transform(X)
        Y = asarray(Y)

        logging.info('start training')
        self.clf.train(X, Y)
        logging.info('done')

    def get_type(self, question):
        """
        Get type for a given question
        """
        if not self.features or not self.labels:
            logging.error('You need to train model first!')
            return None
        if not question:
            logging.error('Question should not be None')
            return None
        f = [self.extractor.extract_features_aux(question)]
        f = self.vectorizer.transform(f)
        # print self.clf.predict(f)
        return self.cate[self.clf.predict(f)[0]]
Пример #8
0
def classify(queue, lsh, child_conn, n_sigs):
    fe = FeatureExtractor()
    count = 0
    classified_relationships = []
    print multiprocessing.current_process(), "started"
    while True:
        try:
            line = queue.get_nowait()
            count += 1
            if count % 1000 == 0:
                print multiprocessing.current_process(
                ), count, " processed, remaining ", queue.qsize()

            relationships = fe.process_classify(line)

            for r in relationships:
                rel = r[0]
                shingles = r[1]

                # compute signatures
                sigs = MinHash.signature(shingles.getvalue().split(), n_sigs)

                # find closest neighbours
                types = lsh.classify(sigs)
                if types is not None:
                    classified_r = (rel.e1, rel.e2, rel.sentence,
                                    types.encode("utf8"))
                else:
                    classified_r = (rel.e1, rel.e2, rel.sentence, "None")
                classified_relationships.append(classified_r)

        except Queue.Empty:
            print multiprocessing.current_process(), "Queue is Empty"
            child_conn.send(classified_relationships)
            break
Пример #9
0
    def test_extract_features_4x4_returns_correct_dimensions_and_colour(self):
        input_image_df = pd.read_csv(io.StringIO("label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,pixel10,pixel11,pixel12,pixel13,pixel14,pixel15\n0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0"))
        feature_extractor = FeatureExtractor(logging.Logger("FeatureExtractor"), 4, 4, 1)

        features = feature_extractor.extract_features(input_image_df)

        self.assertEqual((1,15), features.shape)
        self.assertTrue(pd.DataFrame([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]).compare(features).empty)
 def __init__(self):
     print('Initializing detector...')
     self.classifier = LinearSVC(verbose=True)
     self.X_scaler = StandardScaler()
     self.feature_extractor = FeatureExtractor(color_space='YCrCb',
                                               orient=9,
                                               hog_channel='ALL')
     self.last_detections = deque(maxlen=20)
Пример #11
0
def get_input_file_features(img_filepath, feature_detector='orb'):
    if feature_detector is not 'fast' or feature_detector is not 'orb':
        warnings.warn(
            '\nFeature detector Warning: No feature detector selected, \
\'fast\' feature detection will be applied')
    params = {'feature_detector': feature_detector, 'desc_vector': 'orb'}
    img_features = FeatureExtractor(img_filepath)
    sample_descr_vector = img_features.feature_extractor()
    return sample_descr_vector
 def __init__(self, fileloc, clf, ss=None):
     self.clf = clf
     self.standardizer = ss
     self.file_loc = fileloc
     self.file_handler = FileHandler(self.file_loc)
     self.file_handler.set_file_extensions((".wav"))
     self.file_handler.create_all_file_list()
     self.file_handler.split_train_test()
     self.extractor = FeatureExtractor()
Пример #13
0
 def test_identity(self):
     feature_extractor = FeatureExtractor(self.data_set.train_df,
                                          scaler=None)
     feature_extractor.fit()
     X_coded, y = feature_extractor.eval()
     self.assertEqual(X_coded.shape[0], self.data_set.train_df.shape[0])
     self.assertEqual(X_coded.shape[1] + 1, self.data_set.train_df.shape[1])
     self.assertTrue(
         np.array_equal(X_coded, self.data_set.train_df.iloc[:, :178]))
     self.assertTrue(np.array_equal(y, self.data_set.train_df.iloc[:, 178]))
Пример #14
0
def prepare_full_feature():
    image_base_path = "../images/kyoto/"
    model_path = "/home/ge/tests/vgg16_weights.h5"
    FeatureExtractor.initialize(model_path)
    images = np.zeros((500, 4096))
    for i in range(500):
        img = imread(image_base_path + str(i) + ".jpg")
        feature = FeatureExtractor.feature(img)
        images[i, :] = feature

    np.save("../mid-data/full_feature.npy", images)
Пример #15
0
 def __init__(self,debugFlag):
     """
     Default constructor.
     
     Parameters:
     
     debugFlag     -    the debugging flag. If set to True, then detailed
                        debugging messages will be printed to the terminal
                        during execution.
     """
     FeatureExtractor.__init__(self,debugFlag)
Пример #16
0
    def test_extract_features_x10_full_size_returns_correct_features(self):
        input_image_df = pd.read_csv("Data/train.csv", nrows=1)
        feature_extractor = FeatureExtractor(logging.Logger("FeatureExtractor"), 4, 4, 2)

        features = feature_extractor.extract_features(input_image_df)

        self.assertEqual((1, 75), features.shape)
        self.assertTrue(pd.DataFrame([[0, 15.93750, 0, 0, 63.75, 0, 0, 63.75, 0, 1, 255, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                       0, ]]).iloc[0,:].compare(features.iloc[0,:]).empty)
Пример #17
0
 def add(self, ts, pack):
     if self.slot_id == 0:
         self.slot_id = ts
     if not self.accumulate:
         self._writer.log_record(self.slot_id, 0, self._dev_fea)  # # call writer
         self.slot_id = ts
         del self._dev_fea
         self._dev_fea = DevFeatures.copy_from_devdata(self._oracle)
         self._dev_fea = FeatureExtractor().extract_feas(self._dev_fea, ts, pack)
         return
     self._dev_fea = FeatureExtractor().extract_feas(self._dev_fea, ts, pack)
     pass
Пример #18
0
def prepare_feature():
    model_path = "/home/ge/tests/vgg16_weights.h5"
    proposal_path = "../mid-data/proposals.npy"
    image_base_path = "../images/kyoto/"
    feature_path = "../mid-data/feature/"

    proposals = np.load(proposal_path)
    FeatureExtractor.initialize(model_path)
    for i in range(10, 500):
        img = imread(image_base_path + str(i) + ".jpg")
        feature = FeatureExtractor.iterate_feature(proposals[i], img, axis=1)
        np.save(feature_path + str(i) + ".npy", feature)
Пример #19
0
def init():
    bot_id = '1437569240:AAEd2sZ0faC1EwPvQGJPPW4xf7ohP1hTzV8'
    updater = Updater(bot_id)
    updater.setPhotoHandler(imageHandler)

    QualityChecker.init()
    ShoeDetector.init()
    FeatureExtractor.init()
    data_structure = Indexer.build_data_structure(config.DATASET_PATH)
    Matcher.init(data_structure)

    print("Bot is running...")
    updater.start()
Пример #20
0
    def classificationValidation(self,test_list, kmeans_path, kernel, C, gamma):
        '''
        Main classification Validation function to validate model on
        :param true_val_Vector: true values of test set.
        :param test_list: list of paths where test images are held.
        :param KmeansName: Load the Kmeans classifier for Binerization Task.
        '''
        if gamma == None:
            clf = SVC(C=C,kernel=kernel)
        else: 
            clf = SVC(C=C,gamma=gamma,kernel=kernel)

        print "kernel: " + kernel
        print "gamma: " + str(gamma)
        print "C: " + str(C)

        clf.fit(self.X,self.y)

        results_vector = []
        y_true = []
        cl=0

        k_means = joblib.load(kmeans_name)

        [m,num_of_clusters] = np.shape(self.X)

        for path in test_list:
            for item in os.listdir(path): 
                p = path + "/" + item
                im = cv.imread(p)
                fe = FeatureExtractor(im)
                feature_vector = np.zeros(num_of_clusters)
                raw_vector = fe.computeFeatureVector()
                Km_vector = k_means.predict(raw_vector) 
                for k in range(len(Km_vector)):
                    feature_vector[Km_vector[k]] = feature_vector[Km_vector[k]] + 1 

                res = clf.predict(feature_vector)
                
                # Debugging                    
                if res[0] == 1:
                    print p + " is not a foram!"
                if res[0] == 0:
                    print p + " is a foram!"

                y_true.append(cl)
                results_vector.append(res[0])
            cl = cl + 1

        print "confusion_matrix"
        print confusion_matrix(y_true,results_vector)
Пример #21
0
def processFile(x, dir_name, files):
    print dir_name
    print files
    for f in files:
    	path = os.path.join(dir_name, f)
    	if f.startswith('.') or not os.path.isfile(path):
    		continue
    	print 'Processing ' + path
    	fe = FeatureExtractor();
    	fe.setup(path);
    	extractedFeatures = fe.getAllFeatures();
        sys.stdout.write('Finished extracting features!' + "\n")
    	data = {f: extractedFeatures}
    	pickle.dump(data, open( "extracted_features.pickle", "a" ))
    def extract_features(self, vehicles, non_vehicles):
        '''
        Extract features for the two lists containing vehicle and non-vehicle image paths respectively
        :param vehicles: list of paths to vehicle images
        :param non_vehicles: list of paths to non-vehicle images
        :return: scaled_X: normalised feature vector, y: true labels (1 = vehicle, 0 = non-vehicle)
        '''
        '''Load training set images and extract features'''
        self.feature_extractor = FeatureExtractor()

        print("Loading images and extracting features...")
        t = time.time()
        vehicle_features = self.feature_extractor.extract_features(
            vehicles,
            cspace=CSPACE,
            spatial_size=(SPATIAL_SIZE, SPATIAL_SIZE),
            hist_bins=HIST_BIN,
            hist_range=HIST_RANGE,
            hog_cell_per_block=HOG_CELL_PER_BLOCK,
            hog_channel=HOG_CHANNEL,
            hog_pix_per_cell=HOG_PIX_PER_CELL,
            hog_orient=HOG_ORIENT_BINS)

        non_vehicle_features = self.feature_extractor.extract_features(
            non_vehicles,
            cspace=CSPACE,
            spatial_size=(SPATIAL_SIZE, SPATIAL_SIZE),
            hist_bins=HIST_BIN,
            hist_range=HIST_RANGE,
            hog_cell_per_block=HOG_CELL_PER_BLOCK,
            hog_channel=HOG_CHANNEL,
            hog_pix_per_cell=HOG_PIX_PER_CELL,
            hog_orient=HOG_ORIENT_BINS)

        # Create an array stack of all feature vectors and scale the resulting feature vector
        X = np.vstack(
            (vehicle_features, non_vehicle_features)).astype(np.float64)
        self.X_scaler = StandardScaler().fit(X)
        scaled_X = self.X_scaler.transform(X)

        # Define the labels vector (1 = vehicle, 0 = non-vehicle)
        y = np.hstack((np.ones(len(vehicle_features)),
                       np.zeros(len(non_vehicle_features))))
        t2 = time.time()

        print('Number of features: {}'.format(scaled_X.shape[1]))
        print('Feature extraction time: {}'.format(round(t2 - t, 2)))

        return scaled_X, y
def run():
        connection = PgSQL.connect(user = "******", database = ClassifierConfig.DatabaseName );
        db = DocumentsDatabase(connection, 
                               WorkingDbConfig['DocTagsTable'], 
                               WorkingDbConfig['RawDocTable'], 
                               WorkingDbConfig['TagsTable'], 
                               WorkingDbConfig['DocumentsTable'] );
        extractor = FeatureExtractor(True);
        docIds = [1,2,3,4,5];
# TODO: documents that breaks parser
 #       docIds = [247, 1070198, 619547];
#        docIds = [247,145698,42027];
        docs = db.getDocumentsContent(docIds);
        for (id, header, content, tags) in docs:
                print extractor.processText(id, header, content);
Пример #24
0
 def add(self, ts, pack):
     if self.slot_id == 0:
         self.slot_id = ts
     if ts > self.slot_id + self.sd or ts < self.slot_id:
         self._writer.log_record(self.slot_id, self.sd,
                                 self._dev_fea)  # # call writer
         self.slot_id = ts
         del self._dev_fea
         self._dev_fea = DevFeatures.copy_from_devdata(self._oracle)
         self._dev_fea = FeatureExtractor().extract_feas(
             self._dev_fea, ts, pack)
         return
     self._dev_fea = FeatureExtractor().extract_feas(
         self._dev_fea, ts, pack)
     pass
Пример #25
0
def _test():
    darknet = FeatureExtractor(is_training=True, img_size=None, model='yolov2')
    darknet.model.summary()

    mobilenet = FeatureExtractor(is_training=True,
                                 img_size=None,
                                 model='mobilenet')
    mobilenet.model.summary()

    densenet = FeatureExtractor(
        is_training=True,
        img_size=None,
        model='densenet',
        model_path='../weights/feature_extractor/densenet201.h5')
    densenet.model.summary()
Пример #26
0
    def extract_and_predict(self, image, featureSet=None):
        # Use the classifier's configured feature set
        if not featureSet:
            featureSet = self.featureSet

        extractedFeatures = FeatureExtractor.extract(image, featureSet)
        return self.model.predict([extractedFeatures])
    def load_model(self, name):
        '''
        Load a trained model from disc

        :param name: name of the model ("_model.pkl" will be added to the name)
        '''
        self.__init__(self.trained_model_path)

        # Load the trained classifier and the scaler
        self.clf = joblib.load(self.trained_model_path + '/' + name +
                               '_model.pkl')
        self.X_scaler = joblib.load(self.trained_model_path + '/' + name +
                                    '_scaler.pkl')
        self.feature_extractor = FeatureExtractor()

        self.trained = True
    def createClassificationTrainingFromDataset(self, dataset_name, labels_list, path_list):
        '''
        Creates a new training set to work on from given path list and labels.
        Notice path_list and path_labels are intended to be lists of the same length. see tests in __main__ for examples.
        :param dataset_name: the name of the data set
        :param path_list: a list of pathes frome which the images are collected.
        :param labels_list: a list of labels to use for the images collected from corresponding path. (i.e. first label correspond to first path in the path list.)
        '''

        base_path = "binData/"

        labels = []
        trainingData = []
        classes = []
        cl = 0

        ### Building the feature matrix.
        for i, path in enumerate(path_list):

            labels.append(labels_list[i])
            print labels_list[i]

            for item in os.listdir(path):
                p = path + "/" + item
                print p # DEBUG
                im = cv.imread(p)
                fe = FeatureExtractor(im)
                feature_vector = fe.computeFeatureVector()
                if len(trainingData) == 0:
                    trainingData = feature_vector
                else:
                    np.vstack((trainingData, feature_vector))           
                classes.append(cl)
            
            print "vstack Kmeans Classifier: "
            print np.shape(trainingData)

            classes = np.array(classes)
            cl = cl + 1

        ### DEBUG 
        print np.shape(trainingData)
        print np.shape(classes)

        ### SAVING THE DATASETS TO NPZ FORMAT
        np.savez(os.path.join(base_path, dataset_name), trainingData, labels, classes)
Пример #29
0
    def run(self):
        rev = ReviewParser(
            open(settings.reviews_path + ReviewParser.map_cid_to_name(self.cid), "rb"),
            review_files[choice].split(".")[-1],
        )
        rev.parse()

        print "Mining", len(rev.reviews), "reviews"

        text = rev.get_raw_text()

        f = FeatureExtractor(text, ReviewParser.map_cid_to_name(self.cid))

        self.features = f.get_frequent_features(self.min_support)

        for ftr in self.features:
            self.ratings[ftr[0]] = {"positive": 0, "negative": 0, "neutral": 0}

        o = OpinionSentenceFinder(self.features, f.feature_sentences)

        # Extract all sentences which express some opinion
        opinion_sents = map(
            lambda y: y["opinion_sent"], filter(lambda x: len(x["opinion_sent"]) > 1, o.feature_sentences)
        )

        temp = []

        for os in opinion_sents:
            temp.extend(os)

        opinion_sents = temp

        for ftr, sentiment in opinion_sents:
            if sentiment[0] is True:
                self.ratings[ftr]["positive"] += 1
            elif sentiment[0] is False:
                self.ratings[ftr]["negative"] += 1
            else:
                self.ratings[ftr]["neutral"] += 1

        pp = pprint.PrettyPrinter(indent=4)
        print "Is this a %s?" % f.product_category
        print "%d features are interesting" % len(self.features)

        # pp.pprint(opinion_sents)
        pp.pprint(self.ratings)
 def region_similarity(cls, img1, img2, proposal1, proposal2):
     feature1 = FeatureExtractor.iterate_feature(proposal1, img1, axis=1)
     feature2 = FeatureExtractor.iterate_feature(proposal2, img2, axis=1)
     sim_mat = distance.cdist(feature1, feature2, "cosine")
     feature_map1 = DiscriminativeDetector.hog_feature(img1)
     feature_map2 = DiscriminativeDetector.hog_feature(img2)
     dis_tensor1 = DiscriminativeDetector.batch_gen_dis_map(feature_map1)
     dis_tensor2 = DiscriminativeDetector.batch_gen_dis_map(feature_map2)
     dis_mat1 = DiscriminativeDetector.batch_dis_detector(dis_tensor1,
                                                          proposal1,
                                                          axis=1)
     dis_mat2 = DiscriminativeDetector.batch_dis_detector(dis_tensor2,
                                                          proposal2,
                                                          axis=1)
     dis_mat = np.dot(dis_mat1, dis_mat2.T)
     res = sim_mat * dis_mat
     res = np.amax(res, axis=1)
     return res
    def getDescriptors(self, path, featureExtraType):
        '''
        get all descriptors from images in the path
        :param path: the image's path
        :param featureExtraType: the feature type:sift or surf
        :return: all images' descriptors
        '''
        featureExtra = FeatureExtractor()
        descriptors = []
        for p in path:
            image = cv2.imread(p)
            if (featureExtraType.upper() == "SIFT"):
                dsc = featureExtra.getSiftFeature(image)
            if (featureExtraType.upper() == "SURF"):
                dsc = featureExtra.getSurfFeature(image)
            descriptors.append(dsc)

        return descriptors
Пример #32
0
def main(config_filename):
    logger.debug("Starting execution.")
    parameters = Parameters(config_filename, training_mode=True)
    if parameters.preprocessed_data:
        if not isfile(parameters.excel_file) and not isfile(parameters.preprocessed_data_file):
            logger.error("Please, provide a valid Excel file or a valid preprocessed data file.")
            quit()
        if not isfile(parameters.preprocessed_data_file) and isfile(parameters.excel_file):
            logger.info("Loading Excel file.")
            data_frame = read_excel(parameters.excel_file)
            logger.info("Creating documents.")
            docs = data_frame_to_document_list(data_frame)
            logger.info("Storing generated documents.")
            pickle_manager.dump_documents(docs, parameters.preprocessed_data_file)
        logger.info("Preprocessing documents.")
        preprocessor = Preprocessor(stanfordnlp_language_package=parameters.stanfordnlp_language_package, stanfordnlp_use_gpu=parameters.stanfordnlp_use_gpu, stanfordnlp_resources_dir=parameters.stanfordnlp_resources_dir, training_mode=parameters.training_mode)
        preprocessor.preprocess(text_field=parameters.excel_column_with_text_data, preprocessed_data_file=parameters.preprocessed_data_file)
        logger.info("Checking generated data.")
        pickle_manager.check_data(parameters.preprocessed_data_file)
    else:
        if not isfile(parameters.preprocessed_data_file):
            logger.error("The indicated preprocessed data file does not exist.")
            quit()
    logger.info("Extracting features.")
    feature_extractor = FeatureExtractor(nltk_stop_words_package=parameters.nltk_stop_words_package, vectorizer_name=parameters.vectorizer, training_mode=parameters.training_mode, use_lda=parameters.use_lda, document_adjustment_code=parameters.document_adjustment_code, remove_adjectives=parameters.remove_adjectives, synonyms_file=parameters.synonyms_file, features_file=parameters.features_file)
    X, y, _lemmas = feature_extractor.generate_X_y(class_field=parameters.excel_column_with_classification_data, preprocessed_data_file=parameters.preprocessed_data_file)
    logger.info("Splitting dataset into training and test subsets.")    
    train_test_split(y, parameters.test_subset_size, parameters.preprocessed_data_file, parameters.force_subsets_regeneration)
    logger.info("Running classifiers.")
    p = classifiers.Pipeline(parameters.classifiers, parameters.cross_validate)
    metadata = pickle_manager.get_docs_metadata(parameters.preprocessed_data_file)
    training_set_indexes = metadata['training_set_indexes'].tolist()
    test_set_indexes = metadata['test_set_indexes'].tolist()
    assert len(training_set_indexes) == len(set(training_set_indexes))
    assert len(test_set_indexes) == len(set(test_set_indexes))
    for elem in feature_extractor.to_remove:
        try:
            training_set_indexes.remove(elem)
        except ValueError:
            test_set_indexes.remove(elem)
    logger.info("Accuracies:")
    p.start(X, y, parameters.number_of_jobs, parameters.set_num_accepted_probs, training_set_indexes, test_set_indexes, parameters.resampling)
    logger.debug("Execution completed.")
Пример #33
0
    def __init__(self):
        self.classifier = None
        self.className = "NB"   # other options in the future: MaxEnt, DT
        self.featSets = ["BoW"] # other options: combination of BoW, LocalCol, PoS
        self.training = []      # original train instances  
        self.trainFeatures = [] # train features
        self.test = []          # original test instances
        self.testFeatures = []  # test features

        self.featExtractor = FeatureExtractor()
Пример #34
0
    def run(self):

        #Calling the ReviewsExtractor class to read the csv file
        #and extract,concatenate the reviews
        rev = ReviewsExtractor()
        rev.extract_review_content()
        total_content = rev.get_concatenated_reviews()

        f = FeatureExtractor(total_content)

        self.features = f.get_frequent_features_list(self.min_support)

        o = OpinionSentenceCollector(self.features, f.feature_sentences)

        for feature in o.opinion_features:
            self.ratings[feature] = {'positive': 0, 'negative': 0, 'neutral': 0, 'total_reviews': 0, 'negative_review': '', 'positive_review': ''}

        for feature, sentiment_score, sentence in o.opinion_sentences:
            self.ratings[feature]['total_reviews'] += 1
            if sentiment_score > 0:
                self.ratings[feature]['positive'] += 1
                self.ratings[feature]['positive_review'] = sentence
            elif sentiment_score < 0:
                self.ratings[feature]['negative'] += 1
                self.ratings[feature]['negative_review'] = sentence
            else:
                self.ratings[feature]['neutral'] += 1

        for feature in o.opinion_features:
            self.final_features.append((feature,self.ratings[feature]['total_reviews']))

        self.sorted_features = sorted(set(self.final_features), key=lambda x: x[1], reverse=True)

        pp = pprint.PrettyPrinter(indent=4)

        print self.sorted_features
        print len(self.sorted_features)

        for index in range(0, 10):
            iter_feature = self.sorted_features[index][0]
            print "Feature: ", iter_feature
            pp.pprint(self.ratings[iter_feature])
Пример #35
0
def test_signature(test_dir):
    F = []
    improc = ImageProcessor()
    ftextr = FeatureExtractor()
    img_files = find_image_files(test_dir)
    count = 1.0
    for ifile in img_files:
        print("Extracting features, " +
              str(round(count / len(img_files) * 100, 1)) + "% done ...")
        count += 1
        signature = Image.open(ifile)
        processed = improc.preprocess(signature)
        F.append(ftextr.extract_features(processed))

    np.ndarray(shape=(len(F), len(F[0])))
    F = np.array(F)

    np.ndarray(shape=(len(F), len(F[0])))
    F = np.array(F)
    F.dump(test_dir + "feature_dump")
Пример #36
0
	def run(self):
		rev = ReviewParser(open(settings.reviews_path + ReviewParser.map_cid_to_name(self.cid), 'rb',), review_files[choice].split('.')[-1])
		rev.parse()

		print "Mining", len(rev.reviews), "reviews"

		text = rev.get_raw_text()

		f = FeatureExtractor(text, ReviewParser.map_cid_to_name(self.cid))

		self.features = f.get_frequent_features(self.min_support)
		
		for ftr in self.features:
			self.ratings[ftr[0]] = {'positive': 0, 'negative': 0, 'neutral': 0}

		o = OpinionSentenceFinder(self.features, f.feature_sentences)

		#Extract all sentences which express some opinion
		opinion_sents = map(lambda y: y['opinion_sent'], filter(lambda x: len(x['opinion_sent']) > 1, o.feature_sentences))
		
		temp = []
	
		for os in opinion_sents:
			temp.extend(os)		

		opinion_sents = temp
		
		for ftr, sentiment in opinion_sents:
			if sentiment[0] is True:
				self.ratings[ftr]['positive'] += 1
			elif sentiment[0] is False:
				self.ratings[ftr]['negative'] += 1
			else:
				self.ratings[ftr]['neutral'] += 1

		pp = pprint.PrettyPrinter(indent = 4)
		print "Is this a %s?" % f.product_category
		print "%d features are interesting" % len(self.features)

		#pp.pprint(opinion_sents)
		pp.pprint(self.ratings)
Пример #37
0
def main():
    webpages_dir = os.path.join(util.ROOT, 'data/weps2007_data_1.1/traininig/web_pages')
    fe = FeatureExtractor()
    ff = FeatureFilter()
    for name in os.listdir(webpages_dir):
        print 'begin clustering %s' % name
        reader = FileReader(webpages_dir, name)
        description = reader.read_description()
        pc = PersonCorpus(name)
        fm = FeatureMapper()
        for rank in description:
            doc_meta = {}
            html_path = os.path.join(webpages_dir, name, 'raw', rank, 'index.html')
            content = text_extract(html_path)
            features, wordcount = fe.extract(content)
            doc_meta['word_num'] = wordcount
            good_features = ff.filter(features)
            vec = FeatureVector(good_features, fm)
            pc.add_vector(vec)
        pc.compute_matrix()
        pc.dump_matrix()
Пример #38
0
def runKWS(query, imagePath, svgPath):
    svc = SVGCropper()
    fe = FeatureExtractor()
    result = []
    #set threshold here
    threshold = 40
    output = ""
    print("Cropping the segments")

    keywordsList = svc.cropWords(imagePath, svgPath)
    fq = fe.getFeatureVector(query)

    dists = []
    for keyword in keywordsList:
        f = fe.getFeatureVector(keyword[0])
        dist, path = fastdtw(f, fq, dist=euclidean)
        print "distance from ", keyword[1], " : ", dist
        dists.append(dist)
        output += keyword[1] + "," + str(dist) + " "

    return output
Пример #39
0
    def __init__(self, train_data_path, pred_qs=None):
        """
        Constructor
        """
        logging.basicConfig(
            level=logging.DEBUG,
            format=
            '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
            datefmt='%a, %d %b %Y %H:%M:%S',
            filename='qclassifier.log',
            filemode='w')
        reload(sys)
        sys.setdefaultencoding('utf8')

        self.clf = None
        self.path = train_data_path
        self.pred_qs = pred_qs
        self.extractor = FeatureExtractor()
        self.features = None
        self.labels = None
        self.vectorizer = None
        self.cate = ['Person', 'Number', 'Location', 'Other']
Пример #40
0
def main(config_filename, port):
    global _text_field, _class_field, _preprocessor, _feature_extractor
    limit_port = 1024
    if port <= limit_port:
        print("Please, indicate a port higher than %s." % (limit_port))
        quit()
    logger.disabled = True
    parameters = Parameters(config_filename, training_mode=False)
    _text_field = parameters.excel_column_with_text_data
    _class_field = parameters.excel_column_with_classification_data
    _preprocessor = Preprocessor(stanfordnlp_language_package=parameters.stanfordnlp_language_package, stanfordnlp_use_gpu=parameters.stanfordnlp_use_gpu, stanfordnlp_resources_dir=parameters.stanfordnlp_resources_dir, training_mode=parameters.training_mode)
    _feature_extractor = FeatureExtractor(nltk_stop_words_package=parameters.nltk_stop_words_package, vectorizer_name=parameters.vectorizer, training_mode=parameters.training_mode, use_lda=parameters.use_lda, document_adjustment_code=parameters.document_adjustment_code, remove_adjectives=parameters.remove_adjectives, synonyms_file=parameters.synonyms_file, features_file=parameters.features_file)
    app.run(host='0.0.0.0', port=port, debug=False) # host='0.0.0.0' allows access from any network.
Пример #41
0
class NaiveBayesAnalyzer:
    def __init__(self, dict):
        self._dict = dict
        self._fe = FeatureExtractor()

    def train(self):
        train_data = []
        for k, v in self._dict.items():
            train_data = train_data + [
                (self._fe.default_feature_extractor(f), k) for f in v
            ]

        self._classifier = nltk.classify.NaiveBayesClassifier.train(train_data)

    def analyze(self, text):
        feats = self._fe.default_feature_extractor(text)
        prob_dist = self._classifier.prob_classify(feats)

        classification = prob_dist.max()
        # print(classification)
        # for k in self._dict.keys():
        #     print (k, prob_dist.prob(k))
        return classification
Пример #42
0
    def __init__(self, train_data_path, pred_qs = None):
        """
        Constructor
        """
        logging.basicConfig(level = logging.DEBUG,
                format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
                datefmt='%a, %d %b %Y %H:%M:%S',
                filename='qclassifier.log',
                filemode='w')
        reload(sys)
        sys.setdefaultencoding('utf8')

        self.clf = None
        self.path = train_data_path
        self.pred_qs = pred_qs
        self.extractor = FeatureExtractor()
        self.features = None
        self.labels = None
        self.vectorizer = None
        self.cate = ['Person', 'Number', 'Location', 'Other']
    args = parser.parse_args()
    
    if not os.path.exists(args.extractor):
        print "Path to extractor '%s' not found" % args.extractor
        sys.exit(-1)
    
    if args.path_to_audio is None:
        args.path_to_audio = "audio/"+args.collection_name
    
    if args.path_to_audio.endswith("/"):
        args.path_to_audio = args.path_to_audio[:-1]
    
    if not os.path.exists(args.path_to_audio):
        print "Path to audio '%s' not found" % args.path_to_audio
        sys.exit(-1)
    
    if args.path_to_features is None:
        if not os.path.exists("features"):
            os.mkdir("features")
        args.path_to_features = "features/"+args.collection_name
    
    if args.path_to_features.endswith("/"):
        args.path_to_features = args.path_to_features[:-1]
    if not os.path.exists(args.path_to_features[:args.path_to_features.rfind("/")]):
        print "Path to features '%s' not found" % args.path_to_features
        sys.exit(-1)
    
    print args
    extractor = FeatureExtractor(args.extractor)
    extractor.extract(args.path_to_audio, args.path_to_features, args.audio_filetype, args.replace_features)
Пример #44
0
class WSD:

    def __init__(self):
        self.classifier = None
        self.className = "NB"   # other options in the future: MaxEnt, DT
        self.featSets = ["BoW"] # other options: combination of BoW, LocalCol, PoS
        self.training = []      # original train instances  
        self.trainFeatures = [] # train features
        self.test = []          # original test instances
        self.testFeatures = []  # test features

        self.featExtractor = FeatureExtractor()


    def setTrain(self, instances):
        self.training = instances
        self.trainFeatures = []

    def setTest(self, instances):
        self.test = instances
        self.testFeatures = []

    def setClassifier(self, className):
        self.className = className

    def setFeatureSet(self, featSets):
        self.featSets = featSets


    def learn(self):
        # check if variables are initialized
        if len(self.training) == 0:
            sys.stderr.write("No training assigned\n")
            return 0

        if len(self.trainFeatures) == 0:
            sys.stderr.write("[Time] %s : Extracting training features\n" % time.asctime())
            self.trainFeatures = [(self.getFeatures(instance), instance[1]) for (instance) in self.training]
        else:
            sys.stderr.write("[Time] %s : Features already extracted\n" % time.asctime())

        if self.className == "NB":
            sys.stderr.write("[Time] %s : Learning a Naive Bayes classifier\n" % time.asctime())
            self.classifier = nltk.NaiveBayesClassifier.train(self.trainFeatures)

        if self.className == "MaxEnt":
            sys.stderr.write("[Time] %s : Learning a Maximum Entropy classifier\n" % time.asctime())
            #self.classifier = nltk.classify.MaxentClassifier.train(self.trainFeatures, "IIS", trace=3, max_iter=100)
            self.classifier = nltk.classify.MaxentClassifier.train(self.trainFeatures, "IIS", trace=3, max_iter=30)

        if self.className == "DT":
            sys.stderr.write("[Time] %s : Learning a Decission Tree classifier\n" % time.asctime())
            self.classifier = nltk.classify.DecisionTreeClassifier.train(self.trainFeatures, entropy_cutoff=0, support_cutoff=0)

        if self.className == "NB_sklearn":
            sys.stderr.write(
                "[Time] %s : Learning a Multinomial Naive Bayes (scikit-learn) classifier\n" % time.asctime())
            X, y = self.featExtractor.convert2sklearn(self.trainFeatures)
            self.classifier = MultinomialNB()
            self.classifier.fit(X, y)

        if self.className == "DT_sklearn":
            sys.stderr.write(
                "[Time] %s : Learning a Decision Tree (scikit-learn) classifier\n" % time.asctime())
            X, y = self.featExtractor.convert2sklearn(self.trainFeatures)
            self.classifier =  DecisionTreeClassifier(random_state=0)
            self.classifier.fit(X, y)

        if self.className == "MaxEnt_sklearn":
            sys.stderr.write("[Time] %s : Learning a Logistic Regression (scikit-learn) classifier\n" % time.asctime())
            X, y = self.featExtractor.convert2sklearn(self.trainFeatures)
            self.classifier = LogisticRegression()
            self.classifier.fit(X, y)

        if self.className == "SVM_sklearn":
            sys.stderr.write("[Time] %s : Learning a Linear Support Vector Machine (scikit-learn) classifier\n" % time.asctime())
            X, y = self.featExtractor.convert2sklearn(self.trainFeatures)
            self.classifier = LinearSVC(C=1.0)
            self.classifier.fit(X, y)
            
        sys.stderr.write("[Time] %s : Learning finished\n" % time.asctime())
        #self.classifier.show_most_informative_features(20)


    def predict(self):
        if self.classifier == None:
            sys.stderr.write("[ERROR] No classifier learnt")
            return 0
        if len(self.test) == 0:
            sys.stderr.write("[ERROR] No test assigned")
            return 0
        
        if len(self.testFeatures) == 0:
            sys.stderr.write("[Time] %s : Extracting test features\n" % time.asctime())
            self.testFeatures = [(self.getFeatures(instance), instance[1]) for (instance) in self.test]
        else:
            sys.stderr.write("[Time] %s : Test features aldready extracted\n" % time.asctime())

        sys.stderr.write("[Time] %s : Predictions on test\n" % time.asctime())
        if self.className == "MaxEnt_sklearn" or self.className == "SVM_sklearn" or self.className == "DT_sklearn" or self.className == "NB_sklearn":
            X, y = self.featExtractor.convert2sklearn(self.testFeatures, train=False)
            predictions = self.classifier.predict(X)
        else:
            predictions = [self.classifier.classify(feats[0]) for feats in self.testFeatures]
        
        return predictions


    def accuracy(self, preds=None, gold=None):
        if preds == None:
            if len(self.testFeatures) == 0:
                if len(self.test) == 0:
                    sys.stderr.write("[ERROR] No test assigned")
                    return 0
                    sys.stderr.write("[Time] %s : Extracting test features\n" % time.asctime())
                self.testFeatures = self.getFeatures(self.test)

            if self.className == "MaxEnt_sklearn" or self.className == "SVM_sklearn" or self.className == "DT_sklearn" or self.className == "NB_sklearn":
                X_test, y_test = self.featExtractor.convert2sklearn(self.testFeatures, train=False)
                acc = self.classifier.score(X_test, y_test)
            else:
                acc = nltk.classify.accuracy(self.classifier, self.testFeatures)

            return acc

        else:
            correct = [l == r for (l, r) in zip(gold, preds)]
            if correct:
                return float(sum(correct))/len(correct)
            else:
                return 0


    def getFeatures(self, instance):
        features = {}
        for ft in self.featSets:
            if ft == "BoW":
                features.update(self.featExtractor.getBoW(instance))
            if ft == "SPoS":
                features.update(self.featExtractor.getSurroundPoS(instance))
            if ft == "LCOL":
                features.update(self.featExtractor.getLocalCollocations(instance))
        return features


    def saveModel(self, fileName):
        try:
            f = open(fileName, 'wb')
            save = {
                'classifier': self.classifier,
                'className': self.className,
                'featSets': self.featSets
            }
            pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
            f.close()
        except Exception as e:
            print('Unable to save data to', pickle_file, ':', e)
            raise

        statinfo = os.stat(fileName)
        sys.stderr.write('[INFO] Saved model in %s' % fileName)
        sys.stderr.write('[INFO] Compressed pickle size: ' + statinfo.st_size + "\n")


    def loadModel(self, fileName):
        with open(fileName, 'rb') as f:
            save = pickle.load(f)
            self.classifier = save['classifier']
            self.className = save['className']
            self.featSets = save['featSets']
            del save  # hint to help gc free up memory
        
        sys.stderr.write("[INFO] Loaded model name: %s\n" % self.className)
        sys.stderr.write("[INFO] Loaded model features set: " + self.featsSets + "\n")
Пример #45
0
from ReviewParser import ReviewParser
from FeatureExtractor import FeatureExtractor

review_file = ['Apple_iPhone_4.csv', 'Blackberry_Torch_9800.csv', 'Nikon_D90.csv', 'Canon_ELPH_300_HS.csv']
rev = ReviewParser(open('../data/reviews/' + review_file[3], 'rb',), 'CSV')

rev.parse()

text = rev.get_raw_text()

f = FeatureExtractor(text)

print f.get_frequent_features(5)

"""
#tokenize_patterns = ['[Nn]ikon ?[dD][0-9]+', '([0-9]+ ?mm)', '(auto[ -_]?focus)', '(Apple)[ ]?(iphone)??[0-5]?[gs]*']
features = [w.lower() for (w,t) in tags if t.startswith('N') and t != 'NNP']

features = p.stemmer(features)

dist = nltk.FreqDist(features)

obs = [ob for ob in dist.iteritems()]

logfile = open('/tmp/log.txt', 'w')
logfile.write("".join(str(obs)).replace("), (", ")\n("))
logfile.close()
"""
Пример #46
0
try: 
	min_support = int(sys.argv[1])
except:
	min_support = 5

reviews_path = '../data/reviews/'
review_files = check_output(['ls', '-1', reviews_path]).split()

for review_file in review_files:
	print review_files.index(review_file), ' ' + review_file

choice = int(input('#'))
if choice not in xrange(0, len(review_files)):
	print 'Error'
	exit(-1)

rev = ReviewParser(open(reviews_path + review_files[choice], 'rb',), review_files[choice].split('.')[-1])

rev.parse()

text = rev.get_raw_text()

f = FeatureExtractor(text, review_files[choice])

print "Based on ", len(rev.reviews), " reviews"

features = f.get_frequent_features(min_support)

features = f.prune_features(features, 3)
print "Is this a %s?" % f.product_category
Пример #47
0
test_b = prp_binary_dataf(test_q)

# prepare corpus features
# Here we suppose all the raw data is all stored in corpus dir, the raw data is:
# wiki: enwiki-20160113-pages-articles.xml
# ck12: OEBPS dir that contains files extracted from Concepts_b_v8_vdt.epub
# ck12: CK-12-Biology-Concepts_b_v143_e4x_s1.text: the downloaded version is pdf, use online converter generate text
# ck12: CK-12-Chemistry-Basic_b_v143_vj3_s1.text
# ck12: CK-12-Earth-Science-Concepts-For-High-School_b_v114_yui_s1.text
# ck12: CK-12-Life-Science-Concepts-For-Middle-School_b_v126_6io_s1.text
# ck12: CK-12-Physical-Science-Concepts-For-Middle-School_b_v119_bwr_s1.text
# ck12: CK-12-Physics-Concepts-Intermediate_b_v56_ugo_s1.text
data_pkl_file = None
norm_scores_default = False
if data_pkl_file is None:
    fext = FeatureExtractor(base_dir = base_dir, recalc = False, norm_scores_default = norm_scores_default, print_level = 2)

    # prepare word set, which is to derive all the unique 1-gram and 2-gram from train, valid and test
    fext.prepare_word_sets(corpus_dir = corpus_dir, train_b = train_b, valid_b = None, test_b = None)

    # prepare ck12html corpus: this function will go into CK12/OEBPS dir, find all x.html file where x is a number
    # extract all the text while ignore sections such as 'explore more', 'review', 'practice', 'references'
    fext.prepare_ck12html_corpus(corpus_dir = corpus_dir)

    # prepare ck12text corpus: this function will go into CK12 dir, find all .text file, which are 6 textbooks
    # extract relevant text from all Chapters of each book
    fext.prepare_ck12text_corpus(corpus_dir = corpus_dir)

    # prepare simplewiki corpus: this function will go into simplewiki dir, find the simplewiki-20151102-pages-articles.xml
    # extract text from all categories found if the page contains at least some uncommon words from train_b and test_b
    fext.prepare_simplewiki_corpus(corpus_dir, train_b, valid_b)
Пример #48
0
"""
Created on May 8, 2013

@author: bhanu
"""
from FeatureExtractor import FeatureExtractor
import os

if __name__ == "__main__":

    fe = FeatureExtractor()
    fe.load_dup_dict()
#    fe = FeatureExtractor()
#    if(os.path.exists("instanceList.obj")):
#        fe.load_data()
#    else:
#        fe.extractFeatures()
#
#    fe.rolf()
Пример #49
0
    	path = os.path.join(dir_name, f)
    	if f.startswith('.') or not os.path.isfile(path):
    		continue
    	print 'Processing ' + path
    	fe = FeatureExtractor();
    	fe.setup(path);
    	extractedFeatures = fe.getAllFeatures();
        sys.stdout.write('Finished extracting features!' + "\n")
    	data = {f: extractedFeatures}
    	pickle.dump(data, open( "extracted_features.pickle", "a" ))
		
if __name__ == '__main__':
    # process existing audio files
    # os.path.walk('../data', processFile, 0)

    fe = FeatureExtractor();
    path = sys.argv[1]
    fe.setup(path)
    extractedFeatures = fe.getAllFeatures();
    currFile = {path: extractedFeatures}

    # load features that were extracted from existing files 
    fl = FeatureLoader()
    dirname, filename = os.path.split(os.path.abspath(__file__))
    ground_truth = os.path.join(dirname, 'extracted_features.pickle')
    data = fl.load(ground_truth)

    # compare features between uploaded file and existing files
    fc = FeatureComparator()
    diffList = [] 
    currFileName = currFile.iterkeys().next()
Пример #50
0
def test_AutoEncoder(learning_rate=0.1, training_epochs=15,
            batch_size=20):

    """

    :type learning_rate: float
    :param learning_rate: learning rate used for training the DeNosing
                          AutoEncoder

    :type training_epochs: int
    :param training_epochs: number of epochs used for training

  
    """


    x_scipySparse = None; train_set_x = None; numInstances = 0; numFeatures = 0;
    if((os.path.exists("input_scipySparse.obj"))):
        print "loading sparse data from pickled file..."
        f = open("input_scipySparse.obj", 'r')
        x_scipySparse = cPickle.load(f)
        f.close()
        numInstances, numFeatures = x_scipySparse.shape
        
    else: 
        print "extracting features and building sparse data..."
        fe = FeatureExtractor()  
        fe.extractFeatures()
        train_set_x = fe.instanceList
        featureDict = fe.featDict   
        numInstances = len(train_set_x)
        numFeatures = len(featureDict)        
        x_lil = sp.lil_matrix((numInstances,numFeatures), dtype='float32') # the data is presented as a sparse matrix 
        i = -1; v = -1;
        try:
            for i,instance in enumerate(train_set_x):
                for v in instance.input:
                    x_lil[i, v] = 1
        except:
            print "i=",i," v=",v
        x_scipySparse = x_lil.tocsc()
        f = open("input_scipySparse.obj", 'w')
        cPickle.dump(x_scipySparse, f, protocol=cPickle.HIGHEST_PROTOCOL)
        f.close()

    

    # compute number of mini-batches for training, validation and testing
    n_train_batches = numInstances / batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()    # index to a [mini]batch
    #x = sparse.basic.as_sparse_variable(x_scipySparse, 'x')
    x = theano.shared(x_scipySparse, borrow=True)

    
    ####################################
    # BUILDING THE MODEL               #
    ####################################

    print "building the model..."
    rng = numpy.random.RandomState(123)

    ae = AutoEncoder(numpy_rng=rng, input=x, n_visible=numFeatures, n_hidden=10, n_trainExs=numInstances)

    cost, updates = ae.get_cost_updates(corruption_level=0.,
                                        learning_rate=learning_rate)

    train_ae = theano.function([index], cost, updates=updates,
         givens={x: train_set_x[index * batch_size:
                                (index + 1) * batch_size]})

    start_time = time.clock()

    ############
    # TRAINING #
    ############

    # go through training epochs
    print "starting training..."
    for epoch in xrange(training_epochs):
        # go through training set
        c = []
        for batch_index in xrange(n_train_batches):
            c.append(train_ae(batch_index))

        print 'Training epoch %d, cost ' % epoch, numpy.mean(c)

    end_time = time.clock()

    training_time = (end_time - start_time)
    print "training completed in : ", training_time
    def createKmeansTrainingDataset(self,kmeans_data, dataset_name, kmeans_name, path_list, labels_list, num_of_clusters):
        '''
        Create Training for Kmeans With regression.
        :param: KmeansData: the training matrix obtained using createClassificationTrainingFromDataset method on the HOLDOUT set.
        :param: kmeansName: the name of the Kmeans classifier to be saved and pickeled.
        :param: dataset_name: the name of the NEW dataset created using the Kmeans classifier on the training. i.e. clustering the feature vector.
        :param: path_list: list of paths where the training set is at.
        :param: labels_list: the list of labels for the samples in the training set.
        :param: num_of_clusters: the number of clusters for the Kmeans classifier.        
        '''
        npzfile = np.load(kmeans_data)
        KmeansData = npzfile['arr_0']
        Kmeanslabels = npzfile['arr_1']
        Kmeansclasses = npzfile['arr_2']

        k_means = cluster.KMeans(n_clusters=num_of_clusters)
        k_means.fit(kmeans_data)

        base_path = "binData/"

        labels = labels_list
        trainingData = []
        classes = []
        cl=0

        ### Building the feature matrix.
        for i, path in enumerate(path_list):
            
            print labels_list[i]

            for item in os.listdir(path):
                p = path + "/" + item
                print p # DEBUG
                im = cv.imread(p)
                fe = FeatureExtractor(im)
                feature_vector = np.zeros(num_of_clusters)
                raw_vector = fe.computeFeatureVector()
                Km_vector = k_means.predict(raw_vector) 
                for j in range(len(Km_vector)):
                    feature_vector[Km_vector[j]] = feature_vector[Km_vector[j]] + 1 
                trainingData.append(feature_vector)
                classes.append(cl)
            
            # Here we multiply the number of POSITIVE samples in the training set so that the 'unbalanced' problem of "Foram vs. Not-Foram"
            # 'becomes balanced'.
            if i == 0:
                print "working on positive samples"
                print "Original training size: (should be 68 by 10)"
                print np.shape(trainingData)
                print np.shape(classes)

                for k in range(9):
                    trainingData = np.vstack((trainingData, trainingData))
                    classes = np.hstack((classes,classes))
                
                print "After Multipling Positive Samples by 8"
                print np.shape(trainingData)
                print np.shape(classes)
                
                trainingData = trainingData.tolist()
                classes = classes.tolist()
            
            cl = cl + 1
            
        ### DEBUG 
        print "final shape: (should be 54,000~ by 10):"
        print np.shape(trainingData)

        ### SAVING THE DATASETS TO NPZ FORMAT
        joblib.dump(k_means, os.path.join(base_path, kmeans_name), compress=9)
        np.savez(os.path.join(base_path, dataset_name), trainingData, labels_list, classes)