def TestPredictionIntensity(): C = 8192 gamma = 0.0001220703125 windowSize = 12 df = ReadCSVPandas('Close_Values.csv') fe = FeatureExtractor() feature = fe.ExtractIntensity(df['BVMF:BBDC4']) fig = plt.figure() ax = plt.subplot(111) ax.plot(feature, color='r') ax.plot(df['BVMF:BBDC4'], color='k') plt.show() ts = np.array(feature) trainingPeriod = windowSize * 30 testingPeriod = 5 svm = PredSVM(ts[0:trainingPeriod], 'rbf', C, gamma) svm.GenerateTrainingDataset(windowSize) svm.Train() svm.GridSearch() testY = svm.PredictNextN(testingPeriod) PlotResults(ts[trainingPeriod:trainingPeriod + testingPeriod], testY) plt.show()
def test_epsilonNeighbor(): x_scipySparse = None; train_set_x = None; numInstances = 0; numFeatures = 0; if((os.path.exists("input_scipySparse.obj"))): print "loading sparse data from pickled file..." f = open("input_scipySparse.obj", 'r') x_scipySparse = cPickle.load(f) f.close() numInstances, numFeatures = x_scipySparse.shape else: print "extracting features and building sparse data..." fe = FeatureExtractor() fe.extractFeatures() train_set_x = fe.instanceList featureDict = fe.featDict numInstances = len(train_set_x) numFeatures = len(featureDict) x_lil = sp.lil_matrix((numInstances,numFeatures), dtype='float32') # the data is presented as a sparse matrix i = -1; v = -1; try: for i,instance in enumerate(train_set_x): for v in instance.input: x_lil[i, v] = 1 except: print "i=",i," v=",v x_scipySparse = x_lil.tocsc() f = open("input_scipySparse.obj", 'w') cPickle.dump(x_scipySparse, f, protocol=cPickle.HIGHEST_PROTOCOL) f.close() epsilonNeighbor(x_scipySparse)
def process(self, directory, output, feature_type): start = datetime.datetime.now() for root, subFolders, filenames in os.walk(directory): for filename in fnmatch.filter(filenames, self.h5Regex): candfile = os.path.join(root, filename) cand = Candidate.load_hdf5(str(candfile)) fe = FeatureExtractor() features = fe.getfeatures(cand, feature_type) features.append("?") self.storeFeature(features,candfile) outputText = "" for f in self.FeatureStore: outputText += f + "\n" outputFile = open(output, 'a') outputFile.write(str(outputText)) outputFile.close() end = datetime.datetime.now() print 'Processing time = ',str(end-start)
def classify(queue, lsh, child_conn, n_sigs): fe = FeatureExtractor() count = 0 classified_relationships = [] print multiprocessing.current_process(), "started" while True: try: line = queue.get_nowait() count += 1 if count % 1000 == 0: print multiprocessing.current_process(), count, " processed, remaining ", queue.qsize() relationships = fe.process_classify(line) for r in relationships: rel = r[0] shingles = r[1] # compute signatures sigs = MinHash.signature(shingles.getvalue().split(), n_sigs) # find closest neighbours types = lsh.classify(sigs) if types is not None: classified_r = (rel.e1, rel.e2, rel.sentence, types.encode("utf8")) else: classified_r = (rel.e1, rel.e2, rel.sentence, "None") classified_relationships.append(classified_r) except Queue.Empty: print multiprocessing.current_process(), "Queue is Empty" child_conn.send(classified_relationships) break
def extract_features(queue, lsh, child_conn, n_sigs): fe = FeatureExtractor() relationships = [] count = 0 while True: try: line = queue.get_nowait() count += 1 if count % 1000 == 0: print count, " processed, remaining ", queue.qsize() rel_id, rel_type, e1, e2, sentence = line.split('\t') rel_id = int(rel_id.split(":")[1]) shingles = fe.process_index(sentence, e1, e2) try: shingles = shingles.getvalue().strip().split(' ') except AttributeError, e: print line print shingles sys.exit(-1) sigs = MinHash.signature(shingles, n_sigs) lsh.index(rel_type, rel_id, sigs) relationships.append((rel_type, rel_id, sigs, shingles)) except Queue.Empty: print multiprocessing.current_process(), "Queue is Empty" child_conn.send(relationships) break
class QClassifierImpl: """ A wrapper for question classifier """ def __init__(self, train_data_path, pred_qs = None): """ Constructor """ logging.basicConfig(level = logging.DEBUG, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S', filename='qclassifier.log', filemode='w') reload(sys) sys.setdefaultencoding('utf8') self.clf = None self.path = train_data_path self.pred_qs = pred_qs self.extractor = FeatureExtractor() self.features = None self.labels = None self.vectorizer = None self.cate = ['Person', 'Number', 'Location', 'Other'] def train(self): """ Train use all of the given data """ self.extractor.load(path = self.path) self.features = self.extractor.extract_features() self.labels = self.extractor.get_labels() self.clf = QClassifier(questions = self.extractor.questions) assert(len(self.labels) == len(self.features)) X = self.features Y = self.labels self.vectorizer = FeatureHasher(input_type = 'string', non_negative = True) X = self.vectorizer.transform(X) Y = asarray(Y) logging.info('start training') self.clf.train(X, Y) logging.info('done') def get_type(self, question): """ Get type for a given question """ if not self.features or not self.labels: logging.error('You need to train model first!') return None if not question: logging.error('Question should not be None') return None f = [self.extractor.extract_features_aux(question)] f = self.vectorizer.transform(f) # print self.clf.predict(f) return self.cate[self.clf.predict(f)[0]]
def classify(queue, lsh, child_conn, n_sigs): fe = FeatureExtractor() count = 0 classified_relationships = [] print multiprocessing.current_process(), "started" while True: try: line = queue.get_nowait() count += 1 if count % 1000 == 0: print multiprocessing.current_process( ), count, " processed, remaining ", queue.qsize() relationships = fe.process_classify(line) for r in relationships: rel = r[0] shingles = r[1] # compute signatures sigs = MinHash.signature(shingles.getvalue().split(), n_sigs) # find closest neighbours types = lsh.classify(sigs) if types is not None: classified_r = (rel.e1, rel.e2, rel.sentence, types.encode("utf8")) else: classified_r = (rel.e1, rel.e2, rel.sentence, "None") classified_relationships.append(classified_r) except Queue.Empty: print multiprocessing.current_process(), "Queue is Empty" child_conn.send(classified_relationships) break
def test_extract_features_4x4_returns_correct_dimensions_and_colour(self): input_image_df = pd.read_csv(io.StringIO("label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,pixel10,pixel11,pixel12,pixel13,pixel14,pixel15\n0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0")) feature_extractor = FeatureExtractor(logging.Logger("FeatureExtractor"), 4, 4, 1) features = feature_extractor.extract_features(input_image_df) self.assertEqual((1,15), features.shape) self.assertTrue(pd.DataFrame([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]).compare(features).empty)
def __init__(self): print('Initializing detector...') self.classifier = LinearSVC(verbose=True) self.X_scaler = StandardScaler() self.feature_extractor = FeatureExtractor(color_space='YCrCb', orient=9, hog_channel='ALL') self.last_detections = deque(maxlen=20)
def get_input_file_features(img_filepath, feature_detector='orb'): if feature_detector is not 'fast' or feature_detector is not 'orb': warnings.warn( '\nFeature detector Warning: No feature detector selected, \ \'fast\' feature detection will be applied') params = {'feature_detector': feature_detector, 'desc_vector': 'orb'} img_features = FeatureExtractor(img_filepath) sample_descr_vector = img_features.feature_extractor() return sample_descr_vector
def __init__(self, fileloc, clf, ss=None): self.clf = clf self.standardizer = ss self.file_loc = fileloc self.file_handler = FileHandler(self.file_loc) self.file_handler.set_file_extensions((".wav")) self.file_handler.create_all_file_list() self.file_handler.split_train_test() self.extractor = FeatureExtractor()
def test_identity(self): feature_extractor = FeatureExtractor(self.data_set.train_df, scaler=None) feature_extractor.fit() X_coded, y = feature_extractor.eval() self.assertEqual(X_coded.shape[0], self.data_set.train_df.shape[0]) self.assertEqual(X_coded.shape[1] + 1, self.data_set.train_df.shape[1]) self.assertTrue( np.array_equal(X_coded, self.data_set.train_df.iloc[:, :178])) self.assertTrue(np.array_equal(y, self.data_set.train_df.iloc[:, 178]))
def prepare_full_feature(): image_base_path = "../images/kyoto/" model_path = "/home/ge/tests/vgg16_weights.h5" FeatureExtractor.initialize(model_path) images = np.zeros((500, 4096)) for i in range(500): img = imread(image_base_path + str(i) + ".jpg") feature = FeatureExtractor.feature(img) images[i, :] = feature np.save("../mid-data/full_feature.npy", images)
def __init__(self,debugFlag): """ Default constructor. Parameters: debugFlag - the debugging flag. If set to True, then detailed debugging messages will be printed to the terminal during execution. """ FeatureExtractor.__init__(self,debugFlag)
def test_extract_features_x10_full_size_returns_correct_features(self): input_image_df = pd.read_csv("Data/train.csv", nrows=1) feature_extractor = FeatureExtractor(logging.Logger("FeatureExtractor"), 4, 4, 2) features = feature_extractor.extract_features(input_image_df) self.assertEqual((1, 75), features.shape) self.assertTrue(pd.DataFrame([[0, 15.93750, 0, 0, 63.75, 0, 0, 63.75, 0, 1, 255, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]]).iloc[0,:].compare(features.iloc[0,:]).empty)
def add(self, ts, pack): if self.slot_id == 0: self.slot_id = ts if not self.accumulate: self._writer.log_record(self.slot_id, 0, self._dev_fea) # # call writer self.slot_id = ts del self._dev_fea self._dev_fea = DevFeatures.copy_from_devdata(self._oracle) self._dev_fea = FeatureExtractor().extract_feas(self._dev_fea, ts, pack) return self._dev_fea = FeatureExtractor().extract_feas(self._dev_fea, ts, pack) pass
def prepare_feature(): model_path = "/home/ge/tests/vgg16_weights.h5" proposal_path = "../mid-data/proposals.npy" image_base_path = "../images/kyoto/" feature_path = "../mid-data/feature/" proposals = np.load(proposal_path) FeatureExtractor.initialize(model_path) for i in range(10, 500): img = imread(image_base_path + str(i) + ".jpg") feature = FeatureExtractor.iterate_feature(proposals[i], img, axis=1) np.save(feature_path + str(i) + ".npy", feature)
def init(): bot_id = '1437569240:AAEd2sZ0faC1EwPvQGJPPW4xf7ohP1hTzV8' updater = Updater(bot_id) updater.setPhotoHandler(imageHandler) QualityChecker.init() ShoeDetector.init() FeatureExtractor.init() data_structure = Indexer.build_data_structure(config.DATASET_PATH) Matcher.init(data_structure) print("Bot is running...") updater.start()
def classificationValidation(self,test_list, kmeans_path, kernel, C, gamma): ''' Main classification Validation function to validate model on :param true_val_Vector: true values of test set. :param test_list: list of paths where test images are held. :param KmeansName: Load the Kmeans classifier for Binerization Task. ''' if gamma == None: clf = SVC(C=C,kernel=kernel) else: clf = SVC(C=C,gamma=gamma,kernel=kernel) print "kernel: " + kernel print "gamma: " + str(gamma) print "C: " + str(C) clf.fit(self.X,self.y) results_vector = [] y_true = [] cl=0 k_means = joblib.load(kmeans_name) [m,num_of_clusters] = np.shape(self.X) for path in test_list: for item in os.listdir(path): p = path + "/" + item im = cv.imread(p) fe = FeatureExtractor(im) feature_vector = np.zeros(num_of_clusters) raw_vector = fe.computeFeatureVector() Km_vector = k_means.predict(raw_vector) for k in range(len(Km_vector)): feature_vector[Km_vector[k]] = feature_vector[Km_vector[k]] + 1 res = clf.predict(feature_vector) # Debugging if res[0] == 1: print p + " is not a foram!" if res[0] == 0: print p + " is a foram!" y_true.append(cl) results_vector.append(res[0]) cl = cl + 1 print "confusion_matrix" print confusion_matrix(y_true,results_vector)
def processFile(x, dir_name, files): print dir_name print files for f in files: path = os.path.join(dir_name, f) if f.startswith('.') or not os.path.isfile(path): continue print 'Processing ' + path fe = FeatureExtractor(); fe.setup(path); extractedFeatures = fe.getAllFeatures(); sys.stdout.write('Finished extracting features!' + "\n") data = {f: extractedFeatures} pickle.dump(data, open( "extracted_features.pickle", "a" ))
def extract_features(self, vehicles, non_vehicles): ''' Extract features for the two lists containing vehicle and non-vehicle image paths respectively :param vehicles: list of paths to vehicle images :param non_vehicles: list of paths to non-vehicle images :return: scaled_X: normalised feature vector, y: true labels (1 = vehicle, 0 = non-vehicle) ''' '''Load training set images and extract features''' self.feature_extractor = FeatureExtractor() print("Loading images and extracting features...") t = time.time() vehicle_features = self.feature_extractor.extract_features( vehicles, cspace=CSPACE, spatial_size=(SPATIAL_SIZE, SPATIAL_SIZE), hist_bins=HIST_BIN, hist_range=HIST_RANGE, hog_cell_per_block=HOG_CELL_PER_BLOCK, hog_channel=HOG_CHANNEL, hog_pix_per_cell=HOG_PIX_PER_CELL, hog_orient=HOG_ORIENT_BINS) non_vehicle_features = self.feature_extractor.extract_features( non_vehicles, cspace=CSPACE, spatial_size=(SPATIAL_SIZE, SPATIAL_SIZE), hist_bins=HIST_BIN, hist_range=HIST_RANGE, hog_cell_per_block=HOG_CELL_PER_BLOCK, hog_channel=HOG_CHANNEL, hog_pix_per_cell=HOG_PIX_PER_CELL, hog_orient=HOG_ORIENT_BINS) # Create an array stack of all feature vectors and scale the resulting feature vector X = np.vstack( (vehicle_features, non_vehicle_features)).astype(np.float64) self.X_scaler = StandardScaler().fit(X) scaled_X = self.X_scaler.transform(X) # Define the labels vector (1 = vehicle, 0 = non-vehicle) y = np.hstack((np.ones(len(vehicle_features)), np.zeros(len(non_vehicle_features)))) t2 = time.time() print('Number of features: {}'.format(scaled_X.shape[1])) print('Feature extraction time: {}'.format(round(t2 - t, 2))) return scaled_X, y
def run(): connection = PgSQL.connect(user = "******", database = ClassifierConfig.DatabaseName ); db = DocumentsDatabase(connection, WorkingDbConfig['DocTagsTable'], WorkingDbConfig['RawDocTable'], WorkingDbConfig['TagsTable'], WorkingDbConfig['DocumentsTable'] ); extractor = FeatureExtractor(True); docIds = [1,2,3,4,5]; # TODO: documents that breaks parser # docIds = [247, 1070198, 619547]; # docIds = [247,145698,42027]; docs = db.getDocumentsContent(docIds); for (id, header, content, tags) in docs: print extractor.processText(id, header, content);
def add(self, ts, pack): if self.slot_id == 0: self.slot_id = ts if ts > self.slot_id + self.sd or ts < self.slot_id: self._writer.log_record(self.slot_id, self.sd, self._dev_fea) # # call writer self.slot_id = ts del self._dev_fea self._dev_fea = DevFeatures.copy_from_devdata(self._oracle) self._dev_fea = FeatureExtractor().extract_feas( self._dev_fea, ts, pack) return self._dev_fea = FeatureExtractor().extract_feas( self._dev_fea, ts, pack) pass
def _test(): darknet = FeatureExtractor(is_training=True, img_size=None, model='yolov2') darknet.model.summary() mobilenet = FeatureExtractor(is_training=True, img_size=None, model='mobilenet') mobilenet.model.summary() densenet = FeatureExtractor( is_training=True, img_size=None, model='densenet', model_path='../weights/feature_extractor/densenet201.h5') densenet.model.summary()
def extract_and_predict(self, image, featureSet=None): # Use the classifier's configured feature set if not featureSet: featureSet = self.featureSet extractedFeatures = FeatureExtractor.extract(image, featureSet) return self.model.predict([extractedFeatures])
def load_model(self, name): ''' Load a trained model from disc :param name: name of the model ("_model.pkl" will be added to the name) ''' self.__init__(self.trained_model_path) # Load the trained classifier and the scaler self.clf = joblib.load(self.trained_model_path + '/' + name + '_model.pkl') self.X_scaler = joblib.load(self.trained_model_path + '/' + name + '_scaler.pkl') self.feature_extractor = FeatureExtractor() self.trained = True
def createClassificationTrainingFromDataset(self, dataset_name, labels_list, path_list): ''' Creates a new training set to work on from given path list and labels. Notice path_list and path_labels are intended to be lists of the same length. see tests in __main__ for examples. :param dataset_name: the name of the data set :param path_list: a list of pathes frome which the images are collected. :param labels_list: a list of labels to use for the images collected from corresponding path. (i.e. first label correspond to first path in the path list.) ''' base_path = "binData/" labels = [] trainingData = [] classes = [] cl = 0 ### Building the feature matrix. for i, path in enumerate(path_list): labels.append(labels_list[i]) print labels_list[i] for item in os.listdir(path): p = path + "/" + item print p # DEBUG im = cv.imread(p) fe = FeatureExtractor(im) feature_vector = fe.computeFeatureVector() if len(trainingData) == 0: trainingData = feature_vector else: np.vstack((trainingData, feature_vector)) classes.append(cl) print "vstack Kmeans Classifier: " print np.shape(trainingData) classes = np.array(classes) cl = cl + 1 ### DEBUG print np.shape(trainingData) print np.shape(classes) ### SAVING THE DATASETS TO NPZ FORMAT np.savez(os.path.join(base_path, dataset_name), trainingData, labels, classes)
def run(self): rev = ReviewParser( open(settings.reviews_path + ReviewParser.map_cid_to_name(self.cid), "rb"), review_files[choice].split(".")[-1], ) rev.parse() print "Mining", len(rev.reviews), "reviews" text = rev.get_raw_text() f = FeatureExtractor(text, ReviewParser.map_cid_to_name(self.cid)) self.features = f.get_frequent_features(self.min_support) for ftr in self.features: self.ratings[ftr[0]] = {"positive": 0, "negative": 0, "neutral": 0} o = OpinionSentenceFinder(self.features, f.feature_sentences) # Extract all sentences which express some opinion opinion_sents = map( lambda y: y["opinion_sent"], filter(lambda x: len(x["opinion_sent"]) > 1, o.feature_sentences) ) temp = [] for os in opinion_sents: temp.extend(os) opinion_sents = temp for ftr, sentiment in opinion_sents: if sentiment[0] is True: self.ratings[ftr]["positive"] += 1 elif sentiment[0] is False: self.ratings[ftr]["negative"] += 1 else: self.ratings[ftr]["neutral"] += 1 pp = pprint.PrettyPrinter(indent=4) print "Is this a %s?" % f.product_category print "%d features are interesting" % len(self.features) # pp.pprint(opinion_sents) pp.pprint(self.ratings)
def region_similarity(cls, img1, img2, proposal1, proposal2): feature1 = FeatureExtractor.iterate_feature(proposal1, img1, axis=1) feature2 = FeatureExtractor.iterate_feature(proposal2, img2, axis=1) sim_mat = distance.cdist(feature1, feature2, "cosine") feature_map1 = DiscriminativeDetector.hog_feature(img1) feature_map2 = DiscriminativeDetector.hog_feature(img2) dis_tensor1 = DiscriminativeDetector.batch_gen_dis_map(feature_map1) dis_tensor2 = DiscriminativeDetector.batch_gen_dis_map(feature_map2) dis_mat1 = DiscriminativeDetector.batch_dis_detector(dis_tensor1, proposal1, axis=1) dis_mat2 = DiscriminativeDetector.batch_dis_detector(dis_tensor2, proposal2, axis=1) dis_mat = np.dot(dis_mat1, dis_mat2.T) res = sim_mat * dis_mat res = np.amax(res, axis=1) return res
def getDescriptors(self, path, featureExtraType): ''' get all descriptors from images in the path :param path: the image's path :param featureExtraType: the feature type:sift or surf :return: all images' descriptors ''' featureExtra = FeatureExtractor() descriptors = [] for p in path: image = cv2.imread(p) if (featureExtraType.upper() == "SIFT"): dsc = featureExtra.getSiftFeature(image) if (featureExtraType.upper() == "SURF"): dsc = featureExtra.getSurfFeature(image) descriptors.append(dsc) return descriptors
def main(config_filename): logger.debug("Starting execution.") parameters = Parameters(config_filename, training_mode=True) if parameters.preprocessed_data: if not isfile(parameters.excel_file) and not isfile(parameters.preprocessed_data_file): logger.error("Please, provide a valid Excel file or a valid preprocessed data file.") quit() if not isfile(parameters.preprocessed_data_file) and isfile(parameters.excel_file): logger.info("Loading Excel file.") data_frame = read_excel(parameters.excel_file) logger.info("Creating documents.") docs = data_frame_to_document_list(data_frame) logger.info("Storing generated documents.") pickle_manager.dump_documents(docs, parameters.preprocessed_data_file) logger.info("Preprocessing documents.") preprocessor = Preprocessor(stanfordnlp_language_package=parameters.stanfordnlp_language_package, stanfordnlp_use_gpu=parameters.stanfordnlp_use_gpu, stanfordnlp_resources_dir=parameters.stanfordnlp_resources_dir, training_mode=parameters.training_mode) preprocessor.preprocess(text_field=parameters.excel_column_with_text_data, preprocessed_data_file=parameters.preprocessed_data_file) logger.info("Checking generated data.") pickle_manager.check_data(parameters.preprocessed_data_file) else: if not isfile(parameters.preprocessed_data_file): logger.error("The indicated preprocessed data file does not exist.") quit() logger.info("Extracting features.") feature_extractor = FeatureExtractor(nltk_stop_words_package=parameters.nltk_stop_words_package, vectorizer_name=parameters.vectorizer, training_mode=parameters.training_mode, use_lda=parameters.use_lda, document_adjustment_code=parameters.document_adjustment_code, remove_adjectives=parameters.remove_adjectives, synonyms_file=parameters.synonyms_file, features_file=parameters.features_file) X, y, _lemmas = feature_extractor.generate_X_y(class_field=parameters.excel_column_with_classification_data, preprocessed_data_file=parameters.preprocessed_data_file) logger.info("Splitting dataset into training and test subsets.") train_test_split(y, parameters.test_subset_size, parameters.preprocessed_data_file, parameters.force_subsets_regeneration) logger.info("Running classifiers.") p = classifiers.Pipeline(parameters.classifiers, parameters.cross_validate) metadata = pickle_manager.get_docs_metadata(parameters.preprocessed_data_file) training_set_indexes = metadata['training_set_indexes'].tolist() test_set_indexes = metadata['test_set_indexes'].tolist() assert len(training_set_indexes) == len(set(training_set_indexes)) assert len(test_set_indexes) == len(set(test_set_indexes)) for elem in feature_extractor.to_remove: try: training_set_indexes.remove(elem) except ValueError: test_set_indexes.remove(elem) logger.info("Accuracies:") p.start(X, y, parameters.number_of_jobs, parameters.set_num_accepted_probs, training_set_indexes, test_set_indexes, parameters.resampling) logger.debug("Execution completed.")
def __init__(self): self.classifier = None self.className = "NB" # other options in the future: MaxEnt, DT self.featSets = ["BoW"] # other options: combination of BoW, LocalCol, PoS self.training = [] # original train instances self.trainFeatures = [] # train features self.test = [] # original test instances self.testFeatures = [] # test features self.featExtractor = FeatureExtractor()
def run(self): #Calling the ReviewsExtractor class to read the csv file #and extract,concatenate the reviews rev = ReviewsExtractor() rev.extract_review_content() total_content = rev.get_concatenated_reviews() f = FeatureExtractor(total_content) self.features = f.get_frequent_features_list(self.min_support) o = OpinionSentenceCollector(self.features, f.feature_sentences) for feature in o.opinion_features: self.ratings[feature] = {'positive': 0, 'negative': 0, 'neutral': 0, 'total_reviews': 0, 'negative_review': '', 'positive_review': ''} for feature, sentiment_score, sentence in o.opinion_sentences: self.ratings[feature]['total_reviews'] += 1 if sentiment_score > 0: self.ratings[feature]['positive'] += 1 self.ratings[feature]['positive_review'] = sentence elif sentiment_score < 0: self.ratings[feature]['negative'] += 1 self.ratings[feature]['negative_review'] = sentence else: self.ratings[feature]['neutral'] += 1 for feature in o.opinion_features: self.final_features.append((feature,self.ratings[feature]['total_reviews'])) self.sorted_features = sorted(set(self.final_features), key=lambda x: x[1], reverse=True) pp = pprint.PrettyPrinter(indent=4) print self.sorted_features print len(self.sorted_features) for index in range(0, 10): iter_feature = self.sorted_features[index][0] print "Feature: ", iter_feature pp.pprint(self.ratings[iter_feature])
def test_signature(test_dir): F = [] improc = ImageProcessor() ftextr = FeatureExtractor() img_files = find_image_files(test_dir) count = 1.0 for ifile in img_files: print("Extracting features, " + str(round(count / len(img_files) * 100, 1)) + "% done ...") count += 1 signature = Image.open(ifile) processed = improc.preprocess(signature) F.append(ftextr.extract_features(processed)) np.ndarray(shape=(len(F), len(F[0]))) F = np.array(F) np.ndarray(shape=(len(F), len(F[0]))) F = np.array(F) F.dump(test_dir + "feature_dump")
def run(self): rev = ReviewParser(open(settings.reviews_path + ReviewParser.map_cid_to_name(self.cid), 'rb',), review_files[choice].split('.')[-1]) rev.parse() print "Mining", len(rev.reviews), "reviews" text = rev.get_raw_text() f = FeatureExtractor(text, ReviewParser.map_cid_to_name(self.cid)) self.features = f.get_frequent_features(self.min_support) for ftr in self.features: self.ratings[ftr[0]] = {'positive': 0, 'negative': 0, 'neutral': 0} o = OpinionSentenceFinder(self.features, f.feature_sentences) #Extract all sentences which express some opinion opinion_sents = map(lambda y: y['opinion_sent'], filter(lambda x: len(x['opinion_sent']) > 1, o.feature_sentences)) temp = [] for os in opinion_sents: temp.extend(os) opinion_sents = temp for ftr, sentiment in opinion_sents: if sentiment[0] is True: self.ratings[ftr]['positive'] += 1 elif sentiment[0] is False: self.ratings[ftr]['negative'] += 1 else: self.ratings[ftr]['neutral'] += 1 pp = pprint.PrettyPrinter(indent = 4) print "Is this a %s?" % f.product_category print "%d features are interesting" % len(self.features) #pp.pprint(opinion_sents) pp.pprint(self.ratings)
def main(): webpages_dir = os.path.join(util.ROOT, 'data/weps2007_data_1.1/traininig/web_pages') fe = FeatureExtractor() ff = FeatureFilter() for name in os.listdir(webpages_dir): print 'begin clustering %s' % name reader = FileReader(webpages_dir, name) description = reader.read_description() pc = PersonCorpus(name) fm = FeatureMapper() for rank in description: doc_meta = {} html_path = os.path.join(webpages_dir, name, 'raw', rank, 'index.html') content = text_extract(html_path) features, wordcount = fe.extract(content) doc_meta['word_num'] = wordcount good_features = ff.filter(features) vec = FeatureVector(good_features, fm) pc.add_vector(vec) pc.compute_matrix() pc.dump_matrix()
def runKWS(query, imagePath, svgPath): svc = SVGCropper() fe = FeatureExtractor() result = [] #set threshold here threshold = 40 output = "" print("Cropping the segments") keywordsList = svc.cropWords(imagePath, svgPath) fq = fe.getFeatureVector(query) dists = [] for keyword in keywordsList: f = fe.getFeatureVector(keyword[0]) dist, path = fastdtw(f, fq, dist=euclidean) print "distance from ", keyword[1], " : ", dist dists.append(dist) output += keyword[1] + "," + str(dist) + " " return output
def __init__(self, train_data_path, pred_qs=None): """ Constructor """ logging.basicConfig( level=logging.DEBUG, format= '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S', filename='qclassifier.log', filemode='w') reload(sys) sys.setdefaultencoding('utf8') self.clf = None self.path = train_data_path self.pred_qs = pred_qs self.extractor = FeatureExtractor() self.features = None self.labels = None self.vectorizer = None self.cate = ['Person', 'Number', 'Location', 'Other']
def main(config_filename, port): global _text_field, _class_field, _preprocessor, _feature_extractor limit_port = 1024 if port <= limit_port: print("Please, indicate a port higher than %s." % (limit_port)) quit() logger.disabled = True parameters = Parameters(config_filename, training_mode=False) _text_field = parameters.excel_column_with_text_data _class_field = parameters.excel_column_with_classification_data _preprocessor = Preprocessor(stanfordnlp_language_package=parameters.stanfordnlp_language_package, stanfordnlp_use_gpu=parameters.stanfordnlp_use_gpu, stanfordnlp_resources_dir=parameters.stanfordnlp_resources_dir, training_mode=parameters.training_mode) _feature_extractor = FeatureExtractor(nltk_stop_words_package=parameters.nltk_stop_words_package, vectorizer_name=parameters.vectorizer, training_mode=parameters.training_mode, use_lda=parameters.use_lda, document_adjustment_code=parameters.document_adjustment_code, remove_adjectives=parameters.remove_adjectives, synonyms_file=parameters.synonyms_file, features_file=parameters.features_file) app.run(host='0.0.0.0', port=port, debug=False) # host='0.0.0.0' allows access from any network.
class NaiveBayesAnalyzer: def __init__(self, dict): self._dict = dict self._fe = FeatureExtractor() def train(self): train_data = [] for k, v in self._dict.items(): train_data = train_data + [ (self._fe.default_feature_extractor(f), k) for f in v ] self._classifier = nltk.classify.NaiveBayesClassifier.train(train_data) def analyze(self, text): feats = self._fe.default_feature_extractor(text) prob_dist = self._classifier.prob_classify(feats) classification = prob_dist.max() # print(classification) # for k in self._dict.keys(): # print (k, prob_dist.prob(k)) return classification
def __init__(self, train_data_path, pred_qs = None): """ Constructor """ logging.basicConfig(level = logging.DEBUG, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S', filename='qclassifier.log', filemode='w') reload(sys) sys.setdefaultencoding('utf8') self.clf = None self.path = train_data_path self.pred_qs = pred_qs self.extractor = FeatureExtractor() self.features = None self.labels = None self.vectorizer = None self.cate = ['Person', 'Number', 'Location', 'Other']
args = parser.parse_args() if not os.path.exists(args.extractor): print "Path to extractor '%s' not found" % args.extractor sys.exit(-1) if args.path_to_audio is None: args.path_to_audio = "audio/"+args.collection_name if args.path_to_audio.endswith("/"): args.path_to_audio = args.path_to_audio[:-1] if not os.path.exists(args.path_to_audio): print "Path to audio '%s' not found" % args.path_to_audio sys.exit(-1) if args.path_to_features is None: if not os.path.exists("features"): os.mkdir("features") args.path_to_features = "features/"+args.collection_name if args.path_to_features.endswith("/"): args.path_to_features = args.path_to_features[:-1] if not os.path.exists(args.path_to_features[:args.path_to_features.rfind("/")]): print "Path to features '%s' not found" % args.path_to_features sys.exit(-1) print args extractor = FeatureExtractor(args.extractor) extractor.extract(args.path_to_audio, args.path_to_features, args.audio_filetype, args.replace_features)
class WSD: def __init__(self): self.classifier = None self.className = "NB" # other options in the future: MaxEnt, DT self.featSets = ["BoW"] # other options: combination of BoW, LocalCol, PoS self.training = [] # original train instances self.trainFeatures = [] # train features self.test = [] # original test instances self.testFeatures = [] # test features self.featExtractor = FeatureExtractor() def setTrain(self, instances): self.training = instances self.trainFeatures = [] def setTest(self, instances): self.test = instances self.testFeatures = [] def setClassifier(self, className): self.className = className def setFeatureSet(self, featSets): self.featSets = featSets def learn(self): # check if variables are initialized if len(self.training) == 0: sys.stderr.write("No training assigned\n") return 0 if len(self.trainFeatures) == 0: sys.stderr.write("[Time] %s : Extracting training features\n" % time.asctime()) self.trainFeatures = [(self.getFeatures(instance), instance[1]) for (instance) in self.training] else: sys.stderr.write("[Time] %s : Features already extracted\n" % time.asctime()) if self.className == "NB": sys.stderr.write("[Time] %s : Learning a Naive Bayes classifier\n" % time.asctime()) self.classifier = nltk.NaiveBayesClassifier.train(self.trainFeatures) if self.className == "MaxEnt": sys.stderr.write("[Time] %s : Learning a Maximum Entropy classifier\n" % time.asctime()) #self.classifier = nltk.classify.MaxentClassifier.train(self.trainFeatures, "IIS", trace=3, max_iter=100) self.classifier = nltk.classify.MaxentClassifier.train(self.trainFeatures, "IIS", trace=3, max_iter=30) if self.className == "DT": sys.stderr.write("[Time] %s : Learning a Decission Tree classifier\n" % time.asctime()) self.classifier = nltk.classify.DecisionTreeClassifier.train(self.trainFeatures, entropy_cutoff=0, support_cutoff=0) if self.className == "NB_sklearn": sys.stderr.write( "[Time] %s : Learning a Multinomial Naive Bayes (scikit-learn) classifier\n" % time.asctime()) X, y = self.featExtractor.convert2sklearn(self.trainFeatures) self.classifier = MultinomialNB() self.classifier.fit(X, y) if self.className == "DT_sklearn": sys.stderr.write( "[Time] %s : Learning a Decision Tree (scikit-learn) classifier\n" % time.asctime()) X, y = self.featExtractor.convert2sklearn(self.trainFeatures) self.classifier = DecisionTreeClassifier(random_state=0) self.classifier.fit(X, y) if self.className == "MaxEnt_sklearn": sys.stderr.write("[Time] %s : Learning a Logistic Regression (scikit-learn) classifier\n" % time.asctime()) X, y = self.featExtractor.convert2sklearn(self.trainFeatures) self.classifier = LogisticRegression() self.classifier.fit(X, y) if self.className == "SVM_sklearn": sys.stderr.write("[Time] %s : Learning a Linear Support Vector Machine (scikit-learn) classifier\n" % time.asctime()) X, y = self.featExtractor.convert2sklearn(self.trainFeatures) self.classifier = LinearSVC(C=1.0) self.classifier.fit(X, y) sys.stderr.write("[Time] %s : Learning finished\n" % time.asctime()) #self.classifier.show_most_informative_features(20) def predict(self): if self.classifier == None: sys.stderr.write("[ERROR] No classifier learnt") return 0 if len(self.test) == 0: sys.stderr.write("[ERROR] No test assigned") return 0 if len(self.testFeatures) == 0: sys.stderr.write("[Time] %s : Extracting test features\n" % time.asctime()) self.testFeatures = [(self.getFeatures(instance), instance[1]) for (instance) in self.test] else: sys.stderr.write("[Time] %s : Test features aldready extracted\n" % time.asctime()) sys.stderr.write("[Time] %s : Predictions on test\n" % time.asctime()) if self.className == "MaxEnt_sklearn" or self.className == "SVM_sklearn" or self.className == "DT_sklearn" or self.className == "NB_sklearn": X, y = self.featExtractor.convert2sklearn(self.testFeatures, train=False) predictions = self.classifier.predict(X) else: predictions = [self.classifier.classify(feats[0]) for feats in self.testFeatures] return predictions def accuracy(self, preds=None, gold=None): if preds == None: if len(self.testFeatures) == 0: if len(self.test) == 0: sys.stderr.write("[ERROR] No test assigned") return 0 sys.stderr.write("[Time] %s : Extracting test features\n" % time.asctime()) self.testFeatures = self.getFeatures(self.test) if self.className == "MaxEnt_sklearn" or self.className == "SVM_sklearn" or self.className == "DT_sklearn" or self.className == "NB_sklearn": X_test, y_test = self.featExtractor.convert2sklearn(self.testFeatures, train=False) acc = self.classifier.score(X_test, y_test) else: acc = nltk.classify.accuracy(self.classifier, self.testFeatures) return acc else: correct = [l == r for (l, r) in zip(gold, preds)] if correct: return float(sum(correct))/len(correct) else: return 0 def getFeatures(self, instance): features = {} for ft in self.featSets: if ft == "BoW": features.update(self.featExtractor.getBoW(instance)) if ft == "SPoS": features.update(self.featExtractor.getSurroundPoS(instance)) if ft == "LCOL": features.update(self.featExtractor.getLocalCollocations(instance)) return features def saveModel(self, fileName): try: f = open(fileName, 'wb') save = { 'classifier': self.classifier, 'className': self.className, 'featSets': self.featSets } pickle.dump(save, f, pickle.HIGHEST_PROTOCOL) f.close() except Exception as e: print('Unable to save data to', pickle_file, ':', e) raise statinfo = os.stat(fileName) sys.stderr.write('[INFO] Saved model in %s' % fileName) sys.stderr.write('[INFO] Compressed pickle size: ' + statinfo.st_size + "\n") def loadModel(self, fileName): with open(fileName, 'rb') as f: save = pickle.load(f) self.classifier = save['classifier'] self.className = save['className'] self.featSets = save['featSets'] del save # hint to help gc free up memory sys.stderr.write("[INFO] Loaded model name: %s\n" % self.className) sys.stderr.write("[INFO] Loaded model features set: " + self.featsSets + "\n")
from ReviewParser import ReviewParser from FeatureExtractor import FeatureExtractor review_file = ['Apple_iPhone_4.csv', 'Blackberry_Torch_9800.csv', 'Nikon_D90.csv', 'Canon_ELPH_300_HS.csv'] rev = ReviewParser(open('../data/reviews/' + review_file[3], 'rb',), 'CSV') rev.parse() text = rev.get_raw_text() f = FeatureExtractor(text) print f.get_frequent_features(5) """ #tokenize_patterns = ['[Nn]ikon ?[dD][0-9]+', '([0-9]+ ?mm)', '(auto[ -_]?focus)', '(Apple)[ ]?(iphone)??[0-5]?[gs]*'] features = [w.lower() for (w,t) in tags if t.startswith('N') and t != 'NNP'] features = p.stemmer(features) dist = nltk.FreqDist(features) obs = [ob for ob in dist.iteritems()] logfile = open('/tmp/log.txt', 'w') logfile.write("".join(str(obs)).replace("), (", ")\n(")) logfile.close() """
try: min_support = int(sys.argv[1]) except: min_support = 5 reviews_path = '../data/reviews/' review_files = check_output(['ls', '-1', reviews_path]).split() for review_file in review_files: print review_files.index(review_file), ' ' + review_file choice = int(input('#')) if choice not in xrange(0, len(review_files)): print 'Error' exit(-1) rev = ReviewParser(open(reviews_path + review_files[choice], 'rb',), review_files[choice].split('.')[-1]) rev.parse() text = rev.get_raw_text() f = FeatureExtractor(text, review_files[choice]) print "Based on ", len(rev.reviews), " reviews" features = f.get_frequent_features(min_support) features = f.prune_features(features, 3) print "Is this a %s?" % f.product_category
test_b = prp_binary_dataf(test_q) # prepare corpus features # Here we suppose all the raw data is all stored in corpus dir, the raw data is: # wiki: enwiki-20160113-pages-articles.xml # ck12: OEBPS dir that contains files extracted from Concepts_b_v8_vdt.epub # ck12: CK-12-Biology-Concepts_b_v143_e4x_s1.text: the downloaded version is pdf, use online converter generate text # ck12: CK-12-Chemistry-Basic_b_v143_vj3_s1.text # ck12: CK-12-Earth-Science-Concepts-For-High-School_b_v114_yui_s1.text # ck12: CK-12-Life-Science-Concepts-For-Middle-School_b_v126_6io_s1.text # ck12: CK-12-Physical-Science-Concepts-For-Middle-School_b_v119_bwr_s1.text # ck12: CK-12-Physics-Concepts-Intermediate_b_v56_ugo_s1.text data_pkl_file = None norm_scores_default = False if data_pkl_file is None: fext = FeatureExtractor(base_dir = base_dir, recalc = False, norm_scores_default = norm_scores_default, print_level = 2) # prepare word set, which is to derive all the unique 1-gram and 2-gram from train, valid and test fext.prepare_word_sets(corpus_dir = corpus_dir, train_b = train_b, valid_b = None, test_b = None) # prepare ck12html corpus: this function will go into CK12/OEBPS dir, find all x.html file where x is a number # extract all the text while ignore sections such as 'explore more', 'review', 'practice', 'references' fext.prepare_ck12html_corpus(corpus_dir = corpus_dir) # prepare ck12text corpus: this function will go into CK12 dir, find all .text file, which are 6 textbooks # extract relevant text from all Chapters of each book fext.prepare_ck12text_corpus(corpus_dir = corpus_dir) # prepare simplewiki corpus: this function will go into simplewiki dir, find the simplewiki-20151102-pages-articles.xml # extract text from all categories found if the page contains at least some uncommon words from train_b and test_b fext.prepare_simplewiki_corpus(corpus_dir, train_b, valid_b)
""" Created on May 8, 2013 @author: bhanu """ from FeatureExtractor import FeatureExtractor import os if __name__ == "__main__": fe = FeatureExtractor() fe.load_dup_dict() # fe = FeatureExtractor() # if(os.path.exists("instanceList.obj")): # fe.load_data() # else: # fe.extractFeatures() # # fe.rolf()
path = os.path.join(dir_name, f) if f.startswith('.') or not os.path.isfile(path): continue print 'Processing ' + path fe = FeatureExtractor(); fe.setup(path); extractedFeatures = fe.getAllFeatures(); sys.stdout.write('Finished extracting features!' + "\n") data = {f: extractedFeatures} pickle.dump(data, open( "extracted_features.pickle", "a" )) if __name__ == '__main__': # process existing audio files # os.path.walk('../data', processFile, 0) fe = FeatureExtractor(); path = sys.argv[1] fe.setup(path) extractedFeatures = fe.getAllFeatures(); currFile = {path: extractedFeatures} # load features that were extracted from existing files fl = FeatureLoader() dirname, filename = os.path.split(os.path.abspath(__file__)) ground_truth = os.path.join(dirname, 'extracted_features.pickle') data = fl.load(ground_truth) # compare features between uploaded file and existing files fc = FeatureComparator() diffList = [] currFileName = currFile.iterkeys().next()
def test_AutoEncoder(learning_rate=0.1, training_epochs=15, batch_size=20): """ :type learning_rate: float :param learning_rate: learning rate used for training the DeNosing AutoEncoder :type training_epochs: int :param training_epochs: number of epochs used for training """ x_scipySparse = None; train_set_x = None; numInstances = 0; numFeatures = 0; if((os.path.exists("input_scipySparse.obj"))): print "loading sparse data from pickled file..." f = open("input_scipySparse.obj", 'r') x_scipySparse = cPickle.load(f) f.close() numInstances, numFeatures = x_scipySparse.shape else: print "extracting features and building sparse data..." fe = FeatureExtractor() fe.extractFeatures() train_set_x = fe.instanceList featureDict = fe.featDict numInstances = len(train_set_x) numFeatures = len(featureDict) x_lil = sp.lil_matrix((numInstances,numFeatures), dtype='float32') # the data is presented as a sparse matrix i = -1; v = -1; try: for i,instance in enumerate(train_set_x): for v in instance.input: x_lil[i, v] = 1 except: print "i=",i," v=",v x_scipySparse = x_lil.tocsc() f = open("input_scipySparse.obj", 'w') cPickle.dump(x_scipySparse, f, protocol=cPickle.HIGHEST_PROTOCOL) f.close() # compute number of mini-batches for training, validation and testing n_train_batches = numInstances / batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch #x = sparse.basic.as_sparse_variable(x_scipySparse, 'x') x = theano.shared(x_scipySparse, borrow=True) #################################### # BUILDING THE MODEL # #################################### print "building the model..." rng = numpy.random.RandomState(123) ae = AutoEncoder(numpy_rng=rng, input=x, n_visible=numFeatures, n_hidden=10, n_trainExs=numInstances) cost, updates = ae.get_cost_updates(corruption_level=0., learning_rate=learning_rate) train_ae = theano.function([index], cost, updates=updates, givens={x: train_set_x[index * batch_size: (index + 1) * batch_size]}) start_time = time.clock() ############ # TRAINING # ############ # go through training epochs print "starting training..." for epoch in xrange(training_epochs): # go through training set c = [] for batch_index in xrange(n_train_batches): c.append(train_ae(batch_index)) print 'Training epoch %d, cost ' % epoch, numpy.mean(c) end_time = time.clock() training_time = (end_time - start_time) print "training completed in : ", training_time
def createKmeansTrainingDataset(self,kmeans_data, dataset_name, kmeans_name, path_list, labels_list, num_of_clusters): ''' Create Training for Kmeans With regression. :param: KmeansData: the training matrix obtained using createClassificationTrainingFromDataset method on the HOLDOUT set. :param: kmeansName: the name of the Kmeans classifier to be saved and pickeled. :param: dataset_name: the name of the NEW dataset created using the Kmeans classifier on the training. i.e. clustering the feature vector. :param: path_list: list of paths where the training set is at. :param: labels_list: the list of labels for the samples in the training set. :param: num_of_clusters: the number of clusters for the Kmeans classifier. ''' npzfile = np.load(kmeans_data) KmeansData = npzfile['arr_0'] Kmeanslabels = npzfile['arr_1'] Kmeansclasses = npzfile['arr_2'] k_means = cluster.KMeans(n_clusters=num_of_clusters) k_means.fit(kmeans_data) base_path = "binData/" labels = labels_list trainingData = [] classes = [] cl=0 ### Building the feature matrix. for i, path in enumerate(path_list): print labels_list[i] for item in os.listdir(path): p = path + "/" + item print p # DEBUG im = cv.imread(p) fe = FeatureExtractor(im) feature_vector = np.zeros(num_of_clusters) raw_vector = fe.computeFeatureVector() Km_vector = k_means.predict(raw_vector) for j in range(len(Km_vector)): feature_vector[Km_vector[j]] = feature_vector[Km_vector[j]] + 1 trainingData.append(feature_vector) classes.append(cl) # Here we multiply the number of POSITIVE samples in the training set so that the 'unbalanced' problem of "Foram vs. Not-Foram" # 'becomes balanced'. if i == 0: print "working on positive samples" print "Original training size: (should be 68 by 10)" print np.shape(trainingData) print np.shape(classes) for k in range(9): trainingData = np.vstack((trainingData, trainingData)) classes = np.hstack((classes,classes)) print "After Multipling Positive Samples by 8" print np.shape(trainingData) print np.shape(classes) trainingData = trainingData.tolist() classes = classes.tolist() cl = cl + 1 ### DEBUG print "final shape: (should be 54,000~ by 10):" print np.shape(trainingData) ### SAVING THE DATASETS TO NPZ FORMAT joblib.dump(k_means, os.path.join(base_path, kmeans_name), compress=9) np.savez(os.path.join(base_path, dataset_name), trainingData, labels_list, classes)