Python FeatureExtractor 예제들, feature_extractor.FeatureExtractor Python 예제들

예제 #1

0

파일 보기

파일: feature_extractor_fix_script.py 프로젝트: Trolomoll/project_multimedia_retrieval

def dist_sqrt_area_rand_triangle(data):
    mesh = data["poly_data"]
    verts_list = FeatureExtractor.generate_random_ints(0, len(mesh.points) - 1, (FeatureExtractor.number_vertices_sampled, 3))
    triangle_areas = PSBDataset._get_cell_areas(mesh.points, verts_list)
    sqrt_areas = np.sqrt(triangle_areas)
    del verts_list
    return {"hist_sqrt_area_rand_three_verts": FeatureExtractor.make_bins(sqrt_areas, FeatureExtractor.number_bins)}

예제 #2

0

파일 보기

    def preprocessing(self, pca=False, tsne=False, umap=False):

        feature_extractor = FeatureExtractor()
        self.x_all = feature_extractor.fit_transform(self.x_all)
        self.x_train = feature_extractor.fit_transform(self.x_train)
        self.x_test = feature_extractor.fit_transform(self.x_test)
        self.x_all_trans_no_pca = np.copy(self.x_all)

        # Apply dimensionality reduction
        scaler = StandardScaler()
        self.x_all = scaler.fit_transform(self.x_all)
        self.x_train = scaler.fit_transform(self.x_train)
        self.x_test = scaler.fit_transform(self.x_test)

        if pca or tsne or umap:
            self.x_all = self.apply_Dim_Reduction(self.x_all,
                                                  apply_pca=pca,
                                                  apply_tSNE=tsne,
                                                  apply_umap=umap)
            self.x_train = self.apply_Dim_Reduction(self.x_train,
                                                    apply_pca=pca,
                                                    apply_tSNE=tsne,
                                                    apply_umap=umap)
            self.x_test = self.apply_Dim_Reduction(self.x_test,
                                                   apply_pca=pca,
                                                   apply_tSNE=tsne,
                                                   apply_umap=umap)

        # Visualization of the data
        if self.visualize:
            self.visualize_inputs_(self.x_all_trans_no_pca)
            self.visualize_pca_inputs()
            self.visualize_tsne_inputs(self.x_all.shape[0])

예제 #3

0

파일 보기

파일: model.py 프로젝트: Frank1993/signLan_datacollection

def getClf():
    records = []

    for cla, file in [(0, "frameOf1.frame"), (1, "frameOf2.frame")]:
        de = Deserialization(file)
        frames = de.frames[100:-100]

        featureExtractor = FeatureExtractor()
        for frame in frames:
            frame_record = featureExtractor.getFeature(frame, cla)
            records.append(frame_record)

    data = np.array(records)

    features = data[:, :-1]
    labels = data[:, -1]

    print features.shape
    print labels.shape
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        labels,
                                                        test_size=0.2,
                                                        random_state=42)

    clf = svm.SVC()
    clf.fit(X_train, y_train)

    predicted_test = clf.predict(X_test)

    print np.mean(predicted_test == y_test)

    return clf

예제 #4

0

파일 보기

파일: new_indexer.py 프로젝트: akash11061999/Visual-Search-Recommendation

 def __init__(self, config):
     self.layer = config["layer"]
     self.input_layer = config["input_layer"]
     self.feature_extractor = FeatureExtractor(
         config["path_to_deploy_file"],
         config["path_to_model_file"],
         input_layer_name=self.input_layer)

예제 #5

0

파일 보기

파일: indexer.py 프로젝트: latika-bhurani/fk-visual-search

 def __init__(self, config, image_paths):
     self.layer = config["model_layer"]
     self.search_index_path = config["search_index_path"]
     self.feature_extractor = FeatureExtractor(config["path_to_model_file"],
                                               embedding_layer=self.layer)
     self.image_paths = image_paths
     self.search_index = AnnoyIndex(4096, metric="euclidean")

예제 #6

0

파일 보기

def initialise_everything():
    print('''This procedure can take up to hours to finish.
    The program will now run:
     - Normalisation pipeline over the shape database. (~3hrs)
     - Feature extraction over shape database. (~2hrs)\n
     Are you sure you want to continue (y/n)?\n
    ''')
    choice = input(">> ")
    if choice == "n" or choice == "no":
        return

    with open('config.json') as f:
        data = json.load(f)
    path_psd = data["DATA_PATH_PSB"]
    path_normed = data["DATA_PATH_NORMED"]
    path_feature = data["FEATURE_DATA_FILE"]
    db = PSBDataset()
    if len(os.listdir(path_psd)) == 0:
        print("No valid dataset found.\nPoint to a valid dataset.")
        return
    else:
        prompt_for_class_files(path_psd)
        choice = input(
            "Do you wish to go back to the menu to change the current classification settings? (y/n)\n>> "
        )
        if choice == "n":
            return
    if not os.path.isfile(path_normed):
        print("No valid normalised dataset found.\nRunning normalisation.")
        norm = Normalizer(db)
        norm.run_full_pipeline()
    if not os.path.isfile(path_feature):
        print("No valid feature file found.\nRun feature extraction.")
        FE = FeatureExtractor(db)
        FE.run_full_pipeline()

예제 #7

0

파일 보기

파일: extract_features_caltech101.py 프로젝트: norikinishida/image-feature-extraction-via-convnet

def main():
    dataset_path = "/path/to/Caltech-101"
    modelzoo_path = "/path/to/VGG16"
    
    # create an instance
    convnet = FeatureExtractor(
            prototxt_path=os.path.join(modelzoo_path, "vgg16_deploy.prototxt"),
            caffemodel_path=os.path.join(modelzoo_path, "vgg16.caffemodel"),
            target_layer_name="fc7",
            image_size=224,
            mean_values=[103.939, 116.779, 123.68])
    
    # header
    f = open("caltech101_vggnet_fc7_features.csv", "w")
    header = ["filepath"]
    for i in xrange(4096):
        header.append("feat%d" % (i+1))
    header = ",".join(header) + "\n"
    f.write(header)
    
    # extract features
    categories = os.listdir(dataset_path)
    for category in pyprind.prog_bar(categories):
        file_names = os.listdir(os.path.join(dataset_path, category))
        for file_name in file_names:
            img = cv2.imread(os.path.join(dataset_path, category, file_name))
            feat = convnet.transform(img)
            feat_str = [os.path.join(category, file_name)]
            for value in feat:
                feat_str.append(str(value))
            row = ",".join(feat_str)
            f.write("%s\n" % row)
            f.flush()

    f.close()

예제 #8

0

파일 보기

파일: new_indexer.py 프로젝트: akash11061999/Visual-Search-Recommendation

class Indexer(object):
    def __init__(self, config):
        self.layer = config["layer"]
        self.input_layer = config["input_layer"]
        self.feature_extractor = FeatureExtractor(
            config["path_to_deploy_file"],
            config["path_to_model_file"],
            input_layer_name=self.input_layer)
        #self.image_paths = image_paths

    def index_batch(self, batch_size, start_index=0, stop_index=None):
        batches = [
            self.image_paths[x:x + batch_size]
            for x in range(0, len(self.image_paths), batch_size)
        ]
        batch_num = 0
        if not stop_index:
            stop_index = len(batches)
        batches = batches[start_index:stop_index]
        for batch in batches:
            batch_num += 1
            print("Indexing batch ", batch_num, len(batch))
            fv_dict = self.feature_extractor.extract_batch(batch,
                                                           layer=self.layer)
            self.write_to_lmdb(fv_dict)

    def index(self, img):
        fv = self.feature_extractor.extract_from_img(img, layer=self.layer)
        return fv

    def write_to_lmdb(self, fv_dict):
        env = self.connection
        with env.begin(write=True) as txn:
            for k in fv_dict:
                txn.put(k.encode('ascii'), fv_dict[k].tostring())

예제 #9

0

파일 보기

파일: main.py 프로젝트: Ivanka07/poor_neck_posture_dataset

def extract_features(json_file):
    class_id = json_file.split('_')[-1]
    class_id = int(class_id[0])
    feature_extractor = FeatureExtractor(json_file)
    feature_list = feature_extractor.extract_features()

    return feature_list, class_id

예제 #10

0

파일 보기

 def __init__(self, data_type, mode, debug_limit):
     log_csv_path = '{0}/../data/{1}/log_{1}.csv'.format(
         base_dir, data_type)
     feature_path = '{0}/../data/feature/user_feature_{1}.csv'.format(
         base_dir, data_type)
     FeatureExtractor.__init__(self, mode, log_csv_path, feature_path,
                               debug_limit)

예제 #11

0

파일 보기

파일: mstlstm.py 프로젝트: strategist922/uuparser

    def __init__(self, vocab, options):
        import dynet as dy
        from feature_extractor import FeatureExtractor
        global dy
        self.model = dy.ParameterCollection()
        self.trainer = dy.AdamTrainer(self.model, alpha=options.learning_rate)
        self.activations = {
            'tanh':
            dy.tanh,
            'sigmoid':
            dy.logistic,
            'relu':
            dy.rectify,
            'tanh3':
            (lambda x: dy.tanh(dy.cwise_multiply(dy.cwise_multiply(x, x), x)))
        }
        self.activation = self.activations[options.activation]
        self.costaugFlag = options.costaugFlag
        self.feature_extractor = FeatureExtractor(self.model, options, vocab)
        self.labelsFlag = options.labelsFlag
        mlp_in_dims = options.lstm_output_size * 2

        self.unlabeled_MLP = biMLP(self.model, mlp_in_dims,
                                   options.mlp_hidden_dims,
                                   options.mlp_hidden2_dims, 1,
                                   self.activation)
        if self.labelsFlag:
            self.labeled_MLP = biMLP(self.model, mlp_in_dims,
                                     options.mlp_hidden_dims,
                                     options.mlp_hidden2_dims,
                                     len(self.feature_extractor.irels),
                                     self.activation)

        self.proj = options.proj

예제 #12

0

파일 보기

파일: odometry_estimator.py 프로젝트: anastasiia-kornilova/python-LOAM

    def __init__(self):
        self.extractor = FeatureExtractor()

        self.inited = False
        self.last_less_sharp_points = None
        self.last_less_flat_points = None
        self.last_position = np.eye(4)

예제 #13

0

파일 보기

파일: arc_hybrid.py 프로젝트: yudiwbs/uuparser

    def __init__(self, words, pos, rels, cpos, langs, w2i, ch, options):

        import dynet as dy # import here so we don't load Dynet if just running parser.py --help for example
        global dy

        self.model = dy.ParameterCollection()
        self.trainer = dy.AdamTrainer(self.model, alpha=options.learning_rate)

        self.activations = {'tanh': dy.tanh, 'sigmoid': dy.logistic, 'relu':
                            dy.rectify, 'tanh3': (lambda x:
                            dy.tanh(dy.cwise_multiply(dy.cwise_multiply(x, x), x)))}
        self.activation = self.activations[options.activation]

        self.oracle = options.oracle


        self.headFlag = options.headFlag
        self.rlMostFlag = options.rlMostFlag
        self.rlFlag = options.rlFlag
        self.k = options.k

        #dimensions depending on extended features
        self.nnvecs = (1 if self.headFlag else 0) + (2 if self.rlFlag or self.rlMostFlag else 0)
        self.feature_extractor = FeatureExtractor(self.model,options,words,rels,langs,w2i,ch,self.nnvecs)
        self.irels = self.feature_extractor.irels


        mlp_in_dims = options.lstm_output_size*2*self.nnvecs*(self.k+1)
        self.unlabeled_MLP = MLP(self.model, 'unlabeled', mlp_in_dims, options.mlp_hidden_dims,
                                 options.mlp_hidden2_dims, 4, self.activation)
        self.labeled_MLP = MLP(self.model, 'labeled' ,mlp_in_dims, options.mlp_hidden_dims,
                               options.mlp_hidden2_dims,2*len(self.irels)+2,self.activation)

예제 #14

0

파일 보기

 def __init__(self, x, y):
     # instantiate feature extractor
     self.fe = FeatureExtractor()
     self.x_text = x
     self.X = self.fe.fit(x, y)
     self.Y = y
     input_shape = self.X.shape[1:]
     # instantiate keras model
     self.model = mlp_model(input_shape)
     self.optimizer = keras.optimizers.Adam(lr=1e-3)
     # Create callback for early stopping on validation loss. If the loss does
     # not decrease in two consecutive tries, stop training.
     self.callbacks = [keras.callbacks.EarlyStopping(
         monitor='val_loss', patience=2)]
     self.model.compile(optimizer=self.optimizer, loss="binary_crossentropy",  metrics=[precision_m, recall_m])
     self.history = self.model.fit(
         self.X,
         self.Y,
         epochs=10,
         callbacks=self.callbacks,
         validation_split=0.1,
         verbose=0,  # Logs once per epoch.
         batch_size=32)
     self.precision = self.history.history['val_precision_m'][-1]
     self.recall = self.history.history['val_recall_m'][-1]

예제 #15

0

파일 보기

파일: user_feature_extractor.py 프로젝트: Sandy4321/kdd2015-3

 def __init__(self, mode, data_type, log_csv_path, feature_path,
              debug_limit):
     self.db = SimpleCourseDB()
     self.db.build()
     print 'finish build course DB!'
     FeatureExtractor.__init__(self, mode, data_type, log_csv_path,
                               feature_path, debug_limit)

예제 #16

0

파일 보기

    def predict_svm(self, example):
        '''
        :param example: str (example comment)
        :return: str (constructiveness prediction for the example)

        Description:
        Given a comment example, example, this class method returns whether the comment
        is constructive or not based on the trained model for constructiveness.
        '''

        # Build a feature vector for the example
        example_df = pd.DataFrame.from_dict({
            'pp_comment_text': [example],
            'constructive': ['?']
        })
        print(example_df)
        fe = FeatureExtractor(example_df)
        fe.extract_features()
        feats_df = fe.get_features_df()

        # Get the prediction score and find the winner
        prediction = self.svm_pipeline.predict(feats_df)[0]
        prediction_winner = 'Non-constructive' if prediction == 0 else 'Constructive'

        return prediction_winner.upper()

예제 #17

0

파일 보기

파일: kmeans_ngram.py 프로젝트: aerows/NLP1-Project

 def __init__(self,texts=None,n=16,step_size=1,k=100,kmeans_args = None):
     self.n = n
     self.step_size = step_size
     self.k = k
     self.kmeans=None
     self.kmeans_args = kmeans_args
     FeatureExtractor.__init__(self)

예제 #18

0

파일 보기

파일: worker.py 프로젝트: alessandro-betti/see

    def __init__(self,
                 input_stream,
                 output_stream,
                 w=-1,
                 h=-1,
                 fps=-1,
                 frames=-1,
                 force_gray=False,
                 repetitions=1,
                 options=None,
                 resume=False,
                 reset_stream_when_resuming=False):
        self.input_stream = input_stream
        self.output_stream = output_stream
        self.repetitions = repetitions
        self.__completed_repetitions = 0
        self.__start_time = None
        self.__elapsed_time = None
        self.__rho = options['rho']
        self.steps = 0.0
        self.measured_fps = 0.0
        self.save_scores_only = options['save_scores_only']
        options['stream'] = self.input_stream
        self.input_stream.set_options(w, h, fps, force_gray, frames)
        self.fe = FeatureExtractor(
            w, h, options,
            resume)  # here is the TensorFlow based feature extractor!
        self.blink_steps = []

        if resume:
            out("RESUMING...")
            self.load(reset_stream_when_resuming)

예제 #19

0

파일 보기

 def __init__(self, prototype_dict, output_folder, opt):
     self.prototypes = prototype_dict
     self.opt = opt
     self.feature_extractor = FeatureExtractor(None)
     self.feature_vector_protoypes = self.calc_FV_protoypes()
     self.output_folder = output_folder
     self.metrics = {"precision": [], "recall": [], "f1": []}

예제 #20

0

파일 보기

    def __init__(self,
                 training_data_path,
                 colour_space,
                 num_orientations,
                 pixels_per_cell,
                 cells_per_block,
                 hog_channel,
                 spatial_size,
                 hist_bins,
                 toggle_spatial_features=True,
                 toggle_histogram_features=True,
                 toggle_hog_features=True):

        self.orientations = num_orientations
        self.pixels_per_cell = pixels_per_cell
        self.cells_per_block = cells_per_block

        self.feature_extractor = FeatureExtractor(
            colour_space, num_orientations, pixels_per_cell, cells_per_block,
            hog_channel, spatial_size, hist_bins, toggle_spatial_features,
            toggle_histogram_features, toggle_hog_features)

        self.classifier = CarClassifier(training_data_path,
                                        self.feature_extractor)
        self.fleet = VehicleFleet()
        self.heatmap = None
        self.frames = 0
        self.labels = None

예제 #21

0

파일 보기

파일: feature_extractor_fix_script.py 프로젝트: Trolomoll/project_multimedia_retrieval

def dist_two_rand_verts(data):
    distances = []
    mesh = data["poly_data"]
    indices_tuples = FeatureExtractor.generate_random_ints(0, len(mesh.points) - 1, (FeatureExtractor.number_vertices_sampled, 2))
    verts_tuples = [mesh.points[tup] for tup in indices_tuples]
    distances = np.linalg.norm(np.abs(np.diff(np.array(verts_tuples), axis=1)).reshape(-1, 3), axis=1)
    del indices_tuples
    return {"hist_rand_dist_two_verts": FeatureExtractor.make_bins(distances, FeatureExtractor.number_bins)}

예제 #22

0

파일 보기

파일: test_featureExtractor.py 프로젝트: sean-tu/NAPS

 def test_stem_words(self):
     f = FeatureExtractor()
     s = f.stem_words({
         'connect': 1,
         'connected': 1,
         'connecting': 1,
         'connection': 1
     })
     self.assertTrue(s == {'connect': 4})

예제 #23

0

파일 보기

파일: feature_extractor_fix_script.py 프로젝트: Trolomoll/project_multimedia_retrieval

def dist_bar_vert(data):
    distances = []
    mesh = data["poly_data"]
    bary_center = mesh.center
    indices = FeatureExtractor.generate_random_ints(0, len(mesh.points) - 1, (FeatureExtractor.number_vertices_sampled, 1))
    rand_verts = mesh.points[indices]
    distances = np.linalg.norm(np.abs(rand_verts.reshape(-1, 3) - bary_center), axis=1)
    del indices
    return {"hist_dist_bar_vert": FeatureExtractor.make_bins(distances, FeatureExtractor.number_bins)}

예제 #24

0

파일 보기

파일: week_feature_extractor.py 프로젝트: numb3r3/kdd2015

 def __init__(self, mode, data_type, log_csv_path, feature_path, label_path, debug_limit):
     FeatureExtractor.__init__(self, mode, data_type, log_csv_path, feature_path, debug_limit)
     labels = {}
     with open(label_path, 'r') as r:
         for line in r:
             eid, dropout = line.strip().split(',')
             if str.isdigit(eid):
                 labels[int(eid)] = int(dropout)
     self.labels = labels

예제 #25

0

파일 보기

 def __init__(self, data_path, train_length=2500):
     fe = FeatureExtractor(data_path)
     (self.trainX, self.trainY, self.testX, self.testY, self.eng_tokenizer,
      self.hindi_tokenizer) = fe.get_train_test_data(train_length)
     self.l = fe.l
     self.eng_vocab_size = fe.eng_vocab_size
     self.hindi_vocab_size = fe.hindi_vocab_size
     self.eng_length = fe.eng_length
     self.hindi_length = fe.hindi_length

예제 #26

0

파일 보기

파일: classifier.py 프로젝트: jcheminform/OGER-filter

    def predict_from_ngram(self, ngram):
        """Predict class from a ngram.

        args:
            ngram (str): n-gram
        """
        feat_extr = FeatureExtractor(self.config_path)
        feat_val_list = [val for val in feat_extr.iter_feature_values(ngram)]
        return self.predict_from_feat_val_list(feat_val_list)

예제 #27

0

파일 보기

파일: main_training.py 프로젝트: sarioz/Springboard

def main_training():
    lexicon_loader = LexiconLoader()
    scored_lexicon: dict = lexicon_loader.load_all_and_merge()
    tr_tweets_loader = LabeledTweetsLoader(TRAINING_INPUT_FILENAME)
    tr_labeled_tweets = tr_tweets_loader.parse_tokens_and_labels(
        tr_tweets_loader.load_lines())

    token_summarizer = TokenSummarizer(scored_lexicon)
    feature_extractor = FeatureExtractor(scored_lexicon)

    vu = VocabUtil()
    nn_input_preparer = NNInputPreparer(vu)

    tr_feature_vectors = []  # 2D array of feature vectors
    for labeled_tweet in tr_labeled_tweets:
        known_token_sequence = token_summarizer.get_known_tokens(
            labeled_tweet[0])
        feature_vector = feature_extractor.compute_feature_vector(
            known_token_sequence)
        tr_feature_vectors.append(feature_vector)
    tr_network_input = np.array(tr_feature_vectors)
    tr_targets = [labeled_tweet[1] for labeled_tweet in tr_labeled_tweets]
    tr_targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot(
        tr_targets)

    dev_tweets_loader = LabeledTweetsLoader(DEV_INPUT_FILENAME)
    dev_labeled_tweets = dev_tweets_loader.parse_tokens_and_labels(
        dev_tweets_loader.load_lines())
    dev_feature_vectors = []  # 2D array of feature vectors
    for labeled_tweet in dev_labeled_tweets:
        known_token_sequence = token_summarizer.get_known_tokens(
            labeled_tweet[0])
        feature_vector = feature_extractor.compute_feature_vector(
            known_token_sequence)
        dev_feature_vectors.append(feature_vector)
    dev_network_input = np.array(dev_feature_vectors)
    dev_targets = [labeled_tweet[1] for labeled_tweet in dev_labeled_tweets]
    dev_targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot(
        dev_targets)

    # Every epoch is cheap (< 1ms), so we don't need the ability to continue training from a previous model.
    print("Commencing new training run")
    model_creator = ModelCreator(vu)
    model = model_creator.create_two_dense_model(hidden_layer_size=HIDDEN_SIZE)

    cp_filepath = BASE_DIR + 'ep_{epoch}_valacc_{val_accuracy:.5f}.h5'
    checkpoint = ModelCheckpoint(cp_filepath,
                                 monitor='val_accuracy',
                                 verbose=1,
                                 save_best_only=False)

    model.fit(tr_network_input,
              tr_targets_one_hot_encoded,
              batch_size=32,
              epochs=MAX_EPOCHS,
              validation_data=(dev_network_input, dev_targets_one_hot_encoded),
              callbacks=[checkpoint])

예제 #28

0

파일 보기

파일: feature_extractor_pkl.py 프로젝트: dallascard/guac

 def __init__(self, prefix='_p_', min_df=1, max_per=1.0, binarize=False, transform=None, replace_num='#',
              source=None, subdir=None, pseudotype=None, splits_file=None, stage='training', suffix='',
              lower=True, scale_factor=None):
     name = 'pkl'
     assert transform != 'tfidf'
     FeatureExtractor.__init__(self, name=name, prefix=prefix, min_df=min_df, max_per=max_per, binarize=binarize,
                               transform=transform, replace_num=replace_num, source=source, subdir=subdir,
                               pseudotype=pseudotype, splits_file=splits_file, stage=stage, suffix=suffix,
                               lower=lower, scale_factor=scale_factor)

예제 #29

0

파일 보기

파일: svm_predict.py 프로젝트: raozhengqiang/rig_classify

 def __init__(self):
     self.root_path_ = os.path.split(os.path.realpath(__file__))[0]
     self.model_path_ = os.path.join(self.root_path_, 'test.pkl')
     self.clf_ = joblib.load(self.model_path_)
     self.fte_ = FeatureExtractor()
     self.sal_ = SALineupWrapper()
     self.sal_opt_args_ = ' --productname=sc --script-malware=true --loglevel=all'
     self.sal_path_ = os.path.join(self.sal_.get_path(), 'salineup')
     self.behavior_path_ = os.path.join(self.root_path_, 'behavior')

예제 #30

0

파일 보기

 def __init__(self,
              movie_dict=None,
              act_set=None,
              slot_set=None,
              db=None,
              corpus=None,
              train=True,
              _reload=False,
              n_hid=100,
              batch=128,
              ment=0.,
              inputtype='full',
              upd=10,
              sl='e2e',
              rl='e2e',
              pol_start=600,
              lr=0.005,
              N=1,
              tr=2.0,
              ts=0.5,
              max_req=2,
              frac=0.5,
              name=None):
     self.movie_dict = movie_dict
     self.act_set = act_set
     self.slot_set = slot_set
     self.database = db
     self.max_turn = dialog_config.MAX_TURN
     self.training = train
     self.feat_extractor = FeatureExtractor(corpus, self.database.path, N=N)
     out_size = len(dialog_config.inform_slots) + 1
     in_size = len(self.feat_extractor.grams) + len(
         dialog_config.inform_slots)
     slot_sizes = [
         self.movie_dict.lengths[s] for s in dialog_config.inform_slots
     ]
     self._init_model(in_size, out_size, slot_sizes, self.database, \
             n_hid=n_hid, learning_rate_sl=lr, batch_size=batch, ment=ment, inputtype=inputtype, \
             sl=sl, rl=rl)
     self._name = name
     if _reload: self.load_model(dialog_config.MODEL_PATH + self._name)
     if train: self.save_model(dialog_config.MODEL_PATH + self._name)
     self._init_experience_pool(batch)
     self.episode_count = 0
     self.recent_rewards = deque([], 1000)
     self.recent_successes = deque([], 1000)
     self.recent_turns = deque([], 1000)
     self.recent_loss = deque([], 10)
     self.discount = 0.99
     self.num_updates = 0
     self.pol_start = pol_start
     self.tr = tr
     self.ts = ts
     self.max_req = max_req
     self.frac = frac
     self.upd = upd

예제 #31

0

파일 보기

파일: arc_hybrid.py 프로젝트: mdelhoneux/uuparser-composition

    def __init__(self, vocab, options):

        # import here so we don't load Dynet if just running parser.py --help for example
        from multilayer_perceptron import MLP
        from feature_extractor import FeatureExtractor
        import dynet as dy
        global dy

        global LEFT_ARC, RIGHT_ARC, SHIFT, SWAP
        LEFT_ARC, RIGHT_ARC, SHIFT, SWAP = 0, 1, 2, 3

        self.model = dy.ParameterCollection()
        self.trainer = dy.AdamTrainer(self.model, alpha=options.learning_rate)

        self.activations = {
            'tanh':
            dy.tanh,
            'sigmoid':
            dy.logistic,
            'relu':
            dy.rectify,
            'tanh3':
            (lambda x: dy.tanh(dy.cwise_multiply(dy.cwise_multiply(x, x), x)))
        }
        self.activation = self.activations[options.activation]

        self.oracle = options.oracle

        self.headFlag = options.headFlag
        self.rlMostFlag = options.rlMostFlag
        self.rlFlag = options.rlFlag
        self.k = options.k
        self.recursive_composition = options.use_recursive_composition
        #ugly hack

        #dimensions depending on extended features
        self.nnvecs = (1 if self.headFlag else
                       0) + (2 if self.rlFlag or self.rlMostFlag else
                             0) + (1 if self.recursive_composition else 0)
        self.feature_extractor = FeatureExtractor(self.model, options, vocab,
                                                  self.nnvecs)
        self.irels = self.feature_extractor.irels

        if options.no_bilstms > 0:
            mlp_in_dims = options.lstm_output_size * 2 * self.nnvecs * (
                self.k + 1)
        else:
            mlp_in_dims = options.lstm_input_size * self.nnvecs * (self.k + 1)

        self.unlabeled_MLP = MLP(self.model, 'unlabeled', mlp_in_dims,
                                 options.mlp_hidden_dims,
                                 options.mlp_hidden2_dims, 4, self.activation)
        self.labeled_MLP = MLP(self.model, 'labeled', mlp_in_dims,
                               options.mlp_hidden_dims,
                               options.mlp_hidden2_dims,
                               2 * len(self.irels) + 2, self.activation)

예제 #32

0

파일 보기

파일: main.py 프로젝트: h3ct0r/cnn_feature_extractor_rpv_2017

def launch(cfg_path):
    print('[INFO]', 'Starting ...')
    print('[INFO]', 'Loading config')

    json_cfg = load_config(cfg_path)
    print('[DEBUG]', json_cfg)
    json_cfg['config_file'] = os.path.basename(cfg_path)

    extractor = FeatureExtractor(json_cfg)
    extractor.start()

예제 #33

0

파일 보기

 def __init__(self, mode, data_type, log_csv_path, enrollment_path,
              label_path, module_path, feature_path, debug_limit):
     self.db = SimpleCourseDB(mode, data_type, log_csv_path,
                              enrollment_path, label_path, module_path,
                              feature_path, debug_limit)
     self.db.build()
     print 'finish build course DB!'
     log_csv_path = base_dir + '/../../data/log_train.csv'
     FeatureExtractor.__init__(self, mode, data_type, log_csv_path,
                               feature_path, debug_limit)

예제 #34

0

파일 보기

    def __init__(self):
        self.feature_extractor = FeatureExtractor()
        self.frames = 10  # num of frames to aggregate heatmaps over
        self.heatmaps = []  # collection of heatmaps over past 10 frames
        self.cummulative_heatmap = np.zeros(
            (720,
             1280)).astype(np.float64)  # cummulative heat map over 10 frames

        self.cars_detected = 0  # count of cars detected in this frame
        self.contours_detected = []

예제 #35

0

파일 보기

파일: example.py 프로젝트: norikinishida/image-feature-extraction-via-convnet

def main():
    caffe_alexnet_path = "/path/to/caffe-modelzoo/AlexNet"
    caffe_vgg16_path = "/path/to/caffe-modelzoo/VGG16"
    caffe_googlenet_path = "/path/to/caffe-modelzoo/GoogleNet"
    keys_path = "/path/to/dataset/keys.txt"
    data_path = "/path/to/dataset/images"
    dst_path = "/path/to/dataset/features.npy"

    modelname = "VGG16"

    # load pre-trained model
    if modelname == "AlexNet":
        if not os.path.exists(os.path.join(caffe_alexnet_path, "imagenet_mean.npy")):
            convert_mean_file(caffe_alexnet_path)
        convnet = FeatureExtractor(
                prototxt_path=os.path.join(caffe_alexnet_path, "alexnet_deploy.prototxt"),
                caffemodel_path=os.path.join(caffe_alexnet_path, "alexnet.caffemodel"),
                target_layer_name="fc6",
                image_size=227,
                mean_path=os.path.join(caffe_alexnet_path, "imagenet_mean.npy")
                )
    elif modelname == "VGG16":
        convnet = FeatureExtractor(
                prototxt_path=os.path.join(caffe_vgg16_path, "vgg16_deploy.prototxt"),
                caffemodel_path=os.path.join(caffe_vgg16_path, "vgg16.caffemodel"),
                target_layer_name="fc6",
                image_size=224,
                mean_values=[103.939, 116.779, 123.68]
                )
    elif modelname == "GoogleNet":
        googlenet = FeatureExtractor(
                prototxt_path=os.path.join(caffe_googlenet_path, "googlenet_deploy.prototxt"),
                caffemodel_path=os.path.join(caffe_googlenet_path, "googlenet.caffemodel"),
                target_layer_name="pool5/7x7_s1",
                image_size=224,
                mean_values=[104.0, 117.0, 123.0]
                )
    else:
        print "Unknown model name: %s" % modelname
        sys.exit(-1)
    
    # data list
    keys = load_keys(keys_path)
    
    # feature extraction
    feats = []
    for key in keys:
        img = cv2.imread(os.path.join(data_path, key))
        assert img is not None
        feat = convnet.transform(img)
        feats.append(feat)
    feats = np.asarray(feats)
    np.save(dst_path, feats)

    print "Done."

예제 #36

0

파일 보기

파일: unit test.py 프로젝트: xaviercallens/OneTeam

def train_model(X_df, y_array, skf_is):
    fe = FeatureExtractor()
    fe.fit(X_df, y_array)
    X_array = fe.transform(X_df)
    # Regression
    train_is, _ = skf_is
    X_train_array = np.array([X_array[i] for i in train_is])
    y_train_array = np.array([y_array[i] for i in train_is])
    reg = Regressor()
    reg.fit(X_train_array, y_train_array)
    return fe, reg

예제 #37

0

파일 보기

파일: prepchecker_baseline.py 프로젝트: tuxedocat/ss2012

 def makefeatures(self, sents_list, ppindexlist):
     """
     ARGS
         sent_list: [[s1word1,s1word2,...], [s2word1,s2word2,...],...]
     RETURNS
         _features: a list of feature set (dict)
     """
     _features = []
     for sent, ppindex in zip(sents_list, ppindexlist):
         fe = FeatureExtractor(sent, ppindex, "succ")
         _features.append(fe.features())
     return _features

예제 #38

0

파일 보기

파일: logs_to_seti.py 프로젝트: dlluncor/data-mining

def generate_seti(filenames, for_test=False):
  files = []
  for filename in filenames:
    for fname in glob.glob(filename):
      files.append(fname)
  print 'logs_to_seti reading from files: %s' % (str(files))
  setis = []
  # Read each file where each row represents a training example.
  for fname in files:
    num_lines = 0
    num_invalid_lines = 0
    num_bad_entry_lines = 0
    bad_entry_lines = []
    # Read examples from file.
    with open(fname, 'rb') as csvfile:
      reader = csv.reader(csvfile)
      reader.next() # ignore header
      i = 0
      invalid_lines = []
      for csv_line in reader:
        num_lines += 1
        bad_line, reason = is_bad_line(csv_line)
        if bad_line:
          num_invalid_lines += 1
          continue
        #try:
        renter_form, err = _to_renter_form(csv_line)
        if renter_form is None:
          print err
          num_bad_entry_lines += 1
          bad_entry_lines.append(csv_line)
          continue
        fe = FeatureExtractor(for_test=for_test)
        seti = fe.to_seti(renter_form)
        setis.append(seti)
        #except Exception as e:
        #  num_invalid_lines += 1
        #  invalid_lines.append(i)
        #  print 'e: %s' % (str(e))
        #  PrintException()
        #  print 'Could not parse line %d. %d cols. \n%s' % (i, len(csv_line), csv_line)
        i += 1
    # Finished handling file.
    print 'File: %s' % fname
    valid_lines = num_lines-num_invalid_lines-num_bad_entry_lines
    print 'Num lines: %d. Valid: %d. Invalid: %d. Bady entry: %d' % (num_lines, valid_lines, num_invalid_lines, num_bad_entry_lines)

  if len(setis) == 0:
    raise Exception('No setis generated!')
  return setis

예제 #39

0

파일 보기

파일: example_feature_extraction.py 프로젝트: Ryckes/spam-survey

def processDir(corpusName, mailCorpus, maildir):
    mailIterator = mailCorpus.getFilesList(maildir)
    mailStorage = MailStorage(corpusName)
    featureExtractor = FeatureExtractor()
    progress = ProgressDisplay(len(mailIterator), 'Processing emails')

    # Output files are named 1 to numMails
    index = 1
    for mail in mailIterator:
        processed = processMail(maildir, mail, mailCorpus)
        features = featureExtractor.process(processed)
        mailStorage.store(features, str(index))

        index += 1
        progress.update()

예제 #40

0

파일 보기

파일: data_processor.py 프로젝트: t-usui/COMES

 def extract_data(self, id, extraction_method, label_type):
     extractor = FeatureExtractor()
     feature_vector = extractor.extract_feature_vector(id, extraction_method)
     
     if label_type == 'compiler':
         label = self.extract_compiler_label(id)                 # for compiler estimation
     elif label_type == 'optimization_level':
         label = self.extract_optimization_level_label(id)       # for optimization level estimation
     elif label_type == 'test':
         return feature_vector                                   # for test data
     else:
         sys.stderr.write('Unknown label type specified')
         sys.exit()
     
     return label, feature_vector

예제 #41

0

파일 보기

파일: data_processor.py 프로젝트: t-usui/COMES

 def update_database_from_file(self,
                               file_name,
                               asm_file_path,
                               gdl_file_path,
                               compiler=None,
                               optimization_level=None):
     file_name += '_' + compiler + '_' + optimization_level
     
     parser = IDAFileParser()
     extractor = FeatureExtractor()
     db_constructor = DatabaseConstructor()
     
     # Update file_name table
     db_constructor.insert_file_name(file_name)
     
     # Update instruction_sequence table
     instruction_list = parser.extract_instruction(asm_file_path)
     db_constructor.insert_instruction_sequence(file_name, instruction_list)
     
     # Update instruction_code_block table
     code_block_list = parser.extract_code_block(asm_file_path)
     db_constructor.insert_code_block(file_name, code_block_list)
     
     # Update opcode_variety table
     opcode_list = parser.extract_opcode(asm_file_path)
     db_constructor.append_opcode_variety(opcode_list)
     
     # Update bigram_variety table
     bigram_list = extractor.extract_ngram_list(opcode_list, 2)
     db_constructor.append_bigram_variety(bigram_list)
     
     # Update trigram_variety table
     trigram_list = extractor.extract_ngram_list(opcode_list, 3)
     db_constructor.append_trigram_variety(trigram_list)
     
     # Update api table
     api_list = parser.extract_api(gdl_file_path)
     db_constructor.insert_api(file_name, api_list)
     
     # Update api_variety table
     db_constructor.append_api_variety(api_list)
     
     if compiler is not None:
         # Update compiler_information table
         db_constructor.insert_compiler_information(file_name, compiler)
     if optimization_level is not None:
         # Update optimization_level_information table
         db_constructor.insert_optimization_level_information(file_name, optimization_level)

예제 #42

0

파일 보기

파일: classify_relationship_posts.py 프로젝트: linii/ling229-final

    def predict_model(self, model_file=None, output_file=None, output_probability_file=None):
        """
        Predict classes on self.data and output to output_file
        :param model_file: Model file to read model in from. Otherwise looks for self.classifier
        :param output_file: File to save predictions in
        :param output_probability_file: File to save predicted probabilities in
        :return: predicted classes (array)
        """
        if not self.classifier:
            if not model_file:
                raise Exception("No model to predict with.")
            else:
                with open(model_file) as f:
                    self.classifier = pickle.load(f)

        if self.data is None:
            raise Exception("Trying to predict using model with no data loaded.")

        self.featureExtractor = FeatureExtractor(self.data)
        feature_matrix = self.featureExtractor.extract_full_feature_matrix()

        self.predictions = self.classifier.predict(feature_matrix)

        if output_file is not None:
            np.savetxt(output_file, self.predictions, delimiter=",", fmt="%d")

        if output_probability_file is not None:
            pred_probs = self.classifier.predict_proba(feature_matrix)
            np.savetxt(output_probability_file, pred_probs, delimiter=",", fmt="%.3f")

        return self.predictions

예제 #43

0

파일 보기

파일: classify_relationship_posts.py 프로젝트: linii/ling229-final

    def train_model(self, model_out_file):
        """
        Extract the features from self.data and train the classifier. Output pickled model to model_out_file
        :param model_out_file:
        :return: None
        """
        if self.data is None:
            raise Exception("Trying to train model without any data.")

        sys.stderr.write("Extracting features from data.\n")

        self.featureExtractor = FeatureExtractor(self.data)
        feature_matrix = self.featureExtractor.extract_full_feature_matrix()

        labels = np.array([0 if lab == "Romantic" else 1 for lab in self.data["is_romantic"]])

        sys.stderr.write("Training classifier.\n")

        self.classifier = LogisticRegression() if self.classifier_type == "logit" else DecisionTreeClassifier()
        self.classifier.fit(feature_matrix, labels)

        sys.stderr.write("Saving classifier.\n")

        with open(model_out_file, "w") as f:
            pickle.dump(self.classifier, f)

예제 #44

0

파일 보기

파일: test_feature_extractor.py 프로젝트: jrlawson/profunc

class TestFeatureExtractor(unittest.TestCase):
    '''
    Unit tests for the FeatureExtractor class. Does simple tests to insure that 
    the feature vector we get back is of the right length and has frequency
    data that makes sense. More tests should be added.
    ''' 
    def setUp(self):
        '''Sets up the test by constructing feature vectors to get tested'''       
        self.record1 = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
            IUPAC.protein),
            id="YP_025292.1", name="HokC",
            description="toxic membrane protein, small")        
        self.seq1 = self.record1.seq
        self.feature_extractor = FeatureExtractor()  
        self.feature_vector1 = self.feature_extractor.extract_features(self.seq1)
        
    def test_feature_vector_length(self):
        '''Tests that the feature vector is 400 elements long'''
        self.assertEqual(len(self.feature_vector1), 400, msg="Feature vector not 400 long")
        
    def test_dipeptide_frequency_sum(self):
        '''Tests that the dipeptide frequencies sum to 1'''
        checksum = 0.0
        for i in range(0,400):
            checksum += self.feature_vector1[i]
        self.assertAlmostEqual(checksum, 1.0, places=5, msg="Frequencies don't sum to 1")

예제 #45

0

파일 보기

파일: agent_e2eRL_allact.py 프로젝트: SoluMilken/KB-InfoBot

 def __init__(self, movie_dict=None, act_set=None, slot_set=None, db=None, corpus=None,
         train=True, _reload=False, n_hid=100, batch=128, ment=0., inputtype='full', upd=10, 
         sl='e2e', rl='e2e', pol_start=600, lr=0.005, N=1, tr=2.0, ts=0.5, max_req=2, frac=0.5, 
         name=None):
     self.movie_dict = movie_dict
     self.act_set = act_set
     self.slot_set = slot_set
     self.database = db
     self.max_turn = dialog_config.MAX_TURN
     self.training = train
     self.feat_extractor = FeatureExtractor(corpus,self.database.path,N=N)
     out_size = len(dialog_config.inform_slots)+1
     in_size = len(self.feat_extractor.grams) + len(dialog_config.inform_slots)
     slot_sizes = [self.movie_dict.lengths[s] for s in dialog_config.inform_slots]
     self._init_model(in_size, out_size, slot_sizes, self.database, \
             n_hid=n_hid, learning_rate_sl=lr, batch_size=batch, ment=ment, inputtype=inputtype, \
             sl=sl, rl=rl)
     self._name = name
     if _reload: self.load_model(dialog_config.MODEL_PATH+self._name)
     if train: self.save_model(dialog_config.MODEL_PATH+self._name)
     self._init_experience_pool(batch)
     self.episode_count = 0
     self.recent_rewards = deque([], 1000)
     self.recent_successes = deque([], 1000)
     self.recent_turns = deque([], 1000)
     self.recent_loss = deque([], 10)
     self.discount = 0.99
     self.num_updates = 0
     self.pol_start = pol_start
     self.tr = tr
     self.ts = ts
     self.max_req = max_req
     self.frac = frac
     self.upd = upd

예제 #46

0

파일 보기

파일: test_feature_extractor.py 프로젝트: jrlawson/profunc

 def setUp(self):
     '''Sets up the test by constructing feature vectors to get tested'''       
     self.record1 = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
         IUPAC.protein),
         id="YP_025292.1", name="HokC",
         description="toxic membrane protein, small")        
     self.seq1 = self.record1.seq
     self.feature_extractor = FeatureExtractor()  
     self.feature_vector1 = self.feature_extractor.extract_features(self.seq1)

예제 #47

0

파일 보기

파일: analyzer.py 프로젝트: jbkoh/metadata_analyzer

	def __init__(self):
		self.actuNames = ActuatorNames()
		self.sensorNames = SensorNames()
		self.bdm = BDWrapper()
		self.expLogColl = CollectionWrapper('experience_log')
		#self.zonelist = self.csv2list('metadata/partialzonelist.csv')
		self.zonelist = self.csv2list('metadata/zonelist.csv')
		self.feater = FeatureExtractor()
		self.clust = Clusterer()

예제 #48

0

파일 보기

파일: tester.py 프로젝트: arpitgit/Talk2dHand

 def __init__(self, numGestures, minDescriptorsPerFrame, numWords, descType, numPredictions, parent):
     self.numGestures = numGestures
     self.numWords = numWords
     self.minDescriptorsPerFrame = minDescriptorsPerFrame
     self.parent = parent
     self.classifier = None
     self.windowName = "Testing preview"
     self.handWindowName = "Cropped hand"
     self.binaryWindowName = "Binary frames"
     self.predictionList = [-1]*numPredictions;
     self.handTracker = HandTracker(kernelSize=7, thresholdAngle=0.4, defectDistFromHull=30, parent=self)
     self.featureExtractor = FeatureExtractor(type=descType, parent=self)
     self.numSideFrames = 10
     self.prevFrameList = np.zeros((self.numSideFrames,self.parent.imHeight/self.numSideFrames,self.parent.imWidth/self.numSideFrames,3), "uint8")
     self.numPrevFrames = 0
     self.predictionScoreThreshold = 0.2
     self.learningRate = 0.01
     self.numReinforce = 1

예제 #49

0

파일 보기

파일: multi_reader.py 프로젝트: jrlawson/profunc

    def __init__(self, output_width=11, training_frac=70.0, validation_frac=15.0, debug=False):
        self.input_width = 400
        self.output_width = output_width
        self.training_frac = training_frac
        self.validation_frac = validation_frac
        self.debug = debug
        # self.dir = "/home/jlawson/Dropbox/ProteinFunctionData/"      # Where the files live.
        self.names = [  # Names of all of the files.
            "baseplate_3370",
            "collar_1385",
            "htj_2258_nofg",
            "major_tail_1512",
            "mcp_3589",
            "minor_capsid_1500_nofg",
            "minor_tail_2033",
            "portal_2141",
            "tail_fiber_3007",
            "tail_sheath_2350",
        ]

        self.feature_extractor = FeatureExtractor()

예제 #50

0

파일 보기

파일: classify_relationship_posts.py 프로젝트: linii/ling229-final

class RelationshipPostClassifier:
    """
    Main class for classification and prediction
    """

    def __init__(self, classifier_type="tree"):
        self.data = None
        self.classifier_type = classifier_type
        self.classifier = None
        self.featureExtractor = None
        self.predictions = None

    def read_csv_data(self, csv_file, maxrows=None):
        """
        Read in data from given csv_file into self.data (pandas dataframe)
        maxrows limits number of read rows.
        :param csv_file:
        :param maxrows:
        :return: None
        """
        sys.stderr.write("Reading in data from " + csv_file + "\n")

        if maxrows:
            self.data = pan.read_csv(csv_file, nrows=maxrows, encoding='utf-8')
        else:
            self.data = pan.read_csv(csv_file, encoding='utf-8')

    def train_model(self, model_out_file):
        """
        Extract the features from self.data and train the classifier. Output pickled model to model_out_file
        :param model_out_file:
        :return: None
        """
        if self.data is None:
            raise Exception("Trying to train model without any data.")

        sys.stderr.write("Extracting features from data.\n")

        self.featureExtractor = FeatureExtractor(self.data)
        feature_matrix = self.featureExtractor.extract_full_feature_matrix()

        labels = np.array([0 if lab == "Romantic" else 1 for lab in self.data["is_romantic"]])

        sys.stderr.write("Training classifier.\n")

        self.classifier = LogisticRegression() if self.classifier_type == "logit" else DecisionTreeClassifier()
        self.classifier.fit(feature_matrix, labels)

        sys.stderr.write("Saving classifier.\n")

        with open(model_out_file, "w") as f:
            pickle.dump(self.classifier, f)

    def predict_model(self, model_file=None, output_file=None, output_probability_file=None):
        """
        Predict classes on self.data and output to output_file
        :param model_file: Model file to read model in from. Otherwise looks for self.classifier
        :param output_file: File to save predictions in
        :param output_probability_file: File to save predicted probabilities in
        :return: predicted classes (array)
        """
        if not self.classifier:
            if not model_file:
                raise Exception("No model to predict with.")
            else:
                with open(model_file) as f:
                    self.classifier = pickle.load(f)

        if self.data is None:
            raise Exception("Trying to predict using model with no data loaded.")

        self.featureExtractor = FeatureExtractor(self.data)
        feature_matrix = self.featureExtractor.extract_full_feature_matrix()

        self.predictions = self.classifier.predict(feature_matrix)

        if output_file is not None:
            np.savetxt(output_file, self.predictions, delimiter=",", fmt="%d")

        if output_probability_file is not None:
            pred_probs = self.classifier.predict_proba(feature_matrix)
            np.savetxt(output_probability_file, pred_probs, delimiter=",", fmt="%.3f")

        return self.predictions

예제 #51

0

파일 보기

파일: logistic_regression.py 프로젝트: bmccann/party_predictor

def runOnSplit(penalties, constants, split):
	"Running on a " + str(split*100) + '/' + str((1-split)*100) + ' split' 
	fe = FeatureExtractor(split)
	featurized = fe.featurizeFiles('../data')
	classNames = featurized[0]
	trainMatrix, trainLabels = featurized[1:3]
	devMatrix, devLabels = featurized[3:5]
	trainFiles, devFiles = featurized[5:]


	classCounts = Counter()
	for l in devLabels:
		classCounts[l] += 1

	for penalty in penalties:
		for C in constants:
			print "\nPenalty, regularization: ", str(penalty), str(C)

			abstractModel = LogisticRegression()
			model = abstractModel.scikit(penalty, C)
			model_params = (penalty, C)
			model.fit(trainMatrix, trainLabels)

			errors, rankedExamples = Counter(), []

			score = model.score(devMatrix, devLabels)
			predicted_labels = model.predict(devMatrix)

			probs = model.predict_proba(devMatrix)

			for j,pred in enumerate(predicted_labels):
				if not pred == devLabels[j]:
					errors[devLabels[j]] += 1

			for i, p in enumerate(probs):
				rankedExamples.append((p, devFiles[i], predicted_labels[i] == devLabels[i]))		

			results = ''
			for i, c in enumerate(classNames):
				missRate = str(float(errors[i]) / classCounts[i])
				results += '\t' + c + ' error: ' + missRate + '\n'

			results += '\tScore: ' + str(score)
			fileName = 'results/scores/LRsplit'
			for param in model_params:
				fileName += '_' + str(param)
			fileName += '.txt'
			with open(fileName, 'w') as f:
				f.write(results)
			print results

			print '..ranking examples'
			if len(rankedExamples):
				examples = sorted(rankedExamples, key=lambda e: e[0][0])
				fileName = 'results/rankedExamples/LRsplit_' + str(split*100)
				for param in model_params:
					fileName += '_' + str(param)
				fileName += '.txt'
				with open(fileName,'w') as f:
					for e in examples:
						results = e[1]
						results += '\n\t Probability of class '
						results += classNames[0] + ': '
						results += str(e[0][0])
						results += '\n\t Correct: ' + str(e[2])
						f.write(results)

예제 #52

0

파일 보기

파일: enrollment_feature_extractor.py 프로젝트: hellozgm/kddcup2015

 def __init__(self, data_type, mode, debug_limit):
     log_csv_path = '{0}/../data/{1}/log_{1}.csv'.format(base_dir, data_type)
     feature_path = '{0}/../data/feature/enrollment_feature_{1}.csv'.format(base_dir, data_type)
     FeatureExtractor.__init__(self, mode, log_csv_path, feature_path, debug_limit)

예제 #53

0

파일 보기

파일: words_per_sentence.py 프로젝트: aerows/NLP1-Project

 def __init__(self,sentance_length_range=None):
     self.sentance_length_range = sentance_length_range
     FeatureExtractor.__init__(self)

예제 #54

0

파일 보기

파일: module_feature_extractor.py 프로젝트: numb3r3/kdd2015

    def __init__(self, mode, data_type, log_csv_path, module_path, feature_path, debug_limit):

        self.module_db = load_modules(module_path)
        FeatureExtractor.__init__(self, mode, data_type, log_csv_path, feature_path, debug_limit)

예제 #55

0

파일 보기

파일: classifier.py 프로젝트: tuxedocat/Nyanco

 def _get_features(self, v="", v_corpus=None, cls2id=None, domain="src"):
     _flist = []
     _labellist_int = []
     _labellist_str = []
     _labelid = cls2id[v]
     if v_corpus:
         for sid, s in enumerate(v_corpus):
             try:
                 fe = FeatureExtractor(s, verb=v)
                 if "chunk" in self.featuretypes:
                     fe.chunk()
                 if "3gram" in self.featuretypes:
                     fe.ngrams(n=3)
                 if "5gram" in self.featuretypes:
                     fe.ngrams(n=5)
                 if "7gram" in self.featuretypes:
                     fe.ngrams(n=7)
                 if "dep" in self.featuretypes:
                     fe.dependency()
                 if "srl" in self.featuretypes:
                     fe.srl()
                 if "ne" in self.featuretypes:
                     fe.ne()
                 if "errorprob" in self.featuretypes:
                     pass
                 if "topic" in self.featuretypes:
                     pass
                 augf = proc_easyadapt(fe.features, domain=domain)
                 _flist.append(augf)
                 _labellist_int.append(_labelid)
                 _labellist_str.append(v)
             except ValueError:
                 logging.debug(pformat("CaseMaker feature extraction: couldn't find the verb"))
             except:
                 print v
                 raise
     else:
         _flist.append(self.nullfeature)
         _labellist_int.append(_labelid)
         _labellist_str.append(v)
     return _flist, _labellist_str, _labellist_int

예제 #56

0

파일 보기

파일: multi_reader.py 프로젝트: jrlawson/profunc

class MultiReader(DataLoader):
    def __init__(self, output_width=11, training_frac=70.0, validation_frac=15.0, debug=False):
        self.input_width = 400
        self.output_width = output_width
        self.training_frac = training_frac
        self.validation_frac = validation_frac
        self.debug = debug
        # self.dir = "/home/jlawson/Dropbox/ProteinFunctionData/"      # Where the files live.
        self.names = [  # Names of all of the files.
            "baseplate_3370",
            "collar_1385",
            "htj_2258_nofg",
            "major_tail_1512",
            "mcp_3589",
            "minor_capsid_1500_nofg",
            "minor_tail_2033",
            "portal_2141",
            "tail_fiber_3007",
            "tail_sheath_2350",
        ]

        self.feature_extractor = FeatureExtractor()

    def load_data(self, source):
        """Load the data from a directory with a collection of source files,
        one file for each kind of protein. 
        
        Returns an array of pairs in the form:
        
        [(train_set_in, train_set_out), (validation_set_in, validation_set_out), (test_set_in, test_set_out)]

        :type source:   String
        :param source:  The directory where the source files are located.
        """
        dir = source
        raw_data = list()
        unsupporteds = list()
        for i in range(0, len(self.names)):
            num_in_file = 0
            if self.debug:
                print (dir + self.names[i] + ".faa")
            handle = open(dir + self.names[i] + ".faa", "rU")  # Open a file.
            for record in SeqIO.parse(handle, "fasta"):
                num_in_file += 1
                try:
                    # print "      " + record.id
                    feature_vector = self.feature_extractor.extract_features(record)
                    # Now we have to augment the feature vector with the output
                    # vector. So we:
                    #   1) Make a new array a bit longer than the feature vector,
                    #   2) Copy the feature vector into the first cells of the new array,
                    #   3) Find the appropriate cell in the tail of the new array
                    #      and set that one equal to 1.
                    prepared_data_record = numpy.zeros(len(feature_vector) + self.output_width)
                    for col in range(0, len(feature_vector)):  # This surely could be done more efficiently.
                        prepared_data_record[col] = feature_vector[col]  # Doesn't matter for now.
                    prepared_data_record[
                        len(feature_vector) + i
                    ] = 1  # The class of the protein is taken from the order of the files in the list "names"
                    raw_data.append(prepared_data_record)
                except KeyError:
                    if self.debug:
                        print "   Unsupported sequence: " + record.id + "   " + str(record.annotations)
                    unsupporteds.append(record)
                pass
            handle.close()
            if self.debug:
                print "Total in file " + self.names[i] + " = " + str(num_in_file)

        # Now we are done reading all of the data in. In debug mode, print some
        # overall summary information.
        if self.debug:
            print "Supported Sequences = " + str(len(raw_data))
            print "Unsupported Sequences = " + str(len(unsupporteds))

        num_examples = len(raw_data)

        # But the labeled data we have is not randomly ordered. It is sorted
        # by class. We need to shuffle it up or we will only train on the first
        # classes.
        if self.debug:
            print "Shuffling data to randomize for training"
        shuffle = self.rand_perm(num_examples)

        data = numpy.ndarray((num_examples, self.input_width + self.output_width), float)
        for n in range(0, num_examples):
            for w in range(0, self.input_width + self.output_width):
                s = raw_data[shuffle[n]][w]
                data[n, w] = float(s)
        if self.debug:
            print "Finished shuffling data"
            print "Processing data to cull outliers"
        data = self.preprocess(self.cull(data))
        num_examples = len(data)
        print "Data shape = ", data.shape, "   num_examples=", num_examples
        inputs = numpy.array(data)[:, 0 : self.input_width]
        outputs_full = numpy.array(data)[:, self.input_width : self.input_width + self.output_width]
        if self.debug:
            print "Finished culling outliers"
            print inputs.shape
            print outputs_full.shape
        outputs = numpy.ndarray((num_examples,), int)
        for n in range(0, num_examples):
            found_class = False
            for w in range(0, self.output_width):
                if outputs_full[n, w] > 0.5:
                    outputs[n] = w
                    found_class = True
                    break
        num_training_cases = self.num_training(num_examples)
        num_validation_cases = self.num_validation(num_examples)
        num_test_cases = self.num_test(num_examples)

        print num_training_cases, " ", num_validation_cases, " ", num_test_cases
        training_set = (inputs[0:num_training_cases, :], outputs[0:num_training_cases])
        validation_set = (
            inputs[num_training_cases : num_training_cases + num_validation_cases, :],
            outputs[num_training_cases : num_training_cases + num_validation_cases],
        )
        test_set = (
            inputs[num_training_cases + num_validation_cases :, :],
            outputs[num_training_cases + num_validation_cases :],
        )
        training_set_x, training_set_y = theanoutil.shared_dataset(training_set)
        validation_set_x, validation_set_y = theanoutil.shared_dataset(validation_set)
        test_set_x, test_set_y = theanoutil.shared_dataset(test_set)

        if self.debug:
            print "TYPE of test_set_x =", type(test_set_x)
            print "TYPE of test_set=", type(test_set), "  SIZE of test_set=", len(test_set)
            print "TYPE of test_set[0]=", type(test_set[0]), "  SHAPE of test_set[0]=", test_set[0].shape
            print "TYPE of test_set[1]=", type(test_set[1]), "  SHAPE of test_set[1]=", test_set[1].shape
            print "VALUE of training_set[0,0,0]=", training_set[0][0, 0]
            print "VALUE of training_set[1,0]=", training_set[1][0], "   test_set[1,0]=", test_set[1][0]

        rval = [(training_set_x, training_set_y), (validation_set_x, validation_set_y), (test_set_x, test_set_y)]
        return rval

    # Everything from here down should be turned into a base class.

    def num_training(self, num_examples):
        return num_examples * (self.training_frac / 100.0)

    def num_validation(self, num_examples):
        return num_examples * (self.validation_frac / 100.0)

    def num_test(self, num_examples):
        return num_examples - (self.num_training(num_examples) + self.num_validation(num_examples))

    def rand_perm(self, length):
        # In debug mode, we want to have a repeatable random number seed so
        # that we can have a repeatable shuffling.
        if self.debug:
            seed(1)
        shuffle = numpy.ndarray((length,), int)
        for n in range(0, length):
            shuffle[n] = n
        for n in range(0, length):
            swap_cell = randint(0, length - 1)
            temp = shuffle[swap_cell]
            shuffle[swap_cell] = shuffle[n]
            shuffle[n] = temp
        return shuffle

    def cull(self, data):
        # Make a list of all row numbers that need to get culled from the data.
        cull_list = []
        for n in range(0, len(data)):
            if self.prune(data[n]):
                cull_list.append(n)
        cull_list.append(len(data))  # A sentinel at the end of the cull list.

        # Make a new array that doesn't have the culled items in it.
        # The 1+ is for the sentinel.
        new_data = numpy.ndarray((1 + len(data) - len(cull_list), self.input_width + self.output_width), float)
        next_cull_index = 0
        next_data_index = 0
        for n in range(0, len(data)):
            if n == cull_list[next_cull_index]:
                next_cull_index += 1
            else:
                new_data[next_data_index] = data[n]
                next_data_index += 1
        print "Number culled = ", len(cull_list) - 1
        return new_data

    def prune(self, example):
        sum = 0.0
        for n in range(0, self.input_width):
            if example[n] < 0.0:
                return True
            if example[n] > 1.0:
                return True
            sum += example[n]
        if sum > 1.01:
            return True
        if sum < 0.99:
            return True
        return False

    def preprocess(self, data):
        n = self.input_width
        for r in range(0, len(data)):
            sum_x = 0.0
            sum_x2 = 0.0
            for c in range(0, n):
                sum_x += data[r, c]
                sum_x2 += data[r, c] * data[r, c]
            mu = sum_x / n
            std = math.sqrt((sum_x2 - (sum_x * sum_x) / n) / n)  # Population std
            for c in range(0, n):
                z = (data[r, c] - mu) / std
                # squashed_z = sigma(z)
                data[r, c] = z
            if r % 1000 == 0:
                print "Preprocessed row ", r
        return data

예제 #57

0

파일 보기

파일: tester.py 프로젝트: arpitgit/Talk2dHand

class Tester(object):
    def __init__(self, numGestures, minDescriptorsPerFrame, numWords, descType, numPredictions, parent):
        self.numGestures = numGestures
        self.numWords = numWords
        self.minDescriptorsPerFrame = minDescriptorsPerFrame
        self.parent = parent
        self.classifier = None
        self.windowName = "Testing preview"
        self.handWindowName = "Cropped hand"
        self.binaryWindowName = "Binary frames"
        self.predictionList = [-1]*numPredictions;
        self.handTracker = HandTracker(kernelSize=7, thresholdAngle=0.4, defectDistFromHull=30, parent=self)
        self.featureExtractor = FeatureExtractor(type=descType, parent=self)
        self.numSideFrames = 10
        self.prevFrameList = np.zeros((self.numSideFrames,self.parent.imHeight/self.numSideFrames,self.parent.imWidth/self.numSideFrames,3), "uint8")
        self.numPrevFrames = 0
        self.predictionScoreThreshold = 0.2
        self.learningRate = 0.01
        self.numReinforce = 1

    def initialize(self, clf):
        self.classifier = clf
        self.numWords = self.classifier.voc.shape[0]
        self.prevStates = np.zeros((self.numSideFrames, self.numWords), "float32")
        self.prevLabels = [0]*self.numSideFrames
        self.prevScores = [0]*self.numSideFrames

    def test_on_video(self):
        vc = self.parent.vc
        while(vc.isOpened()):
            ret,im = vc.read()
            im = cv2.flip(im, 1)
            imhsv = cv2.cvtColor(im, cv2.COLOR_BGR2HSV)
            self.handTracker.colorProfiler.draw_color_windows(im, imhsv)
            cv2.imshow(self.windowName, im)
            k = cv2.waitKey(1)
            if k == 32: # space
                break
            elif k == 27:
                sys.exit(0)

        self.handTracker.colorProfiler.run()
        binaryIm = self.handTracker.get_binary_image(imhsv)
        cnt,hull,centroid,defects = self.handTracker.initialize_contour(binaryIm)
        cv2.namedWindow(self.binaryWindowName)
        cv2.namedWindow(self.handWindowName)
        cv2.namedWindow(self.windowName)
        cv2.setMouseCallback(self.windowName, self.reinforce)

        while(vc.isOpened()):
            ret,im = vc.read()
            im = cv2.flip(im, 1)
            imhsv = cv2.cvtColor(im, cv2.COLOR_BGR2HSV)
            imgray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
            binaryIm = self.handTracker.get_binary_image(imhsv)
            cnt,hull,centroid,defects = self.handTracker.get_contour(binaryIm)
            imCopy = 1*im
            testData = None
            prediction = -1
            score = -1
            update = False
            if cnt is not None:
                numDefects = defects.shape[0]
                cropImage,cropPoints = self.handTracker.get_cropped_image_from_cnt(im, cnt, 0.05)
                cropImageGray = self.handTracker.get_cropped_image_from_points(imgray, cropPoints)
                #cv2.fillPoly(binaryIm, cnt, 255)
                #cropImageBinary = self.handTracker.get_cropped_image_from_points(binaryIm, cropPoints)
                #cropImageGray = self.apply_binary_mask(cropImageGray, cropImageBinary, 5)
                #kp,des = self.featureExtractor.get_keypoints_and_descriptors(cropImageGray)
                kp = self.featureExtractor.get_keypoints(cropImageGray)
                cropCnt = self.handTracker.get_cropped_contour(cnt, cropPoints)
                kp = self.featureExtractor.get_keypoints_in_contour(kp, cropCnt)
                kp,des = self.featureExtractor.compute_descriptors(cropImageGray, kp)
                if des is not None and des.shape[0] >= 0:
                    self.featureExtractor.draw_keypoints(cropImage, kp)
                if des is not None and des.shape[0] >= self.minDescriptorsPerFrame and self.is_hand(defects):
                    words, distance = vq(des, self.classifier.voc)
                    testData = np.zeros(self.numWords, "float32")
                    for w in words:
                        testData[w] += 1
                    normTestData = np.linalg.norm(testData, ord=2) * np.ones(self.numWords)
                    testData = np.divide(testData, normTestData)
                    prediction,score = self.predict(testData)
                    sortedScores = np.sort(score)
                    #if max(score) > self.predictionScoreThreshold:
                    if sortedScores[-1]-sortedScores[-2] >= self.predictionScoreThreshold:
                        self.handTracker.draw_on_image(imCopy, cnt=False, hullColor=(0,255,0))
                    else:
                        self.handTracker.draw_on_image(imCopy, cnt=False, hullColor=(255,0,0))
                        prediction = -1
                    update = True
                else:
                    self.handTracker.draw_on_image(imCopy, cnt=False, hullColor=(0,0,255))
                    prediction = -1
                cv2.imshow(self.handWindowName,cropImage)
            else:
                prediction = -1
            #self.insert_to_prediction_list(prediction)
            #prediction,predictionCount = self.most_common(self.predictionList)
            #if prediction>=0:
            writtenVal = '-'
            if prediction > 0:
                #if self.classifier.medianDefects is not None and numDefects>=self.classifier.medianDefects[prediction-1]-1 and numDefects<=self.classifier.medianDefects[prediction-1]+1:
                #    #print prediction
                #    writtenVal = str(prediction)
                #    update = True
                #elif self.classifier.medianDefects is None:
                    #print prediction
                writtenVal = str(prediction)
            self.write_on_image(imCopy, writtenVal)
            cv2.imshow(self.binaryWindowName, binaryIm)
            imCopy = self.add_prev_frames_to_image(imCopy, testData, prediction, score, update)
            cv2.imshow(self.windowName,imCopy)
            k = cv2.waitKey(1)
            if k == 27: # space
                break

    def test_on_descriptors(self, desList):
        testLabels = []
        for i,des in enumerate(desList): 
            if des is not None and des.shape[0] >= self.minDescriptorsPerFrame:
                words, distance = vq(des, self.classifier.voc)
                testData = np.zeros(self.numWords, "float32")
                for w in words:
                    testData[w] += 1
                normTestData = np.linalg.norm(testData, ord=2) * np.ones(self.numWords)
                testData = np.divide(testData, normTestData)
                prediction,score = self.predict(testData)
                sortedScores = np.sort(score)
                    #if max(score) > self.predictionScoreThreshold:
                if sortedScores[-1]-sortedScores[-2] >= self.predictionScoreThreshold:
                    pass
                else:
                    prediction = -1
            else:
                prediction = -1
            testLabels.append(prediction)
        return testLabels

    def predict(self, testData):
        prediction = self.classifier.predict(testData.reshape(1,-1))
        score = self.classifier.decision_function(testData.reshape(1,-1))
        return prediction[0], score[0]

    def insert_to_prediction_list(self, prediction):
        self.predictionList.append(prediction)
        self.predictionList = self.predictionList[1:]

    def most_common(self, lst):
        for i in range(1,len(lst)-1):
            if lst[i] != lst[i-1] and lst[i] != lst[i+1]:
                lst[i] = -1
        e = max(set(lst), key=lst.count)
        return e,lst.count(e)

    def is_hand(self, defects):
        if defects.shape[0] > 5:
            return False
        else:
            return True

    def write_on_image(self, image, text):
        cv2.putText(image, text, (self.parent.imWidth/20,self.parent.imHeight/4), cv2.FONT_HERSHEY_SIMPLEX, 5, (0,0,255), 5)

    def get_prev_frames_image(self):
        image = self.prevFrameList[0]
        for i in range(1,len(self.prevFrameList)):
            image = np.append(image, self.prevFrameList[i], axis=0)
        return image

    def apply_binary_mask(self, image, mask, kernelSize):
        kernel = np.ones((kernelSize,kernelSize),np.uint8)
        dilatedMask = cv2.dilate(mask,kernel,iterations=1)
        maskedImage = cv2.bitwise_and(image, image, mask=dilatedMask)
        return maskedImage

    def add_prev_frames_to_image(self, image, testData, testLabel, testScore, update=False):
        shrinkIm = cv2.resize(image, None, fx=float(1)/self.numSideFrames, fy=float(1)/self.numSideFrames)
        prevFramesIm = self.get_prev_frames_image()
        image = np.append(image, prevFramesIm, axis=1)
        if update:
            if self.numPrevFrames < self.numSideFrames:
                self.prevFrameList[self.numPrevFrames] = shrinkIm
                self.prevStates[self.numPrevFrames] = testData
                self.prevLabels[self.numPrevFrames] = testLabel
                self.prevScores[self.numPrevFrames] = testScore
                self.numPrevFrames += 1
            else:
                self.prevFrameList = np.append(self.prevFrameList, np.array([shrinkIm]), axis=0)
                self.prevFrameList = self.prevFrameList[1:]
                self.prevStates = np.append(self.prevStates, np.array([testData]), axis=0)
                self.prevStates = self.prevStates[1:]
                self.prevLabels.append(testLabel)
                self.prevLabels = self.prevLabels[1:]
                self.prevScores.append(testScore)
                self.prevScores = self.prevScores[1:]
        return image

    def reinforce(self, event, x, y, flags, param):
        if event == cv2.EVENT_LBUTTONDOWN:
            if x > self.parent.imWidth:
                prevFrameID = int(np.floor(y*self.numSideFrames/self.parent.imHeight))
                self.prevFrameList[prevFrameID] = cv2.cvtColor(self.prevFrameList[prevFrameID], cv2.COLOR_BGR2HSV)
                if isinstance(self.classifier, svm.LinearSVC):
                    self.perceptron_update(prevFrameID, False)
        elif event == cv2.EVENT_RBUTTONDOWN:
            if x > self.parent.imWidth:
                prevFrameID = int(np.floor(y*self.numSideFrames/self.parent.imHeight))
                self.prevFrameList[prevFrameID] = cv2.cvtColor(self.prevFrameList[prevFrameID], cv2.COLOR_BGR2YCR_CB)
                if isinstance(self.classifier, svm.LinearSVC):
                    self.perceptron_update(prevFrameID, True)

    def perceptron_update(self, prevFrameID, flag):
        weights = self.classifier.coef_
        if not flag:
            wrongData = self.prevStates[prevFrameID]
            #normData = np.linalg.norm(wrongData, ord=2) * np.ones(self.numWords)
            #wrongData = np.divide(wrongData, normData)
            wrongLabel = self.prevLabels[prevFrameID]
            wrongScores = self.prevScores[prevFrameID]
            wrongScore = max(wrongScores)
            if wrongLabel > 0:
                wrongWeights = weights[wrongLabel-1]
                newWeights = np.subtract(wrongWeights, (self.learningRate/self.numReinforce)*wrongData)
                weights[wrongLabel-1] = newWeights
            else:
                k = cv2.waitKey(-1)
                rightLabel = k - 48
                if rightLabel > 0 and rightLabel <= weights.shape[0]:
                    wrongWeights = weights[rightLabel-1]
                    newWeights = np.add(wrongWeights, (self.learningRate/self.numReinforce)*wrongData)
                    weights[rightLabel-1] = newWeights
        else:
            rightData = self.prevStates[prevFrameID]
            #normData = np.linalg.norm(rightData, ord=2) * np.ones(self.numWords)
            #rightData = np.divide(rightData, normData)
            rightLabel = self.prevLabels[prevFrameID]
            rightScores = self.prevScores[prevFrameID]
            rightScore = max(rightScores)
            if rightLabel > 0:
                rightWeights = weights[rightLabel-1]
                newWeights = np.add(rightWeights, (self.learningRate/self.numReinforce)*rightData)
                weights[rightLabel-1] = newWeights
        #self.numReinforce += 1
        self.classifier.coef_ = weights

예제 #58

0

파일 보기

파일: enrollment_feature_extractor.py 프로젝트: numb3r3/kdd2015

 def __init__(self, mode, data_type, log_csv_path, feature_path, debug_limit):
     FeatureExtractor.__init__(self, mode, data_type, log_csv_path, feature_path, debug_limit)

예제 #59

0

파일 보기

파일: spectral_clustering.py 프로젝트: aerows/NLP1-Project

 def __init__(self, k, similarity_function):
     self.vocabulary = None
     self.k = k
     self.sim_function = similarity_function
     FeatureExtractor.__init__(self)
     self.featuresVec

예제 #60

0

파일 보기

파일: analyzer.py 프로젝트: jbkoh/metadata_analyzer

class Analyzer:
	bdm = None
	expLogColl = None
	#timeGran = timedelta(minutes=5)
	timeGran = timedelta(minutes=2)
	actuNames = None
	sensorNames = None
	zonelist = None
	feater = None
	clust = None
	
	def __init__(self):
		self.actuNames = ActuatorNames()
		self.sensorNames = SensorNames()
		self.bdm = BDWrapper()
		self.expLogColl = CollectionWrapper('experience_log')
		#self.zonelist = self.csv2list('metadata/partialzonelist.csv')
		self.zonelist = self.csv2list('metadata/zonelist.csv')
		self.feater = FeatureExtractor()
		self.clust = Clusterer()
	
	def csv2list(self, filename):
		outputList = list()
		with open(filename, 'r') as fp:
			reader = csv.reader(fp, delimiter=',')
			for row in reader:
				outputList.append(row[0])
		return outputList

	def get_actuator_uuid(self, zone=None, actuType=None):
		context = dict()
		if zone != None:
			context['room']=zone
		if actuType != None:
			context['template']=actuType
		uuids = self.bdm.get_sensor_uuids(context)
		if len(uuids)>1:
			raise QRError('Many uuids are found', context)
		elif len(uuids)==0:
			raise QRError('No uuid is found', context)
		else:
			return uuids[0]

	def normalize_data_avg(self, rawData, beginTime, endTime):
		procData = pd.Series({beginTime:float(rawData[0])})
		tp = beginTime
		while tp<=endTime:
			tp = tp+self.timeGran
			leftSeries = rawData[:tp]
			if len(leftSeries)>0:
				idx = len(leftSeries)-1
				leftVal = leftSeries[idx]
				leftIdx = leftSeries.index[idx]
			else:
				leftVal = None
			rightSeries = rawData[tp:]
			if len(rightSeries)>0:
				rightVal = rightSeries[0]
				rightIdx = rightSeries.index[0]
			else:
				rightVal = None
			if rightVal==None and leftVal!=None:
				newVal = leftVal
			elif rightVal!=None and leftVal==None:
				newVal = rightVal
			elif tp==leftIdx:
				newVal = leftVal
			elif tp==rightIdx:
				newVal = rightVal
			elif rightVal!=None and leftVal!=None:
				leftDist = (tp - leftIdx).total_seconds()
				rightDist = (rightIdx - tp).total_seconds()
				newVal = (leftVal*rightDist+rightVal*leftDist)/(rightDist+leftDist)
			else:
				print "ERROR: no data found in raw data"
				newVal = None
			newData = pd.Series({tp:newVal})
			procData = procData.append(newData)
		return procData

	def normalize_data_nextval_deprecated(self, rawData, beginTime, endTime):
		procData = pd.Series({beginTime:float(rawData[0])})
		tp = beginTime
		while tp<=endTime:
			tp = tp+self.timeGran
			leftSeries = rawData[:tp]
			if len(leftSeries)>0:
				idx = len(leftSeries)-1
				leftVal = leftSeries[idx]
				leftIdx = leftSeries.index[idx]
			else:
				leftVal = None
			rightSeries = rawData[tp:]
			if len(rightSeries)>0:
				rightVal = rightSeries[0]
				rightIdx = rightSeries.index[0]
			else:
				rightVal = None

			if rightVal != None:
				newVal = rightVal
			else:
				newVal = leftVal

			newData = pd.Series({tp:newVal})
			procData = procData.append(newData)
		return procData

	def normalize_data(self, rawData, beginTime, endTime, normType):
		rawData = rawData[beginTime:endTime]
		if not beginTime in rawData.index:
			rawData[beginTime] = rawData.head(1)[0]
			rawData = rawData.sort_index()
		if not endTime in rawData.index:
			rawData[endTime] = rawData.tail(1)[0]
			rawData = rawData.sort_index()
		if normType=='nextval':
			procData = rawData.resample('2Min', fill_method='pad')
		elif normType=='avg':
			procData = rawData.resample('2Min', how='mean')
		else:
			procData = None

		return procData
		

	def receive_a_sensor(self, zone, actuType, beginTime, endTime, normType):
		print zone, actuType
		uuid = self.get_actuator_uuid(zone, actuType)
		rawData = self.bdm.get_sensor_ts(uuid, 'PresentValue', beginTime, endTime)
		if actuType!=self.actuNames.damperCommand:
			rawData = self.remove_negativeone(rawData)
		procData = self.normalize_data(rawData, beginTime, endTime, normType)
		return procData

	def receive_entire_sensors_notstore(self, beginTime, endTime, normType, exceptZoneList=[]):
		#TODO: Should be parallelized here
		dataDict = dict()
		for zone in self.zonelist:
			if not zone in exceptZoneList:
				dataDict[zone] = self.receive_zone_sensors(zone, beginTime, endTime, normType)
		return dataDict
	
	def receive_entire_sensors(self, beginTime, endTime, filename, normType, exceptZoneList=[]):
#		filename='data/'+beginTime.isoformat()[0:-7].replace(':','_') + '.pkl'
		dataDict = self.receive_entire_sensors_notstore(beginTime, endTime, normType, exceptZoneList=exceptZoneList)
		with open(filename, 'wb') as fp:
			pickle.dump(dataDict, fp)
#			json.dump(dataDict,fp)

	def clustering(self, inputData, dataDict):
		fftFeat = self.feater.get_fft_features(inputData, dataDict)
		minmaxFeat = self.feater.get_minmax_features(dataDict)
		dtwFeat = self.feater.get_dtw_features(inputData, dataDict)
		freqFeat = self.feater.get_freq_features(inputData, dataDict)
		featDict = dict()
		for zone in self.zonelist:
			featList = list()
			featList.append(fftFeat[zone])
			featList.append(minmaxFeat[zone])
			featList.append(dtwFeat[zone])
			#featList.append(freqFeat[zone])
			featDict[zone] = featList
		print featDict['RM-4132']
		return self.clust.cluster_kmeans(featDict)
	
	def remove_negativeone(self, data):
		if -1 in data.values:
			indices = np.where(data==-1)
			for idx in indices:
				data[idx] = data[idx-1]
		return data

	def receive_zone_sensors(self, zone, beginTime, endTime, normType):
		zoneDict = dict()
		for actuType in self.actuNames.nameList+self.sensorNames.nameList:
			if actuType=='Actual Supply Flow':
				pass
			try:
				uuid = self.get_actuator_uuid(zone, actuType)
			except QRError:
				continue
#			if actuType == self.actuNames.commonSetpoint:
#				wcad = self.receive_a_sensor(zone, 'Warm Cool Adjust', beginTime, endTime, normType)
#				data = self.receive_a_sensor(zone, actuType, beginTime, endTime, normType)
#				data = data + wcad
#				pass
			if actuType != self.actuNames.damperCommand:
				if actuType==self.actuNames.occupiedCommand:
					pass
				data = self.receive_a_sensor(zone, actuType, beginTime, endTime, normType)
			else:
				data = self.receive_a_sensor(zone, actuType, beginTime, endTime, normType)
			zoneDict[actuType] = data
		return zoneDict


	def store_zone_sensors(self, zone, beginTime, endTime, normType, filename):
		data = self.receive_zone_sensors(zone, beginTime, endTime, normType)
#		with open(filename, 'wb') as fp:
#			w = csv.DictWriter(fp, data.keys())
#			w.writeheader()
#			w.writerow(data)
		for key, val in data.iteritems():
			val.to_csv('rm4132.csv', header=key, mode='a')

	def store_minmax_dict(self):
		minDict = defaultdict(dict)
		maxDict = defaultdict(dict)
		beginTime = datetime(2015,2,1)
		endTime = datetime(2015,9,1)
		shortBeginTime = datetime(2015,8,1)
		shortEndTime = datetime(2015,8,2)

		for zone in self.zonelist:
			for pointType in self.actuNames.nameList+self.sensorNames.nameList:
				try:
					if pointType=='Occupied Command':
						minDict[zone][pointType] = 1
						maxDict[zone][pointType] = 3
					elif pointType=='Cooling Command':
						minDict[zone][pointType] = 0
						maxDict[zone][pointType] = 100
					elif pointType=='Cooling Command' or pointType=='Heating Command':
						minDict[zone][pointType] = 0
						maxDict[zone][pointType] = 100
					elif pointType=='Occupied Clg Min' or pointType=='Occupied Htg Flow' or pointType=='Cooling Max Flow':
						uuid = self.get_actuator_uuid(zone, pointType)
						data = self.bdm.get_sensor_ts(uuid, 'Presentvalue', shortBeginTime, shortEndTime)
						minDict[zone][pointType] = min(data)
						maxDict[zone][pointType] = max(data)
					elif pointType=='Temp Occ Sts':
						minDict[zone][pointType] = 0
						maxDict[zone][pointType] = 1
					elif pointType=='Reheat Valve Command':
						minDict[zone][pointType] = 0
						maxDict[zone][pointType] = 100
					elif pointType=='Actual Supply Flow' or pointType=='Actual Sup Flow SP':
						uuid = self.get_actuator_uuid(zone, pointType)
						data = self.bdm.get_sensor_ts(uuid, 'Presentvalue', shortBeginTime, shortEndTime)
						maxFlow = data[0]
						minDict[zone][pointType] = 0
						maxDict[zone][pointType] = maxFlow
					elif pointType=='Damper Position':
						minDict[zone][pointType] = 0
						maxDict[zone][pointType] = 100
					elif pointType=='Damper Command':
						uuid = self.get_actuator_uuid(zone, pointType)
						data = self.bdm.get_sensor_ts(uuid, 'Presentvalue', shortBeginTime, shortEndTime)
						meanData = np.mean(data)
						stdData = np.std(data)
						meanAgain = np.mean(data[np.logical_and(data<=meanData+2*stdData, data>=meanData-2*stdData)])
						minDict[zone][pointType] = meanData-2*stdData
						maxDict[zone][pointType] = meanData+2*stdData
					else:
						uuid = self.get_actuator_uuid(zone, pointType)
						data = self.bdm.get_sensor_ts(uuid, 'Presentvalue', beginTime, endTime)
						minDict[zone][pointType] = min(data)
						maxDict[zone][pointType] = max(data)

				except:
					print "Something is wrong"
					pass
		with open('metadata/mindict.pkl', 'wb') as fp:
			pickle.dump(minDict, fp)
		with open('metadata/maxdict.pkl', 'wb') as fp:
			pickle.dump(maxDict, fp)