Пример #1
0
    def grid_search_on_text_features(self, cross_validate=True, file_postfix=""):
        """
        Performs a grid search using text features on the given dataset. Stores the parameters for the optimal classifier.
        """
        
        self.grid_params = {
                    'vect__ngram_range': [(1,1),(1,2),(2,2),(1,3),(2,3),(3,3),(1,4)],
              'vect__use_idf': (True,False),
              'vect__smooth_idf': (True, False),
              'vect__sublinear_tf': (True, False),
              'vect__max_df': (0.5,),
              }
        self.vect = TfidfVectorizer()

        cross_validation = StratifiedKFold(self.train_targets, n_folds=10)
        
        #Build a Pipeline with TFidfVectorizer and classifier
        pipeline_classifier = Pipeline([
                                        ('vect', self.vect),
                                    ('clf', self.classifier)]
                                       )
        
        #Perform grid search
        print "Performing grid search with classifier of instance ",str(self.classifier.__class__.__name__)
        self.grid = GridSearchCV(pipeline_classifier, self.grid_params, cv=cross_validation, refit=True, n_jobs=-1,verbose=1)

        self.grid.fit([t.text for t in self.train_tweets], self.train_targets)
        
        self.best_estimator = self.grid.best_estimator_
        self.best_parameters = self.grid.best_params_
        self.best_score = self.grid.best_score_
        
        
        print "Results for ",self.classifier.__class__.__name__
        print "Best params: ", self.best_parameters
        print "Best score: ", self.best_score
        
        print "Storing estimator... "        
        utils.store_model(self.classifier.__class__.__name__, self.best_parameters, self.best_score, file_postfix=file_postfix)
        return self.grid
Пример #2
0
def main(arg):
    # Program Parameters
    dataset = arg['dataset']
    save_dir = arg['save_dir']
    load_model = arg['load']
    model_filename = arg['model_file']

    # Topology model
    conv_type = arg['conv_type']
    filter_shape = arg['filter_shape']
    kernel_size = arg['kernel_size']
    kernel_pool_size = arg['kernel_pool_size']

    # Data
    [train, valid, _, num_class,
     image_shape] = utils.load_normalize_data(dataset)

    # Save/load
    saveto = os.path.join(
        save_dir, model_filename) if model_filename is not None else None
    loadfrom = saveto if load_model else None

    # Topology
    model = Model(image_shape, filter_shape, num_class, conv_type, kernel_size,
                  kernel_pool_size)

    with tf.Session() as sess:
        saver = tf.train.Saver()
        save = utils.store_model(saver, sess, saveto)

        # Load the variables of the model if wanted
        if load_model:
            print "Loading model..."
            utils.restore_model(saver, sess, loadfrom)
        else:
            sess.run(tf.global_variables_initializer())

        training(sess, model, arg, train, valid, save)
Пример #3
0
    pgm_acled_soc,
    pgm_acled_sochist,
    pgm_acled_socnat,
    pgm_acled_wcm,
    pgm_canon_histonly,
    pgm_canon_nocm,
    pgm_canon_wcm,
    pgm_nsonly_wcm,
    pgm_osonly_wcm,
    pgm_pronly_wcm,
    pgm_sbonly_wcm,
    pgm_acled_cm,
    pgm_acled_meancm,
    pgm_acled_meancmhist,
    pgm_acled_protest,
    pgm_acled_meanprotest,
    pgm_acled_cm,
    pgm_acled_meancm,
]

runtypes = ["eval", "fcast"]
periods = ["calib", "test"]
times = t.times_nested

for model_root in models_root:
    for model_root_var in model_root:
        models_root_var_times = utils.demux_times(model_root_var, runtypes,
                                                  periods, times)
        for model_root_var_times in models_root_var_times:
            utils.store_model(model_root_var_times, "./output/models/")
Пример #4
0
        'Training mode selection. Choices: mnist, synthetic_timeseries, cell_timeseries. (default: mnist)'
    )
    args = parser.parse_args()
    model_filepath = "model-{}.pth".format(args.train_mode)
    root_path = "results/{}".format(args.train_mode)
    try:
        os.makedirs(root_path)
    except:
        pass
    is_cuda = not args.no_cuda
    device = torch.device("cuda" if is_cuda else "cpu")
    model = VAE(dropout=args.dropout,
                input_dim=input_dims[args.train_mode]).to(device)
    try:
        model = load_model(model_filepath, model)
        logger.info("Loading model from {}".format(model_filepath))
    except:
        logger.info("Creating VAE model from scratch")
        model = VAE(dropout=args.dropout,
                    input_dim=input_dims[args.train_mode]).to(device)
    if args.train_mode == 'mnist':
        train_mnist(model, device, args.epochs, root_path)
    elif args.train_mode == "synthetic_timeseries":
        model.decoder.sigmoid = False  # disable sigmoid from the final decoder layer
        train_synthetic_timeseries(model, device, args.epochs, root_path)
    elif args.train_mode == "cell_timeseries":
        model.decoder.sigmoid = False  # disable sigmoid from the final decoder layer
        train_cell_timeseries(model, device, args.epochs, root_path)
    model.to(torch.device("cpu"))
    store_model(model_filepath, model)
Пример #5
0
    def train_on_feature_set(self, cross_validate=True, use_tfidf=True):
        """
        Performs training with the given model using the given feature set
        """
        #Establish document text feature vectors
        print "Vectorizing"
#        self.tokenizer = CountVectorizer().build_tokenizer()
        
        
        self.vect = CountVectorizer(**self.vect_options)
        self.tfidf_transformer = TfidfTransformer(**self.tfidf_options)
        self.dict_transformer = TfidfTransformer(**self.tfidf_options)
#        train_counts_tf = tfidf_transformer.fit_transform(train_counts)
        
        count_vector = self.vect.fit_transform([t.text for t in self.train_tweets])
        tfidf_count = self.tfidf_transformer.fit_transform(count_vector)
        if self.only_text_features:
            combined_vector = tfidf_count
        else:
            self.dict_vectorizer = DictVectorizer()
            dict_vector = self.dict_vectorizer.fit_transform(self.feature_set)
            
            f=codecs.open("feature_set.txt", "w", "utf8")
            for d in dict_vector:
                f.write(d.__str__())
            f.close()
            tfidf_dict = self.dict_transformer.fit_transform(dict_vector)
            f=codecs.open("feature_set_tdidf.txt", "w", "utf8")
            for d in tfidf_dict:
                f.write(d.__str__())
            f.close()
            combined_vector = sp.hstack([tfidf_count, tfidf_dict])
#        combined_features = FeatureUnion()
        #Crossvalidation
        cross_validation = StratifiedKFold(self.train_targets, n_folds=10)
        
        #Build a Pipeline with TFidfVectorizer and classifier
        pipeline_classifier = Pipeline([
#                                        ('vect', self.vect),
#                                    ('tfidf', self.tfidf_transformer),
                                    ('clf', self.classifier)
                                    ])
        
        #Perform grid search
        print "Performing grid search with classifier of instance ",str(self.classifier.__class__.__name__)
        self.grid = GridSearchCV(pipeline_classifier, self.grid_params, cv=cross_validation, refit=True, n_jobs=-1,verbose=1)

        self.grid.fit(combined_vector, self.train_targets)
        
        self.best_estimator = self.grid.best_estimator_
        self.best_parameters = self.grid.best_params_
        self.best_score = self.grid.best_score_
        
        
        print "Results for ",self.classifier.__class__.__name__
        print "Best params: ", self.best_parameters
        print "Best score: ", self.best_score
        
        print "Storing estimator... "
        utils.store_model(self.classifier.__class__.__name__, self.best_parameters, self.best_score)
        return self.grid