def which_load(training_paths, validation_paths):
    training_results = ['result', 'result1', 'result2', 'result3', 'result10', 
                        'result11', 'result12', 'result13']
    validation_results = ['result4', 'result5', 'result6', 'result7', 'result14', 
                        'result15', 'result16', 'result17']
    for result, pickle_path in training_results, training_paths:
        load(result, pickle_path)
    for result, pickle_path in validation_results, validation_paths:
        load(result, pickle_path)
    '''load('result', 'resized_training_set/circle.pkl')
    load('result1', 'resized_training_set/triangle.pkl')
    load('result2', 'resized_training_set/rectangle.pkl')
    load('result3', 'resized_training_set/square.pkl')'''

    #merge dataset
    train_data = (circle_dataset + triangle_dataset + rectangle_dataset + 
                 square_dataset + circle_dataset1 + triangle_dataset1 + 
                 rectangle_dataset1 + square_dataset1)
    validation_data = (valid_circle_dataset + valid_triangle_dataset + 
                       valid_rectangle_dataset + valid_square_dataset + 
                       valid_circle_dataset1 + valid_triangle_dataset1 + 
                       valid_rectangle_dataset1 + valid_square_dataset1)

    pickle_file = 'data_shapes.pkl'
    try:
        f = open(str(pickle_file), 'wb')
        save = {'train_data': train_data,
                'validation_data': validation_data,
               }
        joblib.dump(save, f, compress = True)
        f.close()
    except Exception as e:
        print('Unable to save data to', str(pickle_file), ':', e)
        raise
示例#2
0
文件: RBR.py 项目: gcetinkaya/RBR
  def dump_schema(self, f):
    """ Dumps current schema to file w joblib
    """
    if not self.feature_schema:
      raise ValueError("schema is not present")

    joblib.dump(self.feature_schema, f)
    def _split_and_dump(self, X, y, valid_X, valid_y):
        if not hasattr(self, '_dm'):
            raise ValueError("It should be called after the dumpmanager _dm is set")

        if self.resampling == 'cv':
            pass
        elif self.resampling == 'holdout':
            if not self._has_valid_data:
                data_size = y.shape[0]
                if data_size >= 100000:
                    valid_ratio = 0.3
                elif 15000 <= data_size < 100000:
                    valid_ratio = 0.2
                else:
                    valid_ratio = 0.15
                valid_size = int(data_size * valid_ratio)
                X, valid_X = X[valid_size:], X[:valid_size]
                y, valid_y = y[valid_size:], y[:valid_size]
        else:
            raise NotImplementedError()

        pkl = {"resampling": self.resampling,
               "X": X, "y": y,
               "valid_X": valid_X, "valid_y": valid_y}

        datafile = os.path.join(self._dm.dir, "data.pkl")
        joblib.dump(pkl, datafile, protocol=-1)

        self._datafile = datafile
        return datafile
示例#4
0
文件: lstm.py 项目: Spepsi/LSTM
 def save_params(self):
     """
     Save the params to a pickle file
     To save the current state of the learning
     :return:
     """
     joblib.dump([i.get_value() for i in self.params],'data/network.pkl')
示例#5
0
文件: main.py 项目: DryRun/seizures
def TrainRandomForestVariance(p_subject, p_save):
	print "Welcome to TrainRandomForestVariance(" + p_subject + ", " + str(p_save) + ")"
	training_data_raw = pd.read_pickle(input_data_paths[p_subject])
	training_data = training_data_raw[["variance" in x or "classification" in x for x in training_data_raw.index]]

	# Ictal vs interictal
	forest_seizure = RandomForestClassifier(n_estimators = 500, n_jobs = 1, max_features="sqrt", max_depth=None, min_samples_split=1)
	y_seizure = [1 * (x > 0) for x in training_data.T["classification"]]
	forest_seizure.fit(training_data[:-1].T, y_seizure)

	# IctalA vs IctalB
	forest_early = RandomForestClassifier(n_estimators = 500, n_jobs = 1, max_features="sqrt", max_depth=None, min_samples_split=1)
	y_early = [1 * (x == 2) for x in training_data.T["classification"]]
	forest_early.fit(training_data[:-1].T, y_early)

	# Save models
	if p_save:
		saved_files = joblib.dump(forest_seizure, "RFV_" + p_subject + "_seizure.pkl")
		for saved_file in saved_files:
			os.system("mv " + saved_file + " /Users/dryu/Documents/DataScience/Seizures/data/models")
		saved_files = joblib.dump(forest_early, "RFV_" + p_subject + "_early.pkl")
		for saved_file in saved_files:
			os.system("mv " + saved_file + " /Users/dryu/Documents/DataScience/Seizures/data/models")

	return {"seizure":forest_seizure, "early":forest_early}
def token_matrix(outdir, data_generator, map_func):

    # transform token data into matrix
    vectorizer = CountVectorizer(tokenizer=lambda x: x.split(), lowercase=False)
    X = vectorizer.fit_transform(data_generator())

    # extract indices
    train_df = pd.read_csv("data/train_v2.csv")
    test_df = pd.read_csv("data/sampleSubmission_v2.csv")
    train_idx = train_df["file"].apply(map_func).values
    test_idx = test_df["file"].apply(map_func).values

    # prepare X_train & X_test
    X_train, X_test = X[train_idx], X[test_idx]

    # create directory if it does not exist
    if not os.path.isdir(outdir):
        try:
            os.makedirs(outdir)
        except OSError as exception:
            if exception.errno != errno.EEXIST:
                raise

    # save matrices
    with open(os.path.join(outdir, "X_train.np"), "w") as fhandle:
        save_sparse_csr(fhandle, X_train)

    with open(os.path.join(outdir, "X_test.np"), "w") as fhandle:
        save_sparse_csr(fhandle, X_test)

    joblib.dump(vectorizer.vocabulary_, os.path.join(outdir, "vocabulary.pkl"))
示例#7
0
    def fit(self, dpacks, targets, nonfixed_pairs=None, cache=None):
        """
        Extract whatever models or other information from the multipack
        that is necessary to make the parser operational

        Parameters
        ----------
        mpack : MultiPack
        """
        cache_file = (cache.get('attach') if cache is not None
                      else None)
        # load cached classifier, if it exists
        if cache_file is not None and fp.exists(cache_file):
            # print('\tload {}'.format(cache_file))
            self._learner_attach = joblib.load(cache_file)
            return self

        dpacks, targets = self.dzip(for_attachment, dpacks, targets)
        self._learner_attach.fit(dpacks, targets,
                                 nonfixed_pairs=nonfixed_pairs)
        # save classifier, if necessary
        if cache_file is not None:
            # print('\tsave {}'.format(cache_file))
            joblib.dump(self._learner_attach, cache_file)
        return self
示例#8
0
 def save_prediction(self, model_name, predictions, type_n):
     self._check_type_n(type_n)
     if on_cloud:
         joblib.dump(predictions, model_name + "_prediction_" + type_n, compress=5)
         cloud.bucket.put(model_name + "_prediction_" + type_n, prefix="prediction")
     else:
         joblib.dump(predictions, path_join(self.prediction_dir, model_name + "_prediction_" + type_n), compress=5)
示例#9
0
def write_test_pickle(to_pickle, args):
    kwargs = {}
    compress = args.compress
    method = args.method
    joblib_version = get_joblib_version()
    py_version = '{0[0]}{0[1]}'.format(sys.version_info)
    numpy_version = ''.join(np.__version__.split('.')[:2])

    # The game here is to generate the right filename according to the options.
    body = '_compressed' if (compress and method == 'zlib') else ''
    if compress:
        if method == 'zlib':
            kwargs['compress'] = True
            extension = '.gz'
        else:
            kwargs['compress'] = (method, 3)
            extension = '.pkl.{0}'.format(method)
        if args.cache_size:
            kwargs['cache_size'] = 0
            body += '_cache_size'
    else:
        extension = '.pkl'

    pickle_filename = 'joblib_{0}{1}_pickle_py{2}_np{3}{4}'.format(
        joblib_version, body, py_version, numpy_version, extension)

    try:
        joblib.dump(to_pickle, pickle_filename, **kwargs)
    except Exception as e:
        # With old python version (=< 3.3.), we can arrive there when
        # dumping compressed pickle with LzmaFile.
        print("Error: cannot generate file '{0}' with arguments '{1}'. "
              "Error was: {2}".format(pickle_filename, kwargs, e))
    else:
        print("File '{0}' generated successfuly.".format(pickle_filename))
def BOWtransform(corpus,mode,idx):

    data_matrix=[]
    print('Transform data...')

    if mode == 'train':
        bow_transformer = BOWTransformer()
        data_matrix = bow_transformer.fit_transform(corpus)

        #save transform model
        jl.dump(bow_transformer,'{}/{}.model'.format(marcos.TRANSFORM_MODEL_DIR,idx))

    elif mode == 'test':
        bow_transformer = jl.load('{}/{}.model'.format(marcos.TRANSFORM_MODEL_DIR,idx))
        data_matrix = bow_transformer.transform(corpus)

    else:
        print("Unexpected mode in BOWtransform",file=sys.stderr)
        sys.exit()

    # turn dt matrix to list
    print ("The shape of dt matrix is {}\n".format(data_matrix.shape))

    if sp.sparse.isspmatrix_csr(data_matrix):
        data_matrix = data_matrix.toarray().tolist()
    else: #pass through dimension reduction pipe
        data_matrix = data_matrix.tolist()

    return data_matrix
def dimReduction(corpus,mode,idx):
    
    print("Dimension reduction...")
    if sp.sparse.isspmatrix_csr(corpus):
        data_matrix = corpus.toarray()
    data_matrix=[]
    if mode == 'train':
        dim_reduc_pipe = marcos.DIMREDUC_PIPE
        dim_reduc_pipe.set_params(pca__n_components=1000)

        # bow_transformer = BOWTransformer()
        data_matrix = dim_reduc_pipe.fit_transform(corpus)

        #save transform model
        jl.dump(dim_reduc_pipe,'{}/{}.model_reduc'.format(marcos.TRANSFORM_MODEL_DIR,idx))

    elif mode == 'test':
        dim_reduc_pipe = jl.load('{}/{}.model_reduc'.format(marcos.TRANSFORM_MODEL_DIR,idx))
        data_matrix = dim_reduc_pipe.transform(corpus)

    else:
        print("Unexpected mode in BOWtransform",file=sys.stderr)
        sys.exit()

    # turn dt matrix to list
    print ("The shape of dt matrix is {} (after dimension reduction)\n".format(data_matrix.shape))

    return data_matrix.tolist()
示例#12
0
 def train(self,descri,names):
     def distance_hist(point1, point2):
         return cv2.compareHist(np.array(point1,np.float32), np.array(point2,np.float32), cv2.cv.CV_COMP_BHATTACHARYYA)
     # unique, counts = np.unique(names, return_counts=True)
     # print dict(zip(unique, counts))
     # R = zip(descri,names)
     # sorted_by_second = sorted(R, key=lambda tup: tup[1])
     # descri = np.array(sorted_by_second)[:,0]
     # descri = np.array([D for D in descri])
     # # D = distance_hist(descri[0],descri[0])
     # Y = pdist(descri,'euclidean')
     # Y = squareform(Y)
     # Y = (Y/Y.max())*255
     # # np.save("Matrix_NN_DL.npy",Y)
     # Size_block = 2
     # Matri = np.zeros((Y.shape[0]*Size_block, Y.shape[1]*Size_block), np.float32)
     # for i in xrange(Y.shape[0]):
     #     for j in xrange(Y.shape[1]):
     #         Value = Y[i][j]
     #         Matri[i * Size_block:(i + 1) * Size_block, j * Size_block:(j + 1) * Size_block] = Value
     # plt.imsave("Matriz_Distancias.jpg",Matri,cmap='hot')
     # plt.show()
     self.clf = NearestNeighbors(3)
     self.clf.fit(descri)
     self.names = names
     self.clases = np.unique(self.names)
     joblib.dump((self.clf, self.leaf_size, self.metric,self.names,self.clases), self.path, compress=3)
     return self.names
def run_gender():
    '''
    CAUTION!!
    Currently this script is set to run for age data distribution.
    '''
    c = IndexedContext()

    index_file = os.path.join(DATA,
        'libsvm_files/gender/paper/train.index')

    input_file = os.path.join(DATA,
        'annotation/gender/paper/gender_train.csv')
    output_file = os.path.join(DATA,
        'libsvm_files/gender/paper/train.libsvm')
    c.processFile(input_file, output_file)
    joblib.dump(c.getIndexer(), index_file)

    #indexer = joblib.load(index_file)
    #c.setIndexer(indexer)

    c.freeze()

    input_file = os.path.join(DATA,
        'annotation/gender/paper/gender_test.csv')
    output_file = os.path.join(DATA,
        'libsvm_files/gender/paper/test.libsvm')
    c.processFile(input_file, output_file)
  def verify_suff_stats(self, Dchunk, SS, lap):
    ''' Run-time checks to make sure the suff stats
        have expected values
    '''
    if self.savedir is not None:
      SSfile = os.path.join(self.savedir, 'SSdump-Lap%03d.dat' % (lap))
      if self.isLastBatch(lap):
        joblib.dump(SS, SSfile)
    if hasattr(Dchunk, 'nDocTotal') and Dchunk.nDocTotal < 4000:
      if self.hasMove('birth') and self.do_birth_at_lap(lap):
        if self.algParams['birth']['earlyLap'] > 0:
          pass
        elif lap < np.ceil(lap):
          assert SS.nDoc - Dchunk.nDocTotal > -0.001
        else:
          if abs(SS.nDoc - Dchunk.nDocTotal) > 0.01:
            print "WARNING @ lap %.2f | SS.nDoc=%d, nDocTotal=%d" % (lap, SS.nDoc, Dchunk.nDocTotal)
          assert abs(SS.nDoc - Dchunk.nDocTotal) < 0.01
      elif lap >= 1.0:
        assert abs(SS.nDoc - Dchunk.nDocTotal) < 0.01

    if hasattr(SS, 'N'):
      if not np.all(SS.N >= -1e-9):
        raise ValueError('N should be >= 0!')
      SS.N[SS.N < 0] = 0
def fit_with_params(params, X, firings, window_size, i):
    X = transform_data(X.as_matrix(), window_size)
    pid = os.getpid()
    print "fitting {}th iteration. PID: {}".format(i, pid)
    if params['e_n'] > params['e_w']:
        params['e_w'], params['e_n'] = params['e_n'], params['e_w']

    spk_aggr_func = params['spk_aggr_func']
    nrn_aggr_func = params['nrn_aggr_func']
    dist_metric = params['dist_metric']
    mgng_params = dict(params)
    del mgng_params['spk_aggr_func']
    del mgng_params['nrn_aggr_func']
    del mgng_params['dist_metric']
    try:
        estimator = mgng.MGNG(**mgng_params)
        estimator.fit(X)
        winner_units = estimator.transform(X)
        score = mgng.scorer(winner_units, window_size,
                            firings[firings.fire_idx <
                                    (len(winner_units) - window_size)],
                            spk_aggr_func, nrn_aggr_func, dist_metric)
        ret_val = score + (params, pid)
        pprint.pprint(ret_val)
        dump(winner_units, 'winner_units_{}.pickle'.format(pid), compress=3)
    except Exception as e:
        pprint.pprint(e)
        ret_val = (-np.infty, -np.infty, np.infty, params, pid)

    print "{}th iteration finished. PID: {}".format(i, pid)
    with open('hyperparam_opt_{}.log'.format(pid), 'ab') as fp:
        fp.write('{}\n'.format(pprint.pformat(ret_val)))
    return ret_val
示例#16
0
def train(corpus_file, out_file, mode, dim_size, window, min_count,
          negative, epoch, pool_size, chunk_size):
    with bz2.BZ2File(corpus_file) as f:
        sentences = LineSentence(f)
        sg = int(mode == 'sg')

        model = Word2Vec(sentences, size=dim_size, window=window, min_count=min_count,
                         workers=pool_size, iter=epoch, negative=negative, sg=sg)

    words = []
    entities = []
    for (w, _) in model.vocab.iteritems():
        if w.startswith(MARKER):
            entities.append(w[len(MARKER):].replace(u'_', u' '))
        else:
            words.append(w)

    vocab = Vocab(Trie(words), Trie(entities))

    word_embedding = np.zeros((len(words), dim_size), dtype=np.float32)
    entity_embedding = np.zeros((len(entities), dim_size), dtype=np.float32)
    for word in words:
        word_embedding[vocab.get_word_index(word)] = model[word]
    for entity in entities:
        entity_embedding[vocab.get_entity_index(entity)] = model[MARKER + entity.replace(u' ', u'_')]

    ret = dict(
        word_embedding=word_embedding,
        entity_embedding=entity_embedding,
        vocab=vocab,
    )
    joblib.dump(ret, out_file, compress=False)
示例#17
0
文件: agent.py 项目: vyraun/deer
    def dumpNetwork(self, fname, nEpoch=-1):
        """ Dump the network
        
        Parameters
        -----------
        fname : string
            Name of the file where the network will be dumped
        nEpoch : int
            Epoch number (Optional)
        """
        try:
            os.mkdir("nnets")
        except Exception:
            pass
        basename = "nnets/" + fname

        for f in os.listdir("nnets/"):
            if fname in f:
                os.remove("nnets/" + f)

        all_params = self._network.getAllParams()

        if (nEpoch>=0):
            joblib.dump(all_params, basename + ".epoch={}".format(nEpoch))
        else:
            joblib.dump(all_params, basename, compress=True)
    def transform(self, X, stride_size=1, save_to_file=None, memmap=False, force_rerun=False):
        """
        Expects X to be in the shape of (n, x, y, chan)
        """
        if not hasattr(self, 'centroids_'):
            raise RuntimeError("Model has not been fitted")

        if save_to_file is not None and os.path.exists(save_to_file) and not force_rerun:
            logger.info("File already exists, loading from {}".format(save_to_file))
            if memmap:
                res = joblib.load(save_to_file, mmap_mode='r+')
            else:
                res = joblib.load(save_to_file)
        else:
            all_rows = range(X.shape[0])
            chunked_rows = list(chunks(all_rows, self.n_jobs))
            logger.info("Transforming in {} jobs, chunk sizes: {}".format(self.n_jobs, [len(x) for x in chunked_rows]))
            res = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
                delayed(chunked_extract_features)(i, X, self.rf_size, self.centroids_, self.mean_, self.p_, True, stride_size, self.pool_method) for i in chunked_rows
            )
            res = np.vstack(res)
            if save_to_file is not None:
                logger.info("Saving results to file {}".format(save_to_file))
                joblib.dump(res, save_to_file)
                if memmap:
                    res = joblib.load(save_to_file, mmap_mode='r+')

        return res
示例#19
0
    def save_model(self, model_dir):
        """
        Save the model to `model_dir`

        Parameters
        ----------
        model_dir: str, location where model is saved
        """
        if os.path.isdir(model_dir):
            raise Exception('Folder already exists')
        else:
            os.mkdir(model_dir)

        # We clone the instance but do not clone the leaves since we will save them separately
        new_hkmnn_model = HKMNearestNeighbor(self.branching_factor,
                                             self.max_depth,
                                             self.leaf_size,
                                             self.batch_size,
                                             self.verbose)
        new_hkmnn_model.root = self._recursive_save(self.root,
                                                    0,
                                                    [0] * self.max_depth,
                                                    model_dir)
        # save skeleton
        file_name = os.path.join(model_dir, 'skeleton.pickle')
        joblib.dump(new_hkmnn_model, file_name, protocol=2)
示例#20
0
def get_corpora(lang, num_train=500000, num_test=10000, distributed=False):
        full_corpus = corpora.get_corpus(lang, word_boundaries=True)

        # A list of (phoneme, precedes_boundary) tuples.
        phones_and_boundaries = extract_boundaries(full_corpus)

        # Divide into train and test.
        train, test = corpora.train_test_split(phones_and_boundaries, 
                                               num_train, num_test, mode='end')

        # Separate phones from boundary markers.
        train_phones, _ = map(list, zip(*train))
        test_phones, test_bounds = map(list, zip(*test))
        joblib.dump(test_bounds, lang + '_bounds.pkl')
        return

        # Construct targets and encode phonemes.
        train_in, train_out = prepare(train_phones, distributed)
        test_in, test_out = prepare(test_phones, distributed)
        
        # Remove the trailing bound to match test_out.
        del test_bounds[-1]
        assert len(train_in) == len(train_out)
        assert len(test_in) == len(test_out) == len(test_bounds)

        return (train_in, train_out), (test_in, test_out), test_bounds
    def save_database(self, file_path):
        """Saves the current data to disk

        Keyword Arguments:
        file_path (str) -- Path were youo wish to save the file, use an extension like .db"""

        dump(self, file_path, True)
 def save_state(self, state_dict, itr=None):
     """
     Saves the state of an experiment.
     To be clear: this is about saving *state*, not logging diagnostics.
     All diagnostic logging is separate from this function. This function
     will save whatever is in ``state_dict``---usually just a copy of the
     environment---and the most recent parameters for the model you
     previously set up saving for with ``setup_tf_saver``.
     Call with any frequency you prefer. If you only want to maintain a
     single state and overwrite it at each call with the most recent
     version, leave ``itr=None``. If you want to keep all of the states you
     save, provide unique (increasing) values for 'itr'.
     Args:
         state_dict (dict): Dictionary containing essential elements to
             describe the current state of training.
         itr: An int, or None. Current iteration of training.
     """
     if proc_id()==0:
         fname = 'vars.pkl' if itr is None else 'vars%d.pkl'%itr
         try:
             joblib.dump(state_dict, osp.join(self.output_dir, fname))
         except:
             self.log('Warning: could not pickle state_dict.', color='red')
         if hasattr(self, 'tf_saver_elements'):
             self._tf_simple_save(itr)
示例#23
0
文件: cb.py 项目: PSUPing/PyExaminer
def get_mult_runs_data(design_doc, view_names, x_npy_file, y_npy_file):
    cb = Couchbase.connect(bucket=bucket_name,  host=host_name)

    x = [[0, 0, 0, 0, 0, 0, 0, 0]]

    for view_name in view_names:
        rows = cb.query(design_doc, view_name)

        count = 0
        for row in rows:
            x.append([row.value[1]['thread_alloc_count'],
                      row.value[1]['proc_count'],
                      row.value[1]['thread_alloc_size'],

                      row.value[2]['mem_free'],
                      row.value[2]['native_allocated_heap'],
                      row.value[2]['native_free_heap'],
                      row.value[2]['mem_total'],
                      row.value[2]['native_heap'],

                      row.value[3]['global_class_init'],
                      row.value[3]['classes_loaded'],
                      row.value[3]['total_methods_invoc'],

                      row.value[4]['total_tx'],
                      row.value[4]['total_rx']])

            count = count + 1

            print view_name + ' count: ' + `count`

    x.remove([0, 0, 0, 0, 0, 0, 0, 0])

    joblib.dump(x, x_npy_file)
def process_single_traj(fn, topology, stride, outdir, featurizers):
    traj = None

    def load():
        with timing('loading %s' % fn):
            t = md.load(fn, stride=stride, top=topology)
        print('Number of frames: %d' % t.n_frames)
        return t

    for f in featurizers:
        featurizer = f['featurizer']
        outfile = construct_outfile(fn, f['suffix'], outdir)
        if os.path.exists(outfile):
            print('Skipping %s. File exists' % outfile, file=sys.stderr)
            continue

        if traj is None:
            traj = load()

        with timing('featurizing (%s)' % featurizer.__class__.__name__):
            X = featurizer.partial_transform(traj)
        with timing('dumping to %s' % outfile):
            dump(X, outfile, compress=0)

    if traj is None:
        print(' == Completely skipped: %s ==' % fn, file=sys.stderr)
示例#25
0
文件: batch.py 项目: jcmgray/xyzpy
    def save_info(self, combos):
        """Save information about the sowed cases.
        """
        # If saving Harvester or Runner, strip out function information so
        #   as just to use pickle.
        if self.harvester is not None:
            harvester_copy = copy.deepcopy(self.harvester)
            harvester_copy.runner.fn = None
            hrvstr_pkl = pickle.dumps(harvester_copy)
            runner_pkl = None
        elif self.runner is not None:
            hrvstr_pkl = None
            runner_copy = copy.deepcopy(self.runner)
            runner_copy.fn = None
            runner_pkl = pickle.dumps(runner_copy)
        else:
            hrvstr_pkl = None
            runner_pkl = None

        joblib.dump({
            'combos': combos,
            'batchsize': self.batchsize,
            'num_batches': self.num_batches,
            '_batch_remainder': self._batch_remainder,
            'harvester': hrvstr_pkl,
            'runner': runner_pkl,
        }, os.path.join(self.location, INFO_NM))
def create_stacked_features(input_mel_file_name, output_examples_file_name):
    # Load audio.
    mel = joblib.load(input_mel_file_name)
    examples, labels, parameters = mel_to_example(mel, 'dummy_label')
    # Using compress=1 to make sure it is stored as one file.
    joblib.dump((examples, parameters), output_examples_file_name,
                compress=1)
示例#27
0
    def __init__(self,*args,**kwargs):
        super(score_locality_hash, self).__init__(*args,**kwargs)

        self.f_params = os.path.join(
            kwargs["output_data_directory"],
            "locality_hash_params.pkl")

        params = self.load_params(**kwargs)

        # Build the hash function lookup
        dim = self.M.syn0.shape[1]
        n_bits = int(kwargs['locality_n_bits'])
        alpha = float(kwargs['locality_alpha'])

        R = RBP_hasher(dim,n_bits,alpha)

        # We assume that all locality hashes will be the same, save these params to disk
        
        for key in ['dim', 'projection_count']:
            if key not in params: continue
            print "Checking if locality_hash({}) {}=={}".format(key, R.params[key], params[key])
            if R.params[key] != params[key]:
                msg = "\nLocality-hash config value of {} does not match from {} to {}.\nDelete {} to continue."
                raise ValueError(msg.format(key, R.params[key], params[key], self.f_params))

        if 'normals' in params:
            print "Loading locality hash from {}".format(self.f_params)
            R.load(params)
        else:
            joblib.dump(R.params, self.f_params)

        self.RBP_hash = R        
        self.WORD_HASH = {}
        for w,v in zip(self.M.index2word, self.M.syn0):
            self.WORD_HASH[w] = self.RBP_hash(v)
示例#28
0
文件: ML.py 项目: RSPB/StormPetrels
def train(path):
    name = os.path.splitext(os.path.basename(path))[0]
    print('Processing: ', name)
    features = pd.read_csv(path, index_col=None)
    selected_features_names = [name for name, desc in selected_features]
    features = features[selected_features_names]
    split_idx = 1200
    features = features.drop(['sound.files'], axis=1)
    noise_only_df, df = features.iloc[:split_idx], features.iloc[split_idx:]
    y = df.pop('petrel')
    X = df.values
    y_noise = noise_only_df.pop('petrel')
    X_noise = noise_only_df.values
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
    hyperparams = {
        'n_estimators': [100, 300, 500, 1000],
        'learning_rate': [0.1],
        'gamma': [0.0, 0.5],
        'max_depth': [2, 3, 4],
        'min_child_weight': [1, 2],
        'subsample': [1.0, 0.8],
        'reg_alpha': [0.0, 0.1],
        'reg_lambda': [1, 2, 3]
    }
    #
    # hyperparams = {
    #     'n_estimators': [100],
    #     'learning_rate': [0.1],
    #     'gamma': [0.0],
    #     'max_depth': [2],
    #     'min_child_weight': [1],
    #     'subsample': [1.0],
    #     'reg_alpha': [0.0],
    #     'reg_lambda': [1]
    # }

    clf = model_selection.GridSearchCV(estimator=xg.XGBClassifier(objective='binary:logistic', n_jobs=-1),
                                       param_grid=hyperparams,
                                       cv=4)
    fit_params = clf.fit(X_train, y_train)
    estimator = fit_params.best_estimator_
    joblib.dump(estimator, name + '_model.pkl')

    test_pred = estimator.predict(X_test)
    metrics = calculate_metrics(test_pred, y_test)

    noise_pred = estimator.predict(X_noise)
    noise_detection_accuracy = accuracy_score(y_noise, noise_pred)

    experiment = Experiment(api_key="4PdGdUZmGf6P8QsMa5F2zB4Ui",
                            project_name="storm petrels",
                            workspace="tracewsl")
    experiment.set_name(name)
    experiment.log_parameter('name', name)
    experiment.log_multiple_params(fit_params.best_params_)
    experiment.log_multiple_metrics(metrics)
    experiment.log_metric('Noise detection accuracy', noise_detection_accuracy)
    experiment.log_figure('Confusion matrix', get_confusion_matrix_figure(test_pred, y_test))
    experiment.log_figure('Feature importnace', get_feature_importance_figure(estimator, list(df.columns.values)))
示例#29
0
def motionEstTSS(curI, nextI, blockSize, stepSize, shiftSize):
	""" Computes motion vectors using 3-step search method
		Input:
			curI: The image for which we want to find motion vectors
			nextI: The reference image
			blockSize:
		 	stepSize:
			shiftSize:
		Ouput:
		    velX, velY : the motion vectors for each direction
	"""
	# check if two images have the same size
	if nextI.shape != curI.shape:
		print "Two images do not have the same size"
		return [], []
	
	# filepath for temp generated file used by parallel computation
	folder = tempfile.mkdtemp()
	curI_path = os.path.join(folder, 'curI')
	nextI_path = os.path.join(folder, 'nextI')
	velX_path = os.path.join(folder, 'velX')
	velY_path = os.path.join(folder, 'velY')

	# get pre-defined size
	height, width = curI.shape
	
	block_r = blockSize / 2
	velSize = ((height + 1 - 2 * block_r) / shiftSize, (width + 1 - 2 * block_r) / shiftSize)
	
	# get the number of system cores
	num_cores = multiprocessing.cpu_count()

	"""Pre-allocate a writeable shared memory map as a container for the results
	motion vectors of the parallel computation
	"""
	velX = np.memmap(velX_path, dtype=np.int32, shape=velSize, mode='w+')
	velY = np.memmap(velY_path, dtype=np.int32, shape=velSize, mode='w+')

	# Dump the input images to disk to free the memory
	dump(curI, curI_path)
	dump(nextI, nextI_path)

	"""Release the reference on the original in memory array and replace it
	by a reference to the memmap array so that the garbage collector can
	release the memory before forking. gc.collect() is internally called
	in Parallel just before forking.
	"""
	curI = load(curI_path, mmap_mode='r')
	nextI = load(nextI_path, mmap_mode='r')

	# Fork the worker processes to perform motion vector computation concurrently
	Parallel(n_jobs=num_cores)(delayed(estTSS)(curI, nextI, velX, velY, i, j, block_r, stepSize, shiftSize, height, width) for i in range(velSize[0]) for j in range(velSize[1]))

	# try:
	# 	shutil.rmtree(folder)
	# except:
	# 	print("Failed to delete: " + folder)

	return velX, velY
示例#30
0
	def save_inverted_index(self, inverted_index, sub_folder, base_path=None):
		if (not os.path.exists(os.path.join(self.cache_path_, sub_folder))):
			os.makedirs(os.path.join(self.cache_path_, sub_folder))

		if (base_path is None):
			base_path = self.cache_path_

		joblib.dump(inverted_index, os.path.join(base_path, sub_folder, 'inverted_index.joblib'), compress=3)
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Scaling

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Applying Linear Regression Model

from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)

## Saving the model

from joblib import dump, load
dump(model, 'Boston.joblib')

## Using the model

from joblib import dump, load
import numpy as np
model = load('Boston.joblib')
mod = mt.generate_symbolic_model(T, V, ttheta, [0, 0, tau1])

# Zustandsraummodell, partiell linearisiert
mod.calc_coll_part_lin_state_eq(simplify=True)
x_dot = mod.ff + mod.gg * qddot1

# Zustandsdefinition anpassen und ZRM speichern
replacements = {
    'Matrix': 'sp.Matrix',
    'sin': 'sp.sin',
    'cos': 'sp.cos',
    'q1': 'x1',
    'qdot1': 'x2',
    'qddot1': 'u1',
    'p1': 'x3',
    'pdot1': 'x4',
    'p2': 'x5',
    'pdot2': 'x6'
}


def str_replace_all(string, replacements):
    for (key, val) in replacements.items():
        string = string.replace(key, val)
    return string


x_dot = sp.Matrix([x_dot[2], x_dot[5], x_dot[0], x_dot[3], x_dot[1], x_dot[4]])
x_dot_str = str_replace_all(str(x_dot), replacements)
dump({'x_dot_str': x_dot_str}, 'examples/double_pend_cart_pl.str')
# X_train = train_images.reshape(train_images.shape[0], train_images.shape[1]*train_images.shape[2])/255
# 此处,因为我们已经知道的样本的形态,所以可以直接书写值

X_train = train_images.reshape(60000, 28*28)/255
y_train = train_labels
X_test = test_images.reshape(10000, 28*28)/255
y_test = test_labels

# 为了提高训练速度,我们只提取10%的样本进行演示
X_train_lite = X_train[0:5999, :]
y_train_lite = y_train[0:5999]
X_test_lite = X_test[0:999, :]
y_test_lite = y_test[0:999]


# TODO: 3.训练MLP神经网络并输出预测结果
start = time.time()

print('开始训练模型,请稍等...', end='')
mlp = MLPClassifier(solver='lbfgs', hidden_layer_sizes=[
                    100, 100], activation='relu', alpha=1e-5, random_state=62)
mlp.fit(X_train_lite, y_train_lite)

# 保存mlp神经网络模型
ModelPath = os.path.join(os.getcwd(), 'Models', 'Ch08MNIST_lbfgs.pkl')
joblib.dump(mlp, ModelPath)

print('训练结束,用时{:.2f}s.'.format(time.time() - start))
print('训练集得分: {:.4f}, 测试集得分: {:.4f}'.format(
    mlp.score(X_train, y_train), mlp.score(X_test, y_test)))
os.chdir("C:/Users/Rahul/Desktop/edwisor")
import importing

x = importing.trn_term_doc
label_cols = importing.label_cols
train_data=importing.train_data
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4, dual=True)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

dict1={}
for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(train_data[j])
    dict1.update({j:[m,r]})
joblib.dump(dict1,'diction.pkl')







示例#35
0
# hyperparameters
n_quantiles = [10]
output_distribution = ['normal']
penalty = ['l1']
C = np.logspace(-4, 4, 20)

# parameter grid
param_grid = {
    'qt__n_quantiles': n_quantiles,
    'qt__output_distribution': output_distribution,
    'clf__penalty': penalty,
    'clf__solver': ['saga'],
    'clf__C': C,
    'clf__max_iter': [1000]
}

clf_grid = GridSearchCV(pipeline,
                        param_grid=param_grid,
                        cv=gkf,
                        scoring=scoring,
                        refit=False,
                        verbose=2,
                        n_jobs=-1)

search = clf_grid.fit(X, y)

dump(
    search,
    'models/logreg_gridsearch_pipeline_ALL_ach-at-hex_' + args.window_size +
    '_' + args.n_significant + '_' + args.n_classes + '_.joblib')
示例#36
0
 def save(save_path):
     ps = sess.run(params)
     joblib.dump(ps, save_path)
y_test = data_test["v"].copy().values
X_val = test.drop("v",axis=1).values
y_val = test["v"].copy().values

scaler_x = MinMaxScaler()
scaler_y = MinMaxScaler()
scaler_x.fit(X_train)
X_train = scaler_x.transform(X_train)
X_test = scaler_x.transform(X_test)

scaler_y.fit(y_train.reshape(-1,1))
y_train = scaler_y.transform(y_train.reshape(-1,1))
y_test = scaler_y.transform(y_test.reshape(-1,1))



from ANFIS import EVOLUTIONARY_ANFIS

E_Anfis = EVOLUTIONARY_ANFIS(functions=3,generations=500,offsprings=10,
                             mutationRate=0.2,learningRate=0.2,chance=0.7,ruleComb="simple")

bestParam, bestModel = E_Anfis.fit(X_train,y_train,optimize_test_data=False)

bestParam, bestModel = E_Anfis.fit(X_train,y_train,X_test,y_test,optimize_test_data=True)

import joblib

joblib.dump(bestParam,'bestParam.joblib')
joblib.dump(bestModel,'bestModel.joblib')

示例#38
0
    train_x, train_y = pkl.load(f)

param_grid = {
    'max_depth': [6, 10, 15, 20],
    'learning_rate': [0.001, 0.01, 0.1, 0.2, 0, 3],
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
    'gamma': [0, 0.25, 0.5, 1.0],
    'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0],
    'n_estimators': [100]
}

gsearch1 = RandomizedSearchCV(estimator=xgboost.XGBRegressor(),
                              param_distributions=param_grid,
                              verbose=3,
                              scoring='neg_mean_squared_error',
                              cv=3,
                              n_iter=100,
                              random_state=42,
                              n_jobs=-1)
gsearch1.fit(train_x, train_y)
print('best params')
print(gsearch1.best_params_)
print('best score')
print(gsearch1.best_score_)

# save grid search
dump(gsearch1.best_estimator_, 'xgboost.model')
示例#39
0
    #                     Predicted
    #                 Negative  Positive
    #Actual Negative     TN        FP
    #       Positive     FN        TP
    #print(confusion_matrix(predict_label, result_predict, labels=['useless', 'useful']))

    #print(classification_report(predict_label, result_predict))

    #estimator = clf.estimators_[3]
    #dot_data = tree.export_graphviz(
    #                    estimator,
    #                    class_names=["useful", "useless"],
    #                    feature_names=["num_commits_open","lines_modified_open","files_modified_open","commits_on_files_touched","branch_hotness"],
    #                    filled=True,
    #                    rounded=True,
    #                    out_file=None
    #                )
    #graph = pdp.graph_from_dot_data(dot_data)
    #graph.write_png("bootstrap_tree.png")

# それぞれのデータの正解した回数の平均
ave_list = [n / loop_count for n in useful_match]

predict_data['hyouka_1'] = np.array(ave_list)
predict_data['label'] = df_predict['useful']

print(predict_data)

joblib.dump(predict_data, f'scripts/result/{project}.pkl')
示例#40
0
max(hmm['val_grapheme_root_categorical_accuracy'])
max(hmm['val_vowel_diacritic_categorical_accuracy'])
max(hmm['val_consonant_diacritic_categorical_accuracy'])




train = pd.concat([
    pd.read_parquet('data/train_image_data_0.parquet'),
    pd.read_parquet('data/train_image_data_1.parquet'),
    pd.read_parquet('data/train_image_data_2.parquet'),
    pd.read_parquet('data/train_image_data_3.parquet')
]).set_index('image_id', drop=True)


train.head()
import cv2
import joblib
len(original_images)
original_images = {}
for i, row in train.iterrows():
    if i in original_images.keys():
        continue
    image = 255 - row.values
    image = image.reshape(137, 236)
    image = image.astype(np.uint8)
    original_images[i] = image
joblib.dump(original_images, 'data/original_images')

print('hej')
示例#41
0
def Model():
    finalDataSet = pd.read_csv("finalDataSet.csv")
    finalDataSet.set_index("time", inplace=True)
    # print(df.tail())

    ...
    foreCastColumn = "close"  # creating label

    foreCastOut = int(12)  # prediction for next 12 hrs

    finalDataSet["label"] = finalDataSet[foreCastColumn].shift(-foreCastOut)

    ...
    X = np.array(finalDataSet.drop(["label"], axis=1))
    y = np.array(finalDataSet["label"])

    # normalize data
    X = preprocessing.scale(X)

    XforeCastOut = X[-foreCastOut:]

    X = X[:-foreCastOut]
    y = y[:-foreCastOut]

    ...
    # Split the data into train and test data set
    tscv = TimeSeriesSplit(n_splits=5)
    for train_index, test_index in tscv.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

    ...
    # regression model
    Model = LassoLars(alpha=0.01).fit(X_train, y_train)

    # EN = ElasticNet(alpha = 0.0001, l1_ratio = 0.5, random_state = 0).fit(X_train, y_train)

    ...
    # cross validated accucary on train set
    scores = cross_val_score(Model, X_train, y_train, cv=tscv)

    print("Training Accuracy: %0.2f (+/- %0.2f)" %
          (scores.mean(), scores.std() * 2))
    print("Intercept:", Model.intercept_)
    print("Slope:", Model.coef_[0])

    ...
    # prediction on training
    trainPredict = Model.predict(X_train)
    r_squared = r2_score(y_train, trainPredict)
    mae = np.mean(abs(trainPredict - y_train))
    rmse = np.sqrt(np.mean((trainPredict - y_train)**2))
    rae = np.mean(abs(trainPredict - y_train)) / np.mean(
        abs(y_train - np.mean(y_train)))
    rse = np.mean((trainPredict - y_train)**2) / np.mean(
        (y_train - np.mean(y_train))**2)
    sumOfDf = DataFrame(index=[
        "R-squared",
        "Mean Absolute Error",
        "Root Mean Squared Error",
        "Relative Absolute Error",
        "Relative Squared Error",
    ])
    sumOfDf["Training metrics"] = [r_squared, mae, rmse, rae, rse]

    # prediction of test
    testPredict = Model.predict(X_test)
    r_squared = r2_score(y_test, testPredict)
    mae = np.mean(abs(testPredict - y_test))
    rmse = np.sqrt(np.mean((testPredict - y_test)**2))
    rae = np.mean(abs(testPredict - y_test)) / np.mean(
        abs(y_test - np.mean(y_test)))
    rse = np.mean((testPredict - y_test)**2) / np.mean(
        (y_test - np.mean(y_test))**2)

    sumOfDf["Validation metrics"] = [r_squared, mae, rmse, rae, rse]
    sumOfDf = sumOfDf.round(decimals=3)

    print(sumOfDf)  # accuracy check

    ...
    # Save model to file in the current working directory
    fileName = "LLModel.pkl"
    joblib.dump(Model, fileName)

    # Load from file
    LLModel = joblib.load(fileName)

    # forecast future 12 hrs values
    foreCastFutureValues = DataFrame(LLModel.predict(XforeCastOut))

    ...
    # assigning names to columns
    foreCastFutureValues.rename(columns={0: "Forecast"}, inplace=True)

    newDataframe = finalDataSet.tail(foreCastOut)

    newDataframe.reset_index(inplace=True)

    newDataframe = newDataframe.append(
        DataFrame({
            "time":
            pd.date_range(
                start=newDataframe.time.iloc[-1],
                periods=(len(newDataframe) + 1),
                freq="H",
                closed="right",
            )
        }))

    newDataframe.set_index("time", inplace=True)

    newDataframe = newDataframe.tail(foreCastOut)

    foreCastFutureValues.index = newDataframe.index

    foreCastFutureValues.reset_index(inplace=True)

    return foreCastFutureValues
示例#42
0
nb = SVC()

from sklearn.pipeline import make_pipeline

pipe = make_pipeline(vect, nb)

# print(pipe.steps)

pipe.fit(X.cutted_text, y)

#from sklearn.model_selection import cross_val_score

#print(cross_val_score(pipe, X.cutted_text, y, cv=20, scoring='accuracy').mean())

y_pred = pipe.predict(X_test.cutted_text)

import joblib

joblib.dump(pipe, "./model.joblib")

import pickle
pickle.dump(pipe, open("./model.pickle", 'wb'))

from sklearn import metrics

print(metrics.accuracy_score(y_test, y_pred))
from sklearn.metrics import f1_score

print(f1_score(y_test, y_pred))
示例#43
0
def save_serialized(clf, filename_with_path):
    """ save model to a file """
    storage_dir = os.path.dirname(filename_with_path)
    if storage_dir != "":
        os.makedirs(storage_dir, exist_ok=True)
    joblib.dump(clf, filename_with_path)
示例#44
0
    def train(self, tr_x, tr_y, va_x=None, va_y=None):
        # 乱数固定
        ModelNN().set_tf_random_seed()

        # 出力ディレクトリ作成
        os.makedirs(self.params["out_dir"], exist_ok=True)

        # データのセット・スケーリング
        validation = va_x is not None
        scaler = self.params["scaler"]  # StandardScaler()
        scaler.fit(tr_x)
        tr_x = scaler.transform(tr_x)
        # ラベルone-hot化
        tr_y = to_categorical(tr_y, num_classes=self.params["nb_classes"])

        # モデル構築
        self.build_model((tr_x.shape[1],))

        hist = None
        if validation:
            va_x = scaler.transform(va_x)
            va_y = to_categorical(va_y, num_classes=self.params["nb_classes"])

            cb = []
            cb.append(
                ModelCheckpoint(
                    filepath=os.path.join(
                        self.params["out_dir"], f"best_val_loss_{self.run_fold_name}.h5"
                    ),
                    monitor="val_loss",
                    save_best_only=True,
                    # verbose=1,
                    verbose=0,
                )
            )
            # cb.append(ModelCheckpoint(filepath=os.path.join(self.params["out_dir"], f"best_val_acc_{self.run_fold_name}.h5"),
            #        monitor="val_acc",
            #        save_best_only=True,
            #        verbose=1,
            #        mode="max",
            #    )
            # )
            cb.append(
                EarlyStopping(
                    monitor="val_loss", patience=self.params["patience"], verbose=1
                )
            )
            hist = self.model.fit(
                tr_x,
                tr_y,
                epochs=self.params["nb_epoch"],
                batch_size=self.params["batch_size"],
                # verbose=2,
                verbose=0,
                validation_data=(va_x, va_y),
                callbacks=cb,
            )
        else:
            hist = self.model.fit(
                tr_x,
                tr_y,
                epochs=self.params["nb_epoch"],
                batch_size=self.params["batch_size"],
                # verbose=2,
                verbose=0,
            )

        # スケーラー保存
        self.scaler = scaler
        joblib.dump(
            self.scaler,
            os.path.join(self.params["out_dir"], f"{self.run_fold_name}-scaler.pkl"),
        )

        # history plot
        self.plot_hist_acc_loss(hist)

        return hist
示例#45
0
 def save(self, checkpoint_path: pathlib.Path) -> any:
     file_name = f"{self.name}.pth"
     dump(self.model, str(checkpoint_path / file_name))
     return file_name
示例#46
0
import pandas as pd
from joblib import dump
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

answers_df = pd.read_excel('answers_base.xlsx')
queries_df = pd.read_excel('queries_base.xlsx')
queries_df = queries_df[['Текст вопроса', 'Номер связки\n']].dropna()

queries_train, queries_test = train_test_split(queries_df,
                                               test_size=0.3,
                                               random_state=0)
documents = answers_df['Текст вопросов'].append(queries_train['Текст вопроса'],
                                                ignore_index=True)

documents_prep = documents.apply(
    lambda x: ' '.join(preprocessing.preprocessing(x)))
documents_ner = documents.apply(lambda x: ' '.join(
    preprocessing.preprocessing(preprocessing.preprocess_ner(x))))

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents_prep)

vectorizer_ner = TfidfVectorizer()
X_ner = vectorizer_ner.fit_transform(documents_ner)

dump(X, 'text_representations/tfidf.pkl')
dump(vectorizer, 'text_representations/vectorizer.pkl')

dump(X_ner, 'text_representations/tfidf_ner.pkl')
dump(vectorizer_ner, 'text_representations/vectorizer_ner.pkl')
示例#47
0
    model.fit(x, y)
    print("Fin del entrenamiento")


if __name__ == "__main__":
    print("Iniciando")
    solver = sys.argv[1]
    # client = Client(processes=False, threads_per_worker=4,
    #             n_workers=2, memory_limit='3GB')
    # print(client)
    mlp = neural_network.MLPRegressor(
        hidden_layer_sizes=(16,), 
        solver=solver, 
        verbose=10,
        activation='relu',
        batch_size=32,
        learning_rate_init=0.01, # funciona mejor
        tol=1e-3,
        early_stopping=False,
        epsilon=1e-4,
        n_iter_no_change=3)
    # mlp = load('mlp.joblib')
    initial_time = time.now()
    train(mlp, 'train_data.csv')
    end_time = time.now()
    print("Time training:")
    print("Saving model")
    dt_now= datetime.datetime.now().isoformat()
    dump(mlp, f'mlp.joblib')
    print("Finished")
示例#48
0
y=placement_coded.status

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.8,random_state=1)

# predict if a student is placed or not.
from sklearn.linear_model import LogisticRegression
#from sklearn import metrics
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
#y_pred = logreg.predict(X_test)
#print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))    




# Save your model
import joblib
joblib.dump(logreg, 'model.pkl')
print("Model dumped!")

# Load the model that you just saved
lr = joblib.load('model.pkl')

# Saving the data columns from training
model_columns = list(X_train.columns)
joblib.dump(model_columns, 'model_columns.pkl')
print("Models columns dumped!")


示例#49
0
def pre_training_data(is_scaler=True, is_categorical=False, bin_method='bins', is_1hot_categ=True):
    u"""
    return train, label, vali_train, vali_label,X_test,y_test
    type: ndarray
    """
    # only test  image3
    X_train, X_val, y_train, y_val = None, None, None, None
    X_train, X_val, y_train, y_val = load_data_200(X_train, X_val, y_train, y_val, data_path = './Images')
    X_train, X_val, y_train, y_val = load_data_200(X_train, X_val, y_train, y_val, data_path = './Images2')
    X_train, X_val, y_train, y_val = load_data_200(X_train, X_val, y_train, y_val, data_path = './Images3')
    X_train, X_val, y_train, y_val = load_data_200(X_train, X_val, y_train, y_val, data_path = './Images4')
    X_train, X_val, y_train, y_val = load_data_200(X_train, X_val, y_train, y_val, data_path = './Images6')
    X_train, X_val, y_train, y_val = load_data_200(X_train, X_val, y_train, y_val, data_path = './Images7_high_range')
    # X_train, X_val, y_train, y_val = load_data_200(X_train, X_val, y_train, y_val, data_path = './Images8_high_range')
    X_train, X_val, y_train, y_val = load_data_200(X_train, X_val, y_train, y_val, data_path = './Images9_200_300')
    X_train, X_val, y_train, y_val = load_data_200(X_train, X_val, y_train, y_val, data_path = './Images5')

    # X_test, X_val, y_test_origin, y_val_origin = train_test_split(X_val, y_val_origin, test_size=0.5, shuffle=True)
    X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.6, shuffle=True)
    print("train shape", str(X_train.shape))
    print("test shape",str(X_test.shape))
    print("val shape",str(X_val.shape))


    if is_scaler:

        # scaler = preprocessing.MaxAbsScaler() MaxAbsScaler
        scaler = preprocessing.MaxAbsScaler()#StandardScaler
        y_train = scaler.fit_transform(y_train.reshape(-1, 1))
        y_train = y_train.flatten()

        joblib.dump(scaler, 'MaxAbsScaler.pkl')

        # scaler_val = preprocessing.MaxAbsScaler()
        y_val = scaler.transform(y_val.reshape(-1, 1))
        y_val = y_val.flatten()


        y_test = scaler.transform(y_test.reshape(-1, 1))
        y_test = y_test.flatten()

        # joblib.dump(scaler_val, 'MaxAbsScaler_vali.pkl')

    if is_categorical:
        y_test = np.array([processing_y(i, default_method=bin_method) for i in y_test])
        y_train = np.array([processing_y(i, default_method=bin_method) for i in y_train])
        y_val = np.array([processing_y(i, default_method=bin_method) for i in y_val])
        is_categorical = False
        print(y_val)
        print(y_val.shape)
        if len(np.unique(y_train)) >= 2 and is_1hot_categ == True:
            is_categorical = True
            y_train = keras.utils.to_categorical(y_train)
            y_val = keras.utils.to_categorical(y_val)
            y_test = keras.utils.to_categorical(y_test)

    # print(y_test)###
    np.save('X_train.npy', X_train)
    np.save('X_val.npy', X_val)
    np.save('y_train.npy', y_train)
    np.save('y_val.npy', y_val)
    np.save('X_test.npy', X_test)
    np.save('y_test.npy', y_test)

    # print(X_train.shape)###
    # print(X_val.shape)###
    # print(X_test.shape)###
    # print(y_test)###
    # (train,label),(vali_train,vali_label) = CNN_Regression.load_data()
    train = X_train
    label = y_train
    vali_train = X_val
    vali_label = y_val

    return train, label, vali_train, vali_label,X_test,y_test
示例#50
0


    output, h = model(inputs, h)

    # calculate loss
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())

    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer

    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy())
    num_correct += np.sum(correct)


# -- stats! -- ##
# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

print(' S A V I N G  M O D E L')

joblib.dump(model, 'model.pkl')

print('M O D E L  S A V E D..............')
def build_model(dbname):
    mongodb = dbname
    ranktable = mongodb + 'rank'
    client = pymongo.MongoClient('localhost', 27017)
    dataname = client[mongodb]
    global table2
    len_dict = {}
    table2 = dataname[ranktable]
    global df
    df = pd.DataFrame(data=list(table2.find()))
    df = df.drop(columns=['_id', 'date', 'usernick', 'skuId'])
    df = df.loc[lambda df: df["content"] != "此用户未填写评价内容"]
    df = df.loc[lambda df: df["score"] != 3]
    df['sentiment'] = df['score'].apply(lambda x: 1 if x > 3 else 0)
    df_neg = df.loc[lambda df: df["sentiment"] == 0]
    df_pos = df.loc[lambda df: df["sentiment"] == 1]
    sample_size = min(df_neg.shape[0], df_pos.shape[0])
    if sample_size == df_neg.shape[0]:
        df_pos = df_pos.sample(n=sample_size, random_state=None)
    else:
        df_neg = df_neg.sample(n=sample_size, random_state=None)

    print('df_neg', df_neg.shape)
    print('df_pos', df_pos.shape)
    df = pd.concat([df_pos, df_neg])
    X = df[['content']]
    y = df.sentiment
    X['cutted_comment'] = X.content.apply(chinese_word_cut)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    # 数据降维
    stop_words_file = "stopwords.txt"
    stopwords = get_custom_stopwords(stop_words_file)
    max_df = 0.8
    min_df = 3
    vect = CountVectorizer(max_df=max_df,
                           min_df=min_df,
                           token_pattern=u'(?u)\\b[^\\d\\W]\\w+\\b',
                           stop_words=frozenset(stopwords))
    term_matrix = pd.DataFrame(vect.fit_transform(
        X_train.cutted_comment).toarray(),
                               columns=vect.get_feature_names())
    nb = MultinomialNB()
    pipe = make_pipeline(vect, nb)
    cross_score = cross_val_score(pipe,
                                  X_train.cutted_comment,
                                  y_train,
                                  cv=5,
                                  scoring='accuracy').mean()
    print(f"====训练交叉预测准确率:{cross_score}====")
    # 模型拟合
    pipe.fit(X_train.cutted_comment, y_train)
    pipe.predict(X_test.cutted_comment)
    y_pred = pipe.predict(X_test.cutted_comment)
    model_acc = metrics.accuracy_score(y_test, y_pred)
    print(f"====模型预测准确率:{model_acc}====")
    confusion_m = metrics.confusion_matrix(y_test, y_pred)
    print("=======混淆矩阵======")
    print(confusion_m)
    print("====================")
    print("正在保存模型......")
    print("====================")
    model_file_name = dbname + '_trained_model.pkl'
    joblib.dump(pipe, model_file_name)
    print("保存成功")
示例#52
0
def run_experiment(X, y, alphas, seed, method, data_name, proc_train,
                   proc_unlab, cv, num_c, num_gamma, verbose, n_jobs):
    sensitives = np.unique(X[:, -1])
    X_train, y_train, X_unlab, X_test, y_test = split_data(
        X, y, proc_train, proc_unlab, seed)

    if not isinstance(alphas, dict):
        alphas_dict = {}
        for s in sensitives:
            alphas_dict[s] = alphas
    else:
        alphas_dict = alphas
    SIGNATURE = '{}_{}_'.format(data_name, method)
    scaler = StandardScaler()
    scaler.fit(X_train[:, :-1])

    X_train[:, :-1] = scaler.transform(X_train[:, :-1])
    X_unlab[:, :-1] = scaler.transform(X_unlab[:, :-1])
    X_test[:, :-1] = scaler.transform(X_test[:, :-1])
    n_train, d = X_train.shape
    if n_train > d:
        dual = False
    else:
        dual = True

    methods = {
        "LR": LogisticRegression(solver='liblinear'),
        "L-SVC": CalibratedClassifierCV(LinearSVC(dual=dual)),
        "RF": RandomForestClassifier(),
        "RF+": RandomForestClassifier()
        # "RBF-SVC": SVC(probability=True),
    }

    Cs = np.logspace(-4, 4, num_c)
    gammas = np.logspace(-4, 4, num_gamma)
    pows = np.array([1, 15 / 16, 7 / 8, 3 / 4, 1 / 2, 1 / 4, 1 / 8, 1 / 16, 0])
    ds = np.unique((d**pows).astype('int'))

    if method[-1] == "+":
        randomize = True
        postfix = "+"
        method = method[:-1]
    else:
        randomize = False
        postfix = ""
    # randomize = True if method[-1] == "+" else False

    parameters = {
        "LR": {
            "C": Cs
        },
        "L-SVC": {
            "base_estimator__C": Cs
        },
        "RF": {
            "max_features": ds
        },
        # "RF+" : {"max_features" : ds}
        # "RBF-SVC" : {"C" : Cs, "gamma" : gammas}
    }
    key = method
    BASE_MODEL_SIGNATURE = "{}_{}{}_{}".format(data_name, method, postfix,
                                               seed)
    BASE_MODEL_PATH = 'results/models/{}.pkl'.format(BASE_MODEL_SIGNATURE)
    '''
        Base model does not depend on alpha. Load if it exists, and fit it if not.
    '''
    try:
        clf = joblib.load(BASE_MODEL_PATH)
        print('Model {} loaded'.format(BASE_MODEL_SIGNATURE))
    except:
        print('Model {} not found. Fitting ...'.format(BASE_MODEL_SIGNATURE))
        clf = GridSearchCV(methods[key],
                           parameters[key],
                           cv=cv,
                           refit=True,
                           verbose=verbose,
                           n_jobs=n_jobs)
        clf.fit(X_train, y_train)
        joblib.dump(clf, BASE_MODEL_PATH)
    '''
        Transformation step is cheap, we do not save it.
    '''
    transformer = TransformDPAbstantion(clf,
                                        alphas=alphas_dict,
                                        randomize=randomize)
    transformer.fit(X_unlab)
    y_pred = transformer.predict(X_test)
    y_pred_unf = clf.predict(X_test)

    # For test data
    fairness_test = compute_dp(y_test, X_test[:, -1])
    # print_report(fairness_test, 'Test')

    # For base method
    accuracy_base = risk(y_test, y_pred_unf, X_test[:, -1])
    fairness_base = compute_dp(y_pred_unf, X_test[:, -1])
    # print_report(accuracy_base, 'Base')
    # print_report(fairness_base, 'Base')

    # For our method
    accuracy_our = risk(y_test, y_pred, X_test[:, -1])
    fairness_our = compute_dp(y_pred, X_test[:, -1])
    reject_our = classififcation_rate(y_pred, X_test[:, -1])
    # print_report(accuracy_our, 'Our')
    # print_report(fairness_our, 'Our')
    print_report(reject_our, 'Our')

    results = {
        'test': fairness_test,
        'base': {
            **accuracy_base,
            **fairness_base
        },
        'our': {
            **accuracy_our,
            **fairness_our,
            **reject_our
        }
    }

    return results
示例#53
0
import sys
sys.path.insert(0, "/usr/local/lib/python2.7/site-packages")
import pandas as pd
from sklearn.neural_network import MLPClassifier
from joblib import dump, load
import numpy as np

df = pd.read_csv('../data/ensembled_data.csv')

X = df[['Convolutional NN', 'Random Forest', 'SVM', 'Dump']]
y = np.ravel(df[['Correct Line']])

model = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(3), random_state=1)
model.fit(X, y)

dump(model, '../data/nn_ensembled.joblib')
示例#54
0
    f1 = resreg.f1_score(y_test, y_pred, error_threshold=5, 
                     relevance_true=relevance_true, relevance_pred=relevance_pred,
                     relevance_threshold=0.5, k=1e4)
    mse_bins = resreg.bin_performance(y_test, y_pred, bins, metric='MSE')
    
    
    # Store performance results
    r2_store.append(r2)
    mse_store.append(mse)
    mcc_store.append(mcc)
    f1_store.append(f1)
    mse_bins_store.append(mse_bins)

# Performance statistics
r2_mean, r2_std = np.mean(r2_store), np.std(r2_store)
mse_mean, mse_std = np.mean(mse_store), np.std(mse_store)
f1_mean, f1_std = np.mean(f1_store), np.std(f1_store)
mcc_mean, mcc_std = np.mean(mcc_store), np.std(mcc_store)
mse_bins_store = pd.DataFrame(mse_bins_store)
mse_bins_mean, mse_bins_std = np.mean(mse_bins_store, axis=0), np.std(mse_bins_store, axis=0)

# Combine all performance data and write to excel spreadsheet
means = [r2_mean, mse_mean, f1_mean, mcc_mean] + list(mse_bins_mean)
stds = [r2_std, mse_std, f1_std, mcc_std] + list(mse_bins_std)
store = [param] + means + stds


# Save performance results as a binary file (to be read and analyzed later)
joblib.dump(store, f'hpc/joblib_files/{strategy}_{2}.pkl')

示例#55
0
# selecting a machinelearning model

# estimator = BaggingClassifier(SVC(C=3, kernel = 'rbf', gamma='auto', probability=True, class_weight=class_weights), n_jobs=-1, verbose=1) # very slow
estimator = SVC(C=10,
                kernel='rbf',
                gamma='auto',
                probability=True,
                class_weight=class_weights)  # this is a bit slow
# estimator = RandomForestClassifier(n_estimators=100, class_weight=class_weights, n_jobs=-1)
# estimator = DecisionTreeClassifier(max_features=10, class_weight=class_weights)

# train model and predict test data
estimator.fit(X_train, y_train_int)
y_pred = estimator.predict(X_test)

dump(estimator, "estimator_dump")

# print some metrices
print("#### Evaluation #### \n")

print("confusion matrix:")
print(confusion_matrix(y_test_int, y_pred))
print("classification report: ")
print(classification_report(y_test_int, y_pred))

print("balanced_accuracy_score: ")
print(balanced_accuracy_score(y_test_int, y_pred))

# create ROC curve
# see https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
示例#56
0
    "release_extension", "outs_when_up", "pitch_type_CH", "was_3_2",
    "pitch_type_SI", "pitch_type_FT", "pitch_type_FC", "bat_score",
    "post_bat_score", "pitch_type_CU", "was_3_1", "pitch_type_FS", "was_1_1",
    "was_0_1", "was_2_1", "was_1_2", "was_2_2", "was_0_2"
]
for item in to_pop:
    df.pop(item)

y = df.pop("description").values
y_lookup, y = np.unique(y, return_inverse=True)
X = df.values

print(df.columns)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.3,
                                                    stratify=y,
                                                    random_state=42)

gbm = gb_classifier(subsample=0.7,
                    learning_rate=0.1,
                    max_depth=5,
                    n_estimators=300,
                    verbose=1)
gbm.fit(X_train, y_train)

print(gbm.score(X_test, y_test))
print(gbm.feature_importances_)
dump(gbm, 'reduced_gbm.py.joblib')
示例#57
0
# -*- encoding: utf-8 -*-
"""
8.8.5 模型持久化
"""

import joblib
from sklearn.datasets import load_wine
from sklearn.svm import SVC

X, y = load_wine(return_X_y=True)

svc = SVC()
svc.fit(X, y)

joblib.dump(svc, r'..\res\svc.m')  # 持久化模型
svc = joblib.load(r'..\res\svc.m')  # 加载模型
示例#58
0
# result = model.evals_result()
# print("eval's results :", result)

# r2 = r2_score(y_predict, y_test)
# print("r2 Score : %.2f%%" %(r2 * 100.0))
# print("r2 :", r2)
y_predict = model.predict(x_test)
acc = accuracy_score(y_predict, y_test)
print("acc : ", acc)
###################################################################################

# import pickle#파이썬에서 제공하는 피클
# pickle.dump(model, open("./model/xgb_save/cancer.pickle.data", "wb"))
# print("SAVED!!!!")
#피클과 잡립을 비교해보아라 둘다 저장라는 방법임
# from joblib import dump, load
import joblib
joblib.dump(model, "./model/xgb_save/cancer.joblib.data")
print("SAVED!!!!")

# 불러오기
# model2= pickle.load(open("./model/xgb_save/cancer.pickle.data", "rb"))
model2 = joblib.load(open("./model/xgb_save/cancer.joblib.data"))

print("LOADED!!!!불러왔다. ")
y_predict = model.predict(x_test)
acc = accuracy_score(y_predict, y_test)
print("acc : ", acc)
#저장을하고 다시 불러온거에서 두개의 애큐러시가 동일한지를 확인해주는 과정을 거치면 두개다 동일한거에서 저장하고 다시 불러오기 한것임을 알 수 있다.
示例#59
0
def ModelFit():
    global best_model

    #contruct hyperparameter grid
    param_dist = {"max_depth": [3, 10, 20, 70, None],
                  "max_features": [2, 10, 41, 80, 'sqrt'],
                  "min_samples_split": sp_randint(2, 11),
                  "min_samples_leaf": sp_randint(1, 11),
                  #"bootstrap": [True, False],
                  "criterion": ["gini", "entropy"],
                  "n_estimators": [100, 300, 500, 800, 1000]}
    pprint(param_dist)

    #define random forest classifier function
    rf = RandomForestClassifier(random_state = 120)

    #search across 1000 randomized combinations in the above grid
    estimator = RandomizedSearchCV(estimator = rf, param_distributions = param_dist, n_iter = 1000, cv = 10, verbose = 10, random_state = 12, scoring = 'roc_auc', n_jobs = -1)

    #fit the model
    grid_result = estimator.fit(X_train, y_train)

    #find and define best estimator based on grid search
    best_model = grid_result.best_estimator_
    print('\nbest_model:\n', best_model)

    #predict y based on test data
    y_pred = grid_result.predict(X_test)

    #accuracy score
    print('accuracy score:', accuracy_score(y_test, y_pred))

    #confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print(tn,fp,fn,tp)

    #classification report
    print('\nclassification report:\n',classification_report(y_test, y_pred))

    #AUC and ROC curve
    y_pred_prob = grid_result.predict_proba(X_test)[:,1]
    auc = roc_auc_score(y_test, y_pred_prob)
    print('auc:', auc)

    false_positive, true_positive, _ = roc_curve(y_test, y_pred_prob)

    font = {'fontname':'Helvetica'}
    plt.figure()
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(false_positive, true_positive, color='black')
    plt.xlabel('False positive rate', **font)
    plt.ylabel('True positive rate', **font)
    plt.savefig('feces_roc.png', dpi=300)
    plt.show()
    
    # Save the model as a pickle in a file 
    joblib.dump(grid_result, 'campy_rf_feces.pkl')
    
    #determine best features
    feature_importances = grid_result.best_estimator_.feature_importances_
    column_names=list(feces)
    del column_names[-0]
    importance = pd.DataFrame(feature_importances, index=column_names, columns=["Importance"])
    sort_importance = importance.sort_values(by=['Importance'], ascending = False)
    sort_column_names = sort_importance.index.values.tolist()
    mult = 100/(sort_importance['Importance'].iloc[0])
    sort_imp_mult = sort_importance * mult
    
    top_imp = sort_imp_mult['Importance'].iloc[0:15].tolist()
    top_column_names = sort_column_names[0:15]
    top_column_names =  ['AvgMaxGustSpeed1.6',
                         'AvgAverageHumidity1.7',
                         'AverageHumidityTwoDayBefore',
                         'AvgMaxGustSpeed1.3',
                         'AvgMaxGustSpeed1.5',
                         'AvgMinTemperature1.7',
                         'AvgMaxWindSpeed1.7',
                         'AvgMinHumidity1.4',
                         'AvgMaxHumidity1.3',
                         'AvgPrecipitation1.4',
                         'MaxGustSpeedOneDayBefore',
                         'AvgMaxGustSpeedS1.2',
                         'AvgMaxWindSpeed1.4',
                         'AvgAverageHumidity1.3',
                         'MaxGustSpeedTwoDayBefore']
    
    plt.rcParams.update(plt.rcParamsDefault)
    y_ticks = np.arange(0, len(top_column_names))
    fig, ax = plt.subplots()
    ax.barh(y_ticks, top_imp, color = "dimgray")
    ax.set_yticklabels(top_column_names, **font)
    ax.set_yticks(y_ticks)
    plt.xlabel('Relative Importance', **font)
    fig.tight_layout()
    plt.gca().invert_yaxis()
    plt.savefig('feces_var.png', dpi=300)
    plt.show()
   
    return
示例#60
0
def given_saved_some_step(multiply_by, name, path):
    some_step1 = MultiplyByN(multiply_by=multiply_by)
    some_step1.name = name
    dump(some_step1, path)