} clf_param_grid = { 'loss': ['hinge', 'log', 'squared_hinge', 'modified_huber'], 'penalty': ['l2', 'l1', 'elasticnet'], 'alpha': uniform(0.0001 / 10, 0.0001 * 1.4), 'shuffle': [False, True], 'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'], 'eta0': uniform(0.01, 1), } cfg_param_grid = {'batch_size': [12500, 25000, 50000]} HashSampler = ParameterSampler(hash_param_grid, n_iter=1) clfSampler = ParameterSampler(clf_param_grid, n_iter=50000000) cfgSampler = ParameterSampler(cfg_param_grid, n_iter=1, random_state=rng) for _ in range(15): print(next(cfgSampler.__iter__())) #for p in cfgSampler: #print(p) # + # 2^18 = 262,144 # 2^21 = 2,097,152 ###### CONFIG ##### xml_lst = glob("/mnt/training_defs/math*/*.xml.gz") # THIS CONFIGURATION WORKS BEAUTIFULLY BUT STILL TRYING TO MAKE IT BETTER #cfg = {'batch_size': 25000, # 'hash_vect': {'decode_error':'ignore', # 'n_features': 2 ** 23, # 'alternate_sign': False, # 'ngram_range': (1,3)}, } #hash_param = next(HashSampler.__iter__())
clf_param_grid = {'loss': ['hinge', 'log', 'squared_hinge', 'modified_huber'], 'penalty': ['l2', 'l1', 'elasticnet'], 'alpha': uniform(0.0001/10, 0.0001*1.4), 'shuffle': [False, True], 'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'], 'eta0': uniform(0.01, 1),} cfg_param_grid = {'batch_size': [12500, 25000, 50000], } HashSampler = ParameterSampler(hash_param_grid, n_iter=1) clfSampler = ParameterSampler(clf_param_grid, n_iter=1) cfgSampler = ParameterSampler(cfg_param_grid, n_iter=1) tboy_acc = 0 cnt = 0 while True: xml_lst = glob(cfg['train_data']) hash_param = next(HashSampler.__iter__()) vectorizer = HashingVectorizer(**hash_param) clf_param = next(clfSampler.__iter__()) # Here are some classifiers that support the `partial_fit` method partial_fit_classifiers = { 'SGD': SGDClassifier(**clf_param), } # test data statistics test_stats = {'n_test': 0, 'n_test_pos': 0} # First we hold out a number of examples to estimate accuracy cfg_param = next(cfgSampler.__iter__()) stream = stream_arxiv_paragraphs(xml_lst, samples=cfg_param['batch_size']) X_test_text, y_test = next(stream)