def cfg(): description = 'covid_xray' source = '/home/lybarger/clinical_extractors/analyses_pulmonary/step005_text_import/covid_xray/corpus.pkl' include = None exclude = None as_stem = True model_dir = '/home/lybarger/clinical_extractors/analyses_pulmonary/step320_pulmonary_modeling/fit/sent1+concat0+run1/' fast_run = True output_dir = paths_pulmonary.predict if fast_run: destination = os.path.join(output_dir, description + '_FAST_RUN') else: destination = os.path.join(output_dir, description) # Scratch directory make_and_clear(destination) device = 0 # Create observers file_observ = FileStorageObserver.create(destination) cust_observ = CustomObserver(destination) ex.observers.append(file_observ) ex.observers.append(cust_observ)
def cfg(): description = 'basic' source_dir = paths_pulmonary.brat_import source = constants_pulmonary.COVID_XRAY file = constants.CORPUS_FILE source = os.path.join(source_dir, source, file) labels = [INFILTRATES, EXTRAPARENCHYMAL] doc_map = constants_pulmonary.DOC_MAP doc_label_order = [INFILTRATES, EXTRAPARENCHYMAL] assertion_label_order = [NONE, PRESENT, UNILATERAL, BILATERAL] ''' Paths ''' destination = os.path.join(paths_pulmonary.stats, description) # Scratch directory make_and_clear(destination) # Create observers file_observ = FileStorageObserver.create(destination) cust_observ = CustomObserver(destination) ex.observers.append(file_observ) ex.observers.append(cust_observ)
def cfg(): #mode = CV mode = FIT #mode = PREDICT #mode = SCORE #file_doc_scores = ["scores_doc_labels.csv", "scores_doc_labels_summary.csv", "scores_sent_labels_summary.csv"] #"scores_entities.csv", "scores_relations.csv"] file_doc_scores = "scores_doc_labels.csv" #"scores_entities.csv", "scores_relations.csv"] file_sent_scores = "scores_sent_labels_summary.csv" source_dirs = [os.path.join(paths_pulmonary.modeling, mode)] discrete_dir = '/home/lybarger/clinical_extractors/analyses_pulmonary/step322_pulmonary_discrete/ngrams/' if mode == FIT: source_dirs.append(discrete_dir) metric = F1 destination = os.path.join(paths_pulmonary.summary, mode) suffix_pat = '\+run\d' # Destination file for corpus # Scratch directory make_and_clear(destination) # Create observers file_observ = FileStorageObserver.create(destination) cust_observ = CustomObserver(destination) ex.observers.append(file_observ) ex.observers.append(cust_observ)
def text_to_disk(corpus, destination, sub_dir="text"): ''' Save corpus to disc as txt files ''' dir = os.path.join(destination, sub_dir) make_and_clear(dir, recursive=True) for doc in corpus.docs(): doc.write_text(dir)
def fit_cv(self, X, y, device=None, path=None, n_splits=3, shuffle=True, seed=1): if shuffle: z = list(zip(X, y)) random.Random(seed).shuffle(z) X, y = zip(*z) if not isinstance(X, list): X = list(X) if not isinstance(y, list): y = list(y) kf = KFold(n_splits=n_splits) dfs = OrderedDict() for j, (train_index, test_index) in enumerate(kf.split(X)): self.reset_parameters() X_train = [X[i] for i in train_index] y_train = [y[i] for i in train_index] X_test = [X[i] for i in test_index] y_test = [y[i] for i in test_index] dir = os.path.join(path, f'cross_val_{j}') make_and_clear(dir) self.fit(X_train, y_train, device=device, path=dir) y_pred, scores = self.score(X_test, y_test, device=device, path=dir) for name, df in scores.items(): if name not in dfs: dfs[name] = [] dfs[name].append(df) dfs = self.scorer.combine_cv(dfs, path=path) return dfs
def tokenization_examples(self, path, n, **kwargs): make_and_clear(path, recursive=True) for doc in self.docs(**kwargs)[:n]: # Output file name fn = os.path.join(path, '{}_original.{}'.format(doc.id, 'txt')) # Directory, including path in id dir_ = os.path.dirname(fn) if not os.path.exists(dir_): os.makedirs(dir_) with open(fn, 'w') as f: f.write(doc.text()) fn = os.path.join(path, '{}_tokenized.{}'.format(doc.id, 'txt')) with open(fn, 'w') as f: f.write('\n====\n====\n'.join(doc.sents()))
def cfg(): description = None use_binary = False description = f'binary_{int(use_binary)}' source = constants_pulmonary.XRAY_IMAGES source_corpus_text = '/home/lybarger/clinical_extractors/analyses_pulmonary/step005_text_import/covid_xray/corpus.pkl' source_corpus_anno = '/home/lybarger/clinical_extractors/analyses_pulmonary/step010_brat_import/covid_xray/corpus.pkl' if use_binary: source_model = '/home/lybarger/clinical_extractors/analyses_pulmonary/step320_pulmonary_modeling/fit/sent1+concat0+run0+bd1/' doc_map = constants_pulmonary.DOC_MAP_BINARY else: source_model = '/home/lybarger/clinical_extractors/analyses_pulmonary/step320_pulmonary_modeling/fit/sent1+concat0+run0/' doc_map = constants_pulmonary.DOC_MAP source_image = paths_pulmonary.xray_quadrant_interp load_predictions = False #file = constants.CORPUS_FILE #source = os.path.join(source_dir, source, file) device = 0 ''' Paths ''' destination = os.path.join(paths_pulmonary.image_anno_comp, description) # Scratch directory make_and_clear(destination) # Create observers file_observ = FileStorageObserver.create(destination) cust_observ = CustomObserver(destination) ex.observers.append(file_observ) ex.observers.append(cust_observ)
def get_predictions(source_model, device, text_dict, target_ids, path, load_predictions=False): logging.info(f"=" * 72) logging.info(f"Predictions") logging.info(f"=" * 72) logging.info(f"Document count, all: {len(text_dict)}") text = [] for study_id, accession in target_ids: k = (study_id, accession) text.append(text_dict[k]) logging.info(f"Document count, target: {len(text)}") dir = os.path.join(path, 'predictions') f = os.path.join(dir, PREDICTIONS_FILE) if load_predictions: y = joblib.load(f) else: model = load_pretrained(ModelXray, source_model) y = model.predict(X=text, device=device) make_and_clear(dir) joblib.dump(y, f) labels = [y_[constants.DOC_LABELS] for y_ in y] labels = nest_list(labels) return labels
def cfg(): # Annotation source #source = constants.SYMPTOMS #source = constants.SDOH source = constants_pulmonary.COVID_XRAY #source = constants.SDOH_DEID #source = constants.SDOH_PARTIAL fast_run = False fast_count = 50 if fast_run else None source_dir = None skip = None source_tags = None source_original = None write_brat = False write_text = False map_ids = False corpus_object = CorpusBrat update_lb = False id2tags = None rm_extra_lb = False snap_textbounds = False linebreak_bound = True if source == constants.SDOH: source_dir = paths_deid.sdoh_brat source_tags = paths_deid.sdoh_doc_tags source_original = paths_deid.sdoh_original dir = paths_deid.brat_import write_brat = True write_text = True update_lb = True elif source == constants.SDOH_DEID: source_dir = paths_deid.sdoh_brat_deid dir = paths_deid.brat_import corpus_object = CorpusBratDeid elif source == constants.SDOH_PARTIAL: source_dir = paths.sdoh_brat_partial source_tags = paths.sdoh_doc_tags source_original = paths.sdoh_original dir = paths.brat_import write_brat = True write_text = True fast_run = True elif source == constants.SYMPTOMS: source_dir = paths_symptoms.symptoms_brat source_tags = paths_symptoms.symptoms_doc_tags #source_original = paths_symptoms.symptoms_original dir = paths_symptoms.brat_import corpus_object = CorpusBratSymptoms write_brat = False write_text = False elif source == constants_pulmonary.COVID_XRAY: source_dir = paths_pulmonary.brat_xray source_tags = paths_pulmonary.pulmonary_doc_tags dir = paths_pulmonary.brat_import corpus_object = CorpusBratXray id2tags = corpus_brat_xray.id2tags rm_extra_lb = True snap_textbounds = True skip = [] else: ValueError("invalid source: {}".format(source)) ''' Paths ''' if fast_run: destination = os.path.join(dir, source + '_FAST_RUN') else: destination = os.path.join(dir, source) # Destination file for corpus # Scratch directory make_and_clear(destination) # Create observers file_observ = FileStorageObserver.create(destination) cust_observ = CustomObserver(destination) ex.observers.append(file_observ) ex.observers.append(cust_observ)
def cfg(): description = 'bert' #description = 'baseline+crf' source = constants_pulmonary.COVID_XRAY dir = None mode = CV #mode = FIT #mode = PREDICT #mode = SCORE model_dir = '/home/lybarger/clinical_extractors/analyses_pulmonary/step320_pulmonary_modeling/fit/baseline/' fast_run = False fast_count = 20 if fast_run else None source_dir = None n_splits = 3 doc_map = constants_pulmonary.DOC_MAP if source == constants_pulmonary.COVID_XRAY: source_dir = paths_pulmonary.brat_import source = constants_pulmonary.COVID_XRAY source = os.path.join(source_dir, source, constants.CORPUS_FILE) output_dir = paths_pulmonary.modeling else: ValueError("invalid source: {}".format(source)) side_swap = False if side_swap: entity_definition = ENTITY_DEFINITION_SWAP relation_definition = RELATION_DEFINITION_SWAP else: entity_definition = ENTITY_DEFINITION relation_definition = RELATION_DEFINITION sent_definition = SENT_DEFINITION ''' Paths ''' if fast_run: destination = os.path.join(output_dir, mode, description + '_FAST_RUN') else: destination = os.path.join(output_dir, mode, description) # Destination file for corpus # Scratch directory make_and_clear(destination) device = 0 use_sent_objective = True concat_sent_scores = True span_embed_dim = 50 batch_size = 4 num_workers = 0 max_sent_count = 35 keep_ws = False linebreak_bound = True dropout_sent = 0.0 dropout_doc = 0.0 lr = 1e-5 lr_ratio = 1.0 pretrained = "emilyalsentzer/Bio_ClinicalBERT" # 'bert-base-uncased' doc_definition = DOC_DEFINITION sent_definition = SENT_DEFINITION grad_max_norm = 1.0 loss_reduction = "sum" project_sent = True project_size = 200 attention_query_dim = 100 max_length = 60 num_workers = 6 num_epochs = 10 # Create observers file_observ = FileStorageObserver.create(destination) cust_observ = CustomObserver(destination) ex.observers.append(file_observ) ex.observers.append(cust_observ)
def write_brat(self, path, **kwargs): make_and_clear(path, recursive=True) for doc in self.docs(**kwargs): doc.write_brat(path)
def cfg(): description = 'baseline' source = constants.SYMPTOMS dir = None fast_run = True source_dir = None include_train = [constants.TRAIN] if source == constants.SYMPTOMS: source_dir = paths_symptoms.brat_import if fast_run: source += '_FAST_RUN' source = os.path.join(source_dir, source, constants.CORPUS_FILE) output_dir = paths_symptoms.modeling else: ValueError("invalid source: {}".format(source)) ''' Paths ''' if fast_run: destination = os.path.join(output_dir, description + '_FAST_RUN') else: destination = os.path.join(output_dir, description) # Destination file for corpus # Scratch directory make_and_clear(destination) device = 1 use_rnn = True num_workers = 0 xfmr_dim = 768 lstm_size = 200 h_size = lstm_size * 2 if use_rnn else xfmr_dim loss_reduction = "sum" hyperparams = {} hyperparams['use_rnn'] = use_rnn rnn = {} rnn['input_size'] = xfmr_dim rnn['output_size'] = lstm_size rnn['type_'] = 'lstm' rnn['num_layers'] = 1 rnn['dropout_output'] = 0.0 hyperparams['rnn'] = rnn relation_extractor = {} relation_extractor[ "entity_definition"] = constants_symptoms.ENTITY_DEFINITION relation_extractor["input_dim"] = h_size relation_extractor["span_scorer_type"] = "span" relation_extractor["span_embed_project"] = True relation_extractor["span_embed_dim"] = 100 relation_extractor["span_embed_dropout"] = 0.0 relation_extractor["span_scorer_hidden_dim"] = 100 relation_extractor["span_scorer_dropout"] = 0.0 relation_extractor["span_class_weights"] = None relation_extractor["spans_per_word"] = 2 relation_extractor[ "relation_definition"] = constants_symptoms.RELATION_DEFINITION relation_extractor["role_hidden_dim"] = 100 relation_extractor["role_output_dim"] = 2 relation_extractor["role_dropout"] = 0.0 relation_extractor["loss_reduction"] = loss_reduction hyperparams["relation_extractor"] = relation_extractor hyperparams['grad_max_norm'] = 1.0 hyperparams["loss_reduction"] = loss_reduction dataset_params = {} dataset_params["pretrained"] = "emilyalsentzer/Bio_ClinicalBERT" dataset_params["max_length"] = 30 dataset_params["max_wp_length"] = 60 dataset_params["linebreak_bound"] = True dataset_params["keep"] = 'mean' dataset_params["max_span_width"] = 6 dataset_params["entity_definition"] = constants_symptoms.ENTITY_DEFINITION dataset_params[ "relation_definition"] = constants_symptoms.RELATION_DEFINITION dataloader_params = {} dataloader_params['batch_size'] = 100 dataloader_params['num_workers'] = num_workers optimizer_params = {} optimizer_params['lr'] = 0.001 num_workers = 6 num_epochs = 100 # Create observers file_observ = FileStorageObserver.create(destination) cust_observ = CustomObserver(destination) ex.observers.append(file_observ) ex.observers.append(cust_observ)
def main(source, destination, corpus_fn, num_examples, source_params, sampling, exclude, annotators, id2filename): # Load and create corpus logging.info("Corpus loading...") if source == constants_pulmonary.COVID_XRAY: corpus = xray.load_corpus(**source_params) logging.info('-' * 72) logging.info('Only day 0') logging.info('-' * 72) corpus.summary(path=destination, day_range=(0, 0), rep_range=None) logging.info('-' * 72) logging.info('Only day in [0-7]') logging.info('-' * 72) corpus.summary(path=destination, day_range=(0, 7), rep_range=None) logging.info('-' * 72) logging.info('All notes') logging.info('-' * 72) corpus.summary(path=destination, day_range=None, rep_range=None) elif source == constants_pulmonary.EMERGE_XRAY: corpus = xray.load_emerge_corpus(**source_params) else: raise ValueError("Incorrect corpus:\t{}".format(corpus)) logging.info("Corpus loaded") # Save examples for review example_dir = os.path.join(destination, "Examples") make_and_clear(example_dir, recursive=True) corpus.write_examples(example_dir, num_examples=num_examples) corpus.summary(path=destination) corpus.write_ids(destination) exclude = [tuple(e) for e in exclude] if sampling is not None: logging.info('Sampling') # Sampling brat_dir = os.path.join(destination, "brat") make_and_clear(brat_dir, recursive=True) sampled = [] for params in sampling: dir_ = os.path.join(brat_dir, 'round{:0>2d}'.format(params['round'])) os.mkdir(dir_) logging.info('') logging.info('Round:\t{}'.format(params['round'])) docs = corpus.random_sample( \ size = params['size'], exclude = exclude, seed = params['seed'], path = dir_, brat = True, footer = params['footer'], annotators = annotators, anno_type = params['anno_type'], **params['kwargs']) exclude.extend(list(docs.keys())) for id in docs: if id2filename is not None: id = id2filename(id) sampled.append((params['round'], id)) sample_check(brat_dir, destination) fn = os.path.join(destination, 'sampled_documents.csv') df = pd.DataFrame(sampled, columns=['round', 'id']) df.to_csv(fn) # Save corpus logging.info("Saving to disk...") joblib.dump(corpus, corpus_fn) logging.info("Saving complete") return True
def cfg(): description = 'baseline' #description = 'baseline+crf' source = constants_pulmonary.COVID_XRAY dir = None #mode = CV #mode = FIT #mode = PREDICT #mode = SCORE mode = PROB model_dir='/home/lybarger/clinical_extractors/analyses_pulmonary/step320_pulmonary_modeling/fit/sent1+concat0+run0+bd1/' fast_run = False fast_count = 20 if fast_run else None source_dir = None n_splits = 3 binary_doc_map = False if binary_doc_map: doc_map = constants_pulmonary.DOC_MAP_BINARY doc_definition = DOC_DEFINITION_BINARY else: doc_map = constants_pulmonary.DOC_MAP doc_definition = DOC_DEFINITION if source == constants_pulmonary.COVID_XRAY: source_dir = paths_pulmonary.brat_import source = constants_pulmonary.COVID_XRAY source = os.path.join(source_dir, source, constants.CORPUS_FILE) output_dir = paths_pulmonary.modeling else: ValueError("invalid source: {}".format(source)) side_swap = False if side_swap: entity_definition = ENTITY_DEFINITION_SWAP relation_definition = RELATION_DEFINITION_SWAP else: entity_definition = ENTITY_DEFINITION relation_definition = RELATION_DEFINITION sent_definition = SENT_DEFINITION ''' Paths ''' if fast_run: destination = os.path.join(output_dir, mode, description + '_FAST_RUN') else: destination = os.path.join(output_dir, mode, description) # Destination file for corpus # Scratch directory make_and_clear(destination) device = 0 use_rnn = True use_doc_classifier = True use_span_classifier = False use_doc_features = False use_sent_objective = True concat_sent_scores = True projection_dim = 100 if concat_sent_scores: assert use_sent_objective span_embed_dim = 50 dropout_rnn = 0.0 dropout_sent_classifier = 0.0 dropout_doc_classifier = 0.0 batch_size = 30 linebreak_bound = True max_sent_count = 35 keep_ws = True num_workers = 0 xfmr_dim = 768 rnn_size = 100 h_size = rnn_size*2 if use_rnn else xfmr_dim loss_reduction = "sum" lr = 0.002 hyperparams = {} hyperparams['use_rnn'] = use_rnn hyperparams['use_doc_classifier'] = use_doc_classifier hyperparams['use_span_classifier'] = use_span_classifier hyperparams['use_doc_features'] = use_doc_features rnn = {} rnn['input_size'] = xfmr_dim rnn['output_size'] = rnn_size rnn['type_'] = 'lstm' rnn['num_layers'] = 1 rnn['dropout_output'] = dropout_rnn rnn['layer_norm'] = True hyperparams['rnn'] = rnn span_class_weights = OrderedDict() w = 100.0 span_class_weights[REGION] = [1.0, w, w] # [NONE, PARENCHYMAL, EXTRAPARENCHYMAL] span_class_weights[SIDE] = [1.0, w, w] # [NONE, UNILATERAL, BILATERAL] span_class_weights[SIZE] = [1.0, w, w, w] # [NONE, SMALL, MODERATE, LARGE] span_class_weights[NEGATION] = [1.0, w] # [NONE, NEGATION] relation_extractor = {} relation_extractor["entity_definition"] = entity_definition relation_extractor["input_dim"] = h_size relation_extractor["span_scorer_type"] = "span" relation_extractor["span_embed_project"] = True relation_extractor["span_embed_dim"] = span_embed_dim relation_extractor["span_embed_dropout"] = 0 relation_extractor["span_scorer_hidden_dim"] = 50 relation_extractor["span_scorer_dropout"] = 0 relation_extractor["span_class_weights"] = None # span_class_weights relation_extractor["spans_per_word"] = 2 relation_extractor["relation_definition"] = relation_definition relation_extractor["role_hidden_dim"] = 50 relation_extractor["role_output_dim"] = 2 relation_extractor["role_dropout"] = 0 relation_extractor["create_doc_vector"] = use_doc_features relation_extractor["doc_attention_dropout"] = 0 relation_extractor["loss_reduction"] = loss_reduction hyperparams["relation_extractor"] = relation_extractor doc_classifier = {} doc_classifier["doc_definition"] = doc_definition doc_classifier["input_dim"] = h_size doc_classifier["query_dim"] = 100 doc_classifier["use_ffnn"] = True doc_classifier["dropout_sent_classifier"] = dropout_sent_classifier doc_classifier["dropout_doc_classifier"] = dropout_doc_classifier doc_classifier["activation"] = 'tanh' doc_classifier["loss_reduction"] = loss_reduction doc_classifier["use_sent_objective"] = use_sent_objective doc_classifier["concat_sent_scores"] = concat_sent_scores doc_classifier["sent_definition"] = sent_definition doc_classifier["projection_dim"] = projection_dim hyperparams['doc_classifier'] = doc_classifier hyperparams['grad_max_norm'] = 1.0 hyperparams["loss_reduction"] = loss_reduction dataset_params = {} dataset_params["pretrained"] = "emilyalsentzer/Bio_ClinicalBERT" dataset_params["max_length"] = 30 dataset_params["max_wp_length"] = 50 dataset_params["max_sent_count"] = max_sent_count dataset_params["linebreak_bound"] = linebreak_bound dataset_params["keep"] = 'mean' dataset_params["max_span_width"] = 6 dataset_params["document_definition"] = doc_definition dataset_params["sent_definition"] = sent_definition dataset_params["entity_definition"] = entity_definition dataset_params["relation_definition"] = relation_definition dataset_params["pad_start"] = True dataset_params["pad_end"] = True dataset_params["keep_ws"] = False dataloader_params = {} dataloader_params['batch_size'] = batch_size dataloader_params['num_workers'] = num_workers optimizer_params = {} optimizer_params['lr'] = lr tokenization_params = {} tokenization_params['max_length'] = dataset_params["max_length"] tokenization_params['max_sent_count'] = dataset_params["max_sent_count"] tokenization_params['linebreak_bound'] = linebreak_bound tokenization_params['pad_start'] = dataset_params["pad_start"] tokenization_params['pad_end'] = dataset_params["pad_end"] num_workers = 6 num_epochs = 300 # Create observers file_observ = FileStorageObserver.create(destination) cust_observ = CustomObserver(destination) ex.observers.append(file_observ) ex.observers.append(cust_observ)
def cfg(): # Annotation source #source = constants.SYMPTOMS #source = constants.SDOH source = constants_pulmonary.COVID_XRAY description = 'all' fast_run = False source_dir = None index_round = 0 index_annotator = 1 index_note = 2 round = None annotators = None scorer = Scorer label_spec = {} if source == constants.SDOH: pass elif source == constants.SDOH_PARTIAL: pass elif source == constants.SYMPTOMS: pass elif source == constants_pulmonary.COVID_XRAY: source_dir = paths_pulmonary.brat_import dir = paths_pulmonary.agreement if description == 'round01': target_rounds_aggree = ["round01"] elif description == 'round04': target_rounds_aggree = ["round04"] elif description == 'all': target_rounds_aggree = ["round01", "round04"] target_rounds_dist = ["round02"] #annotator_pairs = [('Mark', 'Linzee'), # ('Mark', 'Matthew'), # ('Linzee', 'Matthew')] annotator_pairs = [('Linzee', 'Matthew')] scorer = ScorerXray doc_map = constants_pulmonary.DOC_MAP label_spec = {'doc_map': doc_map} else: ValueError("invalid source: {}".format(source)) source_corpus = os.path.join(source_dir, source, constants.CORPUS_FILE) ''' Paths55309 ''' destination = os.path.join(dir, source, description) # Destination file for corpus # Scratch directory make_and_clear(destination) # Create observers file_observ = FileStorageObserver.create(destination) cust_observ = CustomObserver(destination) ex.observers.append(file_observ) ex.observers.append(cust_observ)
def cfg(): #prediction_type = 'oracle' run = 0 prediction_type = 'ngrams' exclude_size = False exclude = None description = f"{prediction_type}+run{run}" fast_run = False n_splits = 3 doc_map = constants_pulmonary.DOC_MAP source_dir = paths_pulmonary.brat_import source = constants_pulmonary.COVID_XRAY source = os.path.join(source_dir, source, constants.CORPUS_FILE) output_dir = paths_pulmonary.discrete labels = [INFILTRATES, EXTRAPARENCHYMAL] if prediction_type == 'oracle': model_type = 'random_forest' hyperparams = OrderedDict() hyperparams['n_estimators'] = 200 hyperparams['max_depth'] = None hyperparams['min_samples_split'] = 2 hyperparams['min_samples_leaf'] = 1 hyperparams['min_weight_fraction_leaf'] = 0.0 hyperparams['max_features'] = 'auto' hyperparams['max_leaf_nodes'] = None hyperparams['random_state'] = None hyperparams['ccp_alpha'] = 0.0 hyperparams['n_jobs'] = 1 #hyperparams['class_weight'] = {0:1, 1:1} tuned_parameters = OrderedDict() tuned_parameters['max_depth'] = [5, 10, 30] if fast_run else [ 2, 4, 6, 8, 10, 15, 20, 25, 30, 35 ] tuned_parameters['min_samples_split'] = [2, 4] if fast_run else [ 2, 3, 4, 6, 8, 10, 12 ] tuned_parameters['n_estimators'] = [100] if fast_run else [ 50, 100, 200, 500 ] exclude_size = True if exclude_size: description = 'exclude_size' exclude = [SIZE] elif prediction_type == 'ngrams': model_type = 'svm' hyperparams = OrderedDict() hyperparams['C'] = 1.0 #hyperparams['kernel'] = 'rbf' #hyperparams['degree'] = 3 #hyperparams['gamma'] = 'scale' #hyperparams['coef0'] = 0.0 #hyperparams['shrinking'] = True #hyperparams['probability'] = True #hyperparams['tol'] = 0.001 #hyperparams['cache_size'] = 200 #hyperparams['class_weight'] = None #hyperparams['verbose'] = False #hyperparams['max_iter'] = - 1, #hyperparams['decision_function_shape'] = 'ovr' #hyperparams['break_ties'] = False #hyperparams['random_state'] = None tuned_parameters = OrderedDict() tuned_parameters['C'] = [0.0001, 1.0, 1000.0] if fast_run else [ 0.0001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0 ] else: raise ValueError(f"invalid prediction type: {prediction_type}") ''' Paths ''' if fast_run: destination = os.path.join(output_dir, prediction_type, description + '_FAST_RUN') else: destination = os.path.join(output_dir, prediction_type, description) # Scratch directory make_and_clear(destination) # Create observers file_observ = FileStorageObserver.create(destination) cust_observ = CustomObserver(destination) ex.observers.append(file_observ) ex.observers.append(cust_observ)