def configure_pipeline(experiment_path, drop_nans=True, load_balance_ratio=None, keep_headers=['RECDESC'], label_column='CHECKTHIS', plot_classes=False, drop_classes_less_than=0, drop_classes_more_than=None): base_path = path.join(experiment_path, 'data') config_path = path.join(base_path, 'config.json') d = { 'drop_nans': drop_nans, 'load_balance_ratio': load_balance_ratio, 'drop_classes_less_than': drop_classes_less_than, 'drop_classes_more_than': drop_classes_more_than, 'keep_headers': keep_headers, 'label_column': label_column, 'plot_classes': plot_classes, 'base_path': base_path } check_and_create(base_path) with open(config_path, mode='w+') as fp: json.dump(d, fp, indent=4)
def fit(self, X=None, y=None): """ Combines preprocessing steps into a pipeline object """ if X is None: X, y = pd.read_pickle( path.join(self.__config['data_path'], self.__data_name)) print("Assembling base feature pipeline") # Term Frequency! num_tf_features = self.__config['num_features'] fs_model = FeatureSelectionFactory(k=num_tf_features).get_model( self.__config['feature_selection_type']) count_vectorizer_tuple = ("TF", self.__get_count_vectorizer()) feature_selection_model_tuple = ("FS", fs_model if num_tf_features else None) nmf_tuple = ('NMF', NMF(n_components=50, random_state=42, alpha=.1, l1_ratio=.5, init='nndsvd') if self.__config['nmf'] else None) idf_tuple = ('IDF', TfidfTransformer() if self.__config['idf'] else None) wc_tuple = ( 'WORD_COUNT', None ) # self.__add_wordcount_features(X_i) if self.__config['word_count'] else None) pos_tuple = ( 'POS', None ) # self.__add_partsofspeech_features(X_i) if self.__config['pos'] else None) pipeline_routines = [ count_vectorizer_tuple, idf_tuple, feature_selection_model_tuple, nmf_tuple, wc_tuple, pos_tuple ] self.__pipeline_steps.extend( [x for x in pipeline_routines if x[1] is not None]) pipe = Pipeline(self.__pipeline_steps) for X_i in X: name = X_i.columns[0] X_i = np.array(X_i.values.tolist()).ravel() print("Fitting pipeline!") pipe.fit(X_i, y) print("Saving features pipeline") features_pipeline_location = path.join( self.__config['pipe_path'], self.__config['feature_pipeline_pickle_name'] + '.' + name) ut.check_and_create(self.__config['pipe_path']) joblib.dump(pipe, features_pipeline_location, compress=3)
def configure_pipeline(experiment_path, classifier_name, validation_path='', threshold=0.5): base_path = path.join(experiment_path, 'deploy') config_path = path.join(base_path, 'config.json') ml_path = path.join(experiment_path, 'ml') d = { 'base_path': base_path, 'ml_path': ml_path, 'validation_path': validation_path, 'classifier_name': classifier_name, 'threshold': threshold } ut.check_and_create(base_path) with open(config_path, 'w') as fp: json.dump(d, fp)
def __init__(self, experiment_path, data_name, classifier=None): base_path = path.join(experiment_path, 'ml') config_path = path.join(base_path, 'config.json') ut.check_and_create(base_path) with open(config_path, 'r') as fp: self.__config = json.load(fp) self.__classifier = classifier self.__name = classifier.__class__.__name__ ratio = self.__config['train_test_ratio'] self.__pickle_path = path.join(self.__config['base_path'], self.__name) X, y = pd.read_pickle( path.join(self.__config['features_path'], '_xy_' + data_name)) self.__X_train, self.__X_test, self.__y_train, self.__y_test = train_test_split( X, y, test_size=1.0 - ratio, random_state=42, shuffle=True) self.__accuracy = self.__config['accuracy'] self.__threshold = self.__config['threshold'] self.__classes = []
def fit(self, X=None, y=None): if X is None: data_path = path.join(self.__config['data_path'], self.__data_file_name) with bz2.BZ2File(data_path, 'rb') as pickle_file: X, y = pickle.load(pickle_file) """ Combines preprocessing steps into a pipeline object """ pipeline_steps=[] if self.__config['spell']: pipeline_steps.append(('spell', SpellCheckDoc())) if self.__config['split_words']: pipeline_steps.append(('split', SplitWords())) if self.__config['stop_words']: pipeline_steps.append(('stop', StopWords())) if self.__config['lemmatize']: pipeline_steps.append(('lemmatize', Lemmatizer())) if self.__config['stemm']: pipeline_steps.append(('stemm', Stemmer())) self.__pipeline_steps.extend(pipeline_steps) pipe = Pipeline(self.__pipeline_steps) for header in self.__config['text_headers']: print("Fitting pipeline for " + header) X_i = X[header].astype(str) pipe.fit(X_i, y) print("Saving text pipeline") text_pipeline_location = path.join(self.__config['pipe_path'], self.__config['text_pipeline_pickle_name'] + '.' + X_i.name) check_and_create(self.__config['pipe_path']) joblib.dump(pipe, text_pipeline_location, compress=3)
def configure_pipeline( experiment_path, multi=True, train_test_ratio=0.75, threshold=0.5, accuracy=0.9, ): base_path = path.join(experiment_path, 'ml') features_path = path.join(experiment_path, 'features') config_path = path.join(base_path, 'config.json') d = { 'multi': multi, 'base_path': base_path, 'features_path': features_path, 'train_test_ratio': train_test_ratio, 'threshold': threshold, 'accuracy': accuracy } ut.check_and_create(base_path) with open(config_path, 'w') as fp: json.dump(d, fp)
def configure_pipeline(experiment_path, feature_set=['frequency_matrix'], num_features=0, idf=True, feature_selection_type='Logistic', min_df=3, min_ngram=1, max_ngram=3): base_path = path.join(experiment_path, 'features') config_path = path.join(base_path, 'config.json') pipe_path = path.join(base_path, 'pipe') data_path = path.join(experiment_path, 'text') d = { 'feature_set': feature_set, 'num_features': num_features, 'max_df': 0.3, 'idf': idf, 'feature_selection_type': feature_selection_type, 'min_df': min_df, 'min_ngram': min_ngram, 'max_ngram': max_ngram, 'data_path': data_path, 'pipe_path': pipe_path, 'base_path': base_path, 'feature_pipeline_pickle_name': 'feature_pipeline.pickle', 'frequency_matrix': True if 'frequency_matrix' in feature_set else False, 'embeddings': True if 'embeddings' in feature_set else False, 'word_count': True if 'word_count' in feature_set else False, 'pos': True if 'pos' in feature_set else False, 'nmf': True if 'nmf' in feature_set else False, 'tf_idf_filename': None } ut.check_and_create(base_path) with open(config_path, 'w') as fp: json.dump(d, fp) print()
def __init__(self, experiment_path, data_name): base_path = path.join(experiment_path, 'features') config_path = path.join(base_path, 'config.json') ut.check_and_create(base_path) with open(config_path, 'r') as fp: self.__config = json.load(fp) self.__TFIDF_FILENAME = 'tfidf.pickle' self.__pipeline_steps = [] # to be changed # if 'embeddings' in feature_set: # print("Adding embeddings!") # self.__add_embeddings_features_mean(self.__text, 'fasttext_mean_300d') # self.__add_embeddings_features_mean(self.__text, 'glove_mean_300d') # self.__add_embeddings_features_mean(self.__text, 'fasttext_mean_300d', idf_dict=self.__idf_dict) # self.__add_embeddings_features_mean(self.__text, 'glove_mean_300d', idf_dict=self.__idf_dict) self.__pipe = None self.__n_features = 0 self.__data_name = data_name
def configure_pipeline(experiment_path, data_path, spell=True, split_words=True, text_headers=['RECDESC', 'EXPDESC'], stop_words=True, lemmatize=False, stemm=False): base_path = path.join(experiment_path, 'text') config_path = path.join(base_path, 'config.json') pipe_path = path.join(base_path, 'pipe') lang_path = path.join(base_path, 'lang') d={ 'spell': spell, 'split_words': split_words, 'lemmatize': lemmatize, 'stemm': stemm, 'pipe_path': pipe_path, 'data_path': data_path, 'base_path': base_path, 'lang_path': lang_path, 'text_pipeline_pickle_name': 'text_pipeline.pickle', 'text_headers': text_headers, 'stop_words':stop_words } check_and_create(base_path) check_and_create(lang_path) with open(config_path, mode='w+') as fp: json.dump(d, fp)