def configure_pipeline(experiment_path,
                       drop_nans=True,
                       load_balance_ratio=None,
                       keep_headers=['RECDESC'],
                       label_column='CHECKTHIS',
                       plot_classes=False,
                       drop_classes_less_than=0,
                       drop_classes_more_than=None):

    base_path = path.join(experiment_path, 'data')
    config_path = path.join(base_path, 'config.json')

    d = {
        'drop_nans': drop_nans,
        'load_balance_ratio': load_balance_ratio,
        'drop_classes_less_than': drop_classes_less_than,
        'drop_classes_more_than': drop_classes_more_than,
        'keep_headers': keep_headers,
        'label_column': label_column,
        'plot_classes': plot_classes,
        'base_path': base_path
    }
    check_and_create(base_path)
    with open(config_path, mode='w+') as fp:
        json.dump(d, fp, indent=4)
示例#2
0
    def fit(self, X=None, y=None):
        """
        Combines preprocessing steps into a pipeline object
        """

        if X is None:
            X, y = pd.read_pickle(
                path.join(self.__config['data_path'], self.__data_name))

        print("Assembling base feature pipeline")
        # Term Frequency!

        num_tf_features = self.__config['num_features']
        fs_model = FeatureSelectionFactory(k=num_tf_features).get_model(
            self.__config['feature_selection_type'])

        count_vectorizer_tuple = ("TF", self.__get_count_vectorizer())
        feature_selection_model_tuple = ("FS",
                                         fs_model if num_tf_features else None)
        nmf_tuple = ('NMF',
                     NMF(n_components=50,
                         random_state=42,
                         alpha=.1,
                         l1_ratio=.5,
                         init='nndsvd') if self.__config['nmf'] else None)
        idf_tuple = ('IDF',
                     TfidfTransformer() if self.__config['idf'] else None)
        wc_tuple = (
            'WORD_COUNT', None
        )  # self.__add_wordcount_features(X_i) if self.__config['word_count'] else None)
        pos_tuple = (
            'POS', None
        )  # self.__add_partsofspeech_features(X_i) if self.__config['pos'] else None)

        pipeline_routines = [
            count_vectorizer_tuple, idf_tuple, feature_selection_model_tuple,
            nmf_tuple, wc_tuple, pos_tuple
        ]
        self.__pipeline_steps.extend(
            [x for x in pipeline_routines if x[1] is not None])

        pipe = Pipeline(self.__pipeline_steps)

        for X_i in X:
            name = X_i.columns[0]
            X_i = np.array(X_i.values.tolist()).ravel()
            print("Fitting pipeline!")
            pipe.fit(X_i, y)

            print("Saving features pipeline")
            features_pipeline_location = path.join(
                self.__config['pipe_path'],
                self.__config['feature_pipeline_pickle_name'] + '.' + name)
            ut.check_and_create(self.__config['pipe_path'])
            joblib.dump(pipe, features_pipeline_location, compress=3)
示例#3
0
def configure_pipeline(experiment_path,
                       classifier_name,
                       validation_path='',
                       threshold=0.5):

    base_path = path.join(experiment_path, 'deploy')
    config_path = path.join(base_path, 'config.json')
    ml_path = path.join(experiment_path, 'ml')
    d = {
        'base_path': base_path,
        'ml_path': ml_path,
        'validation_path': validation_path,
        'classifier_name': classifier_name,
        'threshold': threshold
    }

    ut.check_and_create(base_path)
    with open(config_path, 'w') as fp:
        json.dump(d, fp)
示例#4
0
    def __init__(self, experiment_path, data_name, classifier=None):

        base_path = path.join(experiment_path, 'ml')
        config_path = path.join(base_path, 'config.json')

        ut.check_and_create(base_path)
        with open(config_path, 'r') as fp:
            self.__config = json.load(fp)

        self.__classifier = classifier
        self.__name = classifier.__class__.__name__
        ratio = self.__config['train_test_ratio']
        self.__pickle_path = path.join(self.__config['base_path'], self.__name)
        X, y = pd.read_pickle(
            path.join(self.__config['features_path'], '_xy_' + data_name))

        self.__X_train, self.__X_test, self.__y_train, self.__y_test = train_test_split(
            X, y, test_size=1.0 - ratio, random_state=42, shuffle=True)
        self.__accuracy = self.__config['accuracy']
        self.__threshold = self.__config['threshold']
        self.__classes = []
    def fit(self, X=None, y=None):

        if X is None:
            data_path = path.join(self.__config['data_path'], self.__data_file_name)
            with bz2.BZ2File(data_path, 'rb') as pickle_file:
                X, y = pickle.load(pickle_file)

        """
        Combines preprocessing steps into a pipeline object
        """


        pipeline_steps=[]

        if self.__config['spell']:
            pipeline_steps.append(('spell', SpellCheckDoc()))
        if self.__config['split_words']:
            pipeline_steps.append(('split', SplitWords()))
        if self.__config['stop_words']:
            pipeline_steps.append(('stop', StopWords()))
        if self.__config['lemmatize']:
            pipeline_steps.append(('lemmatize', Lemmatizer()))
        if self.__config['stemm']:
            pipeline_steps.append(('stemm', Stemmer()))

        self.__pipeline_steps.extend(pipeline_steps)

        pipe = Pipeline(self.__pipeline_steps)

        for header in self.__config['text_headers']:
            print("Fitting pipeline for " + header)
            X_i = X[header].astype(str)
            pipe.fit(X_i, y)

            print("Saving text pipeline")
            text_pipeline_location = path.join(self.__config['pipe_path'],
                                                   self.__config['text_pipeline_pickle_name'] + '.' + X_i.name)
            check_and_create(self.__config['pipe_path'])
            joblib.dump(pipe, text_pipeline_location, compress=3)
示例#6
0
def configure_pipeline(
    experiment_path,
    multi=True,
    train_test_ratio=0.75,
    threshold=0.5,
    accuracy=0.9,
):

    base_path = path.join(experiment_path, 'ml')
    features_path = path.join(experiment_path, 'features')
    config_path = path.join(base_path, 'config.json')
    d = {
        'multi': multi,
        'base_path': base_path,
        'features_path': features_path,
        'train_test_ratio': train_test_ratio,
        'threshold': threshold,
        'accuracy': accuracy
    }
    ut.check_and_create(base_path)
    with open(config_path, 'w') as fp:
        json.dump(d, fp)
示例#7
0
def configure_pipeline(experiment_path,
                       feature_set=['frequency_matrix'],
                       num_features=0,
                       idf=True,
                       feature_selection_type='Logistic',
                       min_df=3,
                       min_ngram=1,
                       max_ngram=3):
    base_path = path.join(experiment_path, 'features')
    config_path = path.join(base_path, 'config.json')
    pipe_path = path.join(base_path, 'pipe')
    data_path = path.join(experiment_path, 'text')
    d = {
        'feature_set': feature_set,
        'num_features': num_features,
        'max_df': 0.3,
        'idf': idf,
        'feature_selection_type': feature_selection_type,
        'min_df': min_df,
        'min_ngram': min_ngram,
        'max_ngram': max_ngram,
        'data_path': data_path,
        'pipe_path': pipe_path,
        'base_path': base_path,
        'feature_pipeline_pickle_name': 'feature_pipeline.pickle',
        'frequency_matrix':
        True if 'frequency_matrix' in feature_set else False,
        'embeddings': True if 'embeddings' in feature_set else False,
        'word_count': True if 'word_count' in feature_set else False,
        'pos': True if 'pos' in feature_set else False,
        'nmf': True if 'nmf' in feature_set else False,
        'tf_idf_filename': None
    }

    ut.check_and_create(base_path)
    with open(config_path, 'w') as fp:
        json.dump(d, fp)
    print()
示例#8
0
    def __init__(self, experiment_path, data_name):
        base_path = path.join(experiment_path, 'features')
        config_path = path.join(base_path, 'config.json')

        ut.check_and_create(base_path)
        with open(config_path, 'r') as fp:
            self.__config = json.load(fp)

        self.__TFIDF_FILENAME = 'tfidf.pickle'

        self.__pipeline_steps = []

        # to be changed
        # if 'embeddings' in feature_set:
        #     print("Adding embeddings!")
        #     self.__add_embeddings_features_mean(self.__text, 'fasttext_mean_300d')
        #     self.__add_embeddings_features_mean(self.__text, 'glove_mean_300d')
        #     self.__add_embeddings_features_mean(self.__text, 'fasttext_mean_300d', idf_dict=self.__idf_dict)
        #     self.__add_embeddings_features_mean(self.__text, 'glove_mean_300d', idf_dict=self.__idf_dict)

        self.__pipe = None
        self.__n_features = 0
        self.__data_name = data_name
def configure_pipeline(experiment_path, data_path, spell=True, split_words=True, text_headers=['RECDESC', 'EXPDESC'],
                       stop_words=True, lemmatize=False, stemm=False):
    base_path = path.join(experiment_path, 'text')
    config_path = path.join(base_path, 'config.json')
    pipe_path = path.join(base_path, 'pipe')
    lang_path = path.join(base_path, 'lang')
    d={
        'spell': spell,
        'split_words': split_words,
        'lemmatize': lemmatize,
        'stemm': stemm,
        'pipe_path': pipe_path,
        'data_path': data_path,
        'base_path': base_path,
        'lang_path': lang_path,
        'text_pipeline_pickle_name': 'text_pipeline.pickle',
        'text_headers': text_headers,
        'stop_words':stop_words
    }
    check_and_create(base_path)
    check_and_create(lang_path)
    with open(config_path, mode='w+') as fp:
        json.dump(d, fp)