Пример #1
0
def test_pipeline_init():
    # Test the various init parameters of the pipeline.
    assert_raises(TypeError, Pipeline)
    # Check that we can't instantiate pipelines with objects without fit
    # method
    pipe = assert_raises(TypeError, Pipeline, [('svc', IncorrectT)])
    # Smoke test with only an estimator
    clf = T()
    pipe = Pipeline([('svc', clf)])
    assert_equal(
        pipe.get_params(deep=True),
        dict(
            svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False)))

    # Check that params are set
    pipe.set_params(svc__a=0.1)
    assert_equal(clf.a, 0.1)
    assert_equal(clf.b, None)
    # Smoke test the repr:
    repr(pipe)

    # Test with two objects
    clf = SVC()
    filter1 = SelectKBest(f_classif)
    pipe = Pipeline([('anova', filter1), ('svc', clf)])

    # Check that we can't use the same stage name twice
    assert_raises(ValueError, Pipeline, [('svc', SVC()), ('svc', SVC())])

    # Check that params are set
    pipe.set_params(svc__C=0.1)
    assert_equal(clf.C, 0.1)
    # Smoke test the repr:
    repr(pipe)

    # Check that params are not set when naming them wrong
    assert_raises(ValueError, pipe.set_params, anova__C=0.1)

    # Test clone
    pipe2 = clone(pipe)
    assert_false(pipe.named_steps['svc'] is pipe2.named_steps['svc'])

    # Check that apart from estimators, the parameters are the same
    params = pipe.get_params(deep=True)
    params2 = pipe2.get_params(deep=True)

    for x in pipe.get_params(deep=False):
        params.pop(x)

    for x in pipe2.get_params(deep=False):
        params2.pop(x)

    # Remove estimators that where copied
    params.pop('svc')
    params.pop('anova')
    params2.pop('svc')
    params2.pop('anova')
    assert_equal(params, params2)
Пример #2
0
def test_pipeline_init_tuple():
    # Pipeline accepts steps as tuple
    X = np.array([[1, 2]])
    pipe = Pipeline((("transf", Transf()), ("clf", FitParamT())))
    pipe.fit(X, y=None)
    pipe.score(X)
    pipe.set_params(transf="passthrough")
    pipe.fit(X, y=None)
    pipe.score(X)
Пример #3
0
def test_pipeline_raise_set_params_error():
    # Test pipeline raises set params error message for nested models.
    pipe = Pipeline([("cls", LinearRegression())])
    with raises(ValueError, match="Invalid parameter"):
        pipe.set_params(fake="nope")

    # nested model check
    with raises(ValueError, match="Invalid parameter"):
        pipe.set_params(fake__estimator="nope")
Пример #4
0
def test_pipeline_init():
    # Test the various init parameters of the pipeline.
    assert_raises(TypeError, Pipeline)
    # Check that we can't instantiate pipelines with objects without fit
    # method
    pipe = assert_raises(TypeError, Pipeline, [('svc', IncorrectT)])
    # Smoke test with only an estimator
    clf = T()
    pipe = Pipeline([('svc', clf)])
    assert_equal(
        pipe.get_params(deep=True),
        dict(svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False)))

    # Check that params are set
    pipe.set_params(svc__a=0.1)
    assert_equal(clf.a, 0.1)
    assert_equal(clf.b, None)
    # Smoke test the repr:
    repr(pipe)

    # Test with two objects
    clf = SVC()
    filter1 = SelectKBest(f_classif)
    pipe = Pipeline([('anova', filter1), ('svc', clf)])

    # Check that we can't use the same stage name twice
    assert_raises(ValueError, Pipeline, [('svc', SVC()), ('svc', SVC())])

    # Check that params are set
    pipe.set_params(svc__C=0.1)
    assert_equal(clf.C, 0.1)
    # Smoke test the repr:
    repr(pipe)

    # Check that params are not set when naming them wrong
    assert_raises(ValueError, pipe.set_params, anova__C=0.1)

    # Test clone
    pipe2 = clone(pipe)
    assert_false(pipe.named_steps['svc'] is pipe2.named_steps['svc'])

    # Check that apart from estimators, the parameters are the same
    params = pipe.get_params(deep=True)
    params2 = pipe2.get_params(deep=True)

    for x in pipe.get_params(deep=False):
        params.pop(x)

    for x in pipe2.get_params(deep=False):
        params2.pop(x)

    # Remove estimators that where copied
    params.pop('svc')
    params.pop('anova')
    params2.pop('svc')
    params2.pop('anova')
    assert_equal(params, params2)
Пример #5
0
def test_pipeline_raise_set_params_error():
    # Test pipeline raises set params error message for nested models.
    pipe = Pipeline([('cls', LinearRegression())])
    with raises(ValueError, match="Invalid parameter"):
        pipe.set_params(fake='nope')

    # nested model check
    with raises(ValueError, match="Invalid parameter"):
        pipe.set_params(fake__estimator='nope')
Пример #6
0
def train_imbalance(
    descr_series: Series,
    classes_codes: Series,
    TFIDF_,
    IMB_,
    FS_,
    req_percentage: int,
    CLF_,
    model_name: str,
) -> tuple:
    """Trains models using handled setting and saves them as .sav objects.

    Parameters:
    ----------
    instance:
        Instance of User model.
    descr_series:
        description series.
    classes_codes:
        series with classes' codes.
    TFIDF_:
        vectorizer.
    IMB_:
        SMOTE instance.
    FS_:
        ranking terms method.
    req_percentage:
        percentage to be taken from the ranked list.
    CLF_:
        classifier.
    model_name:
        models name.

    Returns:
    ----------
        Trained model in byte representation associated to its model name.

    """
    transformer = feature_selection.SelectPercentile(FS_)
    clf_model = Pipeline([("tfidf", TFIDF_), ("imba", IMB_),
                          ("fs", transformer), ("clf", CLF_)])

    best_params = get_best_params(clf_model, descr_series, classes_codes)
    print(f"{model_name}:{best_params}")

    clf_model.set_params(
        fs__percentile=req_percentage,
        clf__C=best_params["clf__C"],
        clf__gamma=best_params["clf__gamma"],
    ).fit(descr_series, classes_codes)

    return {model_name: clf_model}, {model_name: best_params}
Пример #7
0
def get_text_transformer(method: str, pca: bool, ngram_range):
    from imblearn.pipeline import Pipeline

    from sklearn.decomposition import TruncatedSVD
    steps = [('vect', CountVectorizer(tokenizer=TextTokenizer().preprocess)),
             ('tfidf', TfidfTransformer())]

    if method != "none":
        steps.append(("os", get_balancing_step(method)))

    if pca:
        steps.append(('pca', TruncatedSVD()))

    pipeline = Pipeline(steps)
    pipeline.set_params(vect__ngram_range=ngram_range, tfidf__use_idf=True)
    return pipeline
Пример #8
0
def train_imbalance(
    descr_series: pd.Series,
    classes_codes: pd.Series,
    TFIDF_,
    IMB_,
    FS_,
    req_percentage: int,
    CLF_,
    model_name: str,
) -> dict:
    """ Trains models using handled setting and saves them as .sav objects.

    Parameters:
    ----------
    instance:
        Instance of User model;
    descr_series:
        description series;
    classes_codes:
        series with classes' codes;
    TFIDF_:
        vectorizer;
    IMB_:
        SMOTE instance;
    FS_:
        ranking terms method;
    req_percentage:
        percentage to be taken from the ranked list;
    CLF_:
        classifier;
    model_name:
        models name.

    Returns:
    ----------
        Trained model in byte representation associated to its model name.

    """
    transformer = feature_selection.SelectPercentile(FS_)
    clf_model = Pipeline([("tfidf", TFIDF_), ("imba", IMB_),
                          ("fs", transformer), ("clf", CLF_)])
    clf_model.set_params(fs__percentile=req_percentage).fit(
        descr_series, classes_codes)

    return {model_name: clf_model}
Пример #9
0
def training_imbalance(descr_series, classes_codes, TFIDF_, IMB_, FS_,
                       req_percentage, CLF_, model_path):
    """ Trains models using handled setting and saves them as .sav objects.

        Parameters:
            descr_series(Series): description series;
            classes_codes(Series): series with classes' codes;
            TFIDF_: vectorizer;
            IMB_: SMOTE method;
            FS_: ranking terms method;
            req_percentage(int): percentage to be taken from the ranked list;
            CLF_: classifier;
            model_path(str): the path to the model.

    """
    transformer = feature_selection.SelectPercentile(FS_)
    clf_model = Pipeline([('tfidf', TFIDF_), ('imba', IMB_),
                          ('fs', transformer), ('clf', CLF_)])
    clf_model.set_params(fs__percentile=req_percentage).fit(
        descr_series, classes_codes)
    dump(clf_model, open(model_path + '.sav', 'wb'))
Пример #10
0
def test_set_pipeline_steps():
    transf1 = Transf()
    transf2 = Transf()
    pipeline = Pipeline([('mock', transf1)])
    assert pipeline.named_steps['mock'] is transf1

    # Directly setting attr
    pipeline.steps = [('mock2', transf2)]
    assert 'mock' not in pipeline.named_steps
    assert pipeline.named_steps['mock2'] is transf2
    assert [('mock2', transf2)] == pipeline.steps

    # Using set_params
    pipeline.set_params(steps=[('mock', transf1)])
    assert [('mock', transf1)] == pipeline.steps

    # Using set_params to replace single step
    pipeline.set_params(mock=transf2)
    assert [('mock', transf2)] == pipeline.steps

    # With invalid data
    pipeline.set_params(steps=[('junk', ())])
    with raises(TypeError):
        pipeline.fit([[1]], [1])
    with raises(TypeError):
        pipeline.fit_transform([[1]], [1])
Пример #11
0
def test_set_pipeline_steps():
    transf1 = Transf()
    transf2 = Transf()
    pipeline = Pipeline([("mock", transf1)])
    assert pipeline.named_steps["mock"] is transf1

    # Directly setting attr
    pipeline.steps = [("mock2", transf2)]
    assert "mock" not in pipeline.named_steps
    assert pipeline.named_steps["mock2"] is transf2
    assert [("mock2", transf2)] == pipeline.steps

    # Using set_params
    pipeline.set_params(steps=[("mock", transf1)])
    assert [("mock", transf1)] == pipeline.steps

    # Using set_params to replace single step
    pipeline.set_params(mock=transf2)
    assert [("mock", transf2)] == pipeline.steps

    # With invalid data
    pipeline.set_params(steps=[("junk", ())])
    with raises(TypeError):
        pipeline.fit([[1]], [1])
    with raises(TypeError):
        pipeline.fit_transform([[1]], [1])
Пример #12
0
def test_set_pipeline_steps():
    transf1 = Transf()
    transf2 = Transf()
    pipeline = Pipeline([('mock', transf1)])
    assert pipeline.named_steps['mock'] is transf1

    # Directly setting attr
    pipeline.steps = [('mock2', transf2)]
    assert 'mock' not in pipeline.named_steps
    assert pipeline.named_steps['mock2'] is transf2
    assert [('mock2', transf2)] == pipeline.steps

    # Using set_params
    pipeline.set_params(steps=[('mock', transf1)])
    assert [('mock', transf1)] == pipeline.steps

    # Using set_params to replace single step
    pipeline.set_params(mock=transf2)
    assert [('mock', transf2)] == pipeline.steps

    # With invalid data
    pipeline.set_params(steps=[('junk', ())])
    assert_raises(TypeError, pipeline.fit, [[1]], [1])
    assert_raises(TypeError, pipeline.fit_transform, [[1]], [1])
Пример #13
0
def resampling(X, Y, r):
    # print(sorted(Counter(Y).items()))
    smote_enn = TomekLinks()
    X_resampled, y_resampled = smote_enn.fit_resample(X, Y)
    #print(sorted(Counter(y_resampled).items()))
    return X_resampled, y_resampled


# pipeline
pipeline = Pipeline([
    ('und', RandomUnderSampler()),
    #('power', preprocessing.PowerTransformer()),
    ('standardize', preprocessing.StandardScaler()),
    ('normalizer', preprocessing.Normalizer()),
    ('lda', LinearDiscriminantAnalysis()),
    #('logistic', sk.linear_model.SGDClassifier(loss="hinge", eta0=1, learning_rate="constant", penalty='l2'))
    ('svm', LinearSVC(verbose=0, max_iter=3000, class_weight='balanced')),
])

com_values = [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1, 10]
for c in com_values:
    pipeline.set_params(svm__C=c, und__random_state=42).fit(X_train, Y_train)
    # clf = CalibratedClassifierCV(base_estimator=pipeline, cv=10).fit(X,Y)
    y_p = pipeline.decision_function(X_dev)
    y_pred = pipeline.predict(X_dev)
    print("With:", c)
    print("Confusion matrix:\n", sk.metrics.confusion_matrix(Y_dev, y_pred))
    one = sk.metrics.recall_score(Y_dev, y_pred, pos_label=0)
    two = sk.metrics.recall_score(Y_dev, y_pred, pos_label=1)
    print("UAR:", (one + two) / 2, "\n")
Пример #14
0
class Model:
    def training_model(self,
                       model_path,
                       data_path,
                       colums,
                       resolution,
                       sw=text.ENGLISH_STOP_WORDS,
                       log=0):
        try:
            self.creater = ConfigCreator()
            smt = SMOTE(ratio='minority',
                        random_state=0,
                        kind='borderline1',
                        n_jobs=4)
            svm_imb = SVC(gamma=2,
                          C=1,
                          probability=True,
                          class_weight='balanced')
            tfidf_imb = StemmedTfidfVectorizer(norm='l2',
                                               sublinear_tf=True,
                                               stop_words=sw,
                                               analyzer='word',
                                               max_df=0.5,
                                               max_features=500)
            anova = feature_selection.f_classif
            chi2 = feature_selection.chi2
            if not os.path.exists(model_path):
                os.makedirs(model_path)
            self.data = pandas.read_pickle(data_path)
            binary_logger = BynaryTrainLogger()
            multiple_logger = MultipleTrainLogger()
            filter = Filter()
            self.creater.create_config(
                'single_mod.ini', section_name='single_mod.ini'.split('.')[0])

            # models training
            self.creater.update_setting('single_mod.ini',
                                        'single_mod.ini'.split('.')[0],
                                        'binary_col_class',
                                        ','.join(['0', '1']))

            # areas
            # to exclude columns with quite low percent of 1
            clear_columns = [
                column for column in colums
                if self.data[column][self.data[column] == 1].size /
                self.data[column].size > 0.005
            ]
            self.creater.update_setting('single_mod.ini',
                                        'single_mod.ini'.split('.')[0],
                                        'columns', ','.join(clear_columns))
            for col in clear_columns:
                self.data['Description_tr'] = self.data['Description_tr']
                Model.training_imbalance_kf(self, self.data['Description_tr'],
                                            self.data[col], tfidf_imb, smt,
                                            chi2, 50, svm_imb,
                                            secure_filename(col), model_path)
                if log == 1:
                    binary_logger.log('{}.log'.format(session.sid),
                                      self.data['Description_tr'],
                                      self.data[col], col, model_path)

            # priority
            # to exclude rows with quite low percent of any categorical value
            self.valid_set = [
                el for el in self.data['Priority'].unique().tolist()
                if self.data['Priority'][self.data['Priority'] == el].size /
                self.data['Priority'].size > 0.005
            ]
            self.data_after_filter = self.data[self.data['Priority'].isin(
                self.valid_set)]
            if self.data.shape[0] > self.data_after_filter.shape[0]:
                self.data_after_filter = filter.reindex_data(
                    self.data_after_filter)
            self.data_after_filter['Priority_ord'] = self.data_after_filter[
                'Priority'].astype("category")
            self.data_after_filter['Priority_ord_codes'] = pandas.Categorical(
                self.data_after_filter['Priority_ord']).codes
            # train
            Model.training_imbalance_kf(
                self, self.data_after_filter['Description_tr'],
                self.data_after_filter['Priority_ord_codes'], tfidf_imb, smt,
                chi2, 50, svm_imb, 'priority', model_path)
            self.creater.update_setting(
                'single_mod.ini', 'single_mod.ini'.split('.')[0],
                'prior_col_class',
                ','.join(self.data_after_filter['Priority'].unique().tolist()))
            # log
            if log == 1:
                # print('priority')
                # start = time.clock()
                multiple_logger.log(
                    '{}.log'.format(session.sid),
                    self.data_after_filter['Description_tr'],
                    self.data_after_filter['Priority_ord_codes'], 'priority',
                    model_path)
                # print(time.clock()-start)

            bins = 4
            self.ldis = [i for i in range(1, bins + 1)]

            # ttr
            # to exclude rows with quite low percent of any categorical value
            self.data['temp_ttr_class'] = pandas.qcut(self.data['ttr_tr'],
                                                      bins,
                                                      labels=self.ldis,
                                                      duplicates='drop')
            self.valid_set = [
                el for el in self.data['temp_ttr_class'].unique().tolist()
                if self.data['temp_ttr_class'][self.data['temp_ttr_class'] ==
                                               el].size /
                self.data['temp_ttr_class'].size > 0.005
            ]
            self.data_after_filter = self.data[
                self.data['temp_ttr_class'].isin(self.valid_set)]
            if self.data.shape[0] > self.data_after_filter.shape[0]:
                self.data_after_filter = filter.reindex_data(
                    self.data_after_filter)
            # train
            Model.training_imbalance_kf(
                self, self.data_after_filter['Description_tr'],
                self.data_after_filter['temp_ttr_class'], tfidf_imb, smt, chi2,
                50, svm_imb, 'ttr', model_path)
            self.ttr_col_classTemp = pandas.qcut(
                self.data_after_filter['ttr_tr'], 4,
                duplicates='drop').unique()
            self.creater.update_setting(
                'single_mod.ini', 'single_mod.ini'.split('.')[0],
                'ttr_col_class', ','.join([
                    str(Model.ifZero(self, self.ttr_col_classTemp[el].left)) +
                    '-' +
                    str(Model.ifZero(self, self.ttr_col_classTemp[el].right))
                    for el in range(3)
                ] + [
                    '>' + str(
                        Model.ifZero(
                            self, self.ttr_col_classTemp[range(3)[-1]].right))
                ]))
            # log
            if log == 1:
                multiple_logger.log('{}.log'.format(session.sid),
                                    self.data_after_filter['Description_tr'],
                                    self.data_after_filter['temp_ttr_class'],
                                    'ttr', model_path)

            # resolution
            # resolution may have values wich can't be correctly processed by the system like: "Won't Fix"
            # therefore we have to convert them to model name via secure_filename() function
            self.bin_data = pandas.get_dummies(self.data,
                                               prefix=list(resolution.keys()),
                                               columns=list(resolution.keys()))
            # to exclude columns with quite low percent of 1
            clear_columns = {}
            for key in resolution:
                if isinstance(resolution[key], list):
                    resolutions = []
                    for rez in resolution[key]:
                        if self.bin_data[key + '_' + rez][self.bin_data[
                                key + '_' + rez] == 1].size / self.bin_data[
                                    key + '_' + rez].size > 0.005:
                            resolutions.append(rez)
                    if len(resolutions) == 1:
                        clear_columns[key] = resolutions[0]
                    if len(resolutions) > 1:
                        clear_columns[key] = resolutions
                else:
                    if self.bin_data[key + '_' + resolution[key]][
                            self.bin_data[key + '_' + resolution[key]] ==
                            1].size / self.bin_data[
                                key + '_' + resolution[key]].size > 0.005:
                        clear_columns[key] = resolution[key]

            for key in clear_columns:
                if isinstance(clear_columns[key], list):
                    for res in clear_columns[key]:
                        Model.training_imbalance_kf(
                            self, self.bin_data['Description_tr'],
                            self.bin_data[key + '_' + res],
                            tfidf_imb, smt, chi2, 50, svm_imb,
                            secure_filename(res), model_path)
                        self.creater.update_setting(
                            'single_mod.ini', 'single_mod.ini'.split('.')[0],
                            res + '_col_class', ','.join(['not ' + res, res]))
                        if log == 1:
                            binary_logger.log('{}.log'.format(session.sid),
                                              self.bin_data['Description_tr'],
                                              self.bin_data[key + '_' + res],
                                              secure_filename(res), model_path)
                else:
                    Model.training_imbalance_kf(
                        self, self.bin_data['Description_tr'],
                        self.bin_data[key + '_' + clear_columns[key]],
                        tfidf_imb, smt, chi2, 50, svm_imb,
                        secure_filename(clear_columns[key]), model_path)
                    self.creater.update_setting(
                        'single_mod.ini', 'single_mod.ini'.split('.')[0],
                        clear_columns[key] + '_col_class', ','.join(
                            ['not ' + clear_columns[key], clear_columns[key]]))
                    if log == 1:
                        binary_logger.log(
                            '{}.log'.format(session.sid),
                            self.bin_data['Description_tr'],
                            self.bin_data[key + '_' + clear_columns[key]],
                            secure_filename(clear_columns[key]), model_path)

            self.data.to_pickle(data_path)
        except FileNotFoundError:
            raise

    def training_imbalance_kf(self, X_, Y_, TFIDF_, IMB_, FS_, pers_, CLF_,
                              name_, model_path):
        self.transform = feature_selection.SelectPercentile(FS_)
        self.clf_model = Pipeline([('tfidf', TFIDF_), ('imba', IMB_),
                                   ('fs', self.transform), ('clf', CLF_)])
        kf = KFold(n_splits=10)
        kf.get_n_splits(X_)
        #X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_,Y_,train_size=.8, stratify=Y_)
        for train_index, test_index in kf.split(X_):
            self.X_train, self.X_test = X_[train_index], X_[test_index]
            self.y_train, self.y_test = Y_[train_index], Y_[test_index]
        self.clf_model.set_params(fs__percentile=pers_).fit(
            self.X_train, self.y_train)
        pickle.dump(self.clf_model, open(model_path + name_ + '.sav', 'wb'))
        #y_pred = clf_model.predict(X_test)
        #print(classification_report(y_test, y_pred))

    def ifZero(self, val):
        if val < 0:
            return 0
        else:
            return val

    def proc_text(self, text, col_class, name_model, model_path):
        self.test_pro = text
        try:
            with open('regularExpression.csv') as csv_data:
                for i in [
                        re.compile(el1) for el in csv.reader(
                            csv_data, delimiter=',', quotechar='"')
                        for el1 in el if el1
                ]:
                    self.test_pro = re.sub(i, ' ', self.test_pro)
            self.proba = {}
            sys.path.append("..")
            self.load_model_test = pickle.load(
                open(model_path + name_model + '.sav', 'rb'))
            self.proba_ = list(
                numpy.array(numpy.around(
                    self.load_model_test.predict_proba([self.test_pro])[0], 3),
                            dtype=float).flatten())
            self.proba_dic = dict(zip(col_class, self.proba_))
            return self.proba_dic
        except Exception:
            raise

    def create_top_terms_file(self, frame, resolution):
        checker = Checker()

        chi2 = feature_selection.chi2
        SW = text.ENGLISH_STOP_WORDS

        config_reader = SettingProvider('single_mod.ini')

        resol_all = []
        for el in checker.get_resolutions(resolution):
            resol_all += config_reader.get_setting(
                section='single_mod',
                setting="{el}_col_class".format(el=el),
                evaluate=False).split(',')

        resol = [el for el in resol_all if 'not' not in el]

        prio = config_reader.get_setting(section='single_mod',
                                         setting='prior_col_class',
                                         evaluate=False).split(',')

        areas = config_reader.get_setting(section='single_mod',
                                          setting='columns',
                                          evaluate=False).split(',')

        all_terms = prio + resol + areas
        all_mass = []
        bin_data = pandas.get_dummies(
            frame,
            prefix=list(resolution.keys()) + ['Priority'],
            columns=list(resolution.keys()) + ['Priority'])
        with open('top_terms.csv', 'w', newline='\n') as csvfile:
            csvwriter = csv.writer(csvfile)
            csvwriter.writerow(all_terms)
            for el in all_terms:
                if el in prio:
                    prior = self.top_terms(bin_data, 'Priority_' + el, chi2,
                                           SW)
                    all_mass.append(prior)

                if el in resol:
                    key = None
                    for key1 in resolution:
                        if isinstance(resolution[key1], list):
                            if el in resolution[key1]:
                                key = key1
                        else:
                            if el == resolution[key1]:
                                key = key1
                    resol = self.top_terms(bin_data, key + '_' + el, chi2, SW)
                    all_mass.append(resol)

                if el in areas:
                    area = self.top_terms(bin_data, el, chi2, SW)
                    all_mass.append(area)

            rows = zip_longest(*all_mass)
            for row in rows:
                csvwriter.writerow(row)

    def top_terms(self, data, field, func, SW):
        tfidf = StemmedTfidfVectorizer(norm='l2',
                                       sublinear_tf=True,
                                       min_df=1,
                                       stop_words=SW,
                                       analyzer='word',
                                       max_features=1000)
        # bidata = pandas.get_dummies(data, prefix=field+'_', columns=field)
        multithreaded = Multithreaded()
        clear_data = ClearData()
        parall_data = multithreaded.parallelize(data['Description_tr'],
                                                clear_data.clean_descr)
        tfs = tfidf.fit_transform(parall_data)
        y = data[field]
        selector = SelectKBest(score_func=func, k='all')
        selector.fit_transform(tfs, y)
        X_new = dict(zip(tfidf.get_feature_names(), selector.scores_))
        temp_dict = sorted(X_new.items(), key=lambda x: x[1], reverse=True)
        rez = []
        mean = []
        for el in temp_dict[:]:
            if el[1] > 1:
                rez.append(el)
                mean.append(el[1])
        import numpy
        return [el[0] for el in rez if el[1] > numpy.mean(mean)]
def get_feature_pipeline(params: SingleBaseParams):
    from imblearn.pipeline import Pipeline
    from sklearn.decomposition import TruncatedSVD

    feature_name = params.get_feature_name()

    if feature_name == publication_year_key:
        from features.text_processor import PublicationYearTransformer
        steps = [('t', PublicationYearTransformer())]
    else:
        steps = [
            ('vect', CountVectorizer(tokenizer=TextTokenizer().preprocess)),
            ('tfidf', TfidfTransformer()),
        ]
    if params.method != "none":
        steps.append(("os", get_balancing_step(params.method)))

    if params.pca:
        steps.append(('pca', TruncatedSVD()))

    from classifier.models import get
    parameters = {}
    if params.classifier.startswith('SVC'):
        parameters['probability'] = True
    if params.get_balanced():
        parameters['class_weight'] = 'balanced'
    steps.append(('clf', get(params.classifier, params=parameters)))

    pipeline = Pipeline(steps)
    if feature_name == publication_year_key:
        pass
    elif feature_name in [publication_type_key, mesh_headings_key]:
        pipeline.set_params(vect__ngram_range=(1, 1))
        pipeline.set_params(tfidf__use_idf=True)
    elif feature_name in [
            title_key, title_most_replaced_key,
            title_disease_or_syndrome_replaced_key, journal_title_key
    ]:
        pipeline.set_params(vect__ngram_range=(1, 2))
        pipeline.set_params(tfidf__use_idf=True)
    else:
        pipeline.set_params(vect__ngram_range=(1, 4))
        pipeline.set_params(tfidf__use_idf=True)

    return pipeline
Пример #16
0
def test_pipeline_init():
    # Test the various init parameters of the pipeline.
    with raises(TypeError):
        Pipeline()
    # Check that we can't instantiate pipelines with objects without fit
    # method
    error_regex = ("Last step of Pipeline should implement fit or be the "
                   "string 'passthrough'")
    with raises(TypeError, match=error_regex):
        Pipeline([("clf", NoFit())])
    # Smoke test with only an estimator
    clf = NoTrans()
    pipe = Pipeline([("svc", clf)])
    expected = dict(svc__a=None,
                    svc__b=None,
                    svc=clf,
                    **pipe.get_params(deep=False))
    assert pipe.get_params(deep=True) == expected

    # Check that params are set
    pipe.set_params(svc__a=0.1)
    assert clf.a == 0.1
    assert clf.b is None
    # Smoke test the repr:
    repr(pipe)

    # Test with two objects
    clf = SVC(gamma="scale")
    filter1 = SelectKBest(f_classif)
    pipe = Pipeline([("anova", filter1), ("svc", clf)])

    # Check that we can't instantiate with non-transformers on the way
    # Note that NoTrans implements fit, but not transform
    error_regex = "implement fit and transform or fit_resample"
    with raises(TypeError, match=error_regex):
        Pipeline([("t", NoTrans()), ("svc", clf)])

    # Check that params are set
    pipe.set_params(svc__C=0.1)
    assert clf.C == 0.1
    # Smoke test the repr:
    repr(pipe)

    # Check that params are not set when naming them wrong
    with raises(ValueError):
        pipe.set_params(anova__C=0.1)

    # Test clone
    pipe2 = clone(pipe)
    assert not pipe.named_steps["svc"] is pipe2.named_steps["svc"]

    # Check that apart from estimators, the parameters are the same
    params = pipe.get_params(deep=True)
    params2 = pipe2.get_params(deep=True)

    for x in pipe.get_params(deep=False):
        params.pop(x)

    for x in pipe2.get_params(deep=False):
        params2.pop(x)

    # Remove estimators that where copied
    params.pop("svc")
    params.pop("anova")
    params2.pop("svc")
    params2.pop("anova")
    assert params == params2
 def optimize(self, samp, clf, tag):
     """
     单次实验流程
     """
     # 打印头部信息
     os.system('cls||clear')
     self.tag = tag
     tqdm.write("""{div}\n    Stack Expert Model Optimization
         {indent}--Seed: {seed}
         {indent}--Sampling Method: {samp}
         {indent}--Classification Method: {clf}
         {indent}--Tag: {tag}\n{div}
         """.format(
         **{
             'div': '*' * 50,
             'seed': self.seed,
             'indent': '\b' * 6,
             'tag': self.tag,
             'samp': type(samp).__name__,
             'clf': type(clf).__name__
         }))
     # 加载数据
     data, target, ratio = load_data(self.tag)
     tqdm.write(
         "[INFO] Data loads complete. Expert ratio:{:.2f}%\t{:s}".format(
             100 * ratio, cur()))
     # 设置随机数种子
     self.seed = int(time())
     self.validator.random_state = self.seed
     self._check(type(clf).__name__)
     samp.set_params(**{"random_state": self.seed})
     if 'random_state' in clf.get_params().keys():
         clf.set_params(**{"random_state": self.seed})
     # 建立过采样和分类器的流水线模型
     pipeline = Pipeline([(type(samp).__name__, samp),
                          (type(clf).__name__, clf)])
     tqdm.write(
         "[INFO] Model load completed. Start grid search...\t{:s}".format(
             cur()))
     # 开始进行网格搜索
     for ind, (key, value) in enumerate(
             tqdm(self._get_grid(pipeline, ratio).items())):
         tqdm.write('-' * 15 + 'Epoch {:d}'.format(ind) + '-' * 15)
         # 设置默认参数
         pipeline.set_params(**self._get_params(pipeline, ratio))
         # 建立网格搜索对象
         grid_opti = GridSearchCV(estimator=pipeline,
                                  param_grid={key: value},
                                  cv=self.validator,
                                  **self.params['GridSearchCV'])
         tqdm.write("[EP{:d}] Search Paramator: {:}\t{:s}".format(
             ind, key, cur()))
         tqdm.write("[EP{:d}] Search Grid: {:}\t{:s}".format(
             ind,
             str(value) + " Fitting...", cur()))
         # 拟合模型
         grid_opti.fit(data.to_numpy(), target)
         # 输出最佳参数及对应实验指标
         df_res = pd.DataFrame(grid_opti.cv_results_)
         df_res = df_res.loc[df_res['mean_test_{:s}'.format(
             self.scoring)].idxmax()]
         tqdm.write(
             "[EP{:d}] Fit complete. Current Score: {:}\t{:s}".format(
                 ind, df_res['mean_test_{:s}'.format(self.scoring)], cur()))
         tqdm.write("\r[EP{:d}] Best: {:}\t{:s}".format(
             ind, df_res['params'], cur()))
         if '{}__sampling_strategy'.format(
                 type(samp).__name__) in df_res['params']:
             df_res['params']['{}__sampling_strategy'.format(
                 type(samp).__name__)] /= ratio
         # 更新参数
         self._set_params(df_res['params'])
         # 存储实验结果
         self._rec(
             type(clf).__name__,
             type(samp).__name__, df_res.filter(regex=r'^mean_test',
                                                axis=0))
     # 完成网格搜索,一次实验结束
     tqdm.write("{:s}\n[INFO] Grid search complete.{:}\t{:s}\n".format(
         '=' * 50, "", cur()))
     del data, target
Пример #18
0
    def train_models(self, X_train, y_train, tuning='hyperopt'):
        """Hyperparameter tuning.
        Iterate over each model and find best parameter combination using 'tuning' method and cross validation
        Also finally fits a voting classifier with all the optimized models
        """

        valid_tuning = ['random', 'grid', 'hyperopt']
        if tuning not in valid_tuning:
            raise ValueError(
                'train_models: tuning must be one of {}'.format(valid_tuning))

        start = datetime.now()

        self.best_estimators = {}
        self.best_f1_scores = {}
        self.scores_all = []
        for model, estimator in self.models_selector(tuning):
            self.best = 0
            model_pipeline = Pipeline([('imputer', self.imputer),
                                       ('scaling', self.scaler),
                                       ('rus', self.rus),
                                       (model, estimator['est'])])

            if tuning == 'random':
                search = RandomizedSearchCV(
                    model_pipeline,
                    param_distributions=estimator['params'],
                    scoring='f1',
                    n_iter=100,
                    cv=10,
                    verbose=False,
                    n_jobs=-1,
                    iid=False)
                self.models_fit(model, search, X_train, y_train, tuning)

            elif tuning == 'grid':
                search = GridSearchCV(model_pipeline,
                                      param_grid=param_dist,
                                      scoring='f1',
                                      cv=10,
                                      verbose=True,
                                      n_jobs=-1)
                self.models_fit(model, search, X_train, y_train, tuning)

            elif tuning == 'hyperopt':
                hyperopt_objective = partial(self.raw_hyperopt_objective,
                                             model_pipeline, X_train, y_train)

                trials = Trials()
                best_params = fmin(fn=hyperopt_objective,
                                   space=estimator['params'],
                                   algo=tpe.suggest,
                                   max_evals=20,
                                   trials=trials)
                best_params_actual = space_eval(estimator['params'],
                                                trials.argmin)

                model_pipeline.set_params(**best_params_actual)
                self.models_fit(model, model_pipeline, X_train, y_train,
                                tuning, trials)

        # Training a Voting classifier with all above estimators
        self.vot_model = VotingClassifier(estimators=list(
            self.best_estimators.items()),
                                          voting='hard')
        scores = cross_val_score(self.vot_model,
                                 X_train,
                                 y_train,
                                 scoring='f1',
                                 cv=10)
        self.vot_model.fit(X_train, y_train)
        self.best_estimators['voting'] = self.vot_model
        print('Voting Classifier F1 score mean: {:.4f}, stddev: {:.4f}'.format(
            scores.mean(), scores.std()))

        end = datetime.now()
        print('Train time: {}'.format(end - start))
def manual_partial_model_selection(main_path, dataset_type, dataset_sub_type,
                                   sampling, sampling_timing, fs_step_name,
                                   classifier_step_name, chromosome):
    print("##### Experiment Info #####")
    print("Chromosome:", chromosome)
    print("Dataset type:", dataset_type)
    print("Dataset subtype:", dataset_sub_type)
    print("Sampling:", sampling)
    print("Sampling timing:", sampling_timing)
    print("Filter FS:", fs_step_name)
    print("Classifier:", classifier_step_name)
    print()

    print("Loading variable names...")
    print()
    with open(
            main_path + dataset_type + '/' + dataset_sub_type + '/' +
            chromosome + '/train_train.csv', 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        for row in reader:
            variable_names = np.array(list(row))
            break

    variable_names = variable_names[1:]

    print("Variable names size:", len(variable_names))
    print()

    sampling_seeds = [123, 456, 789]

    print("Loading training data...")
    print()

    train_data = load_dataset(main_path + dataset_type + '/' +
                              dataset_sub_type + '/' + chromosome +
                              '/train_train.csv')

    X_train = train_data[:, 1:]
    print("X_train shape:", X_train.shape)
    print()

    y_train = train_data[:, 0]
    print("y_train shape:", y_train.shape)
    print()

    experiment_results = dict()

    print("Creating pipeline...")
    print()
    pipe = Pipeline([("imputer", Imputer(missing_values=-1)),
                     ("variance", VarianceThreshold()),
                     ("scaler", StandardScaler())])

    if fs_step_name == "anova":
        filter = SelectPercentile(f_classif, percentile=2)

    if sampling_timing == "sampling_before_fs":
        if sampling == "down_sample":
            pipe.steps.append(
                (sampling, RandomUnderSampler(random_state=sampling_seeds[0])))
        elif sampling == "up_sample":
            pipe.steps.append(
                (sampling, RandomOverSampler(random_state=sampling_seeds[1])))
        elif sampling == "smote_sample":
            pipe.steps.append(
                (sampling, SMOTE(n_jobs=-1, random_state=sampling_seeds[2])))

        pipe.steps.append((fs_step_name, filter))
    elif sampling_timing == "sampling_after_fs":
        pipe.steps.append((fs_step_name, filter))

        if sampling == "down_sample":
            pipe.steps.append(
                (sampling, RandomUnderSampler(random_state=sampling_seeds[0])))
        elif sampling == "up_sample":
            pipe.steps.append(
                (sampling, RandomOverSampler(random_state=sampling_seeds[1])))
        elif sampling == "smote_sample":
            pipe.steps.append(
                (sampling, SMOTE(n_jobs=-1, random_state=sampling_seeds[2])))

    classifier = SVC(kernel='linear',
                     random_state=123456,
                     probability=True,
                     class_weight='balanced')

    pipe.steps.append((classifier_step_name, classifier))

    print("Performing manual gridsearch...")
    print()

    C_OPTIONS = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
    cv = StratifiedKFold(n_splits=5, random_state=123456)
    f1_cv = []
    mean_test_score = []
    std_test_score = []

    for C in C_OPTIONS:
        pipe.set_params(linear_svm__C=C)

        for train_indexes, validation_indexes in cv.split(X_train, y_train):
            pipe.fit(X_train[train_indexes, :], y_train[train_indexes])

            y_pred = pipe.predict(X_train[validation_indexes, :])

            f1 = f1_score(y_train[validation_indexes],
                          y_pred,
                          average='weighted')
            f1_cv.append(f1)

        mean_test_score.append(np.mean(f1_cv))
        std_test_score.append(np.std(f1_cv))

    cv_results = dict()
    cv_results['mean_test_score'] = mean_test_score
    cv_results['std_test_score'] = std_test_score
    cv_results['params'] = C_OPTIONS

    experiment_results['cv_results'] = cv_results

    print("Manual gridsearch results:")
    print()
    print(cv_results['mean_test_score'])
    print()
    print(cv_results['std_test_score'])
    print()

    print("Best parameters set found on development set:")
    print()
    print(C_OPTIONS[np.argmax(mean_test_score)])
    print()
params_space = {
    'undersampler__n_neighbors':
    quniform_int('n_neighbors', 2, 10, 1),
    'xgb__max_depth':
    quniform_int('max_depth', 10, 30, 1),
    'xgb__min_child_weight':
    hp.quniform('min_child_weight', 1, 20, 1),
    'xgb__subsample':
    hp.uniform('subsample', 0.8, 1),
    'xgb__n_estimators':
    quniform_int('n_estimators', 1000, 10000, 50),
    'xgb__learning_rate':
    hp.loguniform('learning_rate', np.log(0.0001), np.log(0.5)) - 0.0001,
    'xgb__gamma':
    hp.loguniform('gamma', np.log(0.0001), np.log(5)) - 0.0001,
    'xgb__colsample_bytree':
    hp.quniform('colsample_bytree', 0.5, 1, 0.05)
}

model.set_params({
    'xgb__colsample_bytree': 0.55,
    'xgb__n_estimators': 1000,
    'xgb__subsample': 0.81758885827,
    'xgb__min_child_weight': 2.0,
    'xgb__learning_rate': 0.0091861014503,
    'xgb__gamma': 1.19618674618,
    'undersampler__n_neighbors': 7,
    'xgb__max_depth': 21
})
Пример #21
0
def test_pipeline_init():
    # Test the various init parameters of the pipeline.
    assert_raises(TypeError, Pipeline)
    # Check that we can't instantiate pipelines with objects without fit
    # method
    assert_raises_regex(
        TypeError, 'Last step of Pipeline should implement fit. '
        '.*NoFit.*', Pipeline, [('clf', NoFit())])
    # Smoke test with only an estimator
    clf = NoTrans()
    pipe = Pipeline([('svc', clf)])
    expected = dict(svc__a=None,
                    svc__b=None,
                    svc=clf,
                    **pipe.get_params(deep=False))
    assert pipe.get_params(deep=True) == expected

    # Check that params are set
    pipe.set_params(svc__a=0.1)
    assert clf.a == 0.1
    assert clf.b is None
    # Smoke test the repr:
    repr(pipe)

    # Test with two objects
    clf = SVC()
    filter1 = SelectKBest(f_classif)
    pipe = Pipeline([('anova', filter1), ('svc', clf)])

    # Check that we can't instantiate with non-transformers on the way
    # Note that NoTrans implements fit, but not transform
    assert_raises_regex(TypeError, 'implement fit and transform or sample',
                        Pipeline, [('t', NoTrans()), ('svc', clf)])

    # Check that params are set
    pipe.set_params(svc__C=0.1)
    assert clf.C == 0.1
    # Smoke test the repr:
    repr(pipe)

    # Check that params are not set when naming them wrong
    assert_raises(ValueError, pipe.set_params, anova__C=0.1)

    # Test clone
    pipe2 = clone(pipe)
    assert not pipe.named_steps['svc'] is pipe2.named_steps['svc']

    # Check that apart from estimators, the parameters are the same
    params = pipe.get_params(deep=True)
    params2 = pipe2.get_params(deep=True)

    for x in pipe.get_params(deep=False):
        params.pop(x)

    for x in pipe2.get_params(deep=False):
        params2.pop(x)

    # Remove estimators that where copied
    params.pop('svc')
    params.pop('anova')
    params2.pop('svc')
    params2.pop('anova')
    assert params == params2
Пример #22
0
filts = [
    ('RandomForestClassifier', RandomForestClassifier(n_estimators=25, random_state=random_state)),
    ('RandomForestClassifier', RandomForestClassifier(n_estimators=10, random_state=random_state)),
    ('DecisionTreeClassifier', DecisionTreeClassifier(random_state=random_state)),
    ('LogisticRegression', LogisticRegression(solver='lbfgs', random_state=random_state, multi_class='auto', max_iter=750)),
    ('MLPClassifier', MLPClassifier(random_state=random_state, max_iter=2000))
]


# set up model
znorm = StandardScaler()
ownmethod = MBKMeansFilter_reversed()
rfc = RandomForestClassifier(n_estimators=500, random_state=random_state)

clf = Pipeline([('ZNorm', znorm), ('OwnMethod2',ownmethod), ('RFC', rfc)])
clf.set_params(**params)

## model
clf.fit(X, y, **{'OwnMethod2__filters': filts})
pickle.dump(clf, open(MODELS_PATH+'near_final_clf_.pkl','wb'))

## model2
znorm = StandardScaler()
rfc = RandomForestClassifier(n_estimators=250, random_state=random_state, n_jobs=-1)
clf = Pipeline([('ZNorm', znorm), ('RFC', rfc)])
clf.fit(X, y)
pickle.dump(clf, open(MODELS_PATH+'final_RFC.pkl','wb'))

# ---------------------------------------------------------------------------- #
# Cross Spatial Validation
# ---------------------------------------------------------------------------- #
Пример #23
0
def test_pipeline_init():
    # Test the various init parameters of the pipeline.
    with raises(TypeError):
        Pipeline()
    # Check that we can't instantiate pipelines with objects without fit
    # method
    error_regex = 'Last step of Pipeline should implement fit. .*NoFit.*'
    with raises(TypeError, match=error_regex):
        Pipeline([('clf', NoFit())])
    # Smoke test with only an estimator
    clf = NoTrans()
    pipe = Pipeline([('svc', clf)])
    expected = dict(svc__a=None, svc__b=None, svc=clf,
                    **pipe.get_params(deep=False))
    assert pipe.get_params(deep=True) == expected

    # Check that params are set
    pipe.set_params(svc__a=0.1)
    assert clf.a == 0.1
    assert clf.b is None
    # Smoke test the repr:
    repr(pipe)

    # Test with two objects
    clf = SVC()
    filter1 = SelectKBest(f_classif)
    pipe = Pipeline([('anova', filter1), ('svc', clf)])

    # Check that we can't instantiate with non-transformers on the way
    # Note that NoTrans implements fit, but not transform
    error_regex = 'implement fit and transform or sample'
    with raises(TypeError, match=error_regex):
        Pipeline([('t', NoTrans()), ('svc', clf)])

    # Check that params are set
    pipe.set_params(svc__C=0.1)
    assert clf.C == 0.1
    # Smoke test the repr:
    repr(pipe)

    # Check that params are not set when naming them wrong
    with raises(ValueError):
        pipe.set_params(anova__C=0.1)

    # Test clone
    pipe2 = clone(pipe)
    assert not pipe.named_steps['svc'] is pipe2.named_steps['svc']

    # Check that apart from estimators, the parameters are the same
    params = pipe.get_params(deep=True)
    params2 = pipe2.get_params(deep=True)

    for x in pipe.get_params(deep=False):
        params.pop(x)

    for x in pipe2.get_params(deep=False):
        params2.pop(x)

    # Remove estimators that where copied
    params.pop('svc')
    params.pop('anova')
    params2.pop('svc')
    params2.pop('anova')
    assert params == params2