Пример #1
0
def weka_arff_ttest(model='nc'):
    # importing config
    print("\nimporting config file...")
    config = utility.load_config()

    print("\npreparing the components...\n")
    db = DBConnector(**config['db'])

    if model is 'nc':
        n_class = 9
        dataset = shuffle(pd.DataFrame(db.find_trainingset()), random_state=42)
        # dataset = dataset.head(100)
        print("\ntokenizing...\n")
        ds = preprocessing.tokenize_list(dataset)
        labels = dataset['tag'].to_numpy()

        txt_labels = classification.news_categories
    else:
        n_class = 2
        dataset = shuffle(pd.DataFrame(db.find_likabilityset()),
                          random_state=42)

        print("\ntokenizing...\n")
        ds = pd.DataFrame()
        ds['content'] = preprocessing.tokenize_list(dataset)
        ds['tag'] = dataset['tag']

        labels = np.asarray([
            classification.labelize_likability(a)[0]
            for _, a in dataset.iterrows()
        ])
        txt_labels = ['LIKE', 'DISLIKE']

    # preparing classifiers
    print("\n\nt-testing %s ...\n\n" % (model))
    classifiers = classification.init_simple_classifiers(model)
    classifiers = classifiers + classification.init_ensmeta_classifiers(
        classifiers, model)  # putting together simple and ens/meta

    classification.weka_ttest(classifiers,
                              ds,
                              labels,
                              n_class=n_class,
                              txt_labels=txt_labels,
                              model=model)
Пример #2
0
    def build_feed(self, parsed_feed):
        self.feed = pd.DataFrame(parsed_feed)

        features = pd.DataFrame()
        features['content'] = pp.tokenize_list(parsed_feed)
        features['tag'] = self.nws_clf.predict(features['content'])

        self.feed['likability'] = self.lik_prd.predict(features)
        self.feed['predicted_tag'] = features['tag']

        self.feed = self.feed[self.feed['likability'] == 'LIKE'].drop(
            'likability', axis=1)
Пример #3
0
def deploy_news_classifier(dataset, dir_path='home/miner/model'):
    # this function should provide a trained pipeline  (so apparently bagging is the best now)
    clf = BaggingClassifier(base_estimator=classifier['svc'](C=0.51, random_state=42), 
                            n_estimators=100, 
                            random_state=42)

    model = build_nc_model(('clf', clf))

    dataset = shuffle(dataset, random_state=42)
    ds = pp.tokenize_list(dataset)
    labels = dataset['tag'].to_numpy()

    cross_validate_fullscores(model, ds, labels, n_class=9, txt_labels=news_categories)        
    joblib.dump(model, dir_path + '/nws_clf.pkl')
    return model
Пример #4
0
def meta_classify_lc(dataset, show_mat=False, tuning=False, plot=False, load_pretokenized=False):
    # preparing the inputs
    ds = pd.DataFrame()
    dataset = shuffle(dataset, random_state=42)

    # avoid wasting time during test...
    # if load_pretokenized:
    #     with open('home/pretokenized_dataset/lcds.pkl', 'rb') as f:
    #         ds['content'] = pickle.load(f)
    # else:
    ds['content'] = pp.tokenize_list(dataset)
    ds['tag'] = dataset['tag']

    # preparing the targets
    labels = np.asarray([labelize_likability(a)[0] for _,a in dataset.iterrows()])

    # classifiers initialization
    classifiers = init_simple_classifiers('lc')

    print("\n\nSimple Classifiers:\n")

    for c in classifiers:
        print('\n---------------------------\n')
        
        # building the model
        pl = build_lc_model(c)

        print('\nCrossValidation with 10 folds for ' + c[0] + '\n')
        cross_validate(pl, ds, labels, 2, show_mat=show_mat, txt_labels=['LIKE', 'DISLIKE'], random_state=42)
        if plot:
            plot_learning_curve(pl, c[0], ds, labels, random_state=42)
        print('\n---------------------------\n')

    print("\n\nEnsembles and Meta-Classifiers:\n")
    np.random.seed(42)

    ens_meta_classifiers = init_ensmeta_classifiers(classifiers, 'lc')

    for c in ens_meta_classifiers:
        print("CV 10 folds - " + c[0])

        # building the pipeline vectorizer-classifier
        pl = build_lc_model(c)
        
        # Cross_validating the model (dunno y its not working with the )
        cross_validate(pl, ds, labels, 2, show_mat=show_mat, txt_labels=['LIKE', 'DISLIKE'], random_state=42)
        if plot:
            plot_learning_curve(pl, c[0], ds, labels, random_state=42)
Пример #5
0
def deploy_likability_predictor(dataset, dir_path='home/miner/model'):
    # preparing the inputs
    ds = pd.DataFrame()
    dataset = shuffle(dataset, random_state=42)
    ds['content'] = pp.tokenize_list(dataset)
    ds['tag'] = dataset['tag']

    # preparing the targets
    labels = np.asarray([labelize_likability(a)[0] for _,a in dataset.iterrows()])

    # building the model...
    clf = classifier['log_reg'](solver='lbfgs', multi_class='auto', random_state=42)
    model = build_lc_model(('clf', clf))

    cross_validate_fullscores(model, ds, labels, n_class=2, txt_labels=['LIKE', 'DISIKE'])        
    joblib.dump(model, dir_path + '/lik_prd.pkl') 
    return model    
Пример #6
0
def meta_classify_nc(dataset, show_mat=False, tuning=False, plot=False, load_pretokenized=False):
    # preparing the trainingset
    dataset = shuffle(dataset, random_state=42)

    # avoid wasting time during test...
    # if load_pretokenized:
    #     with open('home/pretokenized_dataset/ncds.pkl', 'rb') as f:
    #         ds = pickle.load(f)
    # else:
    ds = pp.tokenize_list(dataset) # pp.vectorize_list(dataset)  (doc_to_vector stuff, not really working)
        

    # preparing the targets
    labels = dataset['tag'].to_numpy()
    n_class = len(news_categories)
    
    # classifiers initialization
    classifiers = init_simple_classifiers('nc')

    # params tuning
    params = { 
        'rf' : {
            'clf__n_estimators': [2000],
            'clf__bootstrap': [True],
            'clf__max_depth': [None],
            'clf__max_features': ['sqrt'],
            'clf__min_samples_leaf': [2, 4],
            'clf__min_samples_split': [5, 10]
        },
        'mnb' : {},
        'svc' : {
            # 'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 2)],
            # 'vect__min_df': (1, 0.01, 0.025),
            # 'vect__norm': [None, 'l1', 'l2']
            # 'vect__max_df': (0.5, 0.55, 0.65, 0.70, 0.75, 1.0),
            # 'vect__max_features' : [12000, 14500, 15000], # best MF: 14500
            'clf__C': np.arange(0.001, 1, 0.001) # best C: 0.51
        },
        'ada' : {
            'clf__n_estimators': [2000],
            'clf__algorithm': ['SAMME']
        },
        'lr' : {},
        'dt' : {}
    }

    print("\n\nSimple Classifiers:\n")
    for c in classifiers:

        print('\n---------------------------\n')
        # building the model
        model = build_nc_model(c)

        if tuning:
            print("\nTuning {} Hyper-Parameters with GridSearchCV: \n".format(c[0]))
            tuned_model = GridSearchCV(model, params[c[0]], iid=True,
                scoring='accuracy', cv=StratifiedKFold(random_state=42, shuffle=True, n_splits=10),
                verbose=1,
                n_jobs=-1
            )
            tuned_model.fit(ds, labels)
            print(tuned_model.best_score_, tuned_model.best_params_)
        else:
            print('\n CrossValidation with 10 folds for ' + c[0] + '\n')
            cross_validate(model, ds, labels, n_class, show_mat=show_mat, txt_labels=news_categories, random_state=42)

            if plot:
                plot_learning_curve(model, c[0], ds, labels, random_state=42)

        print('\n---------------------------\n')


    print("\n\nSimple Classifiers WITH Independent Features Selection (chi2-40Percentile):\n")
    for c in classifiers:

        print('\n---------------------------\n')
        # building the model
        model = build_nc_model(c, feature_selection=SelectPercentile(chi2, percentile=40))

        if tuning:
            print("\nTuning {} Hyper-Parameters with GridSearchCV: \n".format(c[0]))
            tuned_model = GridSearchCV(model, params[c[0]], iid=True,
                scoring='accuracy', cv=StratifiedKFold(random_state=42, shuffle=True, n_splits=10),
                verbose=1,
                n_jobs=-1
            )
            tuned_model.fit(ds, labels)
            print(tuned_model.best_score_, tuned_model.best_params_)
        else:
            print('\n CrossValidation with 10 folds for ' + c[0] + '\n')
            cross_validate(model, ds, labels, n_class, show_mat=show_mat, txt_labels=news_categories, random_state=42)

            if plot:
                plot_learning_curve(model, c[0], ds, labels, random_state=42)

        print('\n---------------------------\n')


    print("\n\nEnsembles and Meta-Classifiers:\n")
    
    # set the seed for some classifiers...
    np.random.seed(42)
    
    # init the list...
    ens_meta_classifiers = init_ensmeta_classifiers(classifiers, 'nc')

    for c in ens_meta_classifiers:
        print("\nCV 10 folds - " + c[0])
        # building the pipeline vectorizer-classifier
        model = build_nc_model(c)

        # # Cross_validating the model
        cross_validate(model, ds, labels, n_class, show_mat=show_mat, txt_labels=news_categories, random_state=42)
        if plot:
             plot_learning_curve(model, c[0], ds, labels, random_state=42)