Пример #1
0
def partial_dependence(df, y):
    '''
    INPUT: X = features
           y = target variable binary, imbalanced classes
    OUPUT: X = features oversampled to have balanced target classes
           y = target variable oversample to have balanced classes

    Discovers the minority class and then oversamples until eah class makes up
    50% of your data.
    '''
    X_train, X_test, y_train, y_test = oversample_train_test(df, y)
    # X_train, X_test, y_train, y_test = train_test_split(df, y, random_state=42)

    feature_engineering = Pipeline([
        ('lists', ListSplitter()),
        ('race', RaceDummies()),
        ('crime_sentence', CrimeAndSentence()),
        ('feat_eng', FeatureEngineer()),
        ('columns', ColumnFilter(prejudice=False))
    ])

    X = feature_engineering.fit_transform(X_train.copy(), y_train)
    X_test = feature_engineering.fit_transform(X_test.copy(), y_test)

    gbc = GradientBoostingClassifier(n_estimators=850, learning_rate=.75)
    gbc.fit(X.copy(), y_train)
    most_imp = np.argsort(gbc.feature_importances_)[-6:]

    names = list(X_test.columns)
    feats = list(most_imp)
    fig, axs = plot_partial_dependence(gbc, X_test, feats, feature_names=names,
                                       n_jobs=3, grid_resolution=50)
Пример #2
0
    def make_features(self):
        features = Pipeline([
            ('count', self.build_vectorizer()),
            ('tfidf', TfidfTransformer())
        ])

        doc_vecs = features.fit_transform(self.docs)
        rp_vecs = features.fit_transform(self.rps)

        return (doc_vecs, rp_vecs)
Пример #3
0
def create_store_transforms(rl):
    trnsfrm = Pipeline([
        ('vbk', ValueByKey('wrd_list')),
        ('tfidf_tmp', TfidfVectorizer(lowercase=True, min_df = 2, stop_words='english')),
    ])
    with open('transforms/just_txt.pickle', 'wb') as handle: pickle.dump(trnsfrm.fit_transform(rl), handle)

    trnsfrm = Pipeline([
        ('vbk', ValueByKey('wrd_list')),
        ('tfidf_tmp', TfidfVectorizer(lowercase=True, min_df = 2, stop_words='english', tokenizer=brad_tokenizer_test)),
    ])
    with open('transforms/just_txt_chunks.pickle', 'wb') as handle: pickle.dump(trnsfrm.fit_transform(rl), handle)

    trnsfrm = Pipeline([
        ('union', FeatureUnion(
            transformer_list=[
                ('cuisinetype', Pipeline([
                    ('vbk', ValueByKey('type_2')),
                    ('labels', preprocessing.LabelBinarizer()),
                ])),
                # ('price_lev', Pipeline([
                #     ('vbk', ValueByKey('price_level')),
                #     ('labels2', preprocessing.LabelBinarizer()),
                # ])),
                #
                # ('rating_lev', Pipeline([
                #     ('vbk', ValueByKey('rating_level')),
                #     ('labels3', preprocessing.LabelBinarizer()),
                # ])),
                ('nlp', Pipeline([
                    ('vbk', ValueByKey('wrd_list')),
                    ('tfidf_tmp', TfidfVectorizer(lowercase=True, min_df = 2, stop_words='english'))
                ]))
            ]
        ))
    ])
    with open('transforms/txt_cat.pickle', 'wb') as handle: pickle.dump(trnsfrm.fit_transform(rl), handle)

    trnsfrm = Pipeline([
        ('union', FeatureUnion(
            transformer_list=[
                ('cuisinetype', Pipeline([
                    ('vbk', ValueByKey('type_2')),
                    ('labels', preprocessing.LabelBinarizer()),
                ])),
                ('nlp', Pipeline([
                    ('vbk', ValueByKey('wrd_list')),
                    ('tfidf_tmp', TfidfVectorizer(lowercase=True, min_df = 2, stop_words='english', tokenizer=brad_tokenizer_test))
                ]))
            ]
        ))
    ])
    with open('transforms/txt_cat_chunks.pickle', 'wb') as handle: pickle.dump(trnsfrm.fit_transform(rl), handle)
Пример #4
0
def transformCorpus(tdocuments, tentities):
  X1 = None
  #treat the tasks as documents and calculate the tfIdf vector
  '''hasher = HashingVectorizer(stop_words='english', non_negative=True,

                                                                 norm=None,
                                                                 binary=False)
        vectorizer = Pipeline((
                ('hasher', hasher),
                ('tf_idf', TfidfTransformer())
        ))

        '''
  '''lsa = TruncatedSVD(1000)
	X = lsa.fit_transform( vectorizer.fit_transform(tdocuments) )
	X1 = Normalizer(copy=False).fit_transform(X)
	'''
  #X1 = vectorizer.fit_transform(tdocuments)
  #print("n_samples: %d, n_features: %d" % X1.shape)
  #print()
  vec = Pipeline((('dictText', DictVectorizer()),
                  ('tfIdf', TfidfTransformer())))
  X2 = vec.fit_transform(tentities)
  lsa = TruncatedSVD(1000)
  X = lsa.fit_transform(X2)
  X1 = Normalizer(copy=False).fit_transform(X)
  #X2 = Normalizer(copy=False).fit_transform(X)
  print('n_samples: %d, n_features: %d' % X.shape)
  print()

  return X1, X2
Пример #5
0
def test_set_pipeline_step_none():
    # Test setting Pipeline steps to None
    X = np.array([[1]])
    y = np.array([1])
    mult2 = Mult(mult=2)
    mult3 = Mult(mult=3)
    mult5 = Mult(mult=5)

    def make():
        return Pipeline([("m2", mult2), ("m3", mult3), ("last", mult5)])

    pipeline = make()

    exp = 2 * 3 * 5
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))

    pipeline.set_params(m3=None)
    exp = 2 * 5
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))
    assert_dict_equal(
        pipeline.get_params(deep=True),
        {"steps": pipeline.steps, "m2": mult2, "m3": None, "last": mult5, "m2__mult": 2, "last__mult": 5},
    )

    pipeline.set_params(m2=None)
    exp = 5
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))

    # for other methods, ensure no AttributeErrors on None:
    other_methods = ["predict_proba", "predict_log_proba", "decision_function", "transform", "score"]
    for method in other_methods:
        getattr(pipeline, method)(X)

    pipeline.set_params(m2=mult2)
    exp = 2 * 5
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))

    pipeline = make()
    pipeline.set_params(last=None)
    # mult2 and mult3 are active
    exp = 6
    assert_array_equal([[exp]], pipeline.fit(X, y).transform(X))
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))
    assert_raise_message(AttributeError, "'NoneType' object has no attribute 'predict'", getattr, pipeline, "predict")

    # Check None step at construction time
    exp = 2 * 5
    pipeline = Pipeline([("m2", mult2), ("m3", None), ("last", mult5)])
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))
Пример #6
0
class MultinomialNB(Step):

    def __init__(self, percentile_threshold, bins):
        assert bins > 0
        bin_size = 1 / bins
        self.bins = np.arange(bin_size, 1, bin_size)
        self.lower = percentile_threshold
        self.upper = 100 - percentile_threshold
        scaler = MinMaxScaler()
        discretizer = FunctionTransformer(Discretizer(self.bins))
        self.pipeline = Pipeline(
            [('scaler', scaler), ('discretizer', discretizer)])

    def fit(self, vectors):
        self.lower_clip = np.percentile(vectors, self.lower, axis=0)
        self.upper_clip = np.percentile(vectors, self.upper, axis=0)
        vectors = np.clip(vectors, self.lower_clip, self.upper_clip)
        vectors = self.pipeline.fit_transform(vectors)
        n_docs = vectors.shape[0]
        self.distribution = np.array(
            [np.bincount(v, minlength=len(self.bins)) / n_docs
             for v in vectors.T])

    def transform(self, vectors):
        assert self.distribution is not None
        vectors = np.clip(vectors, self.lower_clip, self.upper_clip)
        probabilities = []
        n_dim = vectors.shape[1]
        vectors = self.pipeline.transform(vectors)
        for bins in vectors:
            pr = np.product(self.distribution[np.arange(n_dim), bins])
            probabilities.append(pr)
        return -np.log(np.maximum(1e-10, np.array(probabilities)))
Пример #7
0
def main():
  with open('recipes.json') as f:
    recipes = json.load(f)
    for recipe in recipes:
      ingredients.append(recipe['ingredients'])
      items.append(recipe['name'])

  pca = Pipeline([
    ('vect', DictVectorizer(sparse=False)),
    ('pca', PCA(n_components=2))
  ])
  X_pca = pca.fit_transform(ingredients)
  labels = Pipeline([
    ('vect', DictVectorizer(sparse=False)),
    ('pca', PCA(n_components=2)),
    ('agglom', AgglomerativeWrapper(AgglomerativeClustering(n_clusters=6, linkage='ward')))
  ])

  labels.fit(ingredients)
  clusters = labels.named_steps['agglom'].labels_
  print(clusters)
  plt.figure()
  for row, item in enumerate(items):
    plt.scatter(X_pca[row, 0], X_pca[row, 1], s=100, c='rgbcyk'[clusters[row]])
    plt.annotate("{}".format(item),
                 xy=(X_pca[row, 0], X_pca[row, 1]),
                 textcoords='offset points',
                 xytext=(10,10),
                 size=10,
                 arrowprops=dict(arrowstyle="->",
                                 facecolor='white'))
  plt.show()
Пример #8
0
def load_data_template(argv):
    # Train set
    data = np.load("data/train.npz")
    y_train = data["y_train"]
    X_train = data["X_train"]

    fu = FeatureUnion([
        #('spec', FlattenTransformer(scale=1.0)),
        ('st1', StatsTransformer(axis=1)),
        #('st0', StatsTransformer(axis=0))
    ])

    tf = Pipeline(steps=[('specg', SpectrogramTransformer(NFFT=256, clip=500,
                                                          noverlap=0.5,
                                                          dtype=np.float32,
                                                          log=False, flatten=False)),
                         ('tm', TemplateMatcher(raw=True)),
                         #('flatten', FlattenTransformer()),
                         ('fu', fu),
                     ])

    X_train = tf.fit_transform(X_train, y_train)


    # Test set
    data = np.load("data/test.npz")
    y_test = None
    X_test = data['X_test']
    X_test = tf.transform(X_test)

    return X_train, X_test, y_train, y_test
Пример #9
0
class BallTreeRecommender(object):
    """
    Given input terms, provide k recipe recommendations
    """
    def __init__(self, k=3, **kwargs):
        self.k = k
        self.trans_path = "svd.pkl"
        self.tree_path = "tree.pkl"
        self.transformer = False
        self.tree = None
        self.load()

    def load(self):
        """
        Load a pickled transformer and tree from disk,
        if they exist.
        """
        if os.path.exists(self.trans_path):
            self.transformer = joblib.load(open(self.trans_path, 'rb'))
            self.tree = joblib.load(open(self.tree_path, 'rb'))
        else:
            self.transformer = False
            self.tree = None

    def save(self):
        """
        It takes a long time to fit, so just do it once!
        """
        joblib.dump(self.transformer, open(self.trans_path, 'wb'))
        joblib.dump(self.tree, open(self.tree_path, 'wb'))

    def fit_transform(self, documents):
        # Transformer will be False if pipeline hasn't been fit yet,
        # Trigger fit_transform and save the transformer and lexicon.
        if self.transformer == False:
            self.transformer = Pipeline([
                ('norm', TextNormalizer(minimum=50, maximum=200)),
                ('transform', Pipeline([
                    ('tfidf', TfidfVectorizer()),
                    ('svd', TruncatedSVD(n_components=200))
                ])
                 )
            ])
            self.lexicon = self.transformer.fit_transform(documents)
            self.tree = BallTree(self.lexicon)
            self.save()

    def query(self, terms):
        """
        Given input list of ingredient terms,
        return the k closest matching recipes.

        :param terms: list of strings
        :return: list of document indices of documents
        """
        vect_doc = self.transformer.named_steps['transform'].fit_transform(
            wordpunct_tokenize(terms)
        )
        dists, inds = self.tree.query(vect_doc, k=self.k)
        return inds[0]
Пример #10
0
    def sgdfeature(self,data):

        newdata = pd.DataFrame()

        preproc = Pipeline([('fh',FeatureHasher( n_features=2**20,input_type='string'))])
        ##for SGDClassifier
        newdata['app_id_specs'] = data['app_id'].values+data['app_domain'].values+data['app_category'].values
        newdata['app_dom_specs'] = data['app_domain'].values+data['app_category'].values
        newdata['site_id_specs'] = data['site_id'].values+data['site_domain'].values+data['site_category'].values
        newdata['site_dom_specs'] = data['site_domain'].values+data['site_category'].values
        # data['device'] = data['device_model'].values+(data['device_type'].values.astype(str))+(data['device_conn_type'].values.astype(str))
        newdata['type'] = data['device_type'].values +data['device_conn_type'].values
        newdata['domain'] = data['app_domain'].values +data['site_domain'].values
        newdata['category'] = data['app_category'].values+data['site_category'].values
        newdata['pos_cat'] =  data['banner_pos'].values.astype(str)+data['app_category'].values+data['site_category'].values
        newdata['pos_dom'] =  data['banner_pos'].values.astype(str)+data['app_domain'].values+data['site_domain'].values
        # data['pos_id'] =  data['banner_pos'].values.astype(str)+data['app_id'].values+data['site_id'].values

        newdata['hour'] = data['hour'].map(lambda x: datetime.strptime(x.astype(str),"%y%m%d%H"))
        newdata['dayoftheweek'] = newdata['hour'].map(lambda x:  x.weekday)
        newdata['day'] = newdata['hour'].map(lambda x:  x.day)
        newdata['hour'] = newdata['hour'].map(lambda x:  x.hour)
        newdata = newdata.drop('hour',axis=1)
        newdata = newdata.astype(str)
        del data
        X_dict = np.asarray(newdata)

        self.X_train = preproc.fit_transform(X_dict)

        return self.X_train
Пример #11
0
def main(opt):
    with codecs.open(opt.vocab, encoding='utf-8') as f:
        vocab = load_vocab(f)
    id2word = build_id2word(vocab)
    _, docs_train, _ = load_all_data(opt.train_jsons)
    lda = Pipeline([
        ('bow', BagOfWords(vocab=vocab)),
        ('lda', Lda(id2word=id2word, num_topics=opt.num_topics))])
    lda_vec_train = lda.fit_transform(docs_train)

    sent_set = set()
    tmp_path = opt.lda_vec_path + '.tmp'
    with codecs.open(tmp_path, encoding='utf-8', mode='w') as f:
        dump_lda_vec(docs_train, lda_vec_train, sent_set, f)

    if opt.test_jsons:
        _, docs_test, _ = load_all_data(opt.test_jsons)
        lda_vec_test = lda.transform(docs_test)
        with codecs.open(tmp_path, encoding='utf-8', mode='a') as f:
            dump_lda_vec(docs_test, lda_vec_test, sent_set, f)

    with codecs.open(tmp_path, encoding='utf-8') as fin, \
            codecs.open(opt.lda_vec_path, encoding='utf-8', mode='w') as fout:
        fout.write('{} {}\n'.format(len(sent_set), opt.num_topics))
        for line in fin:
            fout.write(line)

    os.remove(tmp_path)
Пример #12
0
def XY9():
    X, y, X_test, X_test_index = load_xy()

    #### DON'T CHANGE BEFORE
    dummy_cols = ['FinelineNumber']
    keep_cols = ['Weekday', 'Returns']
    mul_col = None
    dfta = ft.DataFrameToArray()
    add_returns = ft.NGAddReturns()

    print("starting grouping")
    grouper = ft.GDummyKeepAndMultiplierTransform(dummy_cols, mul_col,
                                                  keep_cols)
    print("done grouping")
    transform_steps = [("imputer", ft.NGNAImputer()),
                       ("add_returns", add_returns), ('grouper', grouper)]

    ### DON'T CHANGE AFTER
    transform_steps.append((("dfta", dfta)))
    transform_pipe = Pipeline(steps=transform_steps)
    print("done with pipeline, now calculating")
    return {
        "X": transform_pipe.fit_transform(X),
        "y": y,
        "X_test": transform_pipe.transform(X_test),
        "X_test_index": X_test_index
    }
Пример #13
0
class Vectorizer():
    def __init__(self, hash=False, min_df=0.015, max_df=0.9):
        """
        `min_df` is set to filter out extremely rare words,
        since we don't want those to dominate the distance metric.

        `max_df` is set to filter out extremely common words,
        since they don't convey much information.
        """

        if hash:
            args = [
                ('vectorizer', HashingVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=Tokenizer())),
                ('tfidf', TfidfTransformer(norm=None, use_idf=True, smooth_idf=True)),
                ('feature_reducer', TruncatedSVD(n_components=400)),
                ('normalizer', Normalizer(copy=False))
            ]
        else:
            args = [
                ('vectorizer', CountVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=Tokenizer(), min_df=min_df, max_df=max_df)),
                ('tfidf', TfidfTransformer(norm=None, use_idf=True, smooth_idf=True)),
                ('normalizer', Normalizer(copy=False))
            ]

        self.pipeline = Pipeline(args)

    def vectorize(self, docs, train=False):
        if train:
            return self.pipeline.fit_transform(docs)
        else:
            return self.pipeline.transform(docs)

    @property
    def vocabulary(self):
        return self.pipeline.named_steps['vectorizer'].get_feature_names()
Пример #14
0
 def test_sklearn_pipeline(self):
     df = pd.DataFrame.from_dict([{"a":"something","b":1},{"a":"something2"}])
     t = bt.Exclude_features_transform(excluded=["b"])
     transformers = [("exclude_transform",t)]
     p = Pipeline(transformers)
     df2 = p.fit_transform(df)
     self.assertEquals(len(df2.columns),1)
Пример #15
0
def makePlots(Z):
	imp = Imputer()
	scal = StandardScaler()
	vart = VarianceThreshold()
	
	pipe = Pipeline([("imputer", imp), ("var theshold", vart), ("scaler", scal) ])
	
	# Require Z
	X1 = pipe.fit_transform(Z)
	pca = PCA(n_components=2)
	x2d = pca.fit_transform(X1.T)
	
	labels = {}
	centers = []
	
	for n in [2, 3, 5, 10]:
		agglo = FeatureAgglomeration(n_clusters=n).fit(X1)
		labels['ag%d'%n] = agglo.labels_
		plot(x2d, agglo.labels_, "Feature Agglomeration")
		
		km = KMeans(n_clusters=n).fit(X1.T)
		labels['km%d'%n] = km.labels_
		plot(x2d, km.labels_, "K-Means")
		centers = km.cluaster_centers_
	
	dbs = DBSCAN(eps = 100 ,min_samples=10).fit(X1.T)
	labels['DBSCAN'] = dbs.labels_
	plot(x2d, dbs.labels_, "DBSCAN")
		
	return labels, centers
Пример #16
0
    def train(self,sample):

        tTfidf = ptfidf.Tfidf_transform(input_feature="review",output_feature="tfidf",target_feature="sentiment",min_df=10,max_df=0.7,select_features=False,topn_features=50000,stop_words="english",ngram_range=[1,2])


        tFilter2 = bt.Include_features_transform(included=["tfidf","sentiment"])

        svmTransform = bt.Svmlight_transform(output_feature="svmfeatures",excluded=["sentiment"],zero_based=False)

        classifier_xg = xg.XGBoostClassifier(target="sentiment",svmlight_feature="svmfeatures",silent=1,max_depth=5,n_estimators=200,objective='binary:logistic',scale_pos_weight=0.2)

        cv = cf.Seldon_KFold(classifier_xg,metric='auc',save_folds_folder="./folds")
    
        transformers = [("tTfidf",tTfidf),("tFilter2",tFilter2),("svmTransform",svmTransform),("cv",cv)]

        p = Pipeline(transformers)

        pw = sutl.Pipeline_wrapper()
        df = pw.create_dataframe_from_files([self.data_folder],df_format="csv")
        if sample < 1.0:
            logger.info("sampling dataset to size %s ",sample)
            df = df.sample(frac=sample,random_state=1)
        
        logger.info("Data frame shape %d , %d",df.shape[0],df.shape[1])

        df2 = p.fit_transform(df)
        pw.save_pipeline(p,self.model_folder)
        logger.info("cross validation scores %s",cv.get_scores())

        return p
Пример #17
0
def test_l2density_basic():
    dim = 3
    bags = [np.random.randn(np.random.randint(30, 100), dim)
            for _ in xrange(50)]
    pipe = Pipeline([
        ('scale', BagMinMaxScaler([0, 1])),
        ('density', L2DensityTransformer(15)),
    ])
    l2ed = pipe.fit_transform(bags)

    assert np.all(np.isfinite(l2ed))
    # ||x - y||^2 = <x, x> - 2 <x, y> + <y, y>
    K = l2ed.dot(l2ed.T)
    row_norms_sq = np.diagonal(K)
    l2_dist_sq = row_norms_sq[:, None] - 2 * K + row_norms_sq[None, :]
    assert np.min(row_norms_sq) > 0
    assert np.min(l2_dist_sq) >= 0

    assert_raises(ValueError, lambda: L2DensityTransformer(10, basis='foo'))

    t = L2DensityTransformer(10)
    assert_raises(AttributeError, lambda: t.transform(bags))
    t.fit(dim)
    t.transform(BagMinMaxScaler([0, 1]).fit_transform(bags))
    assert_raises(ValueError, lambda: t.transform([b[:, :2] for b in bags]))
    assert_raises(ValueError, lambda: t.transform(bags))
    t.basis = 'haha snuck my way in'
    assert_raises(ValueError, lambda: t.transform(bags))
Пример #18
0
def XY1():
    X, y, X_test, X_test_index = load_xy()

    ####### VARIABLES
    dummy_cols = ['Weekday', 'DepartmentDescription']
    keep_cols = ['ScanCount', 'Returns']
    funcs = [np.sum, np.count_nonzero]

    dfta = ft.DataFrameToArray()
    add_returns = ft.NGAddReturns()
    gdd = ft.GDummyAndKeepTransform(dummy_cols, keep_cols,
                                    funcs)  # Doesn't work!

    transform_steps = [("imputer", ft.NGNAImputer())] + \
                      list(ft.wrapStep(("add_returns", add_returns))) + \
                      list(ft.wrapStep(('grouper', gdd))) + \
                      [("dfta", dfta)]
    transform_pipe = Pipeline(steps=transform_steps)

    kh.start_pipeline()
    kh.record_metric("validation", "start", "NA", "transform_pipeline",
                     str(transform_pipe), "NA")

    return {
        "X": transform_pipe.fit_transform(X),
        "y": y,
        "X_test": transform_pipe.transform(X_test),
        "X_test_index": X_test_index
    }
Пример #19
0
def XY7():
    X, y, X_test, X_test_index = load_xy()

    #### DON'T CHANGE BEFORE
    dummy_cols = ['DepartmentDescription']
    keep_cols = ['Weekday']
    mul_col = 'ScanCount'
    dfta = ft.DataFrameToArray()

    grouper = ft.GDummyKeepAndMultiplierTransform(dummy_cols, mul_col,
                                                  keep_cols)

    transform_steps = [("imputer", ft.NGNAImputer())] + \
                      list(ft.wrapStep(('grouper', grouper)))

    ### DON'T CHANGE AFTER
    transform_steps.append((("dfta", dfta)))
    transform_pipe = Pipeline(steps=transform_steps)

    kh.start_pipeline()
    kh.record_metric("validation", "start", "NA", "transform_pipeline",
                     str(transform_pipe), "NA")

    return {
        "X": transform_pipe.fit_transform(X),
        "y": y,
        "X_test": transform_pipe.transform(X_test),
        "X_test_index": X_test_index
    }
Пример #20
0
def XY8():
    X, y, X_test, X_test_index = load_xy()

    #### DON'T CHANGE BEFORE
    dummy_cols = ['DepartmentDescription']
    keep_cols = ['Weekday', 'Returns']
    mul_col = 'ScanCount'
    dfta = ft.DataFrameToArray()
    add_returns = ft.NGAddReturns()

    grouper = ft.GDummyKeepAndMultiplierTransform(dummy_cols, mul_col,
                                                  keep_cols)

    transform_steps = [("imputer", ft.NGNAImputer()),
                       ("add_returns", add_returns), ('grouper', grouper)]

    ### DON'T CHANGE AFTER
    transform_steps.append((("dfta", dfta)))
    transform_pipe = Pipeline(steps=transform_steps)

    return {
        "X": transform_pipe.fit_transform(X),
        "y": y,
        "X_test": transform_pipe.transform(X_test),
        "X_test_index": X_test_index
    }
Пример #21
0
class MultinomialDEP(Step):

    def __init__(self, percentile_threshold, bins):
        self.lower = percentile_threshold
        self.upper = 100 - percentile_threshold
        scaler = MinMaxScaler()
        discretizer = FunctionTransformer(Discretizer(bins))
        self.pipeline = Pipeline(
            [('scaler', scaler), ('discretizer', discretizer)])

    def fit(self, vectors):
        self.lower_clip = np.percentile(vectors, self.lower, axis=0)
        self.upper_clip = np.percentile(vectors, self.upper, axis=0)
        vectors = np.clip(vectors, self.lower_clip, self.upper_clip)
        self.transformed_vectors = self.pipeline.fit_transform(vectors)

    def transform(self, vectors):
        assert self.transformed_vectors is not None
        vectors = np.clip(vectors, self.lower_clip, self.upper_clip)
        probabilities = []
        vectors = self.pipeline.transform(vectors)
        docs = self.transformed_vectors.shape[0]
        for x in vectors:
            count = np.count_nonzero(
                (self.transformed_vectors == x).all(axis=1))
            pr = count / docs
            probabilities.append(pr)
        return -np.log(np.maximum(1e-10, np.array(probabilities)))
Пример #22
0
class SklearnTopicModels(object):

    def __init__(self, n_topics=50, estimator='LDA'):
        """
        n_topics is the desired number of topics
        To use Latent Semantic Analysis, set estimator to 'LSA',
        To use Non-Negative Matrix Factorization, set estimator to 'NMF',
        otherwise, defaults to Latent Dirichlet Allocation ('LDA').
        """
        self.n_topics = n_topics

        if estimator == 'LSA':
            self.estimator = TruncatedSVD(n_components=self.n_topics)
        elif estimator == 'NMF':
            self.estimator = NMF(n_components=self.n_topics)
        else:
            self.estimator = LatentDirichletAllocation(n_topics=self.n_topics)

        self.model = Pipeline([
            ('norm', TextNormalizer()),
            ('tfidf', CountVectorizer(tokenizer=identity,
                                      preprocessor=None, lowercase=False)),
            ('model', self.estimator)
        ])


    def fit_transform(self, documents):
        self.model.fit_transform(documents)

        return self.model


    def get_topics(self, n=25):
        """
        n is the number of top terms to show for each topic
        """
        vectorizer = self.model.named_steps['tfidf']
        model = self.model.steps[-1][1]
        names = vectorizer.get_feature_names()
        topics = dict()

        for idx, topic in enumerate(model.components_):
            features = topic.argsort()[:-(n - 1): -1]
            tokens = [names[i] for i in features]
            topics[idx] = tokens

        return topics
def _do_feature_selection(must_be_in_thesaurus, k, handler='Base', vector_source='default', max_feature_len=1,
                          delete_kid=False):
    """
    Loads a data set, vectorizes it by extracting n-grams (default n=1) using a feature handler (default
    BaseFeatureHandler) and then performs feature selection based on either a vector source or on chi2 scores.
    Returns the encode/decode matrices and the stripped vocabulary of the Vectorizer after feature selection.

    The vector source by default has a unigrams source that covers all unigrams in the training set
    (feature vectors are made up), and does not know about n-grams. Optionally, another vector
    source can be passed in.
    """
    handler_pattern = 'eval.pipeline.feature_handlers.{}FeatureHandler'
    raw_data, data_ids = load_text_data_into_memory(
        training_path='tests/resources/test-tr',
        test_path='tests/resources/test-ev',
    )

    tokenizer = XmlTokenizer()
    x_train, y_train, x_test, y_test = tokenize_data(raw_data, tokenizer, data_ids)

    if vector_source == 'default':
        unigrams_vect = Vectors.from_tsv('tests/resources/thesauri/exp0-0a.txt.events-unfiltered.strings')
        vector_source = unigrams_vect

    if delete_kid:
        # the set of vectors we load from disk covers all unigrams in the training set, which makes it boring
        # let's remove one entry
        del unigrams_vect['kid/N']
        unigrams_vect.matrix = unigrams_vect.matrix[:, :-1]

    if max_feature_len == 1:
        # extract only unigram features
        feat_extr_opts = {'extract_unigram_features': ['J', 'N', 'V'],
                          'extract_phrase_features': []}
        standard_ngram_features = 0
    else:
        feat_extr_opts = {'extract_unigram_features': ['J', 'N', 'V'],
                          'extract_phrase_features': ['AN', 'NN', 'VO', 'SVO']}
        standard_ngram_features = max_feature_len

    feature_extractor = FeatureExtractor(standard_ngram_features=standard_ngram_features).update(**feat_extr_opts)
    pipeline_list = [
        ('vect',
         ThesaurusVectorizer(min_df=1, use_tfidf=False,
                             decode_token_handler=handler_pattern.format(handler))),
        ('fs', VectorBackedSelectKBest(must_be_in_thesaurus=must_be_in_thesaurus, k=k)),
        ('dumper', FeatureVectorsCsvDumper('fs-test'))
    ]
    p = Pipeline(pipeline_list)
    fit_params = {'vect__vector_source': vector_source,
                  'vect__train_time_extractor':feature_extractor,
                  'vect__decode_time_extractor':feature_extractor,
                  'fs__vector_source': vector_source}

    tr_matrix, tr_voc = p.fit_transform(x_train, y_train, **fit_params)
    if 'fs' in p.named_steps:
        p.named_steps['vect'].vocabulary_ = p.named_steps['fs'].vocabulary_
    ev_matrix, ev_voc = p.transform(x_test)
    return tr_matrix.A, strip(tr_voc), ev_matrix.A, strip(ev_voc)
Пример #24
0
def test_countvectorizer_custom_vocabulary_pipeline():
    what_we_like = ["pizza", "beer"]
    pipe = Pipeline([
        ('count', CountVectorizer(vocabulary=what_we_like)),
        ('tfidf', TfidfTransformer())])
    X = pipe.fit_transform(ALL_FOOD_DOCS)
    assert_equal(set(pipe.named_steps['count'].vocabulary), set(what_we_like))
    assert_equal(X.shape[1], len(what_we_like))
Пример #25
0
def test_bace_2():
    assignments, ref_macrostate_assignments = _metastable_system()
    pipeline = Pipeline([
        ('msm', MarkovStateModel()),
        ('bace', BACE(n_macrostates=2))
    ])
    macro_assignments = pipeline.fit_transform(assignments)[0]
    assert (np.min(assignments) >= 0)
Пример #26
0
 def test_sklearn_pipeline(self):
     df = pd.DataFrame.from_dict([{"a":"something"},{}])
     t = bt.Binary_transform(input_feature="a",output_feature="abin")
     transformers = [("binary_transform",t)]
     p = Pipeline(transformers)
     df2 = p.fit_transform(df)
     self.assertEquals(df["abin"][0],1)
     self.assertEquals(df["abin"][1],0)
Пример #27
0
def pipeline(housing):
  housing_num = housing.drop("ocean_proximity", axis=1)
  num_pipeline = Pipeline([
      ('imputer', Imputer(strategy="median")),
      ('attribs_adder', CombinedAttributesAdder()),
      ('std_scaler', StandardScaler)
    ])
  housing_num_tr = num_pipeline.fit_transform(housing_num)
  housing_num_tr
Пример #28
0
def main():
    pipeline = Pipeline([
        ('features', FeatureUnion([
            ('name', Pipeline([
                ('extract_columns', ColumnExtractor("Name")),
                ('binarize_names', ExistenceBinarizer()),
                ('one_hot', OneHotEncoder(sparse=False))
            ])),
            ('weekday', Pipeline([
                ('extract_columns', ColumnExtractor('DateTime')),
                ('weekday_extractor', WeekdayExtractor()),
                # ('one_hot', OneHotEncoder(sparse=False))
            ]))
        ]))
    ])
    train_df = pd.read_csv('data/train_updated_colors.csv', sep=',')
    train_features, train_labels = preprocess_data(train_df)
    pipeline.fit_transform(train_features, train_labels)
Пример #29
0
def train(neg=None, pos=None):
    the_file = os.path.dirname(os.path.abspath(__file__))
    if not neg:
        neg = os.path.join(the_file, '..', 'origin', 'neg.txt')
    if not pos:
        pos = os.path.join(the_file, '..', 'origin', 'pos.txt')
    
    tagger = crfseg.create_tagger()
    tok_cn = lambda (x): crfseg.cut_zh(x, tagger)
    
    tfidf = TfidfVectorizer(tokenizer=tok_cn, sublinear_tf=True, max_df=0.5)
    pipe = Pipeline([
        ('tfidf', tfidf),
    #    ('svd', TruncatedSVD(32)),
    #    ('normal', Normalizer(copy=False))
        ])
    '''
    hasher = HashingVectorizer(n_features=2**16,
                               tokenizer=tok_cn, non_negative=True,
                               norm=None, binary=False)
    '''

    #clf = SGDClassifier(loss='log', penalty='l2', alpha=0.00001, n_iter=50, fit_intercept=True)
    #clf = MultinomialNB()
    clf = BernoulliNB()
    
    neg_file = codecs.open(neg, 'r', 'utf-8')
    pos_file = codecs.open(pos, 'r', 'utf-8')

    x_train = []
    y_train = []
    
    i = 0
    for line in neg_file:
        x_train.append(line)
        y_train.append(0)
    for line in pos_file:
        x_train.append(line)
        y_train.append(1)
    
    print 'begin transform'
    #x_train = hasher.transform(x_train)
    x_train = pipe.fit_transform(x_train)
    print 'begin fit'
    clf.fit(x_train, y_train)

    print 'begin save'
    tfidf_file = os.path.join(the_file, 'data', 'tfidf.pkl')
    clf_file = os.path.join(the_file, 'data', 'sgdc_clf.pkl')
    #_ = joblib.dump(tfidf, tfidf_file, compress=9)
    _ = joblib.dump(clf, clf_file, compress=9)

    print 'begin test'
    x_test = [u'这个东西真心很赞']
    #x_test = hasher.transform(x_test)
    x_test = pipe.transform(x_test)
    print clf.predict(x_test)
Пример #30
0
 def test_sklearn_pipeline_str_numbers(self):
     df = pd.DataFrame.from_dict([{"a":"2"},{"a":"0"}])
     t = bt.BinaryTransform(input_feature="a",output_feature="abin")
     transformers = [("BinaryTransform",t)]
     p = Pipeline(transformers)
     df2 = p.fit_transform(df)
     print df2
     self.assertEquals(df["abin"][0],1)
     self.assertEquals(df["abin"][1],0)
Пример #31
0
        X_numeric = X.select_dtypes(exclude=["object"])
        skewness = X_numeric.apply(lambda x: skew(x))
        skewness_features = skewness[abs(skewness) >= self.skew].index
        X[skewness_features] = np.log1p(X[skewness_features])
        X = pd.get_dummies(X)
        return X


# build pipeline
pipe = Pipeline([
    ('labenc', labelenc()),
    ('skew_dummies', skew_dummies(skew=1)),
])

full2 = full.copy()
data_pipe = pipe.fit_transform(full2)

data_pipe.shape

# + __use robustscaler since maybe there are other outliers.__

scaler = RobustScaler()
n_train = train.shape[0]

X = data_pipe[:n_train]
test_X = data_pipe[n_train:]
y = train.SalePrice

X_scaled = scaler.fit(X).transform(X)
y_log = np.log1p(train.SalePrice)
test_X_scaled = scaler.transform(test_X)
Пример #32
0
                                                  train_size=0.8,
                                                  random_state=0)

num_imputer = SimpleImputer(strategy='most_frequent')
cat_one_hot = OneHotEncoder(handle_unknown='ignore')

cols_preprocessor = ColumnTransformer(
    transformers=[('num', num_imputer, num_cols), ('cat', cat_one_hot,
                                                   cat_cols)])

my_model = XGBRegressor(n_estimators=1000, learning_rate=0.1)

model_pipeline = Pipeline(steps=[('col_formatter', cols_preprocessor)])

x_train = x_train.fillna(method='ffill')
x_train_final = model_pipeline.fit_transform(x_train)

x_val_final = model_pipeline[0].transform(x_val)

y_val_final = (np.array(y_val)).reshape(-1, 1)
y_validation = num_imputer.fit_transform(y_val_final)

y_train_final = (np.array(y_train)).reshape(-1, 1)
y_training = num_imputer.transform(y_train_final)

my_model.fit(x_train_final,
             y_training,
             eval_set=[(x_val_final, y_validation)],
             verbose=0)

x_test_final = model_pipeline.fit_transform(x_test)
Пример #33
0

from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import GaussianNB
import copy

#y1.return0=y1.return0.apply(float)
from sklearn.ensemble import *
from sklearn.svm import LinearSVC
from sklearn.dummy import DummyClassifier

gbc = GaussianNB()
ohc = Pipeline([('b', OneHotEncoder()), ('a', Densifier())])

X = g.iloc[:, 1:]
Y = g.iloc[:, 0]

X = ohc.fit_transform(X)
t = 0.9
pr = list((max(x) > t) for x in gbc.predict_proba(x_test))
#gbc.score(x_test,y_test,pr)
[
    gbc.score(x_test, y_test, pr),
    sum(list((max(x) > t) for x in gbc.predict_proba(x_test)))
]
h = g.groupby(by=['letter1', 'letter2', 'letter3', 'letter4', 'letter5']).sum()
h2 = g.groupby(
    by=['letter1', 'letter2', 'letter3', 'letter4', 'letter5']).mean()
h[h.return0 > 2]
# STEP 3
oof = np.zeros(len(train))
preds = np.zeros(len(test))

for i in tqdm_notebook(range(512)):

    train2 = train[train['wheezy-copper-turtle-magic'] == i]
    test2 = test[test['wheezy-copper-turtle-magic'] == i]
    idx1 = train2.index
    idx2 = test2.index
    train2.reset_index(drop=True, inplace=True)

    data = pd.concat([pd.DataFrame(train2[cols]), pd.DataFrame(test2[cols])])
    pipe = Pipeline([('vt', VarianceThreshold(threshold=2)),
                     ('scaler', StandardScaler())])
    data2 = pipe.fit_transform(data[cols])
    train3 = data2[:train2.shape[0]]
    test3 = data2[train2.shape[0]:]

    skf = StratifiedKFold(n_splits=11, random_state=42)
    for train_index, test_index in skf.split(train2, train2['target']):

        clf = QuadraticDiscriminantAnalysis(0.5)
        clf.fit(train3[train_index, :], train2.loc[train_index]['target'])
        oof[idx1[test_index]] = clf.predict_proba(train3[test_index, :])[:, 1]
        preds[idx2] += clf.predict_proba(test3)[:, 1] / skf.n_splits

auc = roc_auc_score(train['target'], oof)
print(f'AUC: {auc:.5}')

# STEP 4
Пример #35
0
fig = plt.figure(figsize=(15, 8))
plt.suptitle("Manifold Learning with %i cases, %i variables, %i neighbors"
             % (len(y), np.count_nonzero(y), n_neighbors),
             fontsize=14)

estimators = list()
# estimators.append(('variance_thresholder', VarianceThreshold()))
# estimators.append(('scaler', StandardScaler()))
estimators.append(('ae', AETransform(dim=32)))
tsne = manifold.TSNE(n_components=2, random_state=0, perplexity=100,
                     early_exaggeration=4)
estimators.append(('tsne', tsne))
pipeline = Pipeline(estimators)

X_ = pipeline.fit_transform(X)
X_0 = X_[y == 0]
X_1 = X_[y == 1]

ax = fig.add_subplot(241)
ax.scatter(X_0[:, 0], X_0[:, 1], color='g', alpha=0.5)
ax.scatter(X_1[:, 0], X_1[:, 1], color='r', alpha=0.5)
plt.title("t-SNE")
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
plt.axis('tight')


estimators = list()
estimators.append(('variance_thresholder', VarianceThreshold()))
estimators.append(('scaler', StandardScaler()))
Пример #36
0
np.random.seed(42)
m = 100
X = 6 * np.random.rand(m, 1) - 3
y = 2 + X + 0.5 * X**2 + np.random.randn(m, 1)

X_train, X_val, y_train, y_val = train_test_split(X[:50],
                                                  y[:50].ravel(),
                                                  test_size=0.5,
                                                  random_state=10)

poly_scaler = Pipeline([
    ("poly_features", PolynomialFeatures(degree=90, include_bias=False)),
    ("std_scaler", StandardScaler()),
])

X_train_poly_scaled = poly_scaler.fit_transform(X_train)
X_val_poly_scaled = poly_scaler.transform(X_val)

sgd_reg = SGDRegressor(max_iter=1,
                       tol=-np.infty,
                       penalty=None,
                       eta0=0.0005,
                       warm_start=True,
                       learning_rate="constant",
                       random_state=42)

n_epochs = 500
train_errors, val_errors = [], []
for epoch in range(n_epochs):
    sgd_reg.fit(X_train_poly_scaled, y_train)
    y_train_predict = sgd_reg.predict(X_train_poly_scaled)
class Model(object):
    '''
    Multi label classifier model
    '''
    def __init__(self):
        #self.trainset = pd.read_csv("data/raw/train_set.csv")
        #self.testset = pd.read_csv("data/raw/test_set.csv")
        self.cv = CountVectorizer(ngram_range=(0, 2))
        self.model = LogisticRegression()
        self.build_pipe()

    def build_pipe(self):
        sent_features = Pipeline([('select', Selector(key='Utterance')),
                                  ('extract', SentenceFeatures()),
                                  ('vectorize', DictVectorizer())])

        hapax = Pipeline([('select', Selector(key='Utterance')),
                          ('extract', HapaxLegomera()),
                          ('vectorize', DictVectorizer())])

        CV = Pipeline([('select', Selector(key='Utterance')),
                       ('cv', CountVectorizer(ngram_range=(0, 2)))])

        self.pipe = Pipeline([
            ('union',
             FeatureUnion(
                 transformer_list=[('features',
                                    sent_features), ('hapax',
                                                     hapax), ('Ngrams', CV)]))
        ])

        self.label_pipe = Pipeline([('lt', LabelTransformer()),
                                    ('MLJ', MultiLabelJoiner()),
                                    ('MLB', MyLabelEncoder())])

    def train(self, trainset):
        X = self.pipe.fit_transform(trainset)
        y = self.label_pipe.fit_transform(trainset)
        self.model.fit(X, y)

    def test(self, testset):
        X = self.pipe.transform(testset)
        y = self.label_pipe.transform(testset)
        y_pred = self.model.predict(X)
        #self.print_scores(y, y_pred)
        return y, y_pred

    def distribution(self, which):
        if which == 'test':
            df = self.testset
        elif which == 'train':
            df = self.trainset

        labels = df.filter([
            'Stance category', 'second stance category', 'third', 'fourth',
            'fifth'
        ])
        labels = labels.stack()
        print(labels.value_counts(True))

    def unique_labels(self):
        pass
Пример #38
0
    # z.plot.hist(bins=50, ax=ax_133)
    # plt.xlabel('z')
    # plt.title('{:.0f} samples with z>3'.format(n_outliers))

    return model, cv_score, grid_results


data_train = reduce_mem_usage(data_train)
# scatter_matrix(data_train, figsize=(20, 16))

num_pipeline = Pipeline([
    # ('Imputer', Imputer("median")),
    ('StandardScaler', StandardScaler()),
])

data_train_std = num_pipeline.fit_transform(data_train)
data_train_std = pd.DataFrame(data_train_std,
                              index=data_train.index,
                              columns=data_train.columns)
# savfig_send()

# data_train.iloc[:, :10].hist(bins=50, figsize=[20, 15])
linear_regression = LinearRegression()
X = data_train_std.iloc[:, :-1]
y = data_train_std['target']
# linear_model = linear_regression.fit(X, y)

# outliers = find_outliers(Ridge(), X, y)

model, cv_score, grid_results = train_model(LinearRegression(), {},
                                            X=X,
Пример #39
0
# TSNE
tsne = TSNE(n_components=2, random_state=seed)
tsne_data = tsne.fit_transform(data_std)

# Data for ploting
data_sets = [pca_data, tsne_data]
names = ["pca_data", "tsne_data"]
colors = ["red", "green", "blue"]

# Pipeline
pipe = Pipeline([
    ("std", StandardScaler()),
    ("pca", PCA(n_components=0.95, random_state=seed)),
    ("tsne", TSNE(n_components=2, random_state=seed)),
])
piped = pipe.fit_transform(test_data_02)


def diabetes_classification() -> None:
    elements = [2, 4, 6]
    sum_of_elements = []
    for element in elements:
        pca = PCA(n_components=element, random_state=seed)
        pca.fit(x_standard)

        ratio = pca.explained_variance_ratio_
        print(ratio)

        ratio_sum = sum(ratio)
        sum_of_elements.append(ratio_sum)
Пример #40
0
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

columns = [
    'median_income', 'households', 'population', 'total_bedrooms',
    'total_rooms'
]

pipeline1 = Pipeline([
    ('log', FunctionTransformer(np.log1p, validate=False)),
])
pipeline1.fit_transform(df)

num_attribs = list(df.drop('ocean_proximity', axis=1))
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ("num", pipeline1, columns),
])

df1 = full_pipeline.fit_transform(df)
df2 = pd.DataFrame(df1, columns=columns)
df3 = df.copy()

df3[columns] = df2

df.dtypes
Пример #41
0
def enet_path(est, x_train, x_test, y_train, y_test, num_alphas, eps, l1_ratio,
              target_score, n_tail, max_complexity):
    models = []

    trafo = Pipeline(steps=est.steps[:-1])
    final = est._final_estimator
    fit_intercept = final.fit_intercept
    normalize = final.normalize
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")
        features = trafo.fit_transform(x_train)

    if isinstance(final, RationalFunctionMixin):
        features = est._final_estimator._transform(features, y_train)

    X, y, X_offset, y_offset, X_scale, precompute, Xy = _pre_fit(
        features,
        y_train,
        None,
        True,
        normalize=normalize,
        fit_intercept=fit_intercept,
        copy=True)

    n_samples = X.shape[0]
    alpha_max = np.abs(np.nanmax(X.T @ y) / (n_samples * l1_ratio))

    est.set_params(regression__precompute=precompute,
                   regression__fit_intercept=False,
                   regression__normalize=False,
                   regression__warm_start=True)

    est_ = FFXElasticNet()
    est_.set_params(**final.get_params())

    for alpha in _get_alphas(alpha_max, num_alphas, eps):
        est_.set_params(l1_ratio=l1_ratio, alpha=alpha)

        with warnings.catch_warnings():
            warnings.filterwarnings("ignore")
            est_.fit(X, y, check_input=False)

        model = deepcopy(est)
        model.set_params(regression__fit_intercept=fit_intercept,
                         regression__normalize=normalize,
                         regression__l1_ratio=l1_ratio,
                         regression__alpha=alpha)

        for attr in ["coef_", "intercept_", "n_iter_"]:
            setattr(model._final_estimator, attr, getattr(est_, attr))

        model._final_estimator._set_intercept(X_offset, y_offset, X_scale)
        if isinstance(model._final_estimator, RationalFunctionMixin):
            model._final_estimator._arrange_coef()

        model.train_score_ = model.score(x_train, y_train)
        model.test_score_ = model.score(x_test, y_test)
        model.complexity_ = np.count_nonzero(model._final_estimator.coef_)
        models.append(model)

        if model.train_score_ <= target_score:
            # print("Reached target score")
            break
        elif model.complexity_ >= max_complexity:
            # print("Reached target complexity")
            break
        elif _path_is_saturated(models, n_tail=n_tail):
            # print("Stagnation in train score")
            break
    return models
Пример #42
0
 # Spracovanie zaznamu a klasifikacia
 if konspekt:
     konspekt = False
     output_file.write(line)
     continue
 # Filtrovanie meta dat a titulu
 text = filter_text(text)
 title = filter_text(title)
 # Vytvorenie dat pre klasifikaciu,
 # v pripade klasifikacie aj z fulltextu je potrebne
 # do data['titile'] priradit aj fulltext zaznamu
 data = {}
 data['meta_data'] = [text]
 data['title'] = [title + ' ' + text]
 # Spracovanie dat
 X = pipeline.fit_transform(data)
 # Klasifikacia
 predicted = clf.predict(X)[0]
 predicted_proba = clf.predict_proba(X)[0]
 # Zoradenie a vypis vysledkov klasifikacie
 all_pred = []
 for index, item in enumerate(predicted_proba):
     if item != 0:
         all_pred.append([cat_names[index], item])
 for index, item in enumerate(
         sorted(all_pred, key=lambda x: x[1], reverse=True)):
     if index > 6:
         break
     output_file.write('072 c $a' + item[0] + '$b' + str(item[1]) +
                       '\n')
 output_file.write(line)
Пример #43
0
# Use on numeric columns in the data
class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]


desc = Pipeline([('selector', TextSelector(key='desc')),
                 ('tfidf', TfidfVectorizer(stop_words='english'))])

desc.fit_transform(train_features)

value = Pipeline([
    ('selector', NumberSelector(key='value')),
    # ('standard', StandardScaler())
])

value.fit_transform(train_features)

feats = FeatureUnion([('desc', desc), ('value', value)])

feature_processing = Pipeline([('feats', feats)])
feature_processing.fit_transform(train_features)

pipeline = Pipeline([
    ('features', feats),
Пример #44
0
print("finding best classifier")

features = FEATURES_ARRAY2
scores = {}

# for feature in [OUTPUT_FOLDER + 'lbp2' + FORMAT]:  # features:
for feature in features:
    print("""
    ----------------------------------
    getting feature: {}
    ----------------------------------
    """.format(feature))

    X = np.load(feature, allow_pickle=True)
    X = transformer_pipe.fit_transform(X)

    "Resampling"
    # rus = RandomUnderSampler(random_state=RANDOM_STATE)
    # X_res, y_res = rus.fit_resample(X, y)

    # RandomOverSampler(random_state=RANDOM_STATE)
    # ADASYN(random_state=RANDOM_STATE)
    smote = SMOTE(random_state=RANDOM_STATE)
    X_res, y_res = smote.fit_resample(X, y)

    X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, stratify=y_res)

    "One model for all"
    model = SVC(random_state=RANDOM_STATE)
    model.fit(X_train, y_train)
Пример #45
0
def main():
    """extract features"""
    train = pd.read_csv(settings.train)
    y = train[settings.y]
    coldrop = ['Unnamed: 0','Gene','Variation','Variation_type','Gene_type']
    train = train.drop([settings.y], axis=1)
    train = train.drop(coldrop, axis=1)
    
    test = pd.read_csv(settings.test)
    test = test.drop(coldrop, axis=1)
    pid = test[settings.id_colname]

    feat_p = Pipeline([
        ('union', FeatureUnion(
            n_jobs = -1,
            transformer_list = [
                ('standard', cust_regression_vals()),
                ('p1', Pipeline([
                    ('Text', cust_txt_col(settings.text_colname)),
                    ('tfidf_Text', TfidfVectorizer(ngram_range=(1, 2))),
                    ('tsvd1', TruncatedSVD(n_components=50, n_iter=25, random_state=12)),
                ('p2', Pipeline([(settings.gene_colname, cust_txt_col(settings.gene_colname)),
                                 ('count_Gene', CountVectorizer(analyzer=u'char', ngram_range=(1, 8))), 
                                 ('tsvd2', TruncatedSVD(n_components=20, n_iter=25, random_state=12))])),
                ('p3', Pipeline([('Variation', cust_txt_col(settings.var_name)), 
                                 ('count_Variation', CountVectorizer(analyzer=u'char', ngram_range=(1, 8))), 
                                 ('tsvd3', TruncatedSVD(n_components=20, n_iter=25, random_state=12))])),
                ]))
            ])
        )])

    train = feat_p.fit_transform(train); print(train.shape)
    test = feat_p.transform(test); print(test.shape)


    """ init and run model"""
    y = y - 1 #fix for zero bound array
    denom = 0
    fold = 20
    for i in range(fold):
        params = {
            'eta': 0.03333,
            'max_depth': 4,
            'objective': 'multi:softprob',
            'eval_metric': 'mlogloss',
            'num_class': 9,
            'seed': i,
            'silent': True
        }

        x1, x2, y1, y2 = train_test_split(train, y, test_size=0.18, random_state = i)
        watchlist = [(xgb.DMatrix(x1, y1), 'train'), (xgb.DMatrix(x2, y2), 'valid')]

        model = xgb.train(
            params, xgb.DMatrix(x1, y1), 1000,
            watchlist, verbose_eval = 50, early_stopping_rounds=100
        )

        score = metrics.log_loss(
            y2, model.predict(xgb.DMatrix(x2),
            ntree_limit = model.best_ntree_limit),
            labels = list(range(9))
        )
        print(score)

        if denom != 0:
            pred = model.predict(xgb.DMatrix(test), ntree_limit = model.best_ntree_limit+80)
            preds += pred
        else:
            pred = model.predict(xgb.DMatrix(test), ntree_limit = model.best_ntree_limit+80)
            preds = pred.copy()
        denom += 1
        submission = pd.DataFrame(pred, columns = ['class'+str(c+1) for c in range(9)])
        submission[settings.id_colname] = pid
        submission.to_csv('submission_xgb_fold_' + str(i) + '.csv', index=False)

    preds /= denom
    submission = pd.DataFrame(preds, columns=['class' + str(c + 1) for c in range(9)])
    submission[settings.id_colname] = pid
    submission.to_csv('submission_xgb.csv', index=False)
Пример #46
0
        for shift_vector in [[0, -1], [0, 1]]:
            for image in X:
                image_copy = image.copy().reshape(28, 28)
                shift(image_copy, shift_vector, cval=0)
                X_augmented.append(image_copy.reshape([-1]))
        return np.array(X_augmented)


# Data prep pipeline. Standardization included
pipeline = Pipeline([
    ("augmenter", DataAugmentation()),
    ("scaler", StandardScaler()),
])

# The attributes train set is run through the pipeline
X_train_prepared = pipeline.fit_transform(X_train.astype(np.float64))
# y_train_prepared accounts for the shifted images in X_train_prepared
y_train_prepared = np.array([label for label in y_train] * 3)
# Cross-validation is used to train and test various Random Forests using Grid Search
param_grid = [
    {
        "n_estimators": [3, 10, 30],
        "max_features": [4, 6, 8, 12],
    },
]
rf_clf = RandomForestClassifier(random_state=42)
search = GridSearchCV(rf_clf,
                      param_grid,
                      cv=5,
                      scoring='neg_mean_squared_error',
                      return_train_score=True,
Пример #47
0
def main():
    df_train = pd.read_csv('data/train_data.csv')
    df_valid = pd.read_csv('data/valid_data.csv')
    df_test = pd.read_csv('data/test_data.csv')

    feature_cols = list(df_train.columns[:-1])
    target_col = df_train.columns[-1]

    X_train = df_train[feature_cols].values
    y_train = df_train[target_col].values

    X_valid = df_valid[feature_cols].values
    y_valid = df_valid[target_col].values

    X_test = df_test[feature_cols].values

    tsne_data_2d_5p = np.load('data/tsne_2d_5p.npz')
    tsne_data_2d_10p = np.load('data/tsne_2d_10p.npz')
    tsne_data_2d_15p = np.load('data/tsne_2d_15p.npz')
    tsne_data_2d_20p = np.load('data/tsne_2d_20p.npz')
    tsne_data_2d_30p = np.load('data/tsne_2d_30p.npz')
    tsne_data_2d_40p = np.load('data/tsne_2d_40p.npz')
    tsne_data_2d_50p = np.load('data/tsne_2d_50p.npz')
    tsne_data_3d_30p = np.load('data/tsne_3d_30p.npz')

    # concat features
    X_train_concat = {
        'X': X_train,
        'tsne_2d_5p': tsne_data_2d_5p['train'],
        'tsne_2d_10p': tsne_data_2d_10p['train'],
        'tsne_2d_15p': tsne_data_2d_15p['train'],
        'tsne_2d_20p': tsne_data_2d_20p['train'],
        'tsne_2d_30p': tsne_data_2d_30p['train'],
        'tsne_2d_40p': tsne_data_2d_40p['train'],
        'tsne_2d_50p': tsne_data_2d_50p['train'],
        'tsne_3d_30p': tsne_data_3d_30p['train'],
    }
    X_valid_concat = {
        'X': X_valid,
        'tsne_2d_5p': tsne_data_2d_5p['valid'],
        'tsne_2d_10p': tsne_data_2d_10p['valid'],
        'tsne_2d_15p': tsne_data_2d_15p['valid'],
        'tsne_2d_20p': tsne_data_2d_20p['valid'],
        'tsne_2d_30p': tsne_data_2d_30p['valid'],
        'tsne_2d_40p': tsne_data_2d_40p['valid'],
        'tsne_2d_50p': tsne_data_2d_50p['valid'],
        'tsne_3d_30p': tsne_data_3d_30p['valid'],
    }
    X_test_concat = {
        'X': X_test,
        'tsne_2d_5p': tsne_data_2d_5p['test'],
        'tsne_2d_10p': tsne_data_2d_10p['test'],
        'tsne_2d_15p': tsne_data_2d_15p['test'],
        'tsne_2d_20p': tsne_data_2d_20p['test'],
        'tsne_2d_30p': tsne_data_2d_30p['test'],
        'tsne_2d_40p': tsne_data_2d_40p['test'],
        'tsne_2d_50p': tsne_data_2d_50p['test'],
        'tsne_3d_30p': tsne_data_3d_30p['test'],
    }

    pipeline = Pipeline(steps=[
        ('features',
         FeatureUnion(transformer_list=[
             ('X', ItemSelector('X')),
             ('tsne_2d_5p', ItemSelector('tsne_2d_5p')),
             ('tsne_2d_10p', ItemSelector('tsne_2d_10p')),
             ('tsne_2d_15p', ItemSelector('tsne_2d_15p')),
             ('tsne_2d_20p', ItemSelector('tsne_2d_20p')),
             ('tsne_2d_30p', ItemSelector('tsne_2d_30p')),
             ('tsne_2d_40p', ItemSelector('tsne_2d_40p')),
             ('tsne_2d_50p', ItemSelector('tsne_2d_50p')),
             ('tsne_3d_30p', ItemSelector('tsne_3d_30p')),
         ])),
        ('poly', PolynomialFeatures(degree=2)),
        ('scaler', MinMaxScaler()),
    ])

    X_train_concat = pipeline.fit_transform(X_train_concat, y_train)
    X_valid_concat = pipeline.transform(X_valid_concat)
    X_test_concat = pipeline.transform(X_test_concat)

    X_valid_both, y_valid_both = divide_samples_train(X_valid_concat, y_valid)

    classifier = make_pipeline(
        make_union(ItemSelector(key='L'), ItemSelector(key='R')),
        LogisticRegression(penalty='l2', C=1e-2, n_jobs=-1, warm_start=True))

    for i in trange(10):
        X_train_both, y_train_both = divide_samples_train(
            *shuffle(X_train_concat, y_train))

        print('Fitting...')
        start_time = time.time()
        classifier.fit(X_train_both, y_train_both)
        print('Fit: {}s'.format(time.time() - start_time))

        p_valid = classifier.predict_proba(X_valid_both)
        loss = log_loss(y_valid_both, p_valid[:, 1])
        auc = roc_auc_score(y_valid_both, p_valid[:, 1])
        print('Pairwise Loss: {}, AUC: {}'.format(loss, auc))

    p_valids = []
    for i in trange(100):
        X_valid_both = divide_samples_test(X_valid_concat)
        p_valid = classifier.predict_proba(X_valid_both)
        p_valids.append(p_valid)
    p_valid = np.array(p_valids)
    p_valid = np.mean(p_valid, axis=0)

    loss = log_loss(y_valid, p_valid[:, 1])
    auc = roc_auc_score(y_valid, p_valid[:, 1])
    print('Validation Loss: {}, AUC: {}'.format(loss, auc))

    p_tests = []
    for i in trange(100):
        X_test_both = divide_samples_test(X_test_concat)
        p_test = classifier.predict_proba(X_test_both)
        p_tests.append(p_test)
    p_test = np.array(p_tests)
    p_test = np.mean(p_test, axis=0)

    df_pred = pd.DataFrame({
        't_id': df_test['t_id'],
        'probability': p_test[:, 1]
    })
    csv_path = 'predictions/predictions_{}_{}.csv'.format(
        int(time.time()), loss)
    df_pred.to_csv(csv_path, columns=('t_id', 'probability'), index=None)
    print('Saved: {}'.format(csv_path))
Пример #48
0
        ]))])
    comb_vectorizer = Pipeline([(
        'features',
        FeatureUnion([
            ('tfidf', tfidf_vectorizer),  #find tfidf value
            ('tp', tp_vectorizer),  #find term presence
            ('sv', sv_vectorizer)
        ]))])

    tfidf_train = tfidf_vectorizer.fit_transform(x_train).todense()
    tfidf_test = tfidf_vectorizer.transform(x_test).todense()

    tp_train = tp_vectorizer.fit_transform(x_train)
    tp_test = tp_vectorizer.transform(x_test)

    sv_train = sv_vectorizer.fit_transform(x_train)
    sv_test = sv_vectorizer.transform(x_test)

    c_train = comb_vectorizer.fit_transform(x_train)
    c_test = comb_vectorizer.transform(x_test)

    path_result = "result/"
    file_res = open(path_result + "output_desc.txt", "w")

    file_res.write("data training: " + str(len(x_train)) + "\n")
    file_res.write("data testing: " + str(len(x_test)) + "\n")
    file_res.write("\n")
    file_res.write("data training pos: " +
                   str(sum(i == "positive" for i in y_train)) + "\n")
    file_res.write("data training neg: " +
                   str(sum(i == "negative" for i in y_train)) + "\n")
Пример #49
0
def dat_prep(nbd_train,nbd_test,k,vect_type,Type_train,Type_test,Chr_train,Chr_test,Label_train,Label_test,scaled_feats_train,scaled_feats_test,dummy_train,dummy_test):
    #Derives the Count Vectorizer or TFIDF scores for a given neighborhood sequence
    """
    Arguments:
        nbd_train = Column containing the neighborhood sequence from the training data
        nbd_test = Column containing the neighborhood sequence from the test data

        k=size of kmer
        vect_type= 'CV' for Count Vectorizer or else TFIDF Vectorizer 
        Type_train=Numerically encoded substitution Type ("A>T" encoded as 1 or "G>C" encoded as 2 and so on) from training data
        Type_test=Numerically encoded substitution Type ("A>T" encoded as 1 or "G>C" encoded as 2 and so on) from test data
        Chr_train= Chromosome number from training data
        Chr_test=Chromosome number from test data
        Label_train=Binary label (training data), where 1=Passenger and 2=Driver
        Label_test=Binary label (test data), where 1=Passenger and 2=Driver
        scaled_feats_train=Scaled genomic features (consrvation, amino acid etc.) for training data
        scaled_feats_test=Scaled genomic features (consrvation, amino acid etc.) for test data
        dummy train= One-hot encoding based feature matrix for training data
        dummy test=One-hot encoding based feature matrix for test data


    Returns:
        df_comb_train= The complete dataframe (using training data) of TFIDF or CountVect scores plus other features such as chromosome number and substitution type
        df_comb_test= The complete dataframe (using test data) of TFIDF or CountVect scores plus other features such as chromosome number and substitution type
        count_vector_train= Just the TFIDF or Count vect features (training data) also known as the Document-Term matrix
        count_vector_test= Just the TFIDF or Count vect features (test data) also known as the Document-Term matrix
        cols= feature names
        vect= The vocabulary derived from the training data
        sc= The scaling variable derived from the training data


    """
    if(vect_type=="CV"):
        vect=Pipeline([('cv1',CountVectorizer(lowercase=False))])
    else:
        vect = Pipeline([('cv1',CountVectorizer(lowercase=False)), ('tfidf_transformer',TfidfTransformer(smooth_idf=True,use_idf=True))])
        

    count_vector_train=vect.fit_transform(preprocess(nbd_train,k))
    count_vector_test=vect.transform(preprocess(nbd_test,k))
    
    df_train=pd.DataFrame(count_vector_train.todense(),columns=vect['cv1'].get_feature_names())
    df_test=pd.DataFrame(count_vector_test.todense(),columns=vect['cv1'].get_feature_names())

    if(vect_type=="tf"):
        sc=MinMaxScaler()
        #We have used fit_transform() here because we wanted to learn the vocabulary dictionary and return document-term matrix using the traininig data
        df_train=pd.DataFrame(sc.fit_transform(df_train),columns=df_train.columns)
        #We have used transform() here since we already have a pretrained vocabulary using which we just wanted to derive the term-document matrix for the test data
        df_test=pd.DataFrame(sc.transform(df_test),columns=df_test.columns)
        
    df_train['Type']=Type_train;df_test['Type']=Type_test
    df_train['Label']=Label_train;df_test['Label']=Label_test
    df_train['Chr']=Chr_train;df_test['Chr']=Chr_test
    df_comb_train=pd.concat([df_train, scaled_feats_train,dummy_train], axis=1)
    df_comb_test=pd.concat([df_test, scaled_feats_test,dummy_test], axis=1)

    df_comb_train = df_comb_train.loc[:,~df_comb_train.columns.duplicated()]
    df_comb_test = df_comb_test.loc[:,~df_comb_test.columns.duplicated()]
    cols=vect['cv1'].get_feature_names()


    return df_comb_train,df_comb_test,count_vector_train,count_vector_test,cols,vect,sc
Пример #50
0
x_prepared = pd.DataFrame(stdsc.fit_transform(x),index=x.index,columns=x.columns)
# Repeat for test set
x_prepared_test = pd.DataFrame(stdsc.fit_transform(x_test),
                               index=x_test.index,columns=x_test.columns)
# Repeat "all" set
x_prepared_all = pd.DataFrame(stdsc.fit_transform(x_all),
                              index=x_all.index,columns=x_all.columns)
#%% Generate "x_poly" sets -----------------------
#    Pipeline uses Poly + Standard Scaling

pipeline = Pipeline([("poly_features",PolynomialFeatures(degree=2, include_bias=True)),('std_scaler',StandardScaler())])
pipeline.fit(x_prepared)
# Retrieve the column names, for book-keeping
poly_cols = pipeline.named_steps["poly_features"].get_feature_names(x_prepared.columns)
# Transform the data and re-frame the results as pandas DataFrame
x_poly=pd.DataFrame(pipeline.fit_transform(x_prepared),
                    index=x_prepared.index,columns=poly_cols)
# Repeat for test set
x_poly_test=pd.DataFrame(pipeline.fit_transform(x_prepared_test),
                         index=x_prepared_test.index,columns=poly_cols)
# repeat for "all" set
x_poly_all=pd.DataFrame(pipeline.fit_transform(x_prepared_all),
                        index=x_prepared_all.index,columns=poly_cols)

#%% Take a look at the transformations before proceeding -----------------

# Need to recombine x and y for easy correlation plotting
df_temp = x_poly.copy()
df_temp["salary"] = y
plt.figure(figsize=(10,3.5))
# Absoloute Magnitude--
Пример #51
0
		return self
	def transform(self,x):
		output = x.copy()
		if self.columns is not None:
			for col in self.columns:
				output [col] =LabelEncoder().fit_transform(output[col])
		else:
			for colname,col in output.iteritems():
				output[colname] =LabelEncoder().fit_transform(col)
		return output
	def fit_transform(self,x,y=None):
		return self.fit(x,y).transform(x)
encoding_pipeline = Pipeline([
	('encoding',MultiColumnLabelEncoder(columns=['buying','maintain','lug_boot','safety','class']))
])
dataread = encoding_pipeline.fit_transform(dataread)
out = dataread.ix[:,6:7]
#out=np.array(out)
inp = dataread.ix[:,0:6]
#inp=np.array(inp)
inp.columns.tolist()
inp['person']=inp['person'].replace(['5more'] , 5)
inp['doors']=inp['doors'].replace(['more'], 5)
print(inp)
from sklearn.cross_validation import train_test_split
train_inp,test_inp,train_out,test_out=train_test_split(inp,out,train_size=0.66,test_size=0.33)
print(np.shape(train_inp))
print (dataread.head())
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=4, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None)
model.fit(train_inp, train_out)
Пример #52
0
# Plot pairwise plot
sns.set_context('notebook')
sns.set_palette('dark')
sns.set_style('white')
sns.pairplot(data)
plt.show()

# Create a pipeline to pre-process the data and compare with previous result
# The custom NumPy log transformer
log_transformer = FunctionTransformer(np.log1p)
# The pipeline
estimators = [('log1p', log_transformer), ('minmaxscale', MinMaxScaler())]
pipeline = Pipeline(estimators)
# Convert the original data
data_pipe = pipeline.fit_transform(data_orig)
print("check two arrays (pipeline, no pipeline) are equal = ",np.allclose(data_pipe, data))

# Perform PCA with n_components ranging from 1 to 5. Find and plot the explained variance and feature importances
pca_list = list()
feature_weight_list = list()
# Fit a range of PCA models
for n in range(1, 6):
    # Create and fit the model
    PCAmod = PCA(n_components=n)
    PCAmod.fit(data)
    # Store the model and variance
    pca_list.append(pd.Series({'n': n, 'model': PCAmod,
                               'var': PCAmod.explained_variance_ratio_.sum()}))
    # Calculate and store feature importances
    abs_feature_values = np.abs(PCAmod.components_).sum(axis=0)
Пример #53
0
    index_cols = features_df.columns[[0, 1, 2, -1]]
    features_df.drop(index_cols, axis=1, inplace=True)
    feature_names = features_df.columns
    X = features_df

    # train-test split and wrap output in data frame to save column names
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                        random_state=41)

    # preprocess the data by setting NaN values to the mean and standard scaling
    preprocess_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="mean")),
        ('std_scaler', StandardScaler()),
    ])

    X_train = pd.DataFrame(preprocess_pipeline.fit_transform(X_train),
                           columns=feature_names)
    X_test = pd.DataFrame(preprocess_pipeline.transform(X_test),
                          columns=feature_names)

    clf = xgb.XGBClassifier(colsample_bytree=0.9,
                            n_estimators=200,
                            learning_rate=0.04,
                            max_depth=8,
                            subsample=0.9,
                            gamma=0.05,
                            objective="multi:softprob",
                            tree_method="gpu_hist")

    clf.fit(X_train, y_train)
    xgb.plot_importance(clf, max_num_features=20)
Пример #54
0
    def use_pipeline_with_fs(self):

        #####################
        #Build a vectorizer / classifier pipeline that filters out tokens that are too rare or too frequent
        #####################

        pipeline = Pipeline([
                ('vect', TfidfVectorizer(stop_words=stopwords, min_df=3, max_df=0.90)),
                ("selector", SelectPercentile()),
                ('clf', MultinomialNB()),
        ])


        # Build a grid search to find the best parameter
        # Fit the pipeline on the training set using grid search for the parameters
        parameters = {
            'vect__ngram_range': [(1,1), (1,2), (1,3)],
            'vect__use_idf': (True, False),
            'selector__score_func': (chi2, f_classif),
            'selector__percentile': (85, 95, 100),
            'clf__alpha': (0.4, 0.5)
        }

        #################
        # Exhaustive search over specified parameter values for an estimator, use cv to generate data to be used
        # implements the usual estimator API: when “fitting” it on a dataset all the possible combinations of parameter values are evaluated and the best combination is retained.
        #################

        cv = StratifiedShuffleSplit(y_train, n_iter=5, test_size=0.2, random_state=42)
        grid_search = GridSearchCV(pipeline, param_grid=parameters, cv=cv, n_jobs=-1)
        clf_gs = grid_search.fit(docs_train, y_train)

        ###############
        # print the cross-validated scores for the each parameters set explored by the grid search
        ###############

        best_parameters, score, _ = max(clf_gs.grid_scores_, key=lambda x: x[1])
        for param_name in sorted(parameters.keys()):
            print("%s: %r" % (param_name, best_parameters[param_name]))

        print("Score for gridsearch is %0.2f" % score)

        #y_predicted = clf_gs.predict(docs_test)

        ###############
        # run the classifier again with the best parameters
        # in order to get 'clf' for get_important_feature function!
        ###############

        ngram_range = best_parameters['vect__ngram_range']
        use_idf = best_parameters['vect__use_idf']
        score_func = best_parameters['selector__score_func']
        percentile = best_parameters['selector__percentile']
        alpha = best_parameters['clf__alpha']

        # vectorisation

        count_vect = CountVectorizer(stop_words=stopwords, min_df=3, max_df=0.90, ngram_range=ngram_range)
        X_CV = count_vect.fit_transform(docs_train)

        # print number of unique words (n_features)
        print ("Shape of train data is "+str(X_CV.shape))

        # tfidf transformation

        tfidf_transformer = TfidfTransformer(use_idf=use_idf)
        X_tfidf = tfidf_transformer.fit_transform(X_CV)

        #################
        # feature selection
        #################

        selector = SelectPercentile(score_func=score_func, percentile=percentile)

        combined_features = Pipeline([
                                        ("vect", count_vect),
                                        ("tfidf", tfidf_transformer),
                                        ("feat_select", selector)
        ])

        X_features = combined_features.fit_transform(docs_train,y_train)
        X_test_features = combined_features.transform(docs_test)

        print ("Shape of train data after feature selection is "+str(X_features.shape))
        print ("Shape of test data after feature selection is "+str(X_test_features.shape))


        # run classifier on selected features

        clf = MultinomialNB(alpha=alpha).fit(X_features, y_train)

        # get the features which are selected and write to file

        feature_boolean = selector.get_support(indices=False)

        f = open(path_to_store_feature_selection_boolean_file,'w')

        for fb in feature_boolean:
            f.write(str(fb)+'\n')

        f.close()


        ##################
        # get cross validation score
        ##################

        scores = cross_val_score(clf, X_features, y_train, cv=10, scoring='f1_weighted')
        print ("Cross validation score: "+str(scores))

        # Get average performance of classifier on training data using 10-fold CV, along with standard deviation

        print("Cross validation accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


        #################
        # run classifier on test data
        #################


        y_predicted = clf.predict(X_test_features)

        # print the mean accuracy on the given test data and labels

        print ("Classifier score on test data is: %0.2f " % clf.score(X_test_features,y_test))

        # Print and plot the confusion matrix

        print(metrics.classification_report(y_test, y_predicted))
        cm = metrics.confusion_matrix(y_test, y_predicted)
        print(cm)

        # import matplotlib.pyplot as plt
        # plt.matshow(cm)
        # plt.show()

        return clf, count_vect
Пример #55
0
    def train_and_evaluate(self):
        data_dir = self.datasetprop.dir
        filename = self.datasetprop.filename
        self.dataset = self.loadDatasets(data_dir, filename, self.localTrain)
        # train_set, test_set = self.create_train_test_df(self.dataset)
        train_set = self.dataset # using te full dataset

        # Prepare IDS Dataset   
        dataclean_pipeline = Pipeline([
            ('data_cleaner', CustomDataCleaner()),
        ])

        prepdata_pipeline = Pipeline([
            ('attribs_remover', AttributesRemover()),
            ('standard_scaler', StandardScaler()),
        ])

        ids_label_pipeline = Pipeline([
            ('label_encoder', MyLabelEncoder()),
            ('benign_encoder', BenignLabelEncoder()),
        ])
        
        train_x = train_set.copy()
        train_x = dataclean_pipeline.fit_transform(train_x)
        train_y = train_x["Label"].copy()

        train_x_prepared = prepdata_pipeline.fit_transform(train_x)
        train_y_prepared = ids_label_pipeline.fit_transform(train_y)

        # test_x = test_set.copy()
        # test_x = dataclean_pipeline.transform(test_x)
        # test_y = test_x["Label"].copy()

        # test_x_prepared = prepdata_pipeline.transform(test_x)
        # test_y_prepared = ids_label_pipeline.transform(test_y)

        # PredefinedSplit
        # my_test_fold = []
        # for _ in range(len(cleanset_prepared)):
        #     my_test_fold.append(-1)
        # for _ in range(len(anomalyset_prepared)):
        #     my_test_fold.append(0)
            
        # param_grid = [{'gamma': [0.05,0.1,0.2,0.001,0.02,0.03], 
        #             'kernel': ['rbf',], 
        #             'nu':[0.01,0.05,0.1,0.03,0.3,0.07]
        #             }]
        # estimator = OneClassSVM()

        # grid_search = GridSearchCV(estimator, 
        #                         param_grid, 
        #                         cv=PredefinedSplit(test_fold=my_test_fold),
        #                         scoring='f1_micro'
        #                         )
        # grid_search.fit(np.concatenate((cleanset_prepared,anomalyset_prepared),axis=0), 
        #                 np.concatenate((cleanset_label_prepared,anomalyset_label_prepared),axis=0)
        #             )

        # Print the cv scores.
        # cvres = grid_search.cv_results_
        # for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        #     print(mean_score, params)

        # return grid_search
        estimator = OneClassSVM(gamma=0.2, kernel='rbf', nu=0.07)

        # estimator = OneClassSVM(gamma=0.001, kernel='rbf', nu=0.001) # hyperparams are for test purpose 
        estimator.fit(train_x_prepared)
        return estimator
#The first 100 instances are for trainning, the remaining is for testing

#EARLY STOPPING
from sklearn.base import clone  #Function to copy the model
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler  #Function to scale the data
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

poly_scaler = Pipeline([
    ('poly_features', PolynomialFeatures(
        degree=2,
        include_bias=False)),  #First, performs a polynomial regression
    ('std_scaler', StandardScaler())
])  #Then, scales the values
X_train_poly_scaled = poly_scaler.fit_transform(
    X_train)  #Executes the transformation
X_val_poly_scaled = poly_scaler.transform(X_val)

#SOFTMAX REGRESSION
from sklearn.linear_model import LogisticRegression
softmax_reg = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    C=10,
    max_iter=1,
    warm_start=True
)  #ONLY 1 iteration is performed each time, Warm Start let the fit function use the previous results too.

minimum_val_error = float('inf')
best_epoch = None
best_model = None
        else:

            return np.c_[X, rooms_per_household, population_per_household]
        attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
        df_extra_attribs = attr_adder.transform(housing.values)


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])
df_num_tr = num_pipeline.fit_transform(df_num)

from sklearn.compose import ColumnTransformer
num_attribs = list(df_num)
cat_attribs = ['ocean_proximity']
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])

df_prepared = full_pipeline.fit_transform(df)

from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(df_prepared, df_labels)
Пример #58
0
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer, OneHotEncoder, StandardScaler

numeric_transformer = Pipeline(
    steps=[('imputer', Imputer(missing_values='NaN', strategy='mean')
            ), ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder())])

preprocessor = ColumnTransformer(transformers=[(
    'num', numeric_transformer,
    numeric_features), ('cat', categorical_transformer, categorical_features)])

clf = Pipeline(steps=[('preprocessor', preprocessor)])
X = clf.fit_transform(X)

#splitting the dataset

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)
print(X_test)
#linear regression

from sklearn.linear_model import LinearRegression
regressor = LinearRegression(fit_intercept=False)
regressor.fit(X_train, y_train)
Пример #59
0

# In[75]:


#Transformation Pipelines --housing_num is training data
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy = "median")), 
    ('attribs_adder', CombinedAttributesAdder()), 
    ('std_scaler', StandardScaler()), 
])

housing_num_tr = num_pipeline.fit_transform(housing_num)


# In[84]:


#Full pipeline to transform both numerical and categorical attributes
from sklearn.pipeline import FeatureUnion

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)), 
    ('imputer', SimpleImputer(strategy = "median")), 
    ('attribs_adder', CombinedAttributesAdder()), 
Пример #60
0
# delete everything qc!=0 except the value larger than alarming and max valid
missing = np.where((qc_cpc==2)|(qc_cpc==962)|(qc_cpc==65474))[0]
df['cpc_con'].values[missing]= np.NaN
df['diff_con'].values[missing] = np.NaN
'''
#%%
'''PRE-PROCESS DATA'''

selected_features = df.columns
scaled_features = ['cpc_con', 'diff_con']

pipe = Pipeline([('RowDropper', DataSampleDropper()),
                 ('FeatureSelector', DataFrameSelector(selected_features)),
                 ('Scale', DataScaler(scaled_features))])

processed_data = pipe.fit_transform(df)  # TODO
print(processed_data.isnull().values.any())
# PLot the training data
fig = plt.figure(figsize=(15, 5))
myFmt = DateFormatter("%H:%M:%S")
ax = fig.gca()

ax.xaxis.set_major_formatter(myFmt)

ax.plot(processed_data['cpc_con'][4000:4400],
        '.',
        linewidth=1.0,
        color='grey',
        label='ori_data')
ax.plot(processed_data['diff_con'][4000:4400],
        '.',