예제 #1
0
class TfIdf(Feature):
    def __init__(self):
        self.kbest = None
        self.vect = None
        self.truncated = None
        self.normalizer = None

    def train(self, reviews, labels):
        self.vect = TfidfVectorizer(analyzer='word',
                                    ngram_range=(1, 2),
                                    stop_words='english')

        reviews_text = [
            ' '.join(list(chain.from_iterable(review))) for review in reviews
        ]
        tfidf_matrix = self.vect.fit_transform(reviews_text).toarray()

        self.truncated = TruncatedSVD(n_components=50)
        self.truncated.fit(tfidf_matrix, labels)

        trunc = self.truncated.transform(tfidf_matrix)
        self.normalizer = Normalizer()
        self.normalizer.fit(trunc)

        self.kbest = SelectKBest(f_classif, k=5)
        self.kbest.fit(self.normalizer.transform(trunc), labels)

    def score(self, data):
        reviews_text = ' '.join(list(chain.from_iterable(data)))
        tfidf_matrix = self.vect.transform([reviews_text]).toarray()

        trunc = self.truncated.transform(tfidf_matrix)

        return tuple(
            self.kbest.transform(self.normalizer.transform(trunc))[0, :])
예제 #2
0
파일: load.py 프로젝트: hyuntai97/lab_code
def dataloader(datadir, dataload):
    data = pd.read_csv(os.path.join(datadir, f'LUADLUSC_float32.tsv'),
                       sep='\t')
    data_copy = data.copy()
    data_copy.set_index('sample', inplace=True)

    # dataload시에 전체데이터 로그변환 -> 전체데이터 scaling
    if dataload == 1:
        X = data_copy.iloc[:, :-1]
        y_target = data_copy.iloc[:, -1]

        X_columns = X.columns
        X_index = X.index

        # log 변환
        X = X.apply(np.log1p)

        # Standard scaling
        scaler = Normalizer()
        scaler.fit(X)
        X = scaler.transform(X)
        X = pd.DataFrame(X, columns=X_columns, index=X_index)

    elif dataload == 2:
        X = data_copy.iloc[:, :-1]
        y_target = data_copy.iloc[:, -1]

    elif dataload == 3:  # c4 computational gene set 사용해서 feature 개수 줄이는 방법
        data_gene = pd.read_csv(os.path.join(datadir, f'c4_entrez.gmt.txt'),
                                sep='\t',
                                engine='python',
                                header=None,
                                index_col=0,
                                error_bad_lines=False)
        data_gene.drop(1, axis=1, inplace=True)
        data_gene.fillna(0, inplace=True)

        gene_idx = []
        for idx in data_gene.index:
            gene = data_gene.loc[idx].values.astype(int)
            gene_idx.append(gene)
        idx = []
        for i in range(len(gene_idx)):
            for j in gene_idx[i]:
                idx.append(j)
        set_idx = set(idx)
        lst_idx = list(set_idx)

        # new feature dataframe 생성
        overlap_idx = [
        ]  # computational dataset에 있으면서 기존 데이터에도 있는 유전자의 entrez번호
        for col in data_copy.columns[:-1]:
            if int(col.split('|')[0].split(':')[1]) in lst_idx:
                overlap_idx.append(col)
        data_copy2 = data_copy.loc[:, overlap_idx]  # 새롭게 만든 데이터 프레임
        data_copy2['target,LUAD:0,LUSC:1'] = data_copy['target,LUAD:0,LUSC:1']
        X = data_copy.iloc[:, :-1]
        y_target = data_copy.iloc[:, -1]

    return X, y_target
예제 #3
0
파일: tfidf.py 프로젝트: EdwardBetts/Yulp
class TfIdf(Feature):
    def __init__(self):
        self.kbest = None
        self.vect = None
        self.truncated = None
        self.normalizer = None

    def train(self, reviews, labels):
        self.vect = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), stop_words='english')

        reviews_text = [' '.join(list(chain.from_iterable(review))) for review in reviews]
        tfidf_matrix = self.vect.fit_transform(reviews_text).toarray()

        self.truncated = TruncatedSVD(n_components=50)
        self.truncated.fit(tfidf_matrix, labels)

        trunc = self.truncated.transform(tfidf_matrix)
        self.normalizer = Normalizer()
        self.normalizer.fit(trunc)

        self.kbest = SelectKBest(f_classif, k=5)
        self.kbest.fit(self.normalizer.transform(trunc), labels)

    def score(self, data):
        reviews_text = ' '.join(list(chain.from_iterable(data)))
        tfidf_matrix = self.vect.transform([reviews_text]).toarray()

        trunc = self.truncated.transform(tfidf_matrix)

        return tuple(self.kbest.transform(self.normalizer.transform(trunc))[0, :])
def preproc(X_train,
            X_test,
            y_train,
            y_test,
            specified_scaler,
            feature_selection,
            feature_limit=21):
    # Scale feature values
    if specified_scaler == "none":
        scaler = Normalizer()
    if specified_scaler == "standard":
        scaler = StandardScaler()  # -- works well when outliers are negligible
    # scaler = MinMaxScaler(feature_range = (0, 10)) # -- works well when outliers are negligible
    # Robust ignores (no pruned) small and large outliers, given a percentile and scales rest of data
    if specified_scaler == "robust":
        scaler = RobustScaler(quantile_range=(25, 75))
    # quantile transformers changes the distribution and even makes outliers part of inliers-- good for uniform data
    if specified_scaler == "quantile":
        scaler = QuantileTransformer(
            output_distribution='uniform')  # fast and great
    # PowerTransformer finds the optimal scaling factor to stabilize variance through maximum likelihood estimation
    if specified_scaler == "power":
        scaler = PowerTransformer(method='yeo-johnson')  # great
    # Fit on training set only
    scaler.fit(X_train[feature_selection[:feature_limit]])

    # Apply transform to both the training set and the test set and standard naming conventions
    X_train = scaler.transform(X_train[feature_selection[:feature_limit]])
    X_test = scaler.transform(X_test[feature_selection[:feature_limit]])
    return X_train, X_test, y_train, y_test
예제 #5
0
    def _test_normalizer_converter(self, norm):
        warnings.filterwarnings("ignore")
        X = np.array([[1, 2, 3], [4, 3, 0], [0, 1, 4], [0, 5, 6]],
                     dtype=np.float32)

        # Create SKL model for testing
        model = Normalizer(norm=norm)
        model.fit(X)

        # Create ONNX-ML model
        onnx_ml_model = convert_sklearn(model,
                                        initial_types=[
                                            ("float_input",
                                             FloatTensorType_onnx(X.shape))
                                        ])

        # Create ONNX model by calling converter
        onnx_model = convert(onnx_ml_model, "onnx", X)
        # Get the predictions for the ONNX-ML model
        session = ort.InferenceSession(onnx_ml_model.SerializeToString())
        output_names = [
            session.get_outputs()[i].name
            for i in range(len(session.get_outputs()))
        ]
        onnx_ml_pred = [[] for i in range(len(output_names))]
        inputs = {session.get_inputs()[0].name: X}
        onnx_ml_pred = session.run(output_names, inputs)

        # Get the predictions for the ONNX model
        session = ort.InferenceSession(onnx_model.SerializeToString())
        onnx_pred = [[] for i in range(len(output_names))]
        onnx_pred = session.run(output_names, inputs)

        return onnx_ml_pred, onnx_pred
def face_classification():
    data = load('attendance/model/sample_data_face_embeddings.npz')
    trainX, trainy, testX, testy = data['arr_0'], data['arr_1'], data[
        'arr_2'], data['arr_3']
    print("Dataset: train %d,test=%d" % (trainX.shape[0], testX.shape[0]))

    in_encoder = Normalizer(norm='l2')
    in_encoder.fit(trainX)
    trainX = in_encoder.transform(trainX)
    testX = in_encoder.transform(testX)

    out_encoder = LabelEncoder()
    out_encoder.fit(trainy)
    save_pickle('attendance/model/label_encoder.pkl', out_encoder)
    trainy = out_encoder.transform(trainy)
    testy = out_encoder.transform(testy)

    model = SVC(kernel="linear", probability=True)
    model.fit(trainX, trainy)

    yhat_train = model.predict(trainX)
    yhat_test = model.predict(testX)

    #saving the model in pkl file
    save_pickle('attendance/model/svm_model.pkl', model)

    score_train = accuracy_score(trainy, yhat_train)
    score_test = accuracy_score(testy, yhat_test)
    print('Accuracy: train=%.3f, test=%.3f' %
          (score_train * 100, score_test * 100))
    print("Training is done!!!")
예제 #7
0
def normalize_select(normalize, x_train=None, x_val=None):
    x_train_index = x_train.index
    x_val_index = x_val.index

    if normalize == "Normalizer":
        scaler = Normalizer()
        scaler.fit(x_train)
        x_train_new = scaler.transform(x_train)
        x_val_new = scaler.transform(x_val)
        x_train = pd.DataFrame(x_train_new,
                               columns=x_train.columns,
                               index=x_train_index)
        x_val = pd.DataFrame(x_val_new,
                             columns=x_val.columns,
                             index=x_val_index)

    if normalize == "Minmax":
        scaler = MinMaxScaler()
        scaler.fit(x_train)
        x_train_new = scaler.transform(x_train)
        x_val_new = scaler.transform(x_val)
        x_train = pd.DataFrame(x_train_new,
                               columns=x_train.columns,
                               index=x_train_index)
        x_val = pd.DataFrame(x_val_new,
                             columns=x_val.columns,
                             index=x_val_index)

    return x_train, x_val
예제 #8
0
def display_plot(csv, t_size, max_neigh):
    gt = pd.read_csv(csv)
    cols = [col for col in gt.columns if col not in ['label']]
    data = gt[cols]
    target = gt['label']

    data_train, data_test, target_train, target_test = train_test_split(
        data, target, test_size=t_size, random_state=0)

    scaler = Normalizer()
    scaler.fit(data_train)
    data_train = scaler.transform(data_train)
    data_test = scaler.transform(data_test)

    training_accuracy = []
    test_accuracy = []
    neighbors_settings = range(1, max_neigh)
    for n_neighbors in neighbors_settings:
        clf = KNeighborsClassifier(n_neighbors=n_neighbors)
        clf.fit(data_train, target_train)
        training_accuracy.append(clf.score(data_train, target_train))
        test_accuracy.append(clf.score(data_test, target_test))
    plt.plot(neighbors_settings, training_accuracy, label="training accuracy")
    plt.plot(neighbors_settings, test_accuracy, label="test accuracy")
    plt.ylabel("Accuracy")
    plt.xlabel("k")
    plt.legend()
예제 #9
0
class NormalizerPrim(primitive):
    def __init__(self, random_state=0):
        super(NormalizerPrim, self).__init__(name='Normalizer')
        self.id = 13
        self.hyperparams = []
        self.type = 'feature preprocess'
        self.description = "Normalize samples individually to unit norm. Each sample (i.e. each row of the data matrix) with at least one non zero component is rescaled independently of other samples so that its norm (l1 or l2) equals one. This transformer is able to work both with dense numpy arrays and scipy.sparse matrix (use CSR format if you want to avoid the burden of a copy / conversion). Scaling inputs to unit norms is a common operation for text classification or clustering for instance. For instance the dot product of two l2-normalized TF-IDF vectors is the cosine similarity of the vectors and is the base similarity metric for the Vector Space Model commonly used by the Information Retrieval community."
        self.hyperparams_run = {'default': True}
        self.scaler = Normalizer()
        self.accept_type = 'c_t'

    def can_accept(self, data):
        return self.can_accept_c(data)

    def is_needed(self, data):
        # data = handle_data(data)
        # Update
        return True

    def fit(self, data):
        data = handle_data(data)
        self.scaler.fit(data['X'])

    def produce(self, data):
        output = handle_data(data)
        cols = list(output['X'].columns)
        cols = ["{}_nrmlzr".format(x) for x in cols]
        output['X'] = pd.DataFrame(self.scaler.transform(output['X']),
                                   columns=cols)
        final_output = {0: output}
        return final_output
예제 #10
0
class SimpleNormalizer(Transormer):
    def __init__(self, env):
        super().__init__()
        self._env = env
        self._norm = Normalizer()
        self._norm.fit(self._gen_data(10000))

    def _gen_data(self, cap):
        data = []
        while len(data) < cap:
            data.append(self._env.reset())
            done = False
            while not done:
                action = self._env.action_space.sample()
                s, _, done, _ = self._env.step(action)
                data.append(s)
        return data

    def transform(self, state):
        res = self._norm.transform([state])[0]
        return res

    def save(self, path):
        fp = os.path.join(path, 'norm.pkl')
        with open(fp, 'wb') as f:
            pickle.dump(self._norm, f)

    def load(self, path):
        fp = os.path.join(path, 'norm.pkl')
        with open(fp, 'rb') as f:
            self._norm = pickle.load(f)
예제 #11
0
def data_preprocess(data):
    # your code here
    # example:
    label = LabelEncoder()
    label_count = 0

    for col in data:
        if data[col].dtype == 'object':
            if len(list(data[col].unique())) <= 2:
                # Train on data
                label.fit(data[col])
                # Transform data
                data[col] = label.transform(data[col])
                label_count += 1
    x = pd.get_dummies(data)

    scaler = Normalizer()
    imputer = Imputer(strategy = 'median')
    imputer.fit(x)
    x = imputer.transform(x)
    scaler.fit(x)
    x = scaler.transform(x)

    # your code end
    return x
예제 #12
0
class CatEncoder:
    def __init__(self, cat_columns, data, normalize: bool = True):
        self.cat_indexes = [data.columns.get_loc(name) for name in cat_columns]
        self.num_indexes = [
            idx for idx in range(len(data.columns))
            if idx not in self.cat_indexes
        ]
        self.encoder = preprocessing.OneHotEncoder()
        self.encoder.fit(data[cat_columns])
        self.num_columns = list(data.columns[self.num_indexes])
        self.cat_columns = cat_columns
        cat_transformed_names = self.encoder.get_feature_names(
            input_features=self.cat_columns)
        self._transformed_column_names = self.num_columns + list(
            cat_transformed_names)
        if normalize:
            self.normalizer = Normalizer()
            self.normalizer.fit(data.iloc[:, self.num_indexes])
        else:
            self.normalizer = None

    def __call__(self, x):
        numeric = x[:, self.num_indexes]
        if self.normalizer is not None:
            numeric = self.normalizer.transform(numeric)
        categorical = self.encoder.transform(x[:, self.cat_indexes]).toarray()
        return np.concatenate((numeric, categorical), axis=1)

    @property
    def transformed_features(self):
        return self._transformed_column_names
예제 #13
0
def run_grid_search(config,
                    grid_search,
                    feature_vectors,
                    labels,
                    classifier_model='svm',
                    scale=True,
                    normalize=False):
    if (config.has("model")):
        classifier_model = config.get("model")
        scale = config.get("scale")
        normalize = config.get("normalize")

    print classifier_model
    print 'Scale:',
    print scale
    print 'Normalize:',
    print normalize

    if (scale):
        scaler = StandardScaler()
        scaler.fit(feature_vectors)
        feature_vectors = scaler.transform(feature_vectors)

    if (normalize):
        normalizer = Normalizer()
        normalizer.fit(feature_vectors)
        feature_vectors = normalizer.transform(feature_vectors)

    if classifier_model == 'svm':
        grid_search.fit(feature_vectors, labels)
예제 #14
0
 def Normalizer(self):
     esti = Normalizer()
     esti.fit(self.feature_data)
     new_data = Normalizer().fit_transform(self.feature_data)
     # print(new_data.shape)
     # print(new_data)
     return new_data
예제 #15
0
def reduce_dim(data, labels, n_components, **kwargs):
    ''' performs dimensionality reduction'''
    if kwargs['method'] == 'pca':

        matrix = data
        #transformer = Normalizer()
        #transformer.fit(matrix)

        pca = PCA(n_components=n_components, svd_solver='full')
        pca.fit(matrix)
        #return pca.fit_transform(matrix)
        #pass
        return pca.transform(matrix)

    if kwargs['method'] == 'lda':
        transformer = Normalizer()

        label = labels
        matrix = data
        transformer.fit(matrix)
        lda = LDA(n_components=n_components)
        lda.fit(transformer.transform(matrix), label)
        return lda.transform(matrix)
    #pass

    if kwargs['method'] == 'ica':

        matrix = data
        ica = ICA(n_components=n_components, random_state=0)
        return ica.fit_transform(matrix)
def generate_attendence(entities, relations, attributes, param):
    # initialize lecture attendence completely randomly
    attend = dict()
    for s in relations["s_c"]:
        for c in relations["s_c"][s]:
            attend[(s, c)] = random.choice([0, 1])
    
    # Gibbs sampling
    for t in progressbar.progressbar(range(param.time)):
        print(f"\n {np.mean(np.fromiter(attend.values(), dtype=float)) * 100}")
        
        for s in relations["s_c"]:
            for c in relations["s_c"][s]:            
                prev_attend = 0.3 * attend[(s, c)]
                phi_skill = 0
                friends_attend = summarize.beta_9 * friends_attendence(s, c, relations, attend)
                diff = attributes["d"][c]
                noise = normal()
                attend[(s, c)] = prev_attend + phi_skill + friends_attend + diff + noise
    
        all_values = np.fromiter(attend.values(), dtype=float).reshape(-1, 1)
        normalizer = Normalizer()
        normalizer.fit(all_values)
        for k in attend:
            v = np.array(attend[k]).reshape(-1, 1)
            attend[k] = random.random()<expit(normalizer.transform(v))
예제 #17
0
def normalize_data(X_train, X_test):

    # remove overlap
    cut = int(X_train.shape[1] / 2)
    longX = X_train[:, -cut:, :]
    # flatten windows
    longX = longX.reshape((longX.shape[0] * longX.shape[1], longX.shape[2]))
    # flatten train and test
    flatX_train = X_train.reshape(
        (X_train.shape[0] * X_train.shape[1], X_train.shape[2]))
    flatX_test = X_test.reshape(
        (X_test.shape[0] * X_test.shape[1], X_test.shape[2]))

    # normalize and fit on training data
    s = Normalizer()

    # fit on training data
    s.fit(longX)

    # apply to training and test data
    longX = s.transform(longX)
    flatX_train = s.transform(flatX_train)
    flatX_test = s.transform(flatX_test)

    # reshape
    flatX_train = flatX_train.reshape((X_train.shape))
    flatX_test = flatX_test.reshape((X_test.shape))

    return flatX_train, flatX_test
def prepare_X(X_raw, X_full_raw):
    X_before = np.array([e.ravel() for e in X_raw])
    X_full_before = np.array([e.ravel() for e in X_full_raw])
    n = Normalizer()
    n.fit(X_full_before)
    result = np.array([np.split(e, 100) for e in n.transform(X_before)])
    return result
예제 #19
0
def normalization(os_list, X_test):
    X_test_scaled = []
    for i in range(len(os_list)):
        sc = Normalizer(norm='l2')
        sc.fit(os_list[i][0])
        os_list[i][0] = sc.transform(os_list[i][0])
        X_test_scaled.append(sc.transform(X_test))
    return os_list, X_test_scaled
예제 #20
0
def test_normalizer():
    for norm in ["l1", "l2", "max"]:
        tform = Normalizer(norm=norm)
        tform.fit(X)
        tform_ = convert_estimator(tform)
        X_t = tform.transform(X)
        X_t_ = tform_.transform(X)
        np.allclose(X_t, X_t_)
예제 #21
0
def normalize_data(x_train, x_test):
    normalizer = Normalizer()
    # Fit our normalizer to our training data.
    normalizer.fit(x_train)
    # Transform the training data using our fitted normalizer.
    x_train = normalizer.transform(x_train)
    # Transform the testing data using our x_trained fitted normalizer.
    x_test = normalizer.transform(x_test)
    return x_train, x_test
예제 #22
0
class ScikitNormalizer(object):
    def __init__(self):
        self.data_normalizer = Normalizer()

    def fit(self, data):
        self.data_normalizer.fit(data)

    def transform(self, data):
        return (self.data_normalizer.transform(data) + 1) / 2
예제 #23
0
def test_normalizer_sparse():
    X_sparse = tosparse(X)
    for norm in ["l1", "l2", "max"]:
        tform = Normalizer(norm=norm)
        tform.fit(X)
        tform_ = convert_estimator(tform)
        X_t = tform.transform(X)
        X_t_ = tform_.transform(X_sparse)
        np.allclose(X_t, X_t_.todense())
class ScikitNormalizer(object):
    def __init__(self):
        self.data_normalizer = Normalizer()

    def fit(self, data):
        self.data_normalizer.fit(data)

    def transform(self, data):
        return (self.data_normalizer.transform(data) + 1) / 2
예제 #25
0
def train_model(X_train, y_train):
    normalizer = Normalizer()
    normalizer.fit(X_train)
    normalized_data = normalizer.transform(X_train)

    ada_boost_classifier = AdaBoostClassifier()
    ada_boost_classifier.fit(normalized_data, y_train)

    return ada_boost_classifier, normalizer
예제 #26
0
def pca(x_train, x_test):
    normalizer = Normalizer()
    normalizer.fit(x_train)
    x_train = normalizer.transform(x_train)
    x_test = normalizer.transform(x_test)

    pca = PCA()
    pca.fit(x_train)
    x_train = pca.transform(x_train)
    x_test = pca.transform(x_test)
예제 #27
0
def normalize_data(dataframe):
    """
    :param dataframe: DataFrame
    :return: DataFrane of normalized data
    """
    normal = Normalizer()
    normal.fit(dataframe)
    data = normal.transform(dataframe)
    data = pd.DataFrame(data, columns=list(dataframe))
    return data
 def test_model_normalizer(self):
     model = Normalizer(norm="l2")
     x = numpy.random.randn(10, 1).astype(numpy.int64)
     model.fit(x)
     model_onnx = convert_sklearn(model,
                                  "scikit-learn normalizer",
                                  [("input", Int64TensorType([None, 1]))],
                                  target_opset=TARGET_OPSET)
     self.assertTrue(model_onnx is not None)
     self.assertTrue(len(model_onnx.graph.node) == 1)
def normalize_usecase():
    X_train = np.array([[1., -1., 2.], [2., 0., 0.], [0., 1., -1.]])
    normalize = Normalizer(norm="l1")
    normalize.fit(X_train)
    X_L1_normalize = normalize.transform(X_train)
    print(X_L1_normalize)

    normalize = Normalizer(norm="l2")
    normalize.fit(X_train)
    X_L2_normalize = normalize.transform(X_train)
    print(X_L2_normalize)
예제 #30
0
class RepresentationNormal():

    def __init__(self, norm=DEFAULT_NORM):
        self.norm = norm
        self.normalizer = Normalizer(norm=self.norm)

    def fit(self, data):
        self.normalizer.fit(data)

    def transform(self, data):
        return self.normalizer.transform(data)
    def test_onnx_normalizer_converter_raises_rt(self):
        warnings.filterwarnings("ignore")
        X = np.array([[1, 2, 3], [4, 3, 0], [0, 1, 4], [0, 5, 6]], dtype=np.float32)
        model = Normalizer(norm="l1")
        model.fit(X)

        # generate test input
        onnx_ml_model = convert_sklearn(model, initial_types=[("float_input", FloatTensorType_onnx(X.shape))])
        onnx_ml_model.graph.node[0].attribute[0].s = "".encode()

        self.assertRaises(RuntimeError, convert, onnx_ml_model, "onnx", X)
예제 #32
0
class KNN(Model):

    def __init__(self, X_train, y_train, X_val, y_val):
        super().__init__()
        self.normalizer = Normalizer()
        self.normalizer.fit(X_train)
        self.clf = neighbors.KNeighborsRegressor(n_neighbors=10, weights='distance', p=1)
        self.clf.fit(self.normalizer.transform(X_train), numpy.log(y_train))
        print("Result on validation data: ", self.evaluate(self.normalizer.transform(X_val), y_val))

    def guess(self, feature):
        return numpy.exp(self.clf.predict(self.normalizer.transform(feature)))
예제 #33
0
class KNN(Model):

    def __init__(self, X_train, y_train, X_val, y_val):
        super().__init__()
        self.normalizer = Normalizer()
        self.normalizer.fit(X_train)
        self.clf = neighbors.KNeighborsRegressor(n_neighbors=10, weights='distance', p=1)
        self.clf.fit(self.normalizer.transform(X_train), numpy.log(y_train))
        print("Result on validation data: ", self.evaluate(self.normalizer.transform(X_val), y_val))

    def guess(self, feature):
        return numpy.exp(self.clf.predict(self.normalizer.transform(feature)))
예제 #34
0
def test_normalizer_vs_sklearn():
    # Compare msmbuilder.preprocessing.Normalizer
    # with sklearn.preprocessing.Normalizer

    normalizerr = NormalizerR()
    normalizerr.fit(np.concatenate(trajs))

    normalizer = Normalizer()
    normalizer.fit(trajs)

    y_ref1 = normalizerr.transform(trajs[0])
    y1 = normalizer.transform(trajs)[0]

    np.testing.assert_array_almost_equal(y_ref1, y1)
예제 #35
0
파일: test_sklearn.py 프로젝트: concord/ml
def test_sklearn_transform():
    transformer = Normalizer()
    transformer.fit(X_train)

    computation = SklearnTransform("test-sklearn", transformer,
                                   istreams=[], ostream="out")
    context = ComputationContext(computation)

    data = pd.DataFrame(X_test).to_json(orient="records")
    computation.process_record(context, Record("transform", data, None))

    assert len(context.records) == 1
    assert len(context.records["out"]) == 1

    record = context.records["out"][0]
    assert record.key == "transform"
    assert np.allclose(transformer.transform(X_test), json.loads(record.data))
data_np[:, [5]] = le.transform(np.ravel(data_np[:, [5]])).reshape(n_lin, 1)

# Encode label in the columns 11
le2 = LabelEncoder()
le2.fit(np.ravel(data_np[:, [10]]))
#print le2.classes_
data_np[:, [10]] = le2.transform(np.ravel(data_np[:, [10]])).reshape(n_lin, 1)

# Replace missing values by 0 for the column 16 and 17
data_np = preprocess_replace_NaN(data_np, [15, 16], 'nan')

# plot_NA_ratio_features(data_np, feature_names)

# Normalize the dataset for columns 5, 6, 7, 10, 11, 13, 14, 17 and 25
nor = Normalizer( norm='l1')
nor.fit(data_np[:, [4, 5, 6, 9, 10, 12, 13, 16, 24]].astype(np.float64))
# [0, 1, 2, 6, 11, 17, 18, 19, 20, 21, 22, 23]
data_np[:, [4, 5, 6, 9, 10, 12, 13, 16, 24]] = \
	nor.transform(data_np[:, [4, 5, 6, 9, 10, 12, 13, 16, 24]].astype(np.float64))

# Replace missing values for the risk_factor using a svm classifier
preprocess_missing_risk_factor(data_np)

# plot_pourcentage_result(data_np, feature_names, [17, 18, 19, 20, 21 ,22, 23])

# plot_NA_ratio_features(data_np, feature_names)


################################################################################

# # Replace all missing values for the column 12, 16 and 17 with the median value
예제 #37
0
for C in np.arange(0.05,2, 0.05):
    for gamma in np.arange(0.001, 0.1, 0.001):
        
        svc = SVC(C=C,gamma=gamma)
        svc.fit(X_train, y_train)
        score = svc.score(X_test, y_test)
        if score > best_score:
            best_score = score 
            print "C, gamma, score", C, gamma, score




#normalizer
norm = Normalizer()
norm.fit(X)
T = norm.transform(X)

X_train, X_test, y_train, y_test = train_test_split(T, y, test_size=0.3, random_state=7)

for C in np.arange(0.05,2, 0.05):
    for gamma in np.arange(0.001, 0.1, 0.001):
        
        svc = SVC(C=C,gamma=gamma)
        svc.fit(X_train, y_train)
        score = svc.score(X_test, y_test)
        if score > best_score:
            best_score = score 
            print "C, gamma, score", C, gamma, score

#maxabs
예제 #38
0
from sklearn import cross_validation
from sklearn import svm
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.externals import joblib
from grid_search import grid_estimation

# downloading matrix of text features and assigned clusters
all_data = genfromtxt('features_and_clusters.csv', delimiter=',')

data = all_data[:, 0:29]
target = all_data[:, 29]

# normalization and scaling of data
normalizer = Normalizer()
normalizer.fit(data)
data = normalizer.transform(data)
scaler = StandardScaler()
data = scaler.fit_transform(data)

# choosing of training and test sets
X_train, X_test, y_train, y_test = cross_validation.train_test_split(data, target, test_size=0.4, random_state=0)

#clf = svm.SVC(kernel="rbf", gamma=0.001, C=1000).fit(X_train, y_train)
clf = svm.SVC(kernel="linear", gamma=1.0, C=1).fit(X_train, y_train)

# saving of classifier, scaler and normalizer
joblib.dump(clf, 'classifier_data\\model.pkl')
joblib.dump(scaler, 'classifier_data\\scaler.pkl')
joblib.dump(normalizer, 'classifier_data\\normalizer.pkl')
    # Append new features
    newAct_train = np.zeros((activation_train.shape[0], activation_train.shape[1]+3))
    for i in range(activation_train.shape[0]):
        newAct_train[i] = np.append(activation_train[i], pttImg_sample_train[i][:3])

    newAct_valid = np.zeros((activation_valid.shape[0], activation_valid.shape[1]+3))
    for i in range(activation_valid.shape[0]):
        newAct_valid[i] = np.append(activation_valid[i], valid_pttImg[i][:3])

    newAct_test = np.zeros((activation_test.shape[0], activation_test.shape[1]+3))
    for i in range(activation_test.shape[0]):
        newAct_test[i] = np.append(activation_test[i], test_blogImg[i][:3])
    # Normalize
    normalizer = Normalizer()
    normalizer.fit(newAct_train)
    newAct_train = normalizer.transform(newAct_train)
    newAct_valid = normalizer.transform(newAct_valid)
    newAct_test = normalizer.transform(newAct_test)

    # Final model
    model3 = Sequential()
    model3.add(Dense(2, input_shape=(newAct_train.shape[1],), activation='softmax'))
    adam = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model3.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
    print(model3.summary())
    model3.fit(newAct_train, y_train_sample, epochs=epochs3, batch_size=batch_size)

    # Evaluating by using validation data or testing data
    print("Valid:")
    scores = model3.evaluate(newAct_valid, y_valid_sample, verbose=0)
예제 #40
0
def main():
    paths = ['C:/Data/crowdflower/train.csv', 'C:/Data/crowdflower/test.csv']    
    t = p.read_csv(paths[0])
    t2 = p.read_csv(paths[1])
    
    class LancasterTokenizer(object):
        def __init__(self):
            self.wnl = nltk.stem.LancasterStemmer()
        def __call__(self, doc):
            return [self.wnl.stem(t) for t in wordpunct_tokenize(doc)]

    class PorterTokenizer(object):
        def __init__(self):
            self.wnl = nltk.stem.PorterStemmer()
        def __call__(self, doc):
            return [self.wnl.stem(t) for t in wordpunct_tokenize(doc)]
    
    class WordnetTokenizer(object):
        def __init__(self):
            self.wnl = nltk.stem.WordNetLemmatizer()
        def __call__(self, doc):
            return [self.wnl.lemmatize(t) for t in wordpunct_tokenize(doc)]
        
    class SnowballTokenizer(object):
        def __init__(self):
            self.wnl = nltk.stem.SnowballStemmer("english")
        def __call__(self, doc):
            return [self.wnl.stem(t) for t in wordpunct_tokenize(doc)]      
    
    
    tfidf1 = TfidfVectorizer(max_features=85000, strip_accents='unicode',  
        analyzer='word',token_pattern=r'\w{3,}',sublinear_tf=1,
        ngram_range=(1, 2),tokenizer = SnowballTokenizer())
    
    tfidf2 = TfidfVectorizer(max_features=600000, strip_accents='unicode',
        analyzer='char',sublinear_tf=1,
        ngram_range=(2, 17))
    
    tfidf3 = CountVectorizer(max_features=5200, strip_accents='unicode',  
        analyzer='word',token_pattern=r'\w{3,}',
        ngram_range=(1, 3),tokenizer = SnowballTokenizer())
    
    tfidf4 = CountVectorizer(max_features=1800, strip_accents='unicode',  
        analyzer='char',
        ngram_range=(2, 9))
    
    tfidf5 = TfidfVectorizer(max_features=10000, strip_accents='unicode',
        analyzer='char_wb',sublinear_tf=1,
        ngram_range=(2, 9))
    
    tfidf6 = CountVectorizer(max_features=1800, strip_accents='unicode',  
        analyzer='char_wb',
        ngram_range=(2, 9))
    
    tfidf7 = TfidfVectorizer(max_features=85000, strip_accents='unicode',  
        analyzer='word',sublinear_tf=1,
        ngram_range=(1, 2),tokenizer = SnowballTokenizer())
    
    tfidf8 = CountVectorizer(max_features=4900, strip_accents='unicode',  
        analyzer='word',
        ngram_range=(1, 3),tokenizer = SnowballTokenizer())
    
    tfidf9 = CountVectorizer(max_features=5200, strip_accents='unicode',  
        analyzer='word',token_pattern=r'\w{1,}',
        ngram_range=(1, 3),tokenizer = SnowballTokenizer())
    
    tfidf10 = TfidfVectorizer(max_features=85000, strip_accents='unicode',  
        analyzer='word',token_pattern=r'\w{1,}',sublinear_tf=1,
        ngram_range=(1, 2),tokenizer = SnowballTokenizer())
    
    tfidf11 = CountVectorizer(max_features=5200, strip_accents='unicode',  
        analyzer='word',token_pattern=r'\w{3,}', binary=True,
        ngram_range=(1, 3),tokenizer = SnowballTokenizer())
    
    tfidf12 = CountVectorizer(max_features=5200, strip_accents='unicode',  
        analyzer='word',token_pattern=r'\w{1,}', binary=True,
        ngram_range=(1, 3),tokenizer = SnowballTokenizer()
        )
    
    tfidf13 = CountVectorizer(max_features=5200, strip_accents='unicode',  
        analyzer='word', binary=True,
        ngram_range=(1, 3),tokenizer = SnowballTokenizer())
    
    vectorizers = [tfidf1,tfidf2,tfidf3,tfidf4,tfidf5,tfidf6, tfidf7,tfidf8,tfidf9,tfidf10,tfidf11,tfidf12,tfidf13]
    #vectorizers = [tfidf1,tfidf3,tfidf5,tfidf6]
    #vectorizers = [tfidf1]  
      
    #comment = 'full, SnowballTokenizer no RF'
    use_lsa = 0
    cv_split = 0.2
    n = int(np.round(len(t['tweet'].tolist())))
    train_end = int(np.round(n*(1-cv_split)))
    cv_beginning = int(np.round( n*(1-cv_split
                                     if cv_split > 0 else 0.8)))
    y = np.array(t.ix[:,4:])
    
    train = t['tweet'].tolist()[0:train_end]
    cv_X_original = np.array(t['tweet'].tolist()[cv_beginning:])
    cv_y = np.array(y[cv_beginning:])
        
    if cv_split == 0:
        train = t['tweet'].tolist()
    else:
        y = y[0:int(np.round(len(t['tweet'].tolist())*(1-cv_split)))]   
    
    prediction_grand_all = 0
    predict_cv_grand_all = 0
    list_predictions = []
    list_predictions_test = []
    for tfid in vectorizers:    
        print 'fitting vectorizer...'
        tfid.fit(t['tweet'].tolist() + t2['tweet'].tolist())
        print 'transforming train set...'
        X = tfid.transform(train)
        print 'transforming cv set...'     
        cv_X = tfid.transform(cv_X_original)
        print 'transforming test set...'    
        test = tfid.transform(t2['tweet'])    
        
        clf1 = MultiTaskLasso()
        clf2 = AdaBoostRegressor(learning_rate = 1,n_estimators = 10)
        clf3 = RandomForestRegressor(max_depth = 20, n_estimators = 36, max_features = 100, n_jobs = 6)
        clf4 = Ridge()       
       
        clfs = [clf4, clf3]
        lsa_classifier = [0, 1]
        prediction_all = 0
        predict_cv_all = 0
        for clf, use_lsa in zip(clfs,lsa_classifier):            
            if use_lsa == 1:
                lsa = TruncatedSVD(n_components = 100)
                print 'fitting lsa...'
                lsa.fit(X, y)
                print 'transfomring with lsa...'
                X = lsa.transform(X)
                cv_X = lsa.transform(cv_X)
                test = lsa.transform(test)                
                print 'normalizing....'
                norm = Normalizer()
                norm.fit(X, y)
                X = norm.transform(X, copy= False)
                test = norm.transform(test, copy= False)
                cv_X = norm.transform(cv_X, copy= False)   
                
            else:           
                fac = p.Categorical(t['state'].tolist() + t2['state'].tolist())        
                t_matrix = u.create_t_matrix(fac.labels)
                train_feat = t_matrix[0:train_end]
                cv_X_feat = t_matrix[cv_beginning:n]
                test_feat = t_matrix[n:]                        
                X = sparse.hstack([X, sparse.csr_matrix(train_feat)])
                cv_X = sparse.hstack([cv_X, sparse.csr_matrix(cv_X_feat)])
                test = sparse.hstack([test, sparse.csr_matrix(test_feat)])     
            
          
            t0 = time.time()
            print 'fitting...'
            clf.fit(X,y) 
            print 'validating...'
            print 'Train error: {0}'.format(np.sqrt(np.sum(np.array(np.array(clf.predict(X))-y)**2)/ (X.shape[0]*24.0)))
            
            prediction = np.array(clf.predict(test))  
            prediction = np.abs(prediction*(prediction > 0))
            prediction[prediction > 1] = 1
            
            predict_cv = np.array(clf.predict(cv_X))  
                    
            predict_cv = np.abs(predict_cv*(predict_cv > 0))
            predict_cv[predict_cv > 1] = 1
            list_predictions.append(predict_cv)
            list_predictions_test.append(prediction)
            print 'Cross validation error: {0}'.format(np.sqrt(np.sum(np.array(predict_cv-cv_y)**2)/ (cv_X.shape[0]*24.0)))
            predict_cv_all = predict_cv + predict_cv_all
            prediction_all = prediction + prediction_all
            print 'fitted model in {0} seconds'.format(np.round(time.time() - t0,2))
        prediction_all /= len(clfs)*1.0
        predict_cv_all /= len(clfs)*1.0
        print 'Cross validation error ensemble: {0}'.format(np.sqrt(np.sum(np.array(predict_cv_all - cv_y)**2)/ (cv_X.shape[0]*24.0)))
        prediction_grand_all = prediction_all + prediction_grand_all
        predict_cv_grand_all = predict_cv_all + predict_cv_grand_all
    
    
    prediction_grand_all /= len(vectorizers)*1.0
    predict_cv_grand_all /= len(vectorizers)*1.0
    
    print 'Cross validation error grand ensemble: {0}'.format(np.sqrt(np.sum(np.array(predict_cv_grand_all - cv_y)**2)/ (cv_X.shape[0]*24.0)))
    #log.info(comment)
    #log.info(np.sqrt(np.sum(np.array(predict_cv_grand_all - cv_y)**2)/ (cv_X.shape[0]*24.0)))
    prediction = np.array(np.hstack([np.matrix(t2['id']).T, prediction_grand_all])) 
    col = '%i,' + '%f,'*23 + '%f'
    np.savetxt('C:/data/crowdflower/sklearn_prediction.csv', prediction,col, delimiter=',')  
    list_predictions.append(cv_y)
    pickle.dump(list_predictions, io.open('C:/data/crowdflower/predicts.txt','wb'))
    pickle.dump(list_predictions_test, io.open('C:/data/crowdflower/predicts_test.txt','wb'))