Exemplo n.º 1
0
def explore_k(svd_trans, k_range):
    '''
    Explores various values of k in KMeans

    Args:
        svd_trans: dense array with lsi transformed data
        k_range: the range of k-values to explore
    Returns:
        scores: list of intertia scores for each k value
    '''

    scores = []
    # spherical kmeans, so normalize
    normalizer = Normalizer()
    norm_data = normalizer.fit_transform(svd_trans)
    for k in np.arange:
        km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1,
                    verbose=2)
        km.fit(norm_data)
        scores.append(-1*km.score(norm_data))
    plt.plot(k_range, scores)
    plt.xlabel('# of clusters')
    plt.ylabel('Inertia')
    sns.despine(offset=5, trim=True)
    return scores
Exemplo n.º 2
0
class TfIdf(Feature):
    def __init__(self):
        self.kbest = None
        self.vect = None
        self.truncated = None
        self.normalizer = None

    def train(self, reviews, labels):
        self.vect = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), stop_words='english')

        reviews_text = [' '.join(list(chain.from_iterable(review))) for review in reviews]
        tfidf_matrix = self.vect.fit_transform(reviews_text).toarray()

        self.truncated = TruncatedSVD(n_components=50)
        self.truncated.fit(tfidf_matrix, labels)

        trunc = self.truncated.transform(tfidf_matrix)
        self.normalizer = Normalizer()
        self.normalizer.fit(trunc)

        self.kbest = SelectKBest(f_classif, k=5)
        self.kbest.fit(self.normalizer.transform(trunc), labels)

    def score(self, data):
        reviews_text = ' '.join(list(chain.from_iterable(data)))
        tfidf_matrix = self.vect.transform([reviews_text]).toarray()

        trunc = self.truncated.transform(tfidf_matrix)

        return tuple(self.kbest.transform(self.normalizer.transform(trunc))[0, :])
def preprocess(data, n_components, use_tf_idf=True):
    """
    Preproecess the data for clustering by running SVD and
    normalizing the results. This process is also known as
    LSA.

    arguments:
    data -- Dataset, if tf_idf is Truethe object must contain a
            tf_idf table alongside a raw frequencies dataframe.
    n_components -- int, the number of components to use for the SVD
                    a minimum of 100 is recommended.
    use_tf_idf -- bool, whether to use the tf-idf frequencies for the
                  preprocessing.

    returns:
    e -- float, a measure of variance explained by the SVD.
    X -- np.array, an array with the data reduced to n_components.
    """
    if use_tf_idf:
        d = data.tf_idf.as_matrix()
    else:
        d = data.df.as_matrix()
    svd = TruncatedSVD(n_components=n_components)
    X = svd.fit_transform(d)
    norm = Normalizer()

    # Record a measure of explained variance
    e = svd.explained_variance_ratio_.sum()*100
    return e, norm.fit_transform(d)
Exemplo n.º 4
0
    def __init__(self, 
                 YTrain_file,
                 XTrain_file,
                 XTest_file,
                 output_path,
                 normalise,
                 C,
                 class_weight,
                 ):
        """
        Arguments:
      
        """
        self.YTrain = joblib.load(YTrain_file)
        XTrain = joblib.load(XTrain_file)
        self.XTrain = XTrain.reshape(np.size(XTrain, axis=0), -1)
       
        XTest = joblib.load(XTest_file)   
        self.XTest = XTest.reshape(np.size(XTest, axis=0), -1)

        self.output_path = output_path
    
        if normalise:
            normalizer = Normalizer(copy=False)
            normalizer.transform(self.XTrain)
            normalizer.transform(self.XTest)

        self.C = C
        if class_weight == 'none':
            class_weight = None
        self.class_weight = class_weight
Exemplo n.º 5
0
 def getPcaFeatures(self, images, components, image_size):
     imageDataset = self.getImagesAsDataset(images, image_size)
     norm = Normalizer()
     imageDataset = norm.fit_transform(imageDataset)
     pca = PCA(n_components=components)
     imageDataset = pca.fit_transform(imageDataset)
     return pca, norm, imageDataset
Exemplo n.º 6
0
def kfold(agetext,k,model,nfeatures,check=False,k2 = None,max_df=0.9,min_df=3):
    out = []
    for i in range(k):
        print "iteration: "+str(i)
        agetext = shuffle(agetext)
        X = agetext["text"]
        X = X.tolist()
        label = agetext["agegroup"].tolist()
        vec = TfidfVectorizer(tokenizer = tokenize,token_pattern=r'(?u)\b\w\w+\b|^[_\W]+$',lowercase=False,max_features=nfeatures,max_df = max_df,min_df = min_df,use_idf=True,ngram_range=(1,2))
        docs = []
        for doc in X:
            docs.append(" ".join(doc))
        docs2 = [doc.replace("\t","").replace("\n","") for doc in docs]
        traindocs = docs2[:7999]
        X = vec.fit_transform(traindocs)
        testdocs = docs2[8000:9500]
        X_test = vec.transform(testdocs)
        tlabel = label[:7999]
        testl = label[8000:9500]
        if(check):
            lsa = TruncatedSVD(k2, algorithm = 'arpack')
            normalizer = Normalizer(copy=False)
            X = lsa.fit_transform(X)
            X = normalizer.fit_transform(X)
            X_test = lsa.transform(X_test)
            X_test = normalizer.transform(X_test)
        model.fit(X,tlabel)
        pred = model.predict(X_test)
        out.append(round(accuracy_score(testl, pred),2))
    print str(out)
    print np.mean(out)
Exemplo n.º 7
0
def kfold(agetext,k,model,k2):
    import collections
    out = []
    for i in range(k):
        print "iteration: "+str(i)
        agetext = shuffle(agetext)
        datatb = agetext.iloc[:,1:]
        label = agetext["agegroup"].tolist()
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(
            datatb, label, test_size=0.15, random_state=i*6)
        data = X_train.values
        counter = collections.Counter(y_train)
        print counter
        testdata = X_test.values
        lsa = TruncatedSVD(k2, algorithm = 'arpack')
        normalizer = Normalizer(copy=False)
        X = lsa.fit_transform(data)
        X = normalizer.fit_transform(X)
        X_test = lsa.transform(testdata)
        X_test = normalizer.transform(X_test)
        model.fit(X,y_train)
        pred = model.predict(X_test)
        counter = collections.Counter(y_test)
        print counter
        counter = collections.Counter(pred)
        print counter
        out.append(round(accuracy_score(y_test, pred),5))
    print str(out)
    print np.mean(out)
Exemplo n.º 8
0
def normalize_test():
    X=[1,2,3,4,5,2,6,8]
    from sklearn.preprocessing import Normalizer
    normalizer = Normalizer()
    X2 = normalizer.fit_transform(X)

    print X2
Exemplo n.º 9
0
    def _normalize(self, X, y, X_t):
        from sklearn.preprocessing import Normalizer
        NORM = Normalizer()

        X = NORM.fit_transform(X, y)
        X_t = NORM.transform(X_t)

        return X, X_t
Exemplo n.º 10
0
def readAndPreProcess():
	print("\n\n********** CS-412 HW5 Mini Project **********")
	print("************ Submitted by Sankul ************\n\n")
	print("Reading data, please ensure that the dataset is in same folder.")
	resp = pd.read_csv('responses.csv')
	print("Data reading complete!")
	print("Some stats reagarding data:")
	resp.describe()
	
	print("\nStarting pre-processing.....")
	
	print("\nFinding missing values:")
	print("Missing values found, removing them")
	emptyVals = resp.isnull().sum().sort_values(ascending=False)
	emptyPlot = emptyVals.plot(kind='barh', figsize = (20,35))
	plt.show()
	print("Empty values removed")
	
	print("\nChecking for NaN and infinite values in target column (Empathy):")
	if len(resp['Empathy']) - len(resp[np.isfinite(resp['Empathy'])]):
		print("Number of infinite or NaN values in Empathy column: ", len(resp['Empathy']) - len(resp[np.isfinite(resp['Empathy'])]))
		print("Removing them")
		resp = resp[np.isfinite(resp['Empathy'])]
		print("Infinite and NaN values removed")
		
	print("\nChecking for categorical features:")
	if pd.Categorical(resp).dtype.name == 'category':
		print("Categorical features found. Removing them...")
		resp = resp.select_dtypes(exclude=[object])	
		print("Categorical features removed")
		
	print("\nReplacing NaN values with the mean value:")
	resp=resp.fillna(resp.mean()) 
	resp.isnull().sum()
	print("Values replaced")
	
	print("\nSeperating labels from data:")
	Y = resp['Empathy'].values
	X = resp.drop('Empathy',axis=1)
	print("Labels seperated")
	
	print("\nScaling, standardizing and normalizing the data:")
	scaler = MinMaxScaler(feature_range=(0, 1))
	rescaledX = scaler.fit_transform(X)
	
	scaler = StandardScaler().fit(rescaledX)
	standardizedX = scaler.transform(rescaledX)
	
	normalizer = Normalizer().fit(standardizedX)
	normalizedX = normalizer.transform(standardizedX)
	print("Scaling, standardizing and normalizing completed")
	
	print("\nFinal data looks like:")
	print(normalizedX.shape)
	print("Values inside look like:")
	print(normalizedX[0])
	
	return normalizedX,Y
Exemplo n.º 11
0
def kmeans(tfidf, svd, svd_trans, k=200, n_words=10):
    '''
    Performs k-means clustering on svd transformed data and plots it

    Args:
        tfidf: sklearn fitted TfidfVectorizer
        svd: sklearn fitted TruncatedSVD
        svd_trans: dense array with lsi transformed data
        k: the k in k-means
    Returns:
        km: the fitted KMean object
    '''

    # spherical kmeans, so normalize
    normalizer = Normalizer()
    norm_data = normalizer.fit_transform(svd_trans)
    km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=5,
                verbose=2)
    km.fit(norm_data)

    original_space_centroids = svd.inverse_transform(km.cluster_centers_)
    order_centroids = original_space_centroids.argsort()[:, ::-1]

    terms = tfidf.get_feature_names()
    terms = prettify(terms)
    terms = np.array(terms)
    fig = plt.figure(figsize=(10, 8))
    for i in range(10):
        print("Cluster {:d}:".format(i))
        for ind in order_centroids[i, :n_words]:
            print(' {:s}'.format(terms[ind]))
        print('\n')

        # Make a figure and axes with dimensions as desired.
        ax = fig.add_subplot(2, 5, i+1)
        ax.set_title('Cluster {:d}'.format(i+1))

        component = order_centroids[i]
        cmap = plt.cm.Purples
        mn = np.min(component[:n_words])
        mx = np.max(component[:n_words])
        norm = mpl.colors.Normalize(mn, mx)

        cb = mpl.colorbar.ColorbarBase(ax, cmap=cmap, norm=norm,
                                       orientation='vertical')
        # sorted_component = np.sort(component)
        colors = sns.color_palette('Purples', 9).as_hex()
        colors = np.repeat(colors[-1], n_words)

        cb.set_ticks(np.linspace(mn, mx, n_words+2)[1:-1])
        cb.ax.yaxis.set_tick_params(size=0)
        cb.ax.tick_params(labelsize=10)
        for color, tick in zip(colors, cb.ax.get_yticklabels()):
            tick.set_color(color)
            tick.set_fontsize(14)
        cb.set_ticklabels(np.array(terms)[order_centroids[i, :n_words][::-1]])
    plt.tight_layout()
    return km
Exemplo n.º 12
0
 def __init__(self, img_dir):
     self._imgdir = img_dir
     self._extractors = self.__get_extractors()
     self._normalizer = Normalizer()
     self._face_normalizer = Normalizer()
     self._estimator = NearestNeighbors(n_neighbors=3)
     self._face_estimator = NearestNeighbors(n_neighbors=3)
     self._imgnames = []
     self._face_imgnames = []
class ScikitNormalizer(object):
    def __init__(self):
        self.data_normalizer = Normalizer()

    def fit(self, data):
        self.data_normalizer.fit(data)

    def transform(self, data):
        return (self.data_normalizer.transform(data) + 1) / 2
Exemplo n.º 14
0
    def test_ver2_syntetic_dataset(self):

        self.ex = experiment.Experiment()
        self.ex.cf_matrix = load_sparse_data('syntetic_cf.dat')
        n = Normalizer(norm='l2', copy=True)
        self.ex.cf_matrix = n.transform(self.ex.cf_matrix) #normalized.
        self.ex.cb_prox = experiment.Experiment.load_data(PKL + 'cb_prox.pkl')
        self.ex.cf_prox = self.ex.cf_matrix * self.ex.cf_matrix.T
        self.ex.test_corr_sparsity(draw=True, interval=100)
Exemplo n.º 15
0
    def reduce_dimension(self, n_components=2):
        """ Return PCA transform of self.data, with n_components. """

        reducer = PCA(n_components=n_components)

        X = self.data.values

        norm = Normalizer()
        Xnorm = norm.fit_transform(X)

        return reducer.fit_transform(Xnorm)
Exemplo n.º 16
0
def make_nn_regression(n_samples=100, n_features=100, n_informative=10,
                       dense=False, noise=0.0, test_size=0,
                       normalize_x=True, normalize_y=True,
                       shuffle=True, random_state=None):

    X, y, w = _make_nn_regression(n_samples=n_samples,
                                  n_features=n_features,
                                  n_informative=n_informative,
                                  shuffle=shuffle,
                                  random_state=random_state)

    if dense:
        X = X.toarray()

    if test_size > 0:
        cv = ShuffleSplit(len(y), n_iter=1, random_state=random_state,
                          test_size=test_size, train_size=1-test_size)

        train, test = list(cv)[0]
        X_train, y_train = X[train], y[train]
        X_test, y_test = X[test], y[test]
        if not dense:
            X_train.sort_indices()
            X_test.sort_indices()
    else:
        X_train, y_train = X, y
        if not dense:
            X_train.sort_indices()
        X_test, y_test = None, None

    # Add noise
    if noise > 0.0:
        generator = check_random_state(random_state)
        y_train += generator.normal(scale=noise * np.std(y_train),
                                    size=y_train.shape)
        y_train = np.maximum(y_train, 0)

    if normalize_x:
        normalizer = Normalizer()
        X_train = normalizer.fit_transform(X_train)
        if X_test is not None:
            X_test = normalizer.transform(X_test)

    if normalize_y:
        scaler = MinMaxScaler()
        y_train = scaler.fit_transform(y_train.reshape(-1, 1)).ravel()
        if y_test is not None:
            y_test = scaler.transform(y_test.reshape(-1, 1)).ravel()

    if X_test is not None:
        return X_train, y_train, X_test, y_test, w
    else:
        return X_train, y_train, w
Exemplo n.º 17
0
 def normalize(self, msi, norm="l1"):
     original_shape = msi.get_image().shape
     collapsed_image = collapse_image(msi.get_image())
     # temporarily save mask, since scipy normalizer removes mask
     is_masked_array = isinstance(msi.get_image(), np.ma.MaskedArray)
     if is_masked_array:
         mask = msi.get_image().mask
     normalizer = Normalizer(norm=norm)
     normalized_image = normalizer.transform(collapsed_image)
     if is_masked_array:
         normalized_image = np.ma.MaskedArray(normalized_image, mask=mask)
     msi.set_image(np.reshape(normalized_image, original_shape))
Exemplo n.º 18
0
class KNN(Model):

    def __init__(self, X_train, y_train, X_val, y_val):
        super().__init__()
        self.normalizer = Normalizer()
        self.normalizer.fit(X_train)
        self.clf = neighbors.KNeighborsRegressor(n_neighbors=10, weights='distance', p=1)
        self.clf.fit(self.normalizer.transform(X_train), numpy.log(y_train))
        print("Result on validation data: ", self.evaluate(self.normalizer.transform(X_val), y_val))

    def guess(self, feature):
        return numpy.exp(self.clf.predict(self.normalizer.transform(feature)))
Exemplo n.º 19
0
def test_pipeline():
    norm = Normalizer(norm='l1')
    norm_id = norm.what().id()
    assert norm_id == "Normalizer(norm='l1')"
    kmeans = KMeans(n_clusters=12)
    kmeans_id = kmeans.what().id()
    print(kmeans_id)
    assert kmeans_id == \
        "KMeans(algorithm='auto',init='k-means++',max_iter=300,n_clusters=12,n_init=10,random_state=None,tol=0.0001)"
    # noinspection PyTypeChecker
    pipeline_id = Pipeline((('norm', norm), ('kmeans', kmeans))).what().id()
    assert pipeline_id == "Pipeline(steps=(('norm',%s),('kmeans',%s)))" % (norm_id, kmeans_id)
def get_tf_idf_M(M, tf = ["bin", "raw", "log", "dnorm"], idf = ["c", "smooth", "max", "prob"], norm_samps=False):
    N = len(M)
    if tf == "raw":
        tf_M = np.copy(M) #just the frequency of the word in a text
#    #TODO: check if dnorm is implemented OK
#    elif tf == "dnorm":
#        tf_M = 0.5 + 0.5*(M/(np.amax(M, axis=1).reshape((N,1))))
    if idf == "c":
        idf_v = []
        for i in range(M.shape[1]): #get the number of texts that contain a word words[i]
            idf_v.append(np.count_nonzero(M[:,i])) #count the non zero values in columns of matrix M
        idf_v = np.array(idf_v)
        idf_v = np.log(N/idf_v)
    tf_idf_M = tf_M*idf_v
    if norm_samps:
        normalizer = Normalizer()
        tf_idf_M = normalizer.fit_transform(tf_idf_M)
#    np.savetxt("tf_idf_M_" + str(N) + ".txt", tf_idf_M , fmt="%s")
    return tf_idf_M
        
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
Exemplo n.º 21
0
    def load_data(self):
        if not os.path.exists('features_train.txt'):
            self.feature_extraction('train.txt', 'features_train.txt')
        data_train, target_train = load_svmlight_file('features_train.txt')

        if not os.path.exists('features_test.txt'):
            self.feature_extraction('test.txt', 'features_test.txt')
        data_test, target_test = load_svmlight_file('features_test.txt')

        normalizer = Normalizer().fit(data_train)
        data_train = normalizer.transform(data_train)
        data_test = normalizer.transform(data_test)

        return data_train.toarray(), target_train, data_test.toarray(), target_test
Exemplo n.º 22
0
def test_normalizer_vs_sklearn():
    # Compare msmbuilder.preprocessing.Normalizer
    # with sklearn.preprocessing.Normalizer

    normalizerr = NormalizerR()
    normalizerr.fit(np.concatenate(trajs))

    normalizer = Normalizer()
    normalizer.fit(trajs)

    y_ref1 = normalizerr.transform(trajs[0])
    y1 = normalizer.transform(trajs)[0]

    np.testing.assert_array_almost_equal(y_ref1, y1)
Exemplo n.º 23
0
def lstm_validate(lstm_model, evaluation_dataset, create_confusion_matrix=False, number_of_subframes=0, sample_strategy="random", batch_size=32):
	
	print("evaluate neural network...")
	validation_data = []
	validation_labels = []
	
	accuracy = 0
	n = 0
	idx = 0

	
	for _obj in evaluation_dataset:
		if number_of_subframes > 0:
			validation_data.append(get_buckets(_obj.get_hoj_set(), number_of_subframes, sample_strategy))
		else:
			validation_data.append(_obj.get_hoj_set())
		validation_labels.append(_obj.get_hoj_label()[0])


	# evaluate neural network
	score, acc = lstm_model.evaluate(np.array(validation_data), np.array(validation_labels), batch_size=batch_size, verbose=0)
			
	print("Accuracy:",acc)

	if create_confusion_matrix is True:
		predictions = lstm_model.predict(np.array(validation_data),batch_size = batch_size)
		
		predicted_labels = []
		real_labels = []

		for k in range(len(predictions)):
			predicted_idx = np.argmax(predictions[k])

			label_idx = np.argmax(validation_labels[k])
			
			real_labels.append(label_idx)
			predicted_labels.append(predicted_idx)


		cnf_matrix = confusion_matrix(real_labels, predicted_labels)

		norm = Normalizer()
		cnf_matrix = norm.fit_transform(cnf_matrix)

		return score, acc, cnf_matrix


	return score, acc, None
Exemplo n.º 24
0
 def __init__(self, X_train, y_train, X_val, y_val):
     super().__init__()
     self.normalizer = Normalizer()
     self.normalizer.fit(X_train)
     self.clf = neighbors.KNeighborsRegressor(n_neighbors=10, weights='distance', p=1)
     self.clf.fit(self.normalizer.transform(X_train), numpy.log(y_train))
     print("Result on validation data: ", self.evaluate(self.normalizer.transform(X_val), y_val))
Exemplo n.º 25
0
	def __init__(self, nor='nor', fold=2):
		self.fold = fold
		dataframe = pandas.read_csv(open('wine.data'))
		array = dataframe.values
		# separate array into input and output components
		self.X = array[:,1:]
		self.Y = array[:,0]
		self.nor = nor
		# normalizer can turn length of vector into 1.
		if self.nor == 'nor':
			scaler = Normalizer().fit(self.X)
		else:
			scaler = MinMaxScaler().fit(self.X)

		self.X = scaler.transform(self.X)
		numpy.set_printoptions(precision=3)
Exemplo n.º 26
0
def test_normalizer_l1():
    rng = np.random.RandomState(0)
    X_dense = rng.randn(4, 5)
    X_sparse_unpruned = sp.csr_matrix(X_dense)

    # set the row number 3 to zero
    X_dense[3, :] = 0.0

    # set the row number 3 to zero without pruning (can happen in real life)
    indptr_3 = X_sparse_unpruned.indptr[3]
    indptr_4 = X_sparse_unpruned.indptr[4]
    X_sparse_unpruned.data[indptr_3:indptr_4] = 0.0

    # build the pruned variant using the regular constructor
    X_sparse_pruned = sp.csr_matrix(X_dense)

    # check inputs that support the no-copy optim
    for X in (X_dense, X_sparse_pruned, X_sparse_unpruned):

        normalizer = Normalizer(norm='l1', copy=True)
        X_norm = normalizer.transform(X)
        assert X_norm is not X
        X_norm1 = toarray(X_norm)

        normalizer = Normalizer(norm='l1', copy=False)
        X_norm = normalizer.transform(X)
        assert X_norm is X
        X_norm2 = toarray(X_norm)

        for X_norm in (X_norm1, X_norm2):
            row_sums = np.abs(X_norm).sum(axis=1)
            for i in range(3):
                assert_almost_equal(row_sums[i], 1.0)
            assert_almost_equal(row_sums[3], 0.0)

    # check input for which copy=False won't prevent a copy
    for init in (sp.coo_matrix, sp.csc_matrix, sp.lil_matrix):
        X = init(X_dense)
        X_norm = normalizer = Normalizer(norm='l2', copy=False).transform(X)

        assert X_norm is not X
        assert isinstance(X_norm, sp.csr_matrix)

        X_norm = toarray(X_norm)
        for i in xrange(3):
            assert_almost_equal(row_sums[i], 1.0)
        assert_almost_equal(la.norm(X_norm[3]), 0.0)
Exemplo n.º 27
0
def test_sklearn_transform():
    transformer = Normalizer()
    transformer.fit(X_train)

    computation = SklearnTransform("test-sklearn", transformer,
                                   istreams=[], ostream="out")
    context = ComputationContext(computation)

    data = pd.DataFrame(X_test).to_json(orient="records")
    computation.process_record(context, Record("transform", data, None))

    assert len(context.records) == 1
    assert len(context.records["out"]) == 1

    record = context.records["out"][0]
    assert record.key == "transform"
    assert np.allclose(transformer.transform(X_test), json.loads(record.data))
Exemplo n.º 28
0
 def __init__(self, dataset, n_words=300, add_global_desc=True,
              color_sift=False):
     self.dataset = dataset
     self.n_words = n_words
     self.add_global_desc = add_global_desc
     self.normalizer = Normalizer(norm='l1')
     self.color_sift = color_sift
     if self.color_sift:
         self.feature_extractor = color_sift_descriptors
     else:
         self.feature_extractor = sift_descriptors
Exemplo n.º 29
0
def perform_classification (corpus_dir, extn, embedding_fname, class_labels_fname):
    '''
    Perform classification from
    :param corpus_dir: folder containing subgraph2vec sentence files
    :param extn: extension of subgraph2vec sentence files
    :param embedding_fname: file containing subgraph vectors in word2vec format (refer Mikolov et al (2013) code)
    :param class_labels_fname: files containing labels of each graph
    :return: None
    '''
    gensim_model = gensim.models.KeyedVectors.load_word2vec_format(fname=embedding_fname)
    logging.info('Loaded gensim model of subgraph vectors')

    subgraph_vocab = sorted(gensim_model.vocab.keys())
    logging.info('Vocab consists of {} subgraph features'.format(len(subgraph_vocab)))

    wlk_files = get_files(corpus_dir, extn)
    logging.info('Loaded {} graph WL kernel files for performing classification'.format(len(wlk_files)))
    c_vectorizer = CountVectorizer(input='filename',
                                   tokenizer=subgraph2vec_tokenizer,
                                   lowercase=False,
                                   vocabulary=subgraph_vocab)
    normalizer = Normalizer()

    X = c_vectorizer.fit_transform(wlk_files)
    X = normalizer.fit_transform(X)
    logging.info('X (sample) matrix shape: {}'.format(X.shape))


    Y = np.array(get_class_labels(wlk_files, class_labels_fname))
    logging.info('Y (label) matrix shape: {}'.format(Y.shape))

    seed = randint(0, 1000)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=seed)
    logging.info('Train and Test matrix shapes: {}, {}, {}, {} '.format(X_train.shape, X_test.shape,
                                                                        Y_train.shape, Y_test.shape))

    linear_kernel_svm_classify(X_train, X_test, Y_train, Y_test)

    subgraph_kernel = get_subgraph_kernel (gensim_model, subgraph_vocab)
    deep_kernel_svm_classify (X_train, X_test, Y_train, Y_test, subgraph_kernel)
Exemplo n.º 30
0
def vectorize(n, comp=0):
    tfv = TfidfVectorizer(min_df=1, strip_accents='unicode', ngram_range=(1,2), stop_words='english',
        sublinear_tf=True, use_idf=True, smooth_idf=True)

    # Fit and transform
    X = tfv.fit_transform(boiler_stream(trainfnm, n))
    lsa = None
    scaler = None
    if comp > 0:
        lsa = TruncatedSVD(comp)
        scaler = Normalizer(copy=False)
        X = lsa.fit_transform(X)
        X = scaler.fit_transform(X)

    # Transform only
    Z = tfv.transform(boiler_stream(testfnm, n))
    if lsa:
        Z = lsa.transform(Z)
        Z = scaler.transform(Z)
    
    np.save(trainvecfnm, X)
    np.save(testvecfnm, Z)
from sklearn.model_selection import train_test_split

from data import load_data, parse_params
from java_bridge import JavaBridge
from common_model import CommonModel

# Create the bridge.
bridge = JavaBridge()

# Read and parse the parameters.
ps = bridge.read()
params = parse_params(ps)

# Load data from file.
train_X, train_Y = load_data(params['train_path'])
test_X, test_Y = load_data(params['test_path'])

# Normalize datasets.
if params['normalize_features']:
    norm = Normalizer().fit(train_X)
    train_X = norm.transform(train_X, copy=False)
    test_X = norm.transform(test_X, copy=False)

# Train and score the model.
model = CommonModel(params)
model.fit(train_X, train_Y, bridge)
model.score(test_X, test_Y, bridge)

# Close the bridge and end program.
bridge.end()
Exemplo n.º 32
0
def model_build():
    data = pd.read_csv('./result_datasets/preprocessed_data.csv')
    # spliting data to train and test data
    X = data.drop('Score', axis=1)
    Y = data.Score.values

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=.33,
                                                        stratify=Y,
                                                        random_state=42)
    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

    # # Normalising all the numerical features
    std_scaler = Normalizer()
    min_max = MinMaxScaler()

    # payment_sequential feature
    payment_sequential_train = std_scaler.fit_transform(
        X_train.payment_sequential.values.reshape(-1, 1))
    payment_sequential_test = std_scaler.transform(
        X_test.payment_sequential.values.reshape(-1, 1))

    # payment_installments feature
    payment_installments_train = std_scaler.fit_transform(
        X_train.payment_installments.values.reshape(-1, 1))
    payment_installments_test = std_scaler.transform(
        X_test.payment_installments.values.reshape(-1, 1))

    # Payment value feature
    payment_value_train = std_scaler.fit_transform(
        X_train.payment_value.values.reshape(-1, 1))
    payment_value_test = std_scaler.transform(
        X_test.payment_value.values.reshape(-1, 1))

    # price
    price_train = std_scaler.fit_transform(X_train.price.values.reshape(-1, 1))
    price_test = std_scaler.transform(X_test.price.values.reshape(-1, 1))

    # freight_value
    freight_value_train = std_scaler.fit_transform(
        X_train.freight_value.values.reshape(-1, 1))
    freight_value_test = std_scaler.transform(
        X_test.freight_value.values.reshape(-1, 1))

    # product_name_length
    product_name_length_train = std_scaler.fit_transform(
        X_train.product_name_length.values.reshape(-1, 1))
    product_name_length_test = std_scaler.transform(
        X_test.product_name_length.values.reshape(-1, 1))

    # product_description_length
    product_description_length_train = std_scaler.fit_transform(
        X_train.product_description_length.values.reshape(-1, 1))
    product_description_length_test = std_scaler.transform(
        X_test.product_description_length.values.reshape(-1, 1))

    # product_photos_qty
    product_photos_qty_train = std_scaler.fit_transform(
        X_train.product_photos_qty.values.reshape(-1, 1))
    product_photos_qty_test = std_scaler.transform(
        X_test.product_photos_qty.values.reshape(-1, 1))

    # delivery_days
    delivery_days_train = std_scaler.fit_transform(
        X_train.delivery_days.values.reshape(-1, 1))
    delivery_days_test = std_scaler.transform(
        X_test.delivery_days.values.reshape(-1, 1))

    # estimated_days
    estimated_days_train = std_scaler.fit_transform(
        X_train.estimated_days.values.reshape(-1, 1))
    estimated_days_test = std_scaler.transform(
        X_test.estimated_days.values.reshape(-1, 1))

    # ships_in
    ships_in_train = std_scaler.fit_transform(
        X_train.ships_in.values.reshape(-1, 1))
    ships_in_test = std_scaler.transform(X_test.ships_in.values.reshape(-1, 1))

    # seller_popularity
    seller_popularity_train = min_max.fit_transform(
        X_train.seller_popularity.values.reshape(-1, 1))
    seller_popularity_test = min_max.transform(
        X_test.seller_popularity.values.reshape(-1, 1))

    # # Normalising Categorical features

    # In[169]:

    # initialising oneHotEncoder

    onehot = CountVectorizer()
    cat = OneHotEncoder()
    # payment_type
    payment_type_train = onehot.fit_transform(X_train.payment_type.values)
    payment_type_test = onehot.transform(X_test.payment_type.values)

    # customer_state
    customer_state_train = onehot.fit_transform(X_train.customer_state.values)
    customer_state_test = onehot.transform(X_test.customer_state.values)

    # seller_state
    seller_state_train = onehot.fit_transform(X_train.seller_state.values)
    seller_state_test = onehot.transform(X_test.seller_state.values)

    # product_category_name
    product_category_name_train = onehot.fit_transform(
        X_train.product_category_name.values)
    product_category_name_test = onehot.transform(
        X_test.product_category_name.values)

    # arrival_time
    arrival_time_train = onehot.fit_transform(X_train.arrival_time.values)
    arrival_time_test = onehot.transform(X_test.arrival_time.values)

    # delivery_impression
    delivery_impression_train = onehot.fit_transform(
        X_train.delivery_impression.values)
    delivery_impression_test = onehot.transform(
        X_test.delivery_impression.values)

    # estimated_del_impression
    estimated_del_impression_train = onehot.fit_transform(
        X_train.estimated_del_impression.values)
    estimated_del_impression_test = onehot.transform(
        X_test.estimated_del_impression.values)

    # ship_impression
    ship_impression_train = onehot.fit_transform(
        X_train.ship_impression.values)
    ship_impression_test = onehot.transform(X_test.ship_impression.values)

    # existing_cust
    existing_cust_train = cat.fit_transform(
        X_train.existing_cust.values.reshape(-1, 1))
    existing_cust_test = cat.transform(
        X_test.existing_cust.values.reshape(-1, 1))

    # **Stacking the data**

    # stacking up all the encoded features
    X_train_vec = hstack(
        (payment_sequential_train, payment_installments_train,
         payment_value_train, price_train, freight_value_train,
         product_name_length_train, product_description_length_train,
         product_photos_qty_train, delivery_days_train, estimated_days_train,
         ships_in_train, payment_type_train, customer_state_train,
         seller_state_train, product_category_name_train, arrival_time_train,
         delivery_impression_train, estimated_del_impression_train,
         ship_impression_train, seller_popularity_train))

    X_test_vec = hstack(
        (payment_sequential_test, payment_installments_test,
         payment_value_test, price_test, freight_value_test,
         product_name_length_test, product_description_length_test,
         product_photos_qty_test, delivery_days_test, estimated_days_test,
         ships_in_test, payment_type_test, customer_state_test,
         seller_state_test, product_category_name_test, arrival_time_test,
         delivery_impression_test, estimated_del_impression_test,
         ship_impression_test, seller_popularity_test))

    print(X_train_vec.shape, X_test_vec.shape)

    # # Naive Bayes

    # # Hyper parameter Tuning

    naive = MultinomialNB(class_prior=[0.5, 0.5])

    param = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]}

    # for the bow based model
    NB = GridSearchCV(naive,
                      param,
                      cv=3,
                      refit=False,
                      return_train_score=True,
                      scoring='roc_auc')
    NB.fit(X_train_vec, y_train)

    NB.best_params_

    # # Fitting the Model

    clf = MultinomialNB(alpha=0.0001, class_prior=[0.5, 0.5])
    clf.fit(X_train_vec, y_train)

    # predicted value of y probabilities
    y_pred_train = clf.predict_proba(X_train_vec)
    y_pred_test = clf.predict_proba(X_test_vec)

    # predicted values of Y labels
    pred_label_train = clf.predict(X_train_vec)
    pred_label_test = clf.predict(X_test_vec)

    # Confusion Matrix
    cf_matrix_train = confusion_matrix(y_train, pred_label_train)
    cf_matrix_test = confusion_matrix(y_test, pred_label_test)

    fpr_train, tpr_train, threshold_train = roc_curve(y_train, y_pred_train[:,
                                                                            1])
    fpr_test, tpr_test, threshold_test = roc_curve(y_test, y_pred_test[:, 1])

    train_auc = round(auc(fpr_train, tpr_train), 3)
    test_auc = round(auc(fpr_test, tpr_test), 3)

    plt.plot(fpr_train,
             tpr_train,
             color='red',
             label='train-auc = ' + str(train_auc))
    plt.plot(fpr_test,
             tpr_test,
             color='blue',
             label='test-auc = ' + str(test_auc))
    plt.plot(np.array([0, 1]),
             np.array([0, 1]),
             color='black',
             label='random model auc = ' + str(0.5))
    plt.xlabel('False Positive Rate(FPR)')
    plt.ylabel('True Positive Rate(TPR)')
    plt.title('ROC curve')
    plt.legend()
    plt.show()
    print('Best AUC for the model is {} '.format(test_auc))

    # plot confusion matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(cf_matrix_test / np.sum(cf_matrix_test),
                annot=True,
                fmt='.2%',
                cmap='Greens')
    plt.show()

    # f1 score
    print('Train F1_score for this model is : ',
          round(f1_score(y_train, pred_label_train), 4))
    print('Test F1_score for this model is : ',
          round(f1_score(y_test, pred_label_test), 4))

    print('Train Accuracy score for this model : ',
          round(accuracy_score(y_train, pred_label_train), 4))
    print('Test Accuracy score for this model : ',
          round(accuracy_score(y_test, pred_label_test), 4))

    # # Observations
    #
    # 1. Naive bayes performed pretty decent in terms of minimal overfitting in train and test performances.
    # 2. Both train and test f1 score was 0.86 and accuracy 77%.
    # 3. But the confusion matrix says it has misclassified many points as False Positives.
    # 4. AUC score for test data was 0.694.

    # # Logistic Regression

    # # Hyper parameter Tuning

    # we have used max_iter 1000 as it was causing exception while fitting
    Logi = LogisticRegression(max_iter=1000, solver='lbfgs')

    param = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 20, 30]}

    # for the bow based model
    LR = GridSearchCV(Logi,
                      param,
                      cv=3,
                      refit=False,
                      return_train_score=True,
                      scoring='roc_auc')
    LR.fit(X_train_vec, y_train)

    LR.best_params_

    # **NOTE**
    #
    # * For performance measurement we will not use accuracy as a metric as the data set is highly imbalanced.
    # * We will use AUC score and f1 score as performance metric.

    # model
    clf = LogisticRegression(C=0.1, max_iter=1000, solver='lbfgs')
    clf.fit(X_train_vec, y_train)

    # In[180]:

    # predicted value of y probabilities
    y_pred_train = clf.predict_proba(X_train_vec)
    y_pred_test = clf.predict_proba(X_test_vec)

    # predicted values of Y labels
    pred_label_train = clf.predict(X_train_vec)
    pred_label_test = clf.predict(X_test_vec)

    # Confusion Matrix
    cf_matrix_train = confusion_matrix(y_train, pred_label_train)
    cf_matrix_test = confusion_matrix(y_test, pred_label_test)

    fpr_train, tpr_train, threshold_train = roc_curve(y_train, y_pred_train[:,
                                                                            1])
    fpr_test, tpr_test, threshold_test = roc_curve(y_test, y_pred_test[:, 1])

    train_auc = round(auc(fpr_train, tpr_train), 3)
    test_auc = round(auc(fpr_test, tpr_test), 3)

    plt.plot(fpr_train,
             tpr_train,
             color='red',
             label='train-auc = ' + str(train_auc))
    plt.plot(fpr_test,
             tpr_test,
             color='blue',
             label='test-auc = ' + str(test_auc))
    plt.plot(np.array([0, 1]),
             np.array([0, 1]),
             color='black',
             label='random model auc = ' + str(0.5))
    plt.xlabel('False Positive Rate(FPR)')
    plt.ylabel('True Positive Rate(TPR)')
    plt.title('ROC curve')
    plt.legend()
    plt.show()
    print('Best AUC for the model is {} '.format(test_auc))

    # In[181]:

    # plot confusion matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(cf_matrix_test / np.sum(cf_matrix_test),
                annot=True,
                fmt='.2%',
                cmap='Greens')
    plt.show()

    # In[182]:

    # f1 score
    print('Train F1_score for this model is : ',
          round(f1_score(y_train, pred_label_train), 4))
    print('Test F1_score for this model is : ',
          round(f1_score(y_test, pred_label_test), 4))

    # In[183]:

    print('Train Accuracy score for this model : ',
          round(accuracy_score(y_train, pred_label_train), 4))
    print('Test Accuracy score for this model : ',
          round(accuracy_score(y_test, pred_label_test), 4))

    # # Observations
    #
    # 1. Logistic regression performs considerably better than Naive bayes in terms of f1 score, however AUC score being almost the same.
    # 2. Misclassification of False positives reduced which resulted in the increase of f1 score of 92%.
    # 3. Accuracy was 86% for both train and test which shows the model doesn't overfit at all.

    # # Decision Tree

    # # HyperParmater tuning

    # In[184]:

    # model initialize
    DT = DecisionTreeClassifier(class_weight='balanced')

    # hyper parameters
    param = {
        'max_depth': [1, 5, 10, 15, 20],
        'min_samples_split': [5, 10, 100, 300, 500, 1000]
    }

    # Grid search CV
    DT = GridSearchCV(DT,
                      param,
                      cv=3,
                      refit=False,
                      return_train_score=True,
                      scoring='roc_auc')
    DT.fit(X_train_vec, y_train)

    # In[185]:

    # best params
    DT.best_params_

    # In[186]:

    # model
    clf = DecisionTreeClassifier(class_weight='balanced',
                                 max_depth=20,
                                 min_samples_split=300)
    clf.fit(X_train_vec, y_train)

    # predicted value of y probabilities
    y_pred_train = clf.predict_proba(X_train_vec)
    y_pred_test = clf.predict_proba(X_test_vec)

    # predicted values of Y labels
    pred_label_train = clf.predict(X_train_vec)
    pred_label_test = clf.predict(X_test_vec)

    # Confusion Matrix
    cf_matrix_train = confusion_matrix(y_train, pred_label_train)
    cf_matrix_test = confusion_matrix(y_test, pred_label_test)

    # taking the probabilit scores instead of the predicted label
    # predict_proba returns probabilty scores which is in the 2nd column thus taking the second column
    fpr_train, tpr_train, threshold_train = roc_curve(y_train, y_pred_train[:,
                                                                            1])
    fpr_test, tpr_test, threshold_test = roc_curve(y_test, y_pred_test[:, 1])

    train_auc = round(auc(fpr_train, tpr_train), 3)
    test_auc = round(auc(fpr_test, tpr_test), 3)

    plt.plot(fpr_train,
             tpr_train,
             color='red',
             label='train-auc = ' + str(train_auc))
    plt.plot(fpr_test,
             tpr_test,
             color='blue',
             label='test-auc = ' + str(test_auc))
    plt.plot(np.array([0, 1]),
             np.array([0, 1]),
             color='black',
             label='random model auc = ' + str(0.5))
    plt.xlabel('False Positive Rate(FPR)')
    plt.ylabel('True Positive Rate(TPR)')
    plt.title('ROC curve')
    plt.legend()
    plt.show()
    print('Best AUC for the model is {} '.format(test_auc))

    # In[187]:

    # plot confusion matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(cf_matrix_test / np.sum(cf_matrix_test),
                annot=True,
                fmt='.2%',
                cmap='Greens')
    plt.show()

    # In[188]:

    # f1 score
    print('Train F1_score for this model is : ',
          round(f1_score(y_train, pred_label_train), 4))
    print('Test F1_score for this model is : ',
          round(f1_score(y_test, pred_label_test), 4))

    # In[189]:

    print('Train Accuracy score for this model : ',
          round(accuracy_score(y_train, pred_label_train), 4))
    print('Test Accuracy score for this model : ',
          round(accuracy_score(y_test, pred_label_test), 4))

    # # Observations
    #
    # 1. Decision Tree does nothing better interms of both f1 score , auc score and accuracy comes out to be 0.708 and 70%.
    # 2. It misclassfied False Positives to a lot.
    # 3. Model doesn't overfit but doesn't perform better either.

    # # Random Forest

    # # Hyperparameter Tuning

    # In[190]:

    # param grid
    # we have limit max_depth to 10 so that the model doesn't overfit
    param = {
        'min_samples_split': [5, 10, 30, 50, 100],
        'max_depth': [5, 7, 10]
    }

    # Random forest classifier
    RFclf = RandomForestClassifier(class_weight='balanced')

    # using grid search cv to tune parameters
    RF = GridSearchCV(RFclf,
                      param,
                      cv=5,
                      refit=False,
                      n_jobs=-1,
                      verbose=1,
                      return_train_score=True,
                      scoring='roc_auc')
    RF.fit(X_train_vec, y_train)

    # In[191]:

    RF.best_params_

    # In[192]:

    # model
    clf = RandomForestClassifier(class_weight='balanced',
                                 max_depth=10,
                                 min_samples_split=5)
    clf.fit(X_train_vec, y_train)

    # predicted value of y probabilities
    y_pred_train = clf.predict_proba(X_train_vec)
    y_pred_test = clf.predict_proba(X_test_vec)

    # predicted values of Y labels
    pred_label_train = clf.predict(X_train_vec)
    pred_label_test = clf.predict(X_test_vec)

    # Confusion Matrix
    cf_matrix_train = confusion_matrix(y_train, pred_label_train)
    cf_matrix_test = confusion_matrix(y_test, pred_label_test)

    # taking the probabilit scores instead of the predicted label
    # predict_proba returns probabilty scores which is in the 2nd column thus taking the second column
    fpr_train, tpr_train, threshold_train = roc_curve(y_train, y_pred_train[:,
                                                                            1])
    fpr_test, tpr_test, threshold_test = roc_curve(y_test, y_pred_test[:, 1])

    train_auc = round(auc(fpr_train, tpr_train), 3)
    test_auc = round(auc(fpr_test, tpr_test), 3)

    plt.plot(fpr_train,
             tpr_train,
             color='red',
             label='train-auc = ' + str(train_auc))
    plt.plot(fpr_test,
             tpr_test,
             color='blue',
             label='test-auc = ' + str(test_auc))
    plt.plot(np.array([0, 1]),
             np.array([0, 1]),
             color='black',
             label='random model auc = ' + str(0.5))
    plt.xlabel('False Positive Rate(FPR)')
    plt.ylabel('True Positive Rate(TPR)')
    plt.title('ROC curve')
    plt.legend()
    plt.show()
    print('Best AUC for the model is {} '.format(test_auc))

    # In[193]:

    # plot confusion matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(cf_matrix_test / np.sum(cf_matrix_test),
                annot=True,
                fmt='.2%',
                cmap='Greens')
    plt.show()

    # In[194]:

    # f1 score
    print('Train F1_score for this model is : ',
          round(f1_score(y_train, pred_label_train), 4))
    print('Test F1_score for this model is : ',
          round(f1_score(y_test, pred_label_test), 4))

    # In[195]:

    print('Train Accuracy score for this model : ',
          round(accuracy_score(y_train, pred_label_train), 4))
    print('Test Accuracy score for this model : ',
          round(accuracy_score(y_test, pred_label_test), 4))

    # # Observations
    #
    # 1. Random forest performs better than logistic regression in terms of f1 score and accuracy.
    # 2. It gives an f1 score of 90.13% and doesn't seem to overfit.
    # 3. Misclassification rate is still not that great.
    # 4. AUC is score is 0.718
    # 5. Accuracy score is 83%.

    # # GBDT

    # # Hyper parameter tuning

    # In[196]:

    # param grid
    # we have limit max_depth to 8 so that the model doesn't overfit
    param = {'min_samples_split': [5, 10, 30, 50], 'max_depth': [3, 5, 7, 8]}

    GBDTclf = GradientBoostingClassifier()

    clf = GridSearchCV(RFclf,
                       param,
                       cv=5,
                       refit=False,
                       return_train_score=True,
                       scoring='roc_auc')
    clf.fit(X_train_vec, y_train)

    # In[197]:

    # best parameters
    clf.best_params_

    # In[198]:

    import pickle

    # In[199]:

    # Model
    clf = GradientBoostingClassifier(max_depth=8, min_samples_split=5)
    clf.fit(X_train_vec, y_train)

    # save the model to disk
    Pkl_Filename = "final_model.pkl"
    with open(Pkl_Filename, 'wb') as file:
        pickle.dump(clf, file)

    # predicted value of y probabilities
    y_pred_train = clf.predict_proba(X_train_vec)
    y_pred_test = clf.predict_proba(X_test_vec)

    # predicted values of Y labels
    pred_label_train = clf.predict(X_train_vec)
    pred_label_test = clf.predict(X_test_vec)

    # Confusion Matrix
    cf_matrix_train = confusion_matrix(y_train, pred_label_train)
    cf_matrix_test = confusion_matrix(y_test, pred_label_test)

    # taking the probabilit scores instead of the predicted label
    # predict_proba returns probabilty scores which is in the 2nd column thus taking the second column
    fpr_train, tpr_train, threshold_train = roc_curve(y_train, y_pred_train[:,
                                                                            1])
    fpr_test, tpr_test, threshold_test = roc_curve(y_test, y_pred_test[:, 1])

    train_auc = round(auc(fpr_train, tpr_train), 3)
    test_auc = round(auc(fpr_test, tpr_test), 3)

    plt.plot(fpr_train,
             tpr_train,
             color='red',
             label='train-auc = ' + str(train_auc))
    plt.plot(fpr_test,
             tpr_test,
             color='blue',
             label='test-auc = ' + str(test_auc))
    plt.plot(np.array([0, 1]),
             np.array([0, 1]),
             color='black',
             label='random model auc = ' + str(0.5))
    plt.xlabel('False Positive Rate(FPR)')
    plt.ylabel('True Positive Rate(TPR)')
    plt.title('ROC curve')
    plt.legend()
    plt.show()
    print('Best AUC for the model is {} '.format(test_auc))

    # In[200]:

    # plot confusion matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(cf_matrix_test / np.sum(cf_matrix_test),
                annot=True,
                fmt='.2%',
                cmap='Greens')
    plt.show()

    # In[201]:

    # f1 score
    print('Train F1_score for this model is : ',
          round(f1_score(y_train, pred_label_train), 4))
    print('Test F1_score for this model is : ',
          round(f1_score(y_test, pred_label_test), 4))

    # In[202]:

    print('Train Accuracy score for this model : ',
          round(accuracy_score(y_train, pred_label_train), 4))
    print('Test Accuracy score for this model : ',
          round(accuracy_score(y_test, pred_label_test), 4))

    # # Observations
    #
    # 1. Gradient Boosted classifier results the best f1 score of 0.9243 and auc score of 0.745.
    # 2. Misclassification of False Positives and True negetives is also reduced to 11% also true positive rate is 83%.
    # 3. Accuracy score is 86% for test and 87% for train data.
    # 4. Model does overfit a slight comapred to rest of the models.

    # # Observations
    #
    # 1. We created a standard deep Neural network model and trained it for 20 epochs this resulted f1 score very similar to our best ML model yet which is GBDT.
    # 2. Kindly note that this neural network was very little hyper-parameter tuning done,and still results in a very decent performance.
    # 3. However the auc score of GBDT is still better than the NN model.
    # 4. Important thing to note that NN based models can be much better than conventional ML models for such problems.

    # # Results

    from prettytable import PrettyTable

    table = PrettyTable()
    table.field_names = ["Model", "F1_score", " AUC_score ", " Accuracy "]

    table.add_row(["Naive Bayes", '0.8575', '0.694', '0.7689'])
    table.add_row(["Logistic Regression", '0.9217', '0.699', '0.8605'])
    table.add_row(["Decision Tree", '0.8031', '0.713', '0.7021'])
    table.add_row([
        "Random Forest",
        '0.9013',
        '0.718',
        '0.8315',
    ])
    table.add_row(["GBDT**(BEST)", '0.9243', '0.745', '0.8651'])
    # table.add_row(["Deep NN",'0.9233','0.710','0.8629'])

    print(table)
    return
from sklearn.kernel_approximation import RBFSampler, Nystroem
from sklearn.cluster import FeatureAgglomeration
from sklearn.feature_selection import SelectFwe, SelectKBest, SelectPercentile, VarianceThreshold
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, f1_score
from tpot_metrics import balanced_accuracy_score
from sklearn.pipeline import make_pipeline
import itertools

dataset = sys.argv[1]

preprocessor_list = [Binarizer(), MaxAbsScaler(), MinMaxScaler(), Normalizer(),
                     PolynomialFeatures(), RobustScaler(), StandardScaler(),
                     FastICA(), PCA(), RBFSampler(), Nystroem(), FeatureAgglomeration(),
                     SelectFwe(), SelectKBest(), SelectPercentile(), VarianceThreshold(),
                     SelectFromModel(estimator=ExtraTreesClassifier(n_estimators=100)),
                     RFE(estimator=ExtraTreesClassifier(n_estimators=100))]

# Read the data set into memory
input_data = pd.read_csv(dataset, compression='gzip', sep='\t').sample(frac=1., replace=False, random_state=42)

with warnings.catch_warnings():
    warnings.simplefilter('ignore')

    for (preprocessor, C, loss, fit_intercept) in itertools.product(
                preprocessor_list,
                [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 0.5, 1., 10., 50., 100.],
#Add this version of X to the list 
X_all.append(['StdSca','All', X_con,X_val_con,1.0,cols,rem_cols,ranks,i_cols,i_rem])

#MinMax
#Apply transform only for non-categorical data
X_temp = MinMaxScaler().fit_transform(X_train[:,0:size])
X_val_temp = MinMaxScaler().fit_transform(X_val[:,0:size])
#Concatenate non-categorical data and categorical
X_con = numpy.concatenate((X_temp,X_train[:,size:]),axis=1)
X_val_con = numpy.concatenate((X_val_temp,X_val[:,size:]),axis=1)
#Add this version of X to the list 
X_all.append(['MinMax', 'All', X_con,X_val_con,1.0,cols,rem_cols,ranks,i_cols,i_rem])

#Normalize
#Apply transform only for non-categorical data
X_temp = Normalizer().fit_transform(X_train[:,0:size])
X_val_temp = Normalizer().fit_transform(X_val[:,0:size])
#Concatenate non-categorical data and categorical
X_con = numpy.concatenate((X_temp,X_train[:,size:]),axis=1)
X_val_con = numpy.concatenate((X_val_temp,X_val[:,size:]),axis=1)
#Add this version of X to the list 
X_all.append(['Norm', 'All', X_con,X_val_con,1.0,cols,rem_cols,ranks,i_cols,i_rem])

#Impute
#Imputer is not used as no data is missing

#List of transformations
trans_list = []

for trans,name,X,X_val,v,cols_list,rem_list,rank_list,i_cols_list,i_rem_list in X_all:
    trans_list.append(trans)
Exemplo n.º 35
0
    "syllablesPerWord", "charactersPerWord"
]

###############################################################################
# Size Matters: Word Count as a Measure of Quality on Wikipedia FEATURES
#features_cols = ["wordCount"]
###############################################################################

# Select only the columns corresponding to the features in the list
X = data[features_cols]

# Select qualityClass as the response (y)
y = data.qualityClass

# NORMALIZE DATASET
scaler = Normalizer().fit(X)
X = scaler.transform(X)

# STANDARDIZE DATASET
#X = preprocessing.scale(X)
#y = preprocessing.scale(y)

# FEATURE SELECTION
#from sklearn.feature_selection import VarianceThreshold
#sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
#X = sel.fit_transform(X)

# 10-fold cross-validation with multilayer perceptron
#mlp = MLPClassifier()
#print cross_val_score(mlp, X, y, cv=10, scoring='accuracy').mean()
Exemplo n.º 36
0
dimension=10
pool_size=20
iteration=2000
loop=1
sigma=0.01# noise
delta=0.1# high probability
alpha=1# regularizer
alpha_2=0.1# edge delete CLUB
epsilon=8 # Ts
beta=0.15# exploration for CLUB, SCLUB and GOB
thres=0.0
state=False # False for artificial dataset, True for real dataset
lambda_list=[4]


item_feature_matrix=Normalizer().fit_transform(np.random.normal(size=(item_num, dimension)))

neighbor_num=3
ws_adj=WS_graph(user_num, neighbor_num, 0.1)
er_adj=ER_graph(user_num, 0.2)
ba_adj=BA_graph(user_num, 3)
random_weights=np.round(np.random.uniform(size=(user_num, user_num)), decimals=2)
random_weights=(random_weights.T+random_weights)/2
ws_adj=ws_adj*random_weights
er_adj=er_adj*random_weights
ba_adj=ba_adj*random_weights



true_adj=rbf_kernel(np.random.normal(size=(user_num, dimension)), gamma=0.25/dimension)
#true_adj=ws_adj
Exemplo n.º 37
0
def prepare_scale_train_valid_test(
    data: Union[pd.DataFrame, pd.Series],
    n_input_days: int,
    n_predict_days: int,
    test_size: float,
    s_end_date: str,
    no_shuffle: bool,
):
    """
    Prepare and scale train, validate and test data.
    Parameters
    ----------
    data: pd.DataFrame
        Dataframe of stock prices
    ns_parser: argparse.Namespace
        Parsed arguments
    Returns
    -------
    X_train: np.ndarray
        Array of training data.  Shape (# samples, n_inputs, 1)
    X_test: np.ndarray
        Array of validation data.  Shape (totoal sequences - #samples, n_inputs, 1)
    y_train: np.ndarray
        Array of training outputs.  Shape (#samples, n_days)
    y_test: np.ndarray
        Array of validation outputs.  Shape (total sequences -#samples, n_days)
    X_dates_train: np.ndarray
        Array of dates for X_train
    X_dates_test: np.ndarray
        Array of dates for X_test
    y_dates_train: np.ndarray
        Array of dates for y_train
    y_dates_test: np.ndarray
        Array of dates for y_test
    test_data: np.ndarray
        Array of prices after the specified end date
    dates_test: np.ndarray
        Array of dates after specified end date
    scaler:
        Fitted preprocesser
    """

    # Pre-process data
    if PREPROCESSER == "standardization":
        scaler = StandardScaler()

    elif PREPROCESSER == "minmax":
        scaler = MinMaxScaler()

    elif PREPROCESSER == "normalization":
        scaler = Normalizer()

    elif (PREPROCESSER == "none") or (PREPROCESSER is None):
        scaler = None
    # Test data is used for forecasting.  Takes the last n_input_days data points.
    # These points are not fed into training

    if s_end_date:
        data = data[data.index <= s_end_date]
        if n_input_days + n_predict_days > data.shape[0]:
            print("Cannot train enough input days to predict with loaded dataframe\n")
            return (
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                True,
            )

    test_data = data.iloc[-n_input_days:]
    train_data = data.iloc[:-n_input_days]

    dates = data.index
    dates_test = test_data.index
    if scaler:
        train_data = scaler.fit_transform(data.values.reshape(-1, 1))
        test_data = scaler.transform(test_data.values.reshape(-1, 1))
    else:
        train_data = data.values.reshape(-1, 1)
        test_data = test_data.values.reshape(-1, 1)

    prices = train_data

    input_dates = []
    input_prices = []
    next_n_day_prices = []
    next_n_day_dates = []

    for idx in range(len(prices) - n_input_days - n_predict_days):
        input_prices.append(prices[idx : idx + n_input_days])
        input_dates.append(dates[idx : idx + n_input_days])
        next_n_day_prices.append(
            prices[idx + n_input_days : idx + n_input_days + n_predict_days]
        )
        next_n_day_dates.append(
            dates[idx + n_input_days : idx + n_input_days + n_predict_days]
        )

    input_dates = np.asarray(input_dates)
    input_prices = np.array(input_prices)
    next_n_day_prices = np.array(next_n_day_prices)
    next_n_day_dates = np.asarray(next_n_day_dates)

    (
        X_train,
        X_valid,
        y_train,
        y_valid,
        X_dates_train,
        X_dates_valid,
        y_dates_train,
        y_dates_valid,
    ) = train_test_split(
        input_prices,
        next_n_day_prices,
        input_dates,
        next_n_day_dates,
        test_size=test_size,
        shuffle=no_shuffle,
    )
    return (
        X_train,
        X_valid,
        y_train,
        y_valid,
        X_dates_train,
        X_dates_valid,
        y_dates_train,
        y_dates_valid,
        test_data,
        dates_test,
        scaler,
        False,
    )
Exemplo n.º 38
0
BASELINE = 'essays'

labels = get_labels(train_partition_name, test_partition_name, BASELINE)
encoded_train_labels, original_training_labels = labels[0]
encoded_test_labels, original_test_labels = labels[1]

#
# Load essays
#

vectorizer = TfidfVectorizer(input="filename",
                             ngram_range=(1, 2),
                             sublinear_tf=True,
                             max_df=0.5,
                             max_features=30000)
transformer = Normalizer()  # Normalize frequencies to unit length

preprocessor = 'tokenized'
training_and_test_data_essays = get_features_from_text(
    train_partition_name,
    test_partition_name,
    baseline=BASELINE,
    preprocessor=preprocessor,
    vectorizer=vectorizer,
    transformer=transformer)

train_matrix_essays = training_and_test_data_essays[0]
test_matrix_essays = training_and_test_data_essays[1]

#-------------------------------------------------------------------------------------
# -----------------NN classifier... on essays--------------------------------------------
Exemplo n.º 39
0
 def apply(self, df):
     arr = Normalizer().fit_transform(df.values)
     return sklearn.cluster.SpectralClustering(
         **self.options).fit_predict(arr)
Exemplo n.º 40
0
 def apply(self, df):
     arr = Normalizer().fit_transform(df.values)
     return sklearn.mixture.GaussianMixture(**self.options).fit_predict(arr)
Exemplo n.º 41
0
from sklearn import metrics
from sklearn.preprocessing import Normalizer
import h5py
from keras import callbacks
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, CSVLogger

traindata = pd.read_csv('data/train.csv', header=None)
testdata = pd.read_csv('data/valid.csv', header=None)


X = traindata.iloc[:,1:61]
Y = traindata.iloc[:,0]
C = testdata.iloc[:,0]
T = testdata.iloc[:,1:61]

scaler = Normalizer().fit(X)
trainX = scaler.transform(X)
# summarize transformed data
np.set_printoptions(precision=3)
#print(trainX[0:5,:])

scaler = Normalizer().fit(T)
testT = scaler.transform(T)
# summarize transformed data
np.set_printoptions(precision=3)
#print(testT[0:5,:])


y_train = np.array(Y)
y_test = np.array(C)
Exemplo n.º 42
0
# Hacemos la descripcion de las columnas que escalamos
print(df[['age', 'diabetes', 'high_blood_pressure']].describe())
"""
    Metodo 2 Normalizacion
"""

# Creamos una figura con dos sub figuras para graficar
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(9, 5))

ax1.set_title('Antes de Normalizar')

# Graficamos tres columnas para ver el comportamiento
sns.kdeplot(data=copd, ax=ax1)

# Definimos la variable normal para hacer el preprocesamiento de Normalizacion
normal = Normalizer(norm='l2', copy=True)
df1[['age', 'diabetes', 'high_blood_pressure'
     ]] = normal.fit_transform(df1[['age', 'diabetes', 'high_blood_pressure']])

# guardamos los datos preprocesados en un nuevo archivo
df1.to_csv('preproNorm.csv', sep='\t')

# Graficamos los datos escaldos
ax2.set_title('Despues de Normalizar')
sns.kdeplot(data=df1[['age', 'diabetes', 'high_blood_pressure']], ax=ax2)

# Mostramos la grafica
plt.show()

print(" --- Normalizer ----")
Exemplo n.º 43
0
    print("Train set size: ", data_train.shape[0])

    # build classification model
    y = data_train['label'].values
    X = data_train.drop(['label'], axis=1)
    if not cognates:
        svm = LinearSVC(C=10, fit_intercept=True)
    else:
        svm = svm.SVC(C=10)

    features = [('cst', digit_col())]

    clf = pipeline.Pipeline([('union',
                              FeatureUnion(transformer_list=features,
                                           n_jobs=1)), ('scale', Normalizer()),
                             ('svm', svm)])

    clf.fit(X, y)

    if not predict_source and not predict_target:

        y = data_test['label'].values
        X = data_test.drop(['label'], axis=1)
        y_pred = clf.predict(X)

        if term_length_filter:
            result = pd.concat([
                X,
                pd.DataFrame(y_pred, columns=['prediction']),
                pd.DataFrame(y, columns=['label'])
Exemplo n.º 44
0
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from tpot.builtins import DatasetSelector

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=85)

# Average CV score on the training set was:0.7012173913043479
exported_pipeline = make_pipeline(
    DatasetSelector(sel_subset=12, subset_list="module23.csv"),
    Normalizer(norm="l2"),
    RandomForestClassifier(bootstrap=True,
                           criterion="entropy",
                           max_features=0.05,
                           min_samples_leaf=10,
                           min_samples_split=14,
                           n_estimators=100))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
#splitting the data frame to x and y
target = pd.DataFrame(data['CASE_STATUS'])
data = data.drop(['CASE_STATUS'], 1)

# In[29]:

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(data,
                                                    target,
                                                    test_size=0.2)

# In[30]:

from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
normalizer.fit(X_train)
train_data = normalizer.transform(X_train)
test_data = normalizer.transform(X_test)

# In[31]:

#Dimensionality reduction : PCA

from sklearn.decomposition import PCA
import time

start_time = time.clock()

pca = PCA(n_components=100)
pca = pca.fit(train_data)
Exemplo n.º 46
0
            Y.append(words[1])
    return X, Y


if __name__ == '__main__':

    # Read all the documents.
    X, Y = read_data('all_sentiment_shuffled.txt')

    # Split into training and test parts.
    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2,
                                                    random_state=0)

    # Set up the preprocessing steps and the classifier.
    myclf=PegasosWithSVC()
    #please uncomment to run the below log loss and then comment the above 
    #myclf=PegasosWithLogLoss()
    model_pl = make_pipeline(
        TfidfVectorizer(preprocessor = lambda x: x, tokenizer = lambda x: x),
        SelectKBest(k=1000),
        Normalizer(),
        myclf,
    )

    t0 = time.time()
    model_pl.fit(Xtrain, Ytrain)
    t1 = time.time()
    print('Training time: {:.2f} sec.'.format(t1 - t0))
    Yguess = model_pl.predict(Xtest)
    print('Accuracy: {:.4f}.'.format(accuracy_score(Ytest, Yguess)))
Exemplo n.º 47
0
Arquivo: qml.py Projeto: anko9801/qpp
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler

import qsharp
from ML import QClassifier

stdsc = StandardScaler()
iris = datasets.load_iris()
x1 = iris.data[:, 0].reshape(-1, 1)
x2 = iris.data[:, 1].reshape(-1, 1)
y = iris.target
x1_norm = stdsc.fit_transform(x1)
x2_norm = stdsc.fit_transform(x2)

normalized_set = Normalizer().transform(np.hstack([x1_norm, x2_norm]))

data_list = normalized_set.tolist()
target_list = y.tolist()
data_list_12 = []
target_list_12 = []
for (data, target) in zip(data_list, target_list):
    if target != 2:
        data_list_12.append(data)
        target_list_12.append(target)

X_train, X_test, y_train, y_test = train_test_split(data_list_12,
                                                    target_list_12,
                                                    test_size=0.2)

for (input, desire) in zip(X_test, y_test):
        x * y - np.dot(np.outer(x, x), Theta_old_vector) -
        alpha * np.dot(A, Theta_old_vector))
    return Theta_vector


user_num = 20
item_num = 100
dimension = 5
alpha = 0.1  # regularizer
beta = 0.1  #regularizer

mu = 0.001  #step size
lambda_ = 0.1  #step size

user_feature = np.random.normal(size=(user_num, dimension))
user_feature = Normalizer().fit_transform(user_feature)
user_feature_vector = user_feature.flatten()
adj = rbf_kernel(user_feature)
lap = csgraph.laplacian(adj, normed=False)

item_feature = np.random.normal(size=(item_num, dimension))
item_feature = Normalizer().fit_transform(item_feature)

Y = np.dot(user_feature, item_feature.T) + np.random.normal(
    size=(user_num, item_num), scale=0.1)

A_true = np.kron(lap, np.identity(dimension))
A = np.identity(user_num * dimension)
Theta_matrix = np.zeros((user_num, dimension))
Theta_vector = Theta_matrix.flatten()
L = np.identity(user_num)
Exemplo n.º 49
0
def function_2(file_name, rows_to_parse):
    data = pd.read_csv(os.path.join('data', 'nycflights', file_name +'.csv'),nrows=int(rows_to_parse)) #User
    data = data.fillna(0)
    print(data)
    
    train, test = train_test_split(data,train_size=0.5, test_size=0.5)
    
    train_x = train.drop(['DepDelay','UniqueCarrier','Origin','Dest'], axis=1)
    train_y = train['DepDelay']
    test_x = test.drop(['DepDelay','UniqueCarrier','Origin','Dest'], axis=1)
    test_y = test['DepDelay']
    
    # Support Vector Machines
    GridSearch.support_vector_machine(train_x,train_y,test_x,test_y)
    
    
        
    import time
    start_time = time.time()
    GridSearch.sklearn_grid_search(train_x, train_y)
    print("--- %s seconds ---" % (time.time() - start_time))
    
    
    
    #____DASK____
    
    
    
    c = dask.distributed.Client()
    client = Client(processes=False, threads_per_worker=4, n_workers=1, memory_limit='2GB')
    print(client)
    
    
    import time
    start_time = time.time()
    GridSearch.dask_grid_search(train_x, train_y)
    print(f"--- {time.time() - start_time}seconds ---")
    
    
    #DASK DELAY
    
    
    
    
    output = []
    #for x in data:
    a = dask.delayed(GridSearch.support_vector_machine)(train_x,train_y,test_x,test_y)
    print(a)
    start_time = time.time()
    a.compute()
    print("--- %s seconds ---" % (time.time() - start_time))
    output.append(a)
    b = dask.delayed(GridSearch.sklearn_grid_search)(train_x, train_y)
    print(b)
    output.append(b)
    start_time = time.time()
    b.compute()
    print("--- %s seconds ---" % (time.time() - start_time))
    c = dask.delayed(GridSearch.dask_grid_search)(train_x, train_y)
    print(c)
    output.append(c)
    start_time = time.time()
    c.compute()
    print("--- %s seconds ---" % (time.time() - start_time))
    
    total = dask.delayed(sum)(output)
    #Visaualize
    total.visualize() 
    
    
    
    #Other Code:    
    clean_dataset(train_x)
    train_x = train_x.values
    train_x
    
    train_y = train_y.values
    train_y
    
    from sklearn.preprocessing import Normalizer
    x = train_x
    transformer = Normalizer().fit(x)
    
    transformer
    
    transformer.transform(x)
    
    train_x = transformer.transform(x)
    
    train_x = train_x.round(decimals=2)
    train_x
    
    train_x, train_y =  make_classification(
        n_features=2, n_redundant=0, n_informative=2,
        random_state=1, n_clusters_per_class=1, n_samples=1000)
    train_x[:5]
    
    train_y
    
    
    # Scale up: increase N, the number of times we replicate the data.
    N = 2
    X_large = da.concatenate([da.from_array(train_x, chunks=train_x.shape) for _ in range(N)])
    y_large = da.concatenate([da.from_array(train_y, chunks=train_y.shape) for _ in range(N)])
    print(X_large)
    
    clf = ParallelPostFit(LogisticRegressionCV(cv=3), scoring="r2")
    y_pred = clf.predict(X_large)
    print(y_pred)
        
        
# 1: bus
# 0: no bus

imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(X)
X = imputer.transform(X)

labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)
labelencoder_Y_bus = LabelEncoder()
Y_bus = labelencoder_Y_bus.fit_transform(Y_bus)

# ================================================Splitting Training/Test Data==========================================

#training and testing splitting multi class
sc_X = Normalizer()
X = sc_X.fit_transform(X)
# X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.25, random_state=0)
# X_train = sc_X.fit_transform(X_train)
# X_test = sc_X.transform(X_test)

#training and testing splitting single class
# X_bus_train, X_bus_test, Y_bus_train, Y_bus_test = train_test_split(X,Y_bus, test_size=0.25, random_state=0)
# X_bus_train = sc_X.fit_transform(X_bus_train)

# ================================================Model Selection=======================================================

#classifier

svm_clf = SVC(kernel='rbf', random_state=0)
svm_clf_bus = SVC(kernel='rbf', random_state=0)
Exemplo n.º 51
0
tpot = TPOTRegressor(generations=10, verbosity=2)
tpot.fit(trX, trY)
print(tpot.score(teX, teY))
# 导出
tpot.export('pipeline_yield.py')

#================= use pipeline result ==========================
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LassoLarsCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import Binarizer, FunctionTransformer, Normalizer
from tpot.operators.preprocessors import ZeroCount

exported_pipeline = make_pipeline(ZeroCount(), Binarizer(threshold=0.17),
                                  Normalizer(norm="l1"),
                                  LassoLarsCV(normalize=True))

exported_pipeline.fit(trX, trY)
trY_pred = exported_pipeline.predict(trX)
teY_pred = exported_pipeline.predict(teX)
accuracy = exported_pipeline.score(teX, teY)

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

print('MSE train: %.3f, test: %.3f' %
      (mean_squared_error(trY, trY_pred), mean_squared_error(teY, teY_pred)))
print('R^2 train: %.3f, test: %.3f' %
      (r2_score(trY, trY_pred), r2_score(teY, teY_pred)))
"""
Exemplo n.º 52
0
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Normalizer
from sklearn.svm import SVC


in_encoder = Normalizer()
out_encoder = LabelEncoder()

model = SVC(kernel='linear', probability=True)


def train_model(emdTrainX, trainy):

    emdTrainX_norm = in_encoder.transform(emdTrainX)

    out_encoder.fit(trainy)
    trainy_enc = out_encoder.transform(trainy)

    model.fit(emdTrainX_norm, trainy_enc)


def test(emdTestX, trainy):
    emdTestX_norm = in_encoder.transform(emdTestX)

    yhat_class = model.predict(emdTestX_norm)
    # score_train = accuracy_score(trainy_enc, yhat_train)
    predict_names = out_encoder.inverse_transform(yhat_class)
    # print('Accuracy: train=%.3f' % (score_train*100))
    return predict_names
Exemplo n.º 53
0
    def test_random_sparse_data(self):

        n_columns = 8
        n_categories = 20

        import numpy.random as rn

        rn.seed(0)
        categories = rn.randint(50000, size=(n_columns, n_categories))

        for dt in ["int32", "float32", "float64"]:

            _X = np.array(
                [[
                    categories[j, rn.randint(n_categories)]
                    for j in range(n_columns)
                ] for i in range(100)],
                dtype=dt,
            )

            # Test this data on a bunch of possible inputs.
            for sparse in (True, False):
                for categorical_features in [
                        "all",
                    [3],
                    [4],
                        range(2, 8),
                        range(0, 4),
                        range(0, 8),
                ]:
                    X = _X.copy()

                    # This appears to be the only type now working.
                    assert X.dtype == np.dtype(dt)

                    model = OneHotEncoder(
                        categorical_features=categorical_features,
                        sparse=sparse)
                    model.fit(X)

                    # Convert the model
                    spec = sklearn.convert(model, [("data", Array(n_columns))],
                                           "out")

                    X_out = model.transform(X)
                    if sparse:
                        X_out = X_out.todense()

                    input_data = [{"data": row} for row in X]
                    output_data = [{"out": row} for row in X_out]

                    result = evaluate_transformer(spec, input_data,
                                                  output_data)

                    assert result["num_errors"] == 0

            # Test normal data inside a pipeline
            for sparse in (True, False):
                for categorical_features in [
                        "all",
                    [3],
                    [4],
                        range(2, 8),
                        range(0, 4),
                        range(0, 8),
                ]:
                    X = _X.copy()

                    model = Pipeline([
                        (
                            "OHE",
                            OneHotEncoder(
                                categorical_features=categorical_features,
                                sparse=sparse,
                            ),
                        ),
                        ("Normalizer", Normalizer()),
                    ])

                    model.fit(X)

                    # Convert the model
                    spec = sklearn.convert(model, [("data", Array(n_columns))],
                                           "out").get_spec()

                    X_out = model.transform(X)
                    if sparse:
                        X_out = X_out.todense()

                    input_data = [{"data": row} for row in X]
                    output_data = [{"out": row} for row in X_out]

                    result = evaluate_transformer(spec, input_data,
                                                  output_data)

                    assert result["num_errors"] == 0
dummy_cols3 = [
    "dummy_living", "dummy_luminoso", "dummy_terraza", "dummy_laundry",
    "dummy_cochera", "dummy_split", "dummy_piscina", "dummy_spa",
    "dummy_acondicionado", "dummy_subte", "dummy_pozo", "dummy_balcon",
    "dummy_sum", "dummy_vigilancia"
]
#dummy_cols2=[]
#dummy_cols3=[]
distance_cols = [col for col in df_train if col.startswith('dist')]
#distance_cols=[]

cols = dummy_cols + dummy_cols2 + dummy_cols3 + distance_cols + [
    'surface_total_in_m2', 'expenses', 'rooms'
]
#
scaler = Normalizer()
scalercols = cols + ["price_usd_per_m2"]
df_train[scalercols] = scaler.fit_transform(df_train[scalercols])
df_test[scalercols] = scaler.fit_transform(df_test[scalercols])

X_train = df_train[cols]
y_train = df_train["price_usd_per_m2"]
X_test = df_test[cols]
y_test = df_test["price_usd_per_m2"]
#print("X:",X)
#print("y:",y)

v = CarlosLib.PolyDictVectorizer(sparse=False)
#print(X_train.T.to_dict())

#hasher = CarlosLib.PolyFeatureHasher()
Exemplo n.º 55
0
df = df.drop(['Surname'], axis='columns')
# here we dont have any NULL or missing values. so, ignoring this
#%% Look for categorial values
# import preprocessing from sklearn
from sklearn import preprocessing
# 1. INSTANTIATE
# encode labels with value between 0 and n_classes-1.
le = preprocessing.LabelEncoder()
# 2/3. FIT AND TRANSFORM
# use df.apply() to apply le.fit_transform to all columns
df["Geography"] = le.fit_transform(df["Geography"])
df["Gender"] = le.fit_transform(df["Gender"])
df.head()

#%% feature scaling
from sklearn.preprocessing import Normalizer
scaler = Normalizer()
df1 = pd.DataFrame(scaler.fit_transform(df), columns=df.columns.values)

#%% Create correlation matrix
corr_matrix = df1.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(
    np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]

# %%
                                 min_df=2, stop_words='english',
                                 use_idf=opts.use_idf)
X = vectorizer.fit_transform(dataset.data)

print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)
print()

if opts.n_components:
    print("Performing dimensionality reduction using LSA")
    t0 = time()
    # Vectorizer results are normalized, which makes KMeans behave as
    # spherical k-means for better results. Since LSA/SVD results are
    # not normalized, we have to redo the normalization.
    svd = TruncatedSVD(opts.n_components)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)

    X = lsa.fit_transform(X)

    print("done in %fs" % (time() - t0))

    explained_variance = svd.explained_variance_ratio_.sum()
    print("Explained variance of the SVD step: {}%".format(
        int(explained_variance * 100)))

    print()

###############################################################################
# Do the actual clustering
# Here we select 5,000 samples for training and 10,000 for testing.
# To actually reproduce the results in the original Tensor Sketch paper,
# select 100,000 for training.

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=5_000,
                                                    test_size=10_000,
                                                    random_state=42)

# %%
# Now scale features to the range [0, 1] to match the format of the dataset in
# the LIBSVM webpage, and then normalize to unit length as done in the
# original Tensor Sketch paper [1].

mm = make_pipeline(MinMaxScaler(), Normalizer())
X_train = mm.fit_transform(X_train)
X_test = mm.transform(X_test)

# %%
# As a baseline, train a linear SVM on the original features and print the
# accuracy. We also measure and store accuracies and training times to
# plot them latter.

results = {}

lsvm = LinearSVC()
start = time.time()
lsvm.fit(X_train, y_train)
lsvm_time = time.time() - start
lsvm_score = 100 * lsvm.score(X_test, y_test)
Exemplo n.º 58
0
def train_classi(model_name, inputs, X_pos, y_pos, X, y, X_neg, y_neg):
    scaler = None
    model_type = inputs['model_type']
    out_comp_nm = inputs['dir_nm'] + inputs['out_comp_nm']

    if (model_type == "tpot"):
        logging_info("Training model... %s", str(model_type))

        from sklearn.pipeline import make_pipeline

        if (model_name == "tpot_select"):
            clf = tpot_classi(inputs)
        elif (model_name == "SVM"):
            logging_info("Training model... %s", str(model_name))
            # Imports from tpot output
            from sklearn.preprocessing import StandardScaler
            #from sklearn.svm import LinearSVC
            from sklearn.svm import SVC

            # Pipeline from tpot
            #clf = make_pipeline(StandardScaler(), LinearSVC(random_state=0, tol=1e-5))
            # Cross validate with C vals - default is 1
            # LinearSVC does not have a predict_proba function
            clf = make_pipeline(
                StandardScaler(),
                SVC(kernel='linear',
                    probability=True,
                    random_state=0,
                    tol=1e-5))
        elif (model_name == "estimator_SVM"):

            from sklearn.ensemble import GradientBoostingClassifier
            from sklearn.feature_selection import SelectFwe, f_classif
            from sklearn.linear_model import LogisticRegression
            from sklearn.pipeline import make_pipeline, make_union
            #from sklearn.svm import LinearSVC
            from tpot.builtins import StackingEstimator
            from xgboost import XGBClassifier

            # Score on the training set was:0.968003998605
            #clf = make_pipeline(StackingEstimator(estimator=GradientBoostingClassifier(learning_rate=0.1, max_depth=9, max_features=0.05, min_samples_leaf=2, min_samples_split=17, n_estimators=100, subsample=1.0)),SelectFwe(score_func=f_classif, alpha=0.02),StackingEstimator(estimator=LogisticRegression(C=1.0, dual=True, penalty="l2")),StackingEstimator(estimator=XGBClassifier(learning_rate=0.001, max_depth=7, min_child_weight=16, n_estimators=100, nthread=1, subsample=0.65)),LinearSVC(C=1.0, dual=True, loss="squared_hinge", penalty="l2", tol=0.001))

            clf = make_pipeline(
                StackingEstimator(
                    estimator=GradientBoostingClassifier(learning_rate=0.1,
                                                         max_depth=9,
                                                         max_features=0.05,
                                                         min_samples_leaf=2,
                                                         min_samples_split=17,
                                                         n_estimators=100,
                                                         subsample=1.0)),
                SelectFwe(score_func=f_classif, alpha=0.02),
                StackingEstimator(estimator=LogisticRegression(
                    C=1.0, dual=True, penalty="l2")),
                StackingEstimator(estimator=XGBClassifier(learning_rate=0.001,
                                                          max_depth=7,
                                                          min_child_weight=16,
                                                          n_estimators=100,
                                                          nthread=1,
                                                          subsample=0.65)),
                SVC(kernel='linear', probability=True, C=1.0, tol=0.001))
        elif (model_name == "log_reg"):
            logging_info("Training model... %s", str(model_name))
            # Imports from tpot output
            from sklearn.ensemble import ExtraTreesClassifier
            from sklearn.linear_model import LogisticRegression
            from tpot.builtins import StackingEstimator, ZeroCount

            # Pipeline from tpot
            # Score on humap was:0.986160063433
            clf = make_pipeline(
                ZeroCount(),
                StackingEstimator(
                    estimator=ExtraTreesClassifier(bootstrap=False,
                                                   criterion="entropy",
                                                   max_features=0.6,
                                                   min_samples_leaf=4,
                                                   min_samples_split=6,
                                                   n_estimators=100)),
                LogisticRegression(C=15.0, dual=False, penalty="l2"))

        elif (model_name == "extra_trees"):
            from sklearn.ensemble import ExtraTreesClassifier
            from tpot.builtins import StackingEstimator

            from sklearn.pipeline import make_pipeline, make_union
            from sklearn.preprocessing import Normalizer
            from sklearn.preprocessing import FunctionTransformer
            from copy import copy

            # Score on the training set was:0.948305771055
            clf = make_pipeline(
                make_union(
                    FunctionTransformer(copy),
                    make_pipeline(
                        StackingEstimator(estimator=ExtraTreesClassifier(
                            bootstrap=False,
                            criterion="gini",
                            max_features=0.25,
                            min_samples_leaf=8,
                            min_samples_split=11,
                            n_estimators=100)), Normalizer(norm="l1"))),
                StackingEstimator(
                    estimator=ExtraTreesClassifier(bootstrap=False,
                                                   criterion="entropy",
                                                   max_features=0.75,
                                                   min_samples_leaf=15,
                                                   min_samples_split=18,
                                                   n_estimators=100)),
                ExtraTreesClassifier(bootstrap=True,
                                     criterion="entropy",
                                     max_features=0.85,
                                     min_samples_leaf=5,
                                     min_samples_split=4,
                                     n_estimators=100))

        else:  # Random forest
            logging_info("Training model... %s", str(model_name))
            # Imports from tpot output
            from sklearn.ensemble import RandomForestClassifier
            from sklearn.feature_selection import VarianceThreshold
            from sklearn.preprocessing import PolynomialFeatures

            # Pipeline from tpot
            # Score on humap was:0.986160063433
            clf = make_pipeline(
                VarianceThreshold(threshold=0.05),
                PolynomialFeatures(degree=2,
                                   include_bias=False,
                                   interaction_only=False),
                RandomForestClassifier(bootstrap=False,
                                       criterion="entropy",
                                       max_features=0.35,
                                       min_samples_leaf=1,
                                       min_samples_split=11,
                                       n_estimators=100))

        clf.fit(X, y)

        logging_info("Finished Training model")
        logging_info("Evaluating training accuracy...")
        #Training accuracy

        acc_overall_train = clf.score(X, y)
        acc_pos_train = clf.score(X_pos, y_pos)
        acc_neg_train = clf.score(X_neg, y_neg)

        res_pos = clf.predict(X_pos)
        res = clf.predict(X_neg)

        n_pos = len(X_pos)
        n_neg = len(X_neg)

        acc, acc_neg, Recall, Precision, F1_score = calc_metrics(
            res, res_pos, n_neg, n_pos)
        analyze_sizewise_accuracies(
            X_pos, res_pos, X_neg, res,
            out_comp_nm + '_size_wise_accuracies_train.png')
        train_fit_probs = clf.predict_proba(X)[:, 1]
        train_aps = sklearn_metrics_average_precision_score(y, train_fit_probs)
        with open(out_comp_nm + '_metrics.out', "a") as fid:
            print("Training set average precision score = %.3f" % train_aps,
                  file=fid)

        model = clf

        if hasattr(model, 'decision_function'):
            score = model.decision_function(X_neg)
            np_savetxt(out_comp_nm + '_train_neg_score.out', score)
            score = model.decision_function(X_pos)
            np_savetxt(out_comp_nm + '_train_pos_score.out', score)

    elif (model_type == "NN"):

        # Standardizing the feature matrix
        from sklearn import preprocessing
        scaler = preprocessing.StandardScaler().fit(X)

        X = scaler.transform(X)

        # Scaling X_pos and X_neg as well now for testing with them later
        X_pos = scaler.transform(X_pos)
        X_neg = scaler.transform(X_neg)

        import tensorflow as tf
        from tensorflow import keras

        #tf.enable_eager_execution() # Fix ensuing errors

        logging_info("Training model... %s", str(model_type))

        # multi-layer perceptron
        #for most problems, one could probably get decent performance (even without a second optimization step) by setting the hidden layer configuration using just two rules: (i) number of hidden layers equals one; and (ii) the number of neurons in that layer is the mean of the neurons in the input and output layers.
        print()
        dims = X.shape
        n_feats = dims[1]
        n_classes = 2
        logging_info("No. of nodes in input layer = %s", str(n_feats))
        logging_info("No. of nodes in output layer (since softmax) = %s",
                     str(n_classes))
        hidden_nodes = int((n_feats + n_classes) / 2)
        logging_info("No. of nodes in the one hidden layer = %s",
                     str(hidden_nodes))
        model = keras.Sequential([
            keras.layers.Dense(n_feats, activation=tf.nn.relu),
            keras.layers.Dense(hidden_nodes, activation=tf.nn.relu),
            keras.layers.Dense(n_classes, activation=tf.nn.softmax)
        ])
        #model = keras.Sequential([keras.layers.Dense(n_feats, activation = tf.nn.relu), keras.layers.Dense(n_classes, activation = tf.nn.softmax)])
        model.compile(optimizer='adam',
                      loss='sparse_categorical_crossentropy',
                      metrics=['accuracy'])
        N_epochs = 1000
        model.fit(X, y, epochs=N_epochs, verbose=0)
        with open(out_comp_nm + '_metrics.out', "a") as fid:
            print("No. of epochs = ", N_epochs, file=fid)

        logging_info("Finished Training model")
        logging_info("Evaluating training accuracy...")
        loss_overall, acc_overall_train = model.evaluate(X, y, verbose=0)
        loss_pos, acc_pos_train = model.evaluate(X_pos, y_pos, verbose=0)
        loss_neg, acc_neg_train = model.evaluate(X_neg, y_neg, verbose=0)
    else:
        print("Model type not found")

    logging_info("Finished Evaluating training accuracy.")
    with open(out_comp_nm + '_metrics.out', "a") as fid:
        print("Accuracy overall train = %.3f" % acc_overall_train, file=fid)
        print("Accuracy positive train = %.3f" % acc_pos_train, file=fid)
        print("Accuracy negative train = %.3f" % acc_neg_train, file=fid)
        print("Train Precision = %.3f" % Precision, file=fid)
        print("Train Recall = %.3f" % Recall, file=fid)
        print("Train F1 score = %.3f" % F1_score, file=fid)
    return model, scaler
Exemplo n.º 59
0
def LSA():
    def connect():
        client = MongoClient()
        return client['myproject']

    def get_documents(query):
        '''
            Retrieves the tokenized version of the timelines,
            followers of a 'parent' account
            that we choose to include in the corpus: is_included: True
        '''
        condition = {'query': query}
        tweets = db.tweets.find_one(condition)['tweet_data']
        documents = [{
            'user_id': tw['user']['id'],
            'tokens': tw['tokens']
        } for tw in tweets]
        return documents

    def display_topics(svd,
                       terms,
                       n_components,
                       n_out=7,
                       n_weight=5,
                       topic=None):
        '''
            This displays a weight measure of each topic (dimension)
            and the 'n_out' first words of these topics.
            n_weight is the number of words used to calculate the weight
            Input:
                svd: the TruncatedSVD model that has been fitted
                terms: the list of words
                n_components: The reduced dimension
                topic: by default prints all topics in the SVD, if topic (int) given
                        prints only the weight and words for that topic
                n_out: Number of words per topic to display
                n_weight: Number of words to average on to calculate the weight
                        of the topic. The smaller, the more spread bwteen the topic
                        relative weights
        '''

        if topic is None:
            for k in range(n_components):
                idx = {i: abs(j) for i, j in enumerate(svd.components_[k])}
                sorted_idx = sorted(idx.items(),
                                    key=operator.itemgetter(1),
                                    reverse=True)
                weight = np.mean([item[1] for item in sorted_idx[0:n_weight]])
                print("T%s)" % k)
                for item in sorted_idx[0:n_out - 1]:
                    print(" %0.3f*%s" % (item[1], terms[item[0]]))
                print()
        else:
            m = max(svd.components_[topic])
            idx = {i: abs(j) for i, j in enumerate(svd.components_[topic])}
            sorted_idx = sorted(idx.items(),
                                key=operator.itemgetter(1),
                                reverse=True)
            weight = np.mean([item[1] for item in sorted_idx[0:n_weight]])
            print("* T %s) weight: %0.2f" % (topic, weight))
            for item in sorted_idx[0:n_out - 1]:
                print(" %0.3f*%s" % (item[1], terms[item[0]]))
            print()

    def plot_clusters(svdX, y_pred, centers):
        plt.style.use('fivethirtyeight')
        f, ax1 = plt.subplots(1, 1, figsize=(16, 8), facecolor='white')
        ax1.set_xlabel("")
        ax1.set_ylabel("")
        ax1.set_title("K-Means")
        # Only plots the first 2 dimensions of the svdX matrix
        ax1.scatter(svdX[:, 0], svdX[:, 1], c=y_pred, cmap=plt.cm.Paired, s=45)
        ax1.scatter(centers[:, 0],
                    centers[:, 1],
                    marker='o',
                    c="black",
                    alpha=1,
                    s=150)
        ax1.axis('off')
        plt.show()

# -------------------------------------
#  Params
# -------------------------------------

    n_components = 3  # Number of dimension for TruncatedSVD
    n_clusters = 3

    db = connect()

    # Get the already tokenized version of the timelines
    documents = get_documents('Israel')

    # This is hacky and due to the fact that we re-use previously tokenized documents
    # We re assemble the tokens prior to tokenizing them again
    tokenized = [' '.join(doc['tokens']) for doc in documents]

    vectorizer = TfidfVectorizer(max_df=0.9,
                                 min_df=6,
                                 max_features=500,
                                 use_idf=True,
                                 strip_accents='ascii')

    # X contains token frequency for each token
    X = vectorizer.fit_transform(tokenized)

    # SVD decomposition
    svd = TruncatedSVD(n_components, random_state=10)
    svdX = svd.fit_transform(X)

    # Normalization.
    # Note: for 2 dimensions this will cause the points to be on an ellipse.
    # Comment the 2 lines below to produce more meaningful plots

    nlzr = Normalizer(copy=False)
    svdX = nlzr.fit_transform(svdX)

    # Clustering
    km = KMeans(n_clusters=n_clusters,
                init='k-means++',
                max_iter=100,
                n_init=4,
                verbose=False,
                random_state=10)
    km.fit(svdX)

    print(" --------------------- ")
    print("   Silhouette Coefficient: %0.3f" %
          metrics.silhouette_score(svdX, km.labels_, sample_size=1000))
    print(" --------------------- ")

    # Array mapping from words integer indices to actual words
    terms = vectorizer.get_feature_names()

    display_topics(svd, terms, n_components)

    # to plot the documents and clusters centers
    # Only relevant for K = 2

    y_pred = km.predict(svdX)
    centers = km.cluster_centers_

    plot_clusters(svdX, y_pred, centers)
Exemplo n.º 60
0
    else:
        os.mkdir(args.d2v_dir + pathname)

    with timed('Running Doc2Vec'):
        model = Doc2Vec(documents,
                        dm=1,
                        sample=args.sample,
                        size=args.size,
                        window=args.window,
                        min_count=args.min_count,
                        workers=args.workers)

    if args.norm:
        with timed('Norming vectors'):
            from sklearn.preprocessing import Normalizer
            nrm = Normalizer('l2')
            normed = nrm.fit_transform(model.docvecs.doctag_syn0)
            words_normed = nrm.fit_transform(model.wv.syn0)

    with timed('Saving data'):
        if args.norm:
            np.save(
                '{0}{1}/user_features_normed_{1}.npy'.format(
                    args.d2v_dir, pathname), normed)
            np.save(
                '{0}{1}/song_features_normed_{1}.npy'.format(
                    args.d2v_dir, pathname), words_normed)
        model.save('{0}{1}/model_{1}'.format(args.d2v_dir, pathname))
        with open('{0}{1}/song_indices_{1}'.format(args.d2v_dir, pathname),
                  'w') as out:
            for song in model.wv.index2word: