Exemplo n.º 1
0
def plt_twodhist_season(x,
                        y,
                        starty,
                        endy,
                        bins=None,
                        cmap=None,
                        range=None,
                        norm=None):
    """ Plots a two-dimensional histogram of variable x and y.
    
    Parameters:
    -----------
    x           : variable on x-axis
    y           : variable on y-axis
    starty      : string of analysis begin
    endy        : string of analysis end
    bisn        : None or int or [int, int] or array-like or [array, array]
    cmap        : Colormap or str
    range       : array-like shape(2, 2), optional,
    norm        : Normalize, optional
    """

    f, axs = plt.subplots(2, 2, figsize=[10, 10], sharex=True, sharey=True)

    for sea, ax in zip(
            x.groupby('time.season').sum('time').season, axs.flatten()):

        _pp = np.asarray(x.sel(time=(x['time.season'] == sea), ))
        _cl = np.asarray(y.sel(time=(y['time.season'] == sea), ))
        _p = _pp[~np.isnan(_pp)]
        _c = _cl[~np.isnan(_pp)]

        _p = _p[~np.isnan(_c)]
        _c = _c[~np.isnan(_c)]

        counts, xedges, yedges, im = ax.hist2d(
            _p,  #p.flatten(), # use .flatten to pass a (N,) shape array as requested by hist2d
            _c,  #l.flatten(), 
            #bins=(20,50),
            bins=bins,
            density=False,
            #     density = True,  # If False, the default, returns the number of samples in each bin. If True, returns the probability density function at the bin, bin_count / sample_count / bin_area.
            cmap=cmap,
            range=range,
            cmin=0.5,
            norm=norm)
        ax.set_title(sea.values)
        ax.set_xlabel('Precipitation')
        ax.set_ylabel('Mass Fraction of Cloud Liquid + Ice Water')

    f.subplots_adjust(top=.85, right=0.8)
    f.suptitle('2D Histogram ' + starty + '-' + endy, fontweight='bold')
    #f.suptitle("Precipitation vs. Cloud Mass")
    cbar_ax = f.add_axes([1.01, 0.15, 0.025, 0.7])
    cbar = f.colorbar(im, cax=cbar_ax)
    cbar.ax.set_ylabel('Counts')
    ax.ticklabel_format(style='sci', axis='both', scilimits=(1, 0))

    plt.tight_layout()
def classify_using_fasttext(pretrained,
							supervised,
							train_data_path,
							text_field_name,
							target_col,
							model_path,
							n_features,
							test_data_path=None,
							split_frac=0.10):

	reader = Read()

	train, test = reader.read_from_csv(train_path,test_path)

	converter = Converter(train,test,target_col,text_field_name)
	
	if not pretrained:
		ft_trn_data = converter.preprocess_w2v_train()
		ft_model = FastTextModel()
		ft_model.train(ft_trn_data)
		ft_model.save(model_path)

	words_rows = converter.preprocess_w2v()

	vectorizer = Vectorize()

	train_data_features = vectorizer.to_fasttext_vectors(words_rows,
														supervised=supervised,
														model_path=model_path,
														num_features=n_features)

	X_train,X_test,y_train,y_test = train_test_split(train_data_features[:15000],train[target_col][:15000].values,
													 test_size=split_frac,
													 random_state=0)

	train_mask = np.all(np.isnan(X_train) | np.isinf(X_train), axis=1)

	test_mask = np.all(np.isnan(X_test) | np.isinf(X_test), axis=1)

	X_train,y_train = X_train[~train_mask],y_train[~train_mask]
	X_test,y_test = X_test[~test_mask],y_test[~test_mask]

	clf_models = ClassificationModels()
	clf_models.create_random_forest()

	clf_models.train(X_train, y_train)
	
	preds = clf_models.predict(X_test)

	cm = clf_models.confusion_matrix(y_test,preds)
	print(cm)
def classify_using_glove(pretrained,
						train_data_path,
						text_field_name,
						target_col,
						model_path,
						n_features,
						test_data_path=None,
						split_frac=0.10):
	
	reader = Read()

	train, test = reader.read_from_csv(train_path,test_path)

	converter = Converter(train,test,target_col,text_field_name)

	model_path = '/home/chandrashekhar/Applied/text-models/models/glove.twitter.27B/glove.twitter.27B.25d.txt'
	if not pretrained:
		w2v_trn_data = converter.preprocess_w2v_train()
		from embeddings.model import W2VModel
		w2v_model = W2VModel()
		w2v_model.train(w2v_trn_data)
		w2v_model.save(model_path)
	

	words_rows = converter.preprocess_w2v()

	vectorizer = Vectorize()

	train_data_features = vectorizer.to_glove_vectors(words_rows,model_path,n_features)

	X_train,X_test,y_train,y_test = train_test_split(train_data_features[:15000],train[target_col][:15000].values,
													 test_size=split_frac,
													 random_state=0)

	train_mask = np.all(np.isnan(X_train) | np.isinf(X_train), axis=1)

	test_mask = np.all(np.isnan(X_test) | np.isinf(X_test), axis=1)

	X_train,y_train = X_train[~train_mask],y_train[~train_mask]
	X_test,y_test = X_test[~test_mask],y_test[~test_mask]

	clf_models = ClassificationModels()
	clf_models.create_random_forest()

	clf_models.train(X_train, y_train)
	
	preds = clf_models.predict(X_test)

	cm = clf_models.confusion_matrix(y_test,preds)
	print(cm)
def calc_regression(ds, ds_result, lat, step, season, model=None):

    x = ds["iwp_{}_{}".format(lat, lat + step)].sel(season=season).values.flatten()
    y = ds["sf_{}_{}".format(lat, lat + step)].sel(season=season).values.flatten()

    mask = ~np.isnan(y) & ~np.isnan(x)

    if x[mask].size == 0 or y[mask].size == 0:
        # print(
        #     "Sorry, nothing exists for {} in {}. ({}, {})".format(
        #         model, season, lat, lat + step
        #     )
        # )
        ds_result[
            "slope_{}_{}".format(
                lat,
                lat + step,
            )
        ] = np.nan
        ds_result[
            "intercept_{}_{}".format(
                lat,
                lat + step,
            )
        ] = np.nan
        ds_result["rvalue_{}_{}".format(lat, lat + step)] = np.nan
    else:
        _res = linregress(x[mask], y[mask])

        ds_result[
            "slope_{}_{}".format(
                lat,
                lat + step,
            )
        ] = _res.slope
        ds_result[
            "intercept_{}_{}".format(
                lat,
                lat + step,
            )
        ] = _res.intercept
        ds_result["rvalue_{}_{}".format(lat, lat + step)] = _res.rvalue

    return ds_result