def plt_twodhist_season(x, y, starty, endy, bins=None, cmap=None, range=None, norm=None): """ Plots a two-dimensional histogram of variable x and y. Parameters: ----------- x : variable on x-axis y : variable on y-axis starty : string of analysis begin endy : string of analysis end bisn : None or int or [int, int] or array-like or [array, array] cmap : Colormap or str range : array-like shape(2, 2), optional, norm : Normalize, optional """ f, axs = plt.subplots(2, 2, figsize=[10, 10], sharex=True, sharey=True) for sea, ax in zip( x.groupby('time.season').sum('time').season, axs.flatten()): _pp = np.asarray(x.sel(time=(x['time.season'] == sea), )) _cl = np.asarray(y.sel(time=(y['time.season'] == sea), )) _p = _pp[~np.isnan(_pp)] _c = _cl[~np.isnan(_pp)] _p = _p[~np.isnan(_c)] _c = _c[~np.isnan(_c)] counts, xedges, yedges, im = ax.hist2d( _p, #p.flatten(), # use .flatten to pass a (N,) shape array as requested by hist2d _c, #l.flatten(), #bins=(20,50), bins=bins, density=False, # density = True, # If False, the default, returns the number of samples in each bin. If True, returns the probability density function at the bin, bin_count / sample_count / bin_area. cmap=cmap, range=range, cmin=0.5, norm=norm) ax.set_title(sea.values) ax.set_xlabel('Precipitation') ax.set_ylabel('Mass Fraction of Cloud Liquid + Ice Water') f.subplots_adjust(top=.85, right=0.8) f.suptitle('2D Histogram ' + starty + '-' + endy, fontweight='bold') #f.suptitle("Precipitation vs. Cloud Mass") cbar_ax = f.add_axes([1.01, 0.15, 0.025, 0.7]) cbar = f.colorbar(im, cax=cbar_ax) cbar.ax.set_ylabel('Counts') ax.ticklabel_format(style='sci', axis='both', scilimits=(1, 0)) plt.tight_layout()
def classify_using_fasttext(pretrained, supervised, train_data_path, text_field_name, target_col, model_path, n_features, test_data_path=None, split_frac=0.10): reader = Read() train, test = reader.read_from_csv(train_path,test_path) converter = Converter(train,test,target_col,text_field_name) if not pretrained: ft_trn_data = converter.preprocess_w2v_train() ft_model = FastTextModel() ft_model.train(ft_trn_data) ft_model.save(model_path) words_rows = converter.preprocess_w2v() vectorizer = Vectorize() train_data_features = vectorizer.to_fasttext_vectors(words_rows, supervised=supervised, model_path=model_path, num_features=n_features) X_train,X_test,y_train,y_test = train_test_split(train_data_features[:15000],train[target_col][:15000].values, test_size=split_frac, random_state=0) train_mask = np.all(np.isnan(X_train) | np.isinf(X_train), axis=1) test_mask = np.all(np.isnan(X_test) | np.isinf(X_test), axis=1) X_train,y_train = X_train[~train_mask],y_train[~train_mask] X_test,y_test = X_test[~test_mask],y_test[~test_mask] clf_models = ClassificationModels() clf_models.create_random_forest() clf_models.train(X_train, y_train) preds = clf_models.predict(X_test) cm = clf_models.confusion_matrix(y_test,preds) print(cm)
def classify_using_glove(pretrained, train_data_path, text_field_name, target_col, model_path, n_features, test_data_path=None, split_frac=0.10): reader = Read() train, test = reader.read_from_csv(train_path,test_path) converter = Converter(train,test,target_col,text_field_name) model_path = '/home/chandrashekhar/Applied/text-models/models/glove.twitter.27B/glove.twitter.27B.25d.txt' if not pretrained: w2v_trn_data = converter.preprocess_w2v_train() from embeddings.model import W2VModel w2v_model = W2VModel() w2v_model.train(w2v_trn_data) w2v_model.save(model_path) words_rows = converter.preprocess_w2v() vectorizer = Vectorize() train_data_features = vectorizer.to_glove_vectors(words_rows,model_path,n_features) X_train,X_test,y_train,y_test = train_test_split(train_data_features[:15000],train[target_col][:15000].values, test_size=split_frac, random_state=0) train_mask = np.all(np.isnan(X_train) | np.isinf(X_train), axis=1) test_mask = np.all(np.isnan(X_test) | np.isinf(X_test), axis=1) X_train,y_train = X_train[~train_mask],y_train[~train_mask] X_test,y_test = X_test[~test_mask],y_test[~test_mask] clf_models = ClassificationModels() clf_models.create_random_forest() clf_models.train(X_train, y_train) preds = clf_models.predict(X_test) cm = clf_models.confusion_matrix(y_test,preds) print(cm)
def calc_regression(ds, ds_result, lat, step, season, model=None): x = ds["iwp_{}_{}".format(lat, lat + step)].sel(season=season).values.flatten() y = ds["sf_{}_{}".format(lat, lat + step)].sel(season=season).values.flatten() mask = ~np.isnan(y) & ~np.isnan(x) if x[mask].size == 0 or y[mask].size == 0: # print( # "Sorry, nothing exists for {} in {}. ({}, {})".format( # model, season, lat, lat + step # ) # ) ds_result[ "slope_{}_{}".format( lat, lat + step, ) ] = np.nan ds_result[ "intercept_{}_{}".format( lat, lat + step, ) ] = np.nan ds_result["rvalue_{}_{}".format(lat, lat + step)] = np.nan else: _res = linregress(x[mask], y[mask]) ds_result[ "slope_{}_{}".format( lat, lat + step, ) ] = _res.slope ds_result[ "intercept_{}_{}".format( lat, lat + step, ) ] = _res.intercept ds_result["rvalue_{}_{}".format(lat, lat + step)] = _res.rvalue return ds_result