def make_model(filename, verbose=False): x = load_obj('x-50000-samples-14:00') y = load_obj('y-50000-samples-14:00') no_samples = x.shape[0] mask = np.arange(0, no_samples) np.random.shuffle(mask) split_percent = 0.75 split_ind = int(no_samples * split_percent) x_train = x[mask[0:split_ind]] y_train = y[mask[0:split_ind]] x_test = x[mask[split_ind:]] y_test = y[mask[split_ind:]] model = RegressionModel(no_hidden=300) model.fit(x_train, y_train) joblib.dump(model, filename) if verbose: print('Dumped model to disk: {}'.format(filename)) if verbose: print('Training score: {}'.format(model.score(x_train, y_train))) print('Testing score: {}'.format(model.score(x_test, y_test)))
def plot(): x = load_obj('x-50000-samples-14:00')[0:1000, 0:11] y = load_obj('y-50000-samples-14:00')[0:1000, None] # Plot correlation between names = [ 'queue', 'doc1', 'doc2', 'doc3', 'doc4', 'doc5', 'doc6', 'doc7', 'doc8', 'doc9', 'doc10', 'time' ] fm = np.concatenate((x, y), axis=1) data = pandas.DataFrame(fm, columns=names) scatter_matrix(data) plt.show()
def process_sequence(obj): # Get list of paths in obj. category in AtlasNet. pth_files_an = jn(pth_root_an, obj, 'ply') files = ls(pth_files_an, exts='txt') # Peocess files. for f in files: # Extract file name. fn_base = f.split('.')[0] # Load .obj mesh from ShapeNet. pth_f_sn = jn( pth_root_sn, obj, fn_base, 'models', 'model_normalized.obj') assert os.path.exists(pth_f_sn) verts, faces = load_obj(pth_f_sn) # Load tf and apply. pth_f_an = jn(pth_files_an, f) T, s = load_tf(pth_f_an) verts = (verts - T) / s # Compute area. area = mesh_area(verts, faces) # Write area to the file. with open(pth_f_an, 'r') as fobj: txt = fobj.read() assert len(txt.splitlines()) == 2 has_nl = txt.endswith('\n') with open(pth_f_an, 'a') as fobj: fobj.write('{}{:.6f}'.format(('\n', '')[has_nl], area)) with num_samples_done.get_lock(): num_samples_done.value += 1 with finished_seqs.get_lock(): finished_seqs.value += 1
from matplotlib import image # routines for displaying orca image from scipy import ndimage from matplotlib.offsetbox import OffsetImage, AnnotationBbox os.chdir("/home/val/Documents/NFWF_Files/2020_Analysis/") print("Working directnp.arctanory is ", os.getcwd()) import WhaleBoatObj # NOTE BENE these python files have to be in the same directory as this file itself import whalePlot import helpers import globalParameters as gp ## gp stands for Global Parameters ######################################################################################## anonBoatsDict = helpers.load_obj("anonimizer") boatsDict = helpers.load_obj("boats") codeCountDict = helpers.load_obj("counts") activityCodeDict = helpers.load_obj("activityCode") jascoCodesDict = helpers.load_obj("jascoCodes") echoSL_Dict = helpers.load_obj("echoSL") #anonBoatsDict['CSMINF_168'] # here NFWF's id 'pow' is anonomized as CSMINF_168 # where 'pow' is a Commercial Small Inflatable with JASCO code JRHIB #('pow_CSMINF', 'Commercial Small Inflatable', 'JRHIB', 'Prince of Whales') #given the anonimized name, the rest of this vehicle's details can be found via boatsDict # #boatsDict['CSMINF'] # pull the vessel code off of the numbered code and use it to get the rest of the boat info # ('Commercial Small Inflatable', 'JRHIB') #boatType = boatsDict[boatID.split('_')[0]][1]
from helpers import load_obj, save_obj from cross_validation import cross_val folders = ["1 day"] + map(lambda x: str(x) + " days", [7, 14, 30, 90, 180, 365]) classifiers = ["DT", "RF", "LR", "kNN", "FFT-Dist2Heaven"] # classifiers = ["FFT-Dist2Heaven"] target = "timeOpen" cwd = os.getcwd() data_path = os.path.join(cwd, "data", "issue_close_time") details_path = os.path.join(data_path, 'issue_close_time_details_5x10_mdlp_365.pkl') if os.path.exists(details_path): performances = load_obj(details_path) else: performances = {} # for folder in folders: folder = folders[6] if folder not in performances: performances[folder] = collections.defaultdict(dict) folder_path = os.path.join(data_path, folder) for file in os.listdir(folder_path): if file.endswith(".csv"): print file + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" file_path = os.path.join(folder_path, file) df = pd.read_csv(file_path) df[target] = df[target].apply(lambda x: 1 if x else 0) for i, clf in enumerate(classifiers):
img[y, x] = shade #polys.append((np.mean(xyz[:, 2]), xyz)) """ polys = sorted(polys, reverse=True, key=lambda x: x[0]) for _, xyz in polys: rr, cc = polygon(xyz[:, 1], xyz[:, 0]) rr[np.logical_or(rr < 0, rr >= Y)] = 0 cc[np.logical_or(cc < 0, cc >= X)] = 0 img[rr, cc] = random.uniform(0.3, 1.0) """ return img # *** do shit tris = load_obj("objs/cube.obj") #tris = load_obj("objs/teapot.obj") # add ground plane tris.append(([-3, -0.5, -3], [3, -0.5, -3], [3, -0.5, 3])) tris.append(([-3, -0.5, -3], [-3, -0.5, 3], [3, -0.5, 3])) SCALE = 1 / 10.0 LSCALE = 1 / 100.0 origin = -10 * K + I + 2 * J #origin = -500*K + I + J look = K import pygame pygame.init()
########################## ##### INIT #### ########################## ### Command line arguments args = vars(parser.parse_args()) print(args) results_path = args['results_path'] substring = args['substring'] del_substring = args['del_substring'] out_name = args['out_name'] ### Get the result files files = [join(results_path, f) for f in listdir(results_path) \ if isfile(join(results_path, f)) and (substring in f) and (del_substring not in f)] print(files) with open(out_name,"w") as outfile: for filename in files: result = load_obj(filename) line = [] line.append(filename) line.append('%.02f'%result['train_accuracy']) line.append('%.02f'%result['test_accuracy']) f1=f1_score(result['test_true_classes'], result['test_pred_classes'], average='weighted') line.append('%.02f'%f1) for elem in line: outfile.write(elem) outfile.write(" & ") outfile.write("\n")
def get_submission(X_train, X_valid, y_train, y_valid, X_test, train_params={}, eval_metric='auc', save=False, load=False, mdl_name='xgb_class'): start_time = time.time() end_time = start_time if load: classifier = load_obj(mdl_name) else: classifier = XGBClassifier(**train_params) classifier.fit(X_train.values, y_train.values.ravel(), eval_metric=eval_metric) end_time = time.time() if save: save_obj(classifier, mdl_name) print('model saved') train_pred = classifier.predict(X_train.values) valid_pred = classifier.predict(X_valid.values) test_pred = classifier.predict(X_test.values) fpr, tpr, _ = roc_curve(y_train.values, train_pred, pos_label=1) train_loss = auc(fpr, tpr) fpr, tpr, _ = roc_curve(y_valid.values, valid_pred, pos_label=1) valid_loss = auc(fpr, tpr) feature_importances = classifier.feature_importances_ feature_names = X_train.columns.values sorted_idx = np.argsort(feature_importances * -1) # descending order summary = '====== XGBClassifier Training Summary ======\n' for idx in sorted_idx: summary += '[{:<25s}] | {:<10.4f}\n'.format(feature_names[idx], feature_importances[idx]) summary += '>>> training_time={:10.2f}min\n'.format( (end_time - start_time) / 60) summary += '>>> Final AUC: {:10.4f}(Training), {:10.4f}(Validation)\n'.format( train_loss, valid_loss) # Generate submission submission = pd.DataFrame(data=test_pred, index=X_test.index, columns=['Next_Premium']) submission_train = pd.DataFrame(data=train_pred, index=X_train.index, columns=['Next_Premium']) submission_valid = pd.DataFrame(data=valid_pred, index=X_valid.index, columns=['Next_Premium']) return { 'model': classifier, 'submission': submission, 'submission_train': submission_train, 'submission_valid': submission_valid, 'valid_loss': valid_loss, 'summary': summary }
def get_submission(X_train, y_train, X_valid, y_valid, X_test, params, save=False, load=False, mdl_name='catb'): categorical_features_indices = np.where(X_train.dtypes != np.float)[0] X_train.fillna(-999, inplace=True) X_valid.fillna(-999, inplace=True) X_test.fillna(-999, inplace=True) PATH = './saved_models' if not os.path.isdir(PATH): os.makedirs(PATH) start_time = time.time() end_time = start_time if load: regressor = load_obj(mdl_name) else: regressor = CatBoostRegressor(**params) regressor.fit(X_train, y_train, cat_features=categorical_features_indices, eval_set=(X_valid, y_valid), plot=False, early_stopping_rounds=None) end_time = time.time() if save: save_obj(regressor, mdl_name) train_pred = regressor.predict(X_train.values) valid_pred = regressor.predict(X_valid.values) test_pred = regressor.predict(X_test.values) train_loss = mean_absolute_error(y_train.values, train_pred) valid_loss = mean_absolute_error(y_valid.values, valid_pred) feature_importances = np.array(regressor.feature_importances_) feature_names = X_train.columns.values sorted_idx = np.argsort(feature_importances * -1) # descending order summary = '====== CatBoost Training Summary ======\n' for idx in sorted_idx: summary += '[{:<25s}] | {:<10.4f}\n'.format(feature_names[idx], feature_importances[idx]) summary += '>>> training_time={:10.2f}min\n'.format( (end_time - start_time) / 60) summary += '>>> Final MAE: {:10.4f}(Training), {:10.4f}(Validation)\n'.format( train_loss, valid_loss) # Generate submission submission = pd.DataFrame(data=test_pred, index=X_test.index, columns=['Next_Premium']) submission_train = pd.DataFrame(data=train_pred, index=X_train.index, columns=['Next_Premium']) submission_valid = pd.DataFrame(data=valid_pred, index=X_valid.index, columns=['Next_Premium']) return { 'model': regressor, 'submission': submission, 'submission_train': submission_train, 'submission_valid': submission_valid, 'valid_loss': valid_loss, 'summary': summary }
"@xerces": ["xerces-1.2.csv", "xerces-1.3.csv", "xerces-1.4.csv"] } rank_csv = os.path.join(data_path, 'top_changes.csv') feature_rankings = pd.read_csv(rank_csv, index_col=0) criterias = ["Accuracy", "Dist2Heaven", "LOC_AUC"] # "Gini", "InfoGain"] for percent in [25, 50, 75, 100]: p_opt_stat = [] cnts = [collections.defaultdict(int) for _ in xrange(len(criterias))] print str(percent) + ' percent of features selected' f_cnt = int(percent / 100.0 * 20) all_data_filepath = os.path.join( data_path, "_reduced_" + str(percent) + "_Data_16.pkl") all_data = load_obj(all_data_filepath) if os.path.exists( all_data_filepath) else {} for name, files in data.iteritems(): if name not in all_data: print '\n' + name print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" f_rankings = feature_rankings.loc[feature_rankings["Name"] == name[1:]].values[0] f_selected = [ t for t in list(f_rankings[1:f_cnt]) + ['bug'] if t != "name.1" ] print "selected features are: " + ", ".join(f_selected) paths = [os.path.join(data_path, file_name) for file_name in files] train_df = pd.concat([pd.read_csv(path) for path in paths[:-1]], ignore_index=True) train_df = train_df[f_selected]
import Learners import os from helpers import load_obj, save_obj logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) print( "================================== Loading models ==================================" ) translator = Translator() word2vecTrainer = Learners.FastTextTrainer() br_model = word2vecTrainer.load_google_model( "/home/mattyws/Downloads/Wikipedia/wiki.pt/wiki.pt") word_freq = load_obj('word_count') print( "================================== Creating pairs ==================================" ) i = 0 not_in_vocab = set() infer_words = set() while i < len(word_freq): word = word_freq[i][0] try: if word not in br_model.wv.vocab: not_in_vocab.add(word) if word in br_model and word not in br_model.wv.vocab: infer_words.add(word) i += 1
os.chdir("/home/val/Documents/NFWF_Files/2020_Analysis/") print("Working directory is ",os.getcwd()) import WhaleBoatObj # NOTE BENE these python files have to be in the same directory as this file itself import whalePlot import helpers import globalParameters as gp ################################################################################# allPassbys = helpers.load_obj("tracksModel_2003_2005") # this is list of passbys where each passby is one whale object # and objects for accompanying boats def plotRangeToWhale(Ipassby): whale = allPassbys[Ipassby][0] print(whale) if whale.nBoats == 0: return boats = allPassbys[Ipassby][1] rWhale = [] N100 = N400 = N1000 = N5000 = 0 for boat in boats: print(boat) print("len(rWhale)",len(boat.rWhale))
""" Sept 15, 2020 Unpacked pickled intermediate data files and save as csv for public access """ import os.path from typing import List import helpers os.chdir("/home/val/Documents/NFWF_Files/2020_Analysis/") print("Working directory is ", os.getcwd()) allPassbys = helpers.load_obj("tracksModel_RLs_2003_2005") # each passby of allPassbys is a whale, boat pair whale=passby[ ][0] boats=passby[ ][1] def buildRangeToWhale(Ipassby): whale = allPassbys[Ipassby][0] print(whale) if whale.nBoats == 0: return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 boats = allPassbys[Ipassby][1] rWhale = [] N100 = N400 = N1000 = N5000 = 0 for boat in boats: # print(boat) # print("len(rWhale)", len(boat.rWhale)) for R in boat.rWhale:
b.tauMod.append(0) b.vxMod.append(0) b.vyMod.append(0) b.vMod.append(0) b.aMod.append(0) dx = (b.xMod[i] - w.xMod[i]) dy = (b.yMod[i] - w.yMod[i]) R = np.sqrt(dx**2 + dy**2) theta = 180*math.atan2(dx,dy)/np.pi b.rWhale.append(R) b.bearingWhale.append(theta) ## print("leaving predictBpositions") ###################################### Executable code starts here tracksList = helpers.load_obj("tracksList_2003_2005") for track in tracksList: whale = track[0] print(whale)predictBpositions predictWpositions(whale) boats = track[1] for boat in boats: print("trackIDs", whale.trackID ,boat.trackID, "predicting boat",boat.boatID, "# boat obs",boat.Nobs) predictBpositions(whale, boat) print("check boat xMod len=",len(boat.xMod)) # whalePlot.plotPassby(whale, boats, 2400, False, False)# Don't plot RLs and no DEBUG # whalePlot.plotPassby(whale, boats, 1200, False, False) # whalePlot.plotPassby(whale, boats, 600, False, False) # whalePlot.plotPassby(whale, boats, 300, False, False)
img[y,x] = shade #polys.append((np.mean(xyz[:, 2]), xyz)) """ polys = sorted(polys, reverse=True, key=lambda x: x[0]) for _, xyz in polys: rr, cc = polygon(xyz[:, 1], xyz[:, 0]) rr[np.logical_or(rr < 0, rr >= Y)] = 0 cc[np.logical_or(cc < 0, cc >= X)] = 0 img[rr, cc] = random.uniform(0.3, 1.0) """ return img # *** do shit tris = load_obj("objs/cube.obj") #tris = load_obj("objs/teapot.obj") # add ground plane tris.append(([-3, -0.5, -3], [3, -0.5, -3], [3, -0.5, 3])) tris.append(([-3, -0.5, -3], [-3, -0.5, 3], [3, -0.5, 3])) SCALE = 1/10.0 LSCALE = 1/100.0 origin = -10*K + I + 2*J #origin = -500*K + I + J look = K import pygame pygame.init()
def get_submission(X_train, X_valid, y_train, y_valid, X_test, train_params={}, save=False, load=False, mdl_name='xgb'): PATH = './saved_model' if not os.path.isdir(PATH): os.makedirs(PATH) start_time = time.time() end_time = start_time if load: regressor = load_obj(mdl_name) else: regressor = xgb.XGBRegressor(**train_params) regressor.fit(X_train.values, y_train.values, eval_metric='mae') end_time = time.time() if save: save_obj(regressor, mdl_name) train_pred = regressor.predict(X_train.values) valid_pred = regressor.predict(X_valid.values) test_pred = regressor.predict(X_test.values) train_loss = mean_absolute_error(y_train.values, train_pred) valid_loss = mean_absolute_error(y_valid.values, valid_pred) feature_importances = regressor.feature_importances_ feature_names = X_train.columns.values sorted_idx = np.argsort(feature_importances * -1) # descending order summary = '====== XGBoost Training Summary ======\n' for idx in sorted_idx: summary += '[{:<25s}] | {:<10.4f}\n'.format(feature_names[idx], feature_importances[idx]) summary += '>>> training_time={:10.2f}min\n'.format( (end_time - start_time) / 60) summary += '>>> Final MAE: {:10.4f}(Training), {:10.4f}(Validation)\n'.format( train_loss, valid_loss) # Generate submission submission = pd.DataFrame(data=test_pred, index=X_test.index, columns=['Next_Premium']) submission_train = pd.DataFrame(data=train_pred, index=X_train.index, columns=['Next_Premium']) submission_valid = pd.DataFrame(data=valid_pred, index=X_valid.index, columns=['Next_Premium']) return { 'model': regressor, 'submission': submission, 'submission_train': submission_train, 'submission_valid': submission_valid, 'valid_loss': valid_loss, 'summary': summary }
import os from helpers import load_obj, save_obj logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) print( "================================== Loading models ==================================" ) translator = Translator() word2vecTrainer = Learners.FastTextTrainer() br_model = word2vecTrainer.load_google_model( "/home/mattyws/Downloads/Wikipedia/br/wiki.pt/wiki.pt") en_model = word2vecTrainer.load_google_model( "/home/mattyws/Downloads/Wikipedia/wiki.en/wiki.en") word_freq = load_obj('word_count') if os.path.exists( '/home/mattyws/Downloads/Wikipedia/br/word_pairs_fasttext_inference.pkl' ): print( "================================== Loading pairs ==================================" ) word_pairs = load_obj('word_pairs_fasttext_inference') else: print( "================================== Creating pairs ==================================" ) word_pairs = [] i = 0 while len(word_pairs) < 5000 and i < len(word_freq):
######################################################################################### Program execution starts here ####################################################################################### boatsJdays = [] allBoatLines = loadAllBoats( boatsJdays ) #boatsJdays is a 1-D array with the julian time for each line in boat file logFile = open(parserLogFileName, 'w') if BUILD_DICTs: buildDictionaries(allBoatLines) helpers.save_obj(anonBoatsDict, "anonimizer") helpers.save_obj(boatsDict, "boats") helpers.save_obj(codeCountDict, "counts") helpers.save_obj(activityCodeDict, "activityCode") else: anonBoatsDict = helpers.load_obj("anonimizer") boatsDict = helpers.load_obj("boats") codeCountDict = helpers.load_obj("counts") activityCodeDict = helpers.load_obj("activityCode") passbyLinesLists = ['init'] passbyCnt = 0 whalePassbyList = [] boatsPassbyList = [] tracksList = [] gapList = [] # list of gaps identified between passby (in minutes) lineCnt = 0 while len(passbyLinesLists) > 0: passbyLinesLists = scanForNextTimeGap( gp.maxObsGapMins, gapList