def main(): parser = argparse.ArgumentParser() parser.add_argument('-c', '--config', help='config file', default=None) parser.add_argument('-o', '--output_file', dest='output_file', help='output data file', default='data.hdf5') parser.add_argument('-r', '--rebound_file', help='Rebound simulation file', default=None) parser.add_argument('-t', '--t_end', type=float, dest='t_end', help='Termination time') parser.add_argument('-d', '--dt', type=float, dest='dt', help='Integration time step (optional for certain integrators)', default=None) parser.add_argument('-s', '--store_dt', type=float, dest='store_dt', help='output time step', default=100) parser.add_argument('-i', '--integrator', dest='integrator', help='Name of the integrator [GaussRadau15|WisdomHolman|RungeKutta|AdamsBashForth|LeapFrog|Euler]', default='GaussRadau15') args = parser.parse_args() abie = ABIE() abie.integrator = args.integrator if args.output_file is not None: abie.output_file = args.output_file if args.t_end is not None: abie.t_end = args.t_end if args.config is not None: abie.initialize(DataIO.parse_config_file(args.config)) elif args.rebound_file is not None: # populate the initial conditions from rebound simulation files abie.initialize() DataIO.ic_populate_from_rebound(args.rebound_file, abie) if args.dt is not None: abie.h = args.dt abie.store_dt = args.store_dt abie.integrate()
def update_data(): dataio = DataIO(autoload=False) df = dataio.update() if df.empty: st.sidebar.error("Failed To Update!") else: st.sidebar.balloons() st.sidebar.success("Updated Data, Wrote to disk and Loaded!") return df
class analysis_dif: dataio = DataIO() def diff_analysis(self,date,distinct): difflist = [] gaplist = [] for slice in range(143): ts = date+'-'+str(slice+1) diffdata = self.dataio.select_orderDiff_by_ds_distinct(ts,distinct) gap = self.dataio.select_gap(ts,distinct) gaplist.append(float(gap)) difflist.append(float(diffdata)) #print(type(diffdata)) #plt.plot(difflist,'ro-') fig = plt.figure() ax1 = fig.add_subplot(311) fig = sm.graphics.tsa.plot_acf(gaplist, lags=20, ax=ax1) ax2 = fig.add_subplot(312) fig = sm.graphics.tsa.plot_pacf(gaplist, lags=20, ax=ax2) ax3 = fig.add_subplot(313) ax3.plot(difflist,'ro-') title = date+" Distinct:"+str(distinct) plt.title(title) #arma_11 = sm.tsa.ARMA(difflist,(1,1)).fit() #arma_02 = sm.tsa.ARMA(difflist,(0,2)).fit() #arma_01 = sm.tsa.ARMA(gaplist,(1,0)).fit() arima = sm.tsa.ARIMA(gaplist,(1,1,0)).fit() fig1 = plt.figure(1) fig1 = arima.plot_predict() plt.show()
def initialize(self, config=None): # Initialize the integrator self.__integrators = Integrator.load_integrators() if self.__integrator is None: print('Use GaussRadau15 as the default integrator...') self.integrator = 'GaussRadau15' self.integrator.initialize() self.integrator.acceleration_method = 'ctypes' else: self.__integrator.CONST_G = self.CONST_G self.__integrator.t_end = self.__t_end self.__integrator.h = self.__h self.__integrator.t_start = self.__t_start self.__integrator.output_file = self.output_file self.__integrator.store_dt = self.__store_dt self.__integrator.buffer_len = self.__buffer_len if config is not None: # Gravitational parameter self.integrator.CONST_G = np.array(config['physical_params']['G']) # Integration parameters self.integrator = config['integration']['integrator'] self.integrator.initialize() self.integrator.h = float(config['integration']['h']) if 'acc_method' in config['integration']: self.integrator.acceleration_method = config['integration']['acc_method'] else: self.integrator.acceleration_method = 'ctypes' # Load sequence of object names if 'names' in config: names = config['names'] else: names = None # Initial and final times if self.integrator.t_start == 0: self.integrator.t_start = float(config['integration']['t0']) if self.integrator.t_end == 0: self.integrator.t_end = float(config['integration']['tf']) self.integrator.active_integrator = config['integration']['integrator'] DataIO.ic_populate(config['initial_conds'], self, names=names)
def initialize(self): if self.__buf is None: self.__buf = DataIO( buf_len=self.buffer_len, output_file_name=self.output_file, close_encounter_output_file_name=self. close_encounter_output_file, collision_output_file_name=self.collision_output_file, CONST_G=self.CONST_G) if self.particles.N > 0: # initialize the C library self.libabie.initialize_code( self.CONST_G, self.CONST_C, self.particles.N, MAX_CE_EVENTS=self.max_close_encounter_events, MAX_COLLISION_EVENTS=self.max_collision_events, close_encounter_distance=self.close_encounter_distance) self.buf.initialize_buffer(self.particles.N)
def __init__(self): self.data_io = DataIO() self.model = None self.x_train = None self.y_train = None self.x_test = None self.y_test = None self.num_classes = 0 self.epochs = 50 self.batch_size = 8 self.use_noise = True self.distributed_training = False self.multi_gpu_training = False self._multi_gpu_model = None self._n_gpus = 1 self.callbacks = [] self.logger = None self.log_level = logging.DEBUG self.input_shape = (512, 512, 3) # (256, 256, 3) self._t_start = 0 self._t_end = 0
from data_io import DataIO from sklearn.ensemble import ExtraTreesRegressor #from sklearn.cross_validation import cross_val_score from sklearn.feature_extraction.text import CountVectorizer import numpy as np dio = DataIO("Settings.json") submission = False n_trees = 10 min_samples_split = 2 param = """Normal count vector with max 200. New submission which is repeatable. and nicer count_vector_titles = TfidfVectorizer( read_column(train_filename, column_name), max_features=200, norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True) """ if submission: type_n = "train_full" type_v = "valid_full" else: type_n = "train" type_v = "valid" vectorizer = CountVectorizer( max_features=200, ) short_id = "count_200f" tfidf_columns = ["Title", "FullDescription", "LocationRaw"]
op.add_option("--n_features", action="store", type=int, default=2 ** 16, help="n_features when using the hashing vectorizer.") (opts, args) = op.parse_args() if len(args) > 0: op.error("this script takes no arguments.") sys.exit(1) print __doc__ op.print_help() print dio = DataIO("Settings_loc5.json") submission = False n_trees = 10 min_samples_split = 2 if submission: type_n = "train_full" type_v = "valid_full" else: type_n = "train" type_v = "valid" vectorizer = TfidfVectorizer( sublinear_tf=True, max_df=0.5,
def buf(self): if self.__buf is None: self.__buf = DataIO(buf_len=self.buffer_len, output_file_name=self.output_file, CONST_G=self.CONST_G) return self.__buf
from data_io import DataIO from sklearn.ensemble import ExtraTreesRegressor #from sklearn.cross_validation import cross_val_score from sklearn.feature_extraction.text import CountVectorizer import numpy as np dio = DataIO("Settings.json") submission = False n_trees = 10 min_samples_split = 2 param = """Normal count vector with max 200. New submission which is repeatable. and nicer count_vector_titles = TfidfVectorizer( read_column(train_filename, column_name), max_features=200, norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True) """ if submission: type_n = "train_full" type_v = "valid_full" else: type_n = "train" type_v = "valid" vectorizer = CountVectorizer(max_features=200, ) short_id = "count_200f" tfidf_columns = ["Title", "FullDescription", "LocationRaw"] #dio.make_counts(vectorizer, short_id, tfidf_columns, type_n, type_v) columns = ["Category", "ContractTime", "ContractType"]
def tfidf_cloud(n_trees): dio = DataIO("/data/Settings_cloud.json") submission = False min_samples_split = 2 param = """Normal count vector with max 200. New submission which is repeatable. and nicer count_vector_titles = TfidfVectorizer( read_column(train_filename, column_name), max_features=200, norm='l1', smooth_idf=True, sublinear_tf=False, use_idf=True) """ if submission: type_n = "train_full" type_v = "valid_full" else: type_n = "train" type_v = "valid" #features = dio.join_features("%s_" + type_n + "_tfidf_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_features) #validation_features = dio.join_features("%s_" + type_v + "_tfidf_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_valid_features) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_features", features) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_valid_features", validation_features) def load(filename): return joblib.load(path_join("/data", filename)) features = load("train_200f_noNorm_categoryTimeType_tfidfl1_features_jl") validation_features = load( "train_200f_noNorm_categoryTimeType_tfidfl1_valid_features_jl") print "features", features.shape print "valid features", validation_features.shape #salaries = dio.get_salaries(type_n, log=True) #if not submission: #valid_salaries = dio.get_salaries(type_v, log=True) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_salaries", salaries) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries", valid_salaries) #joblib.dump(salaries, "train_200f_noNorm_categoryTimeType_tfidfl2_salaries_jl", compress=5) #joblib.dump(valid_salaries, "train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries_jl", compress=5) #TODO: valid salaries so narobe dumpane salaries = load("train_200f_noNorm_categoryTimeType_tfidfl2_salaries_jl") valid_salaries = load( "train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries_jl") dio.is_log = True print salaries.shape name = "ExtraTree_min_sample%d_%dtrees_200f_noNorm_categoryTimeType_tfidfl1_new_log" % ( min_samples_split, n_trees) print name #dio.save_prediction("testni", np.array([1,2,3]), type_n="testno") classifier = ExtraTreesRegressor( n_estimators=n_trees, verbose=2, n_jobs=4, # 2 jobs on submission / 4 on valid test oob_score=False, min_samples_split=min_samples_split, random_state=3465343) #dio.save_model(classifier, "testni_model", 99.) classifier.fit(features, salaries) predictions = classifier.predict(validation_features) if submission: dio.save_prediction(name, predictions, type_n=type_v) dio.write_submission(name + ".csv", predictions=predictions) else: dio.compare_valid_pred(valid_salaries, predictions) metric = dio.error_metric mae = metric(valid_salaries, predictions) print "MAE validation: ", mae dio.save_model(classifier, name, mae) dio.save_prediction(name, predictions, type_n=type_v)
from data_io import DataIO from sklearn.ensemble import ExtraTreesRegressor from sklearn.cross_validation import cross_val_score import numpy as np from sklearn.metrics import mean_absolute_error from itertools import combinations from sklearn.linear_model import LinearRegression, RidgeCV, Ridge dio = DataIO("Settings.json") submission = False n_trees = 10 min_samples_split = 2 param = """Normal count vector with max 200. New submission which is repeatable. and nicer """ if submission: type_n = "train_full" type_v = "valid_full" else: type_n = "train" type_v = "valid" #columns = ["Category", "ContractTime", "ContractType"] #le_features = dio.get_le_features(columns, "train_full") #extra_features = dio.get_features(columns, type_n, le_features) #extra_valid_features = dio.get_features(columns, type_v, le_features) #features = dio.join_features("%s_" + type_n + "_tfidf_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_features)
from data_io import DataIO data = DataIO(autoload=False) df = data.update() print(df.shape)
from data_io import DataIO import numpy as np import matplotlib.pyplot as plt dio = DataIO("Settings.json") model_names = [ "ExtraTree_min_sample2_40trees_200f_noNorm_categoryTimeType_log", "vowpall", #"ExtraTree_min_sample2_10trees_200f_noNorm_categoryTimeType_count_fake_14split_new_log", #"ExtraTree_min_sample2_10trees_200f_noNorm_categoryTimeType_count_fake_split_new_log", #"ExtraTree_min_sample2_10trees_200f_noNorm_categoryTimeType_count_rf10_4split_new1_log" #"ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_count_rf10_4split_newOKsalPredictValid_log", "ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_count_exTre20_4split_new_faked_log", #"ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_count_exTre20_4split_newOKsalPredictValid_log", #"ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_count_exTre20_4split_newPredictsalPredictValid_log", "Ridge_tfidf_05d_log" #"ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_count_exTre20_4split_newPredictsalPredictValid1_log", ] valid_salaries = dio.get_salaries("valid", log=False) ylim = (0, 8000) xlim = (-50000, 50000) grid = True def encode_salaries(salaries, bins): bin_edges = np.linspace(11500.0, 100000, bins + 1, endpoint=True) #hist, bin_edges = np.histogram(salaries, bins) #bin_edges = list(bin_edges) #bin_edges.insert(0, 0) #bin_edges.append(salaries.max() + 1)
from data_io import DataIO from sklearn.decomposition import RandomizedPCA from sklearn.ensemble import ExtraTreesRegressor from sklearn.base import clone from sklearn.cross_validation import cross_val_score import numpy as np dio = DataIO("Settings.json") title_corpus = dio.read_gensim_corpus("train_title_nltk_filtered.corpus.mtx") pca = RandomizedPCA(random_state=3465343) salaries = dio.get_salaries("train", log=True) columns = ["Category", "ContractTime", "ContractType"] le_features = dio.get_le_features(columns, "train_full") extra_features = dio.get_features(columns, "train", le_features) #extra_valid_features = dio.get_features(columns, "valid", le_features) param = "RandomizedPCA title 200 Fulldescription 200 " + ",".join(columns) print map(len, extra_features) extra_features = map(lambda x: np.reshape(np.array(x),(len(x),1)),extra_features) print type(title_corpus) print title_corpus.shape title_pca = clone(pca) title_pca.set_params(n_components=200) title_corpus_pca = title_pca.fit_transform(title_corpus)
def __init__(self): self.data_io = DataIO() self.input = None self.content_data = None self.execute_data, self.prior_1, self.prior_2, self.leave = None, None, None, None wx.Frame.__init__(self, None, wx.ID_ANY, "Customer Segmentation", size=(800, 640)) self.panel = wx.Panel(self, wx.ID_ANY) status = self.CreateStatusBar() self.sizer = wx.BoxSizer(wx.VERTICAL) file_label = wx.StaticText(self.panel, -1, "Input:", (30, 20)) self.InputPathTextBox = wx.TextCtrl(self.panel, -1, "", size=(200, -1), pos=(100, 21)) self.browse_btn = wx.Button(self.panel, -1, "Browse", pos=(310, 20)) self.Bind(wx.EVT_BUTTON, self.onOpenFile, self.browse_btn) # output_label = wx.StaticText(self.panel, -1, "Ket qua:", (30, 50)) # self.OutputPathTextBox = wx.TextCtrl(self.panel, -1, "", size=(200, -1), pos=(100, 47)) self.execute_btn = wx.Button(self.panel, -1, "Phân tích", pos=(400, 20)) self.Bind(wx.EVT_BUTTON, self.onExecute, self.execute_btn) self.execute_btn.Disable() inform_label = wx.StaticText(self.panel, -1, "Tổng quát:", (30, 60)) self.inform_txt = wx.TextCtrl(self.panel, -1, "", style=wx.TE_MULTILINE | wx.TE_READONLY, size=(463, 70), pos=(100, 60)) self.prior1_btn = wx.Button(self.panel, -1, "Nhóm Ưu tiên 1", pos=(98, 140)) self.Bind(wx.EVT_BUTTON, self.onOpenPrior1, self.prior1_btn) self.prior1_btn.Disable() self.prior2_btn = wx.Button(self.panel, -1, "Nhóm Ưu tiên 2", pos=(210, 140)) self.Bind(wx.EVT_BUTTON, self.onOpenPrior2, self.prior2_btn) self.prior2_btn.Disable() self.leave_btn = wx.Button(self.panel, -1, "Dự đoán rời mạng", pos=(320, 140)) self.Bind(wx.EVT_BUTTON, self.onOpenLeave, self.leave_btn) self.leave_btn.Disable() self.total_btn = wx.Button(self.panel, -1, "Toàn bộ các nhóm", pos=(443, 140)) self.Bind(wx.EVT_BUTTON, self.onOpenTotal, self.total_btn) self.total_btn.Disable() self.chart_btn = wx.Button(self.panel, -1, "Biểu đồ", pos=(580, 140)) self.Bind(wx.EVT_BUTTON, self.onOpenChart, self.chart_btn) self.chart_btn.Disable() self.save_btn = wx.Button(self.panel, -1, "Export", pos=(680, 140)) self.Bind(wx.EVT_BUTTON, self.onSaveFile, self.save_btn) self.save_btn.Disable() self.data_grid = grid.Grid(self.panel, size=(780, 400), pos=(5, 180)) self.data_grid.CreateGrid(20, 10) self.panel.SetSizer(self.sizer)
def tfidf_cloud(n_trees): dio = DataIO("/data/Settings_cloud.json") submission = False min_samples_split = 2 param = """Normal count vector with max 200. New submission which is repeatable. and nicer count_vector_titles = TfidfVectorizer( read_column(train_filename, column_name), max_features=200, norm='l1', smooth_idf=True, sublinear_tf=False, use_idf=True) """ if submission: type_n = "train_full" type_v = "valid_full" else: type_n = "train" type_v = "valid" #features = dio.join_features("%s_" + type_n + "_tfidf_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_features) #validation_features = dio.join_features("%s_" + type_v + "_tfidf_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_valid_features) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_features", features) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_valid_features", validation_features) def load(filename): return joblib.load(path_join("/data", filename)) features = load("train_200f_noNorm_categoryTimeType_tfidfl1_features_jl") validation_features = load("train_200f_noNorm_categoryTimeType_tfidfl1_valid_features_jl") print "features", features.shape print "valid features", validation_features.shape #salaries = dio.get_salaries(type_n, log=True) #if not submission: #valid_salaries = dio.get_salaries(type_v, log=True) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_salaries", salaries) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries", valid_salaries) #joblib.dump(salaries, "train_200f_noNorm_categoryTimeType_tfidfl2_salaries_jl", compress=5) #joblib.dump(valid_salaries, "train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries_jl", compress=5) #TODO: valid salaries so narobe dumpane salaries = load("train_200f_noNorm_categoryTimeType_tfidfl2_salaries_jl") valid_salaries = load("train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries_jl") dio.is_log = True print salaries.shape name = "ExtraTree_min_sample%d_%dtrees_200f_noNorm_categoryTimeType_tfidfl1_new_log" % (min_samples_split, n_trees) print name #dio.save_prediction("testni", np.array([1,2,3]), type_n="testno") classifier = ExtraTreesRegressor(n_estimators=n_trees, verbose=2, n_jobs=4, # 2 jobs on submission / 4 on valid test oob_score=False, min_samples_split=min_samples_split, random_state=3465343) #dio.save_model(classifier, "testni_model", 99.) classifier.fit(features, salaries) predictions = classifier.predict(validation_features) if submission: dio.save_prediction(name, predictions, type_n=type_v) dio.write_submission(name + ".csv", predictions=predictions) else: dio.compare_valid_pred(valid_salaries, predictions) metric = dio.error_metric mae = metric(valid_salaries, predictions) print "MAE validation: ", mae dio.save_model(classifier, name, mae) dio.save_prediction(name, predictions, type_n=type_v)
from data_io import DataIO from sklearn.ensemble import ExtraTreesRegressor from sklearn.cross_validation import cross_val_score from os.path import join as path_join from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np import joblib import cloud import os tfidf_columns = ["Title", "FullDescription", "LocationRaw"] dio = DataIO("Settings.json") vectorizer = TfidfVectorizer( max_features=200, norm='l1', smooth_idf=True, sublinear_tf=False, use_idf=True ) short_id = "tfidf_200f_l1" type_n = "train" type_v = "valid" dio.make_counts(vectorizer, short_id, tfidf_columns, "train", "valid") columns = ["Category", "ContractTime", "ContractType"] le_features = dio.get_le_features(columns, "train_full") extra_features = dio.get_features(columns, type_n, le_features) extra_valid_features = dio.get_features(columns, type_v, le_features) features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix", tfidf_columns, extra_features)
class analysis: dataio = DataIO() feature = cFeature() wa = wavelet_ana() verify_file_path = './predict_data_in_training.txt' weekend = [2, 3, 9, 17] weekday = [4, 5, 6, 12, 13, 14, 15, 18] Sat = [2, 9] Sun = [3, 17] def time2slice(self, i_time): t_array = datetime.datetime.strptime(i_time, "%Y-%m-%d %H:%M:%S") slice = t_array.hour * 6 + math.floor(t_array.minute / 10) + 1 return slice def slice2time(self, slice): slice = int(slice) hour = math.floor((slice - 1) / 6) min = (slice - 1 - hour * 6) * 10 timenow = "{:02}:{:02}".format(hour, min) return timenow def select_test_day(self, daylist): daytest = [] for i in daylist: day = '{:02d}'.format(i) prefix = '2016-01-' date = prefix + day daytest.append(date) return daytest def weather_main_trend(self, date, hour_interval=1): #print(self.dataio.select_weatherdata_by_dateslice(date)) weatherlist = [] for i in range(1, 144, 6 * hour_interval): dateslice = date + '-' + str(i) weather = self.dataio.select_weatherdata_by_dateslice(dateslice) if date == '2016-01-16': print(weather) if type(weather) != type(None): weatherlist.append(weather) if len(weatherlist) == 0: print("len(weatherlist)==0") exit(1) weatherPD = pd.DataFrame(weatherlist) if date == '2016-01-16': print(weatherPD) #del weatherPD['temp'] #del weatherPD['pm2.5'] timelist = [] for idx in weatherPD.index: slice = idx.split('-')[-1] timetext = self.slice2time(slice) timelist.append(timetext) weatherPD.index = timelist return weatherPD def write_weather_info(self): for day in range(21): prefix = '2016-01-' date = prefix + '{:02d}'.format(day + 1) print(date) pd_weather = self.weather_main_trend(date) filepath = './weather_info' filename = date + ".txt" fw = open(os.path.join(filepath, filename), 'w') pd_weather.to_csv(fw) fw.close() def do_analysis_drawGapTrend(self): weekend = [1, 2, 3, 9, 10, 16, 17] weekday1 = [4, 5, 6, 7, 8] weekday2 = [11, 12, 13, 14, 15] for type in range(3): if type == 0: daytest = self.select_test_day(weekend) ax = plt.subplot(311) ax.set_title("weekend") if type == 1: daytest = self.select_test_day(weekday1) ax = plt.subplot(312) ax.set_title("weekday1") if type == 2: daytest = self.select_test_day(weekday2) ax = plt.subplot(313) ax.set_title("weekday2") for day in daytest: data = self.dataio.select_orderdata_by_district(day, 8) gap = (data['demand'] - data['supply']) gaplen = gap.shape[0] idx = np.array(range(gaplen)) + 1 x_label = [] for i in range(144): x_label.append(ana.slice2time(i + 1)) gap.index = x_label gap.plot(label=day) ax.legend(loc=2) plt.show() def train_kernel_ridge_regression_clf(self, train_daylist, distinct, gamma=1, alpha=1): daytest = self.select_test_day(train_daylist) y_train = [] X_train = [] for day in daytest: for slice in range(144): dateslice = day + '-' + str(slice + 1) #feature,gap = self.generateFeatureLabel(dateslice,distinct) feature, gap = self.feature.generate(dateslice, distinct) if feature != None: if gap != 0: gap = math.log10(float(gap)) else: gap = -0.1 X_train.append(feature) y_train.append(gap) clf = KernelRidge(kernel='polynomial', gamma=gamma, alpha=alpha) #clf = KernelRidge(kernel='polynomial', degree=3,alpha=0.01) clf.fit(X_train, y_train) return clf def train_optimzation_model(self, train_daylist, distinct): daytest = self.select_test_day(train_daylist) y_train = [] X_train = [] for day in daytest: for slice in range(144): dateslice = day + '-' + str(slice + 1) #feature, label = self.generateFeatureLabel(dateslice, distinct) feature, label = self.feature.generate(dateslice, distinct) #print(feature,label) #print(feature1,label1) #print("-----------") if feature != None: X_train.append(feature) y_train.append(label) opt = optimization() opt.fit(X_train, y_train) return opt def train_gap_diff_curve(self, day, distinct): if len(day.split('-')) != 3: print( "The input of train_gap_diff_curve_by_distinct_day should be a xx-xx-xx" ) exit(1) difflist = [] for slice in range(144): dateslice = day + '-' + str(slice + 1) diffval = self.dataio.select_orderDiff_by_ds_distinct( dateslice, distinct) if diffval != None: difflist.append(diffval) coeffs = self.wa.get_wavelet_coeffs(difflist) #coeffs = self.wa.coeffs_process(coeffs) curve = self.wa.reconstruction_from_coeffs(coeffs) return np.array(curve) def train_gap_diff_by_distinctlist(self, distinct_list, diffcurveList, count): for distinct in distinct_list: count[0] += 1 print("Training model in " + "{:.1f}".format(count[0] / 66 * 100) + "% completed...") curve_dict = {} weekday = self.select_test_day(self.weekday) curve_sum = np.zeros(144) for day in weekday: curve = self.train_gap_diff_curve(day, distinct + 1) curve_sum += curve curve_dict['weekday'] = curve_sum / len(weekday) sat = self.select_test_day(self.Sat) curve_sum = np.zeros(144) for day in sat: curve = self.train_gap_diff_curve(day, distinct + 1) curve_sum += curve curve_dict['sat'] = curve_sum / len(sat) sun = self.select_test_day(self.Sun) curve_sum = np.zeros(144) for day in sun: curve = self.train_gap_diff_curve(day, distinct + 1) curve_sum += curve curve_dict['sun'] = curve_sum / len(sun) diffcurveList[distinct] = curve_dict def drawing_perform_by_distinct_daylist(self, clf, daylist, distinct): daytest = self.select_test_day(daylist) for i, day in enumerate(daytest): gap_real = [] gap_predict = [] slice_x = [] for slice in range(144): dateslice = day + '-' + str(slice + 1) #feature,gap = self.generateFeatureLabel(dateslice,distinct) feature, gap = self.feature.generate(dateslice, distinct) if feature == None: continue label_predicted = clf.predict([feature]) gap_real.append(gap) gap_predict.append(label_predicted) slice_x.append(slice) plt.plot(slice_x, gap_real, color=get_color(i), label=day) plt.plot(slice_x, gap_predict, color=get_color(i), ls='--', lw=2) plt.legend(loc=2) plt.grid() plt.show() def verifying_in_training_set(self, clf): fr = open(self.verify_file_path, 'r') timeslicelist = [] for line in fr: timeslice = line.split(' ')[0] timeslicelist.append(timeslice) fr.close() #------clf------distinct(0,65)-------type(0:weekday, 1:weekend)----- count = 0 err_rate_sum = 0 for timeslice in timeslicelist: for dis_ind in range(66): #clf[distinct][] distinct = dis_ind + 1 date = timeslice[0:10] isWeekend = isWeekends(date) #feature,gap = self.generateFeatureLabel(timeslice,distinct) feature, gap = self.feature.generate(timeslice, distinct) if feature == None or gap == 0: continue gap_predicted = clf[dis_ind][isWeekend].predict([feature])[0] gap_predicted = int(math.pow(10, gap_predicted)) if gap_predicted < 0: gap_predicted = 0 err_rate = abs((gap - gap_predicted) / gap) err_rate_sum += err_rate count += 1 err_rate_sum /= count return err_rate_sum def verifying_in_training_set_bydiff(self, diffcurve): fr = open(self.verify_file_path, 'r') timeslicelist = [] for line in fr: timeslice = line.split(' ')[0] timeslicelist.append(timeslice) fr.close() # ------clf------distinct(0,65)-------type(0:weekday, 1:weekend)----- count = 0 err_rate_sum = 0 for timeslice in timeslicelist: for dis_ind in range(66): distinct = dis_ind + 1 slice = int(timeslice.split('-')[-1]) date = timeslice[0:10] gap = self.dataio.select_gap(timeslice, distinct) if gap == 0: continue ts_before1 = date + '-' + str(slice - 1) ts_before2 = date + '-' + str(slice - 2) ts_before3 = date + '-' + str(slice - 3) gap1 = self.dataio.select_gap(ts_before1, distinct) gap2 = self.dataio.select_gap(ts_before2, distinct) gap3 = self.dataio.select_gap(ts_before3, distinct) diff1 = gap1 - gap2 diff2 = gap2 - gap3 daytype = isWeekends(date) diffval1 = 0 diffval0 = 0 if daytype == 0: curve = diffcurve[dis_ind]['weekday'] diffval0 = curve[slice - 1] diffval1 = curve[slice - 2] if daytype == 1: curve = diffcurve[dis_ind]['sat'] diffval0 = curve[slice - 1] diffval1 = curve[slice - 2] if daytype == 2: curve = diffcurve[dis_ind]['sun'] diffval0 = curve[slice - 1] diffval1 = curve[slice - 2] gapdiff_predict = 2 * diffval1 - diff1 + diffval0 gap_predicted = gap1 + gapdiff_predict if gap_predicted < 0: gap_predicted = 0 err_rate = abs((gap - gap_predicted) / gap) err_rate_sum += err_rate count += 1 err_rate_sum /= count return err_rate_sum def calculate_norm2_error(self, clf, daylist, distinct): err_val = 0 count = 0 daylist = self.select_test_day(daylist) for date in daylist: for slice in range(144): timeslice = date + '-' + str(slice + 1) feature, gap = self.feature.generate(timeslice, distinct) if feature == None or gap == 0: continue if gap != 0: gap_log = math.log10(gap) else: gap_log = 0 gap_predicted = clf.predict([feature])[0] err_val += (gap_log - gap_predicted)**2 count += 1 err_val /= count return err_val def calculate_mape_by_DayDistinct(self, clf, daylist, distinct): err_rate_sum = 0 count = 0 daylist = self.select_test_day(daylist) for date in daylist: for slice in range(144): timeslice = date + '-' + str(slice + 1) #feature, gap = self.generateFeatureLabel(timeslice, distinct) feature, gap = self.feature.generate(timeslice, distinct) if feature == None or gap == 0: continue gap_predicted = clf.predict([feature])[0] #print('Before log:',gap_predicted) gap_predicted = int(math.pow(10, gap_predicted)) if gap_predicted < 0: gap_predicted = 0 isWeekend = isWeekendsText(date) gap_filtered = self.dataio.select_filter_gap( timeslice, distinct, isWeekend) if gap_predicted > 2 * gap_filtered: gap_predicted = 2 * gap_filtered # print('After log:', gap_predicted,gap) err_rate = abs((gap - gap_predicted) / gap) #print(timeslice+"\t{:.2f}\t{}\t{:.0f}".format(err_rate,gap,gap_predicted)) err_rate_sum += err_rate count += 1 err_rate_sum /= count return err_rate_sum # # def generateFeatureLabel(self,dateslice,distinct): # date = dateslice[0:10] # weather = self.dataio.select_weatherdata_by_dateslice(dateslice) # if type(weather) == type(None): # #print("Weather info. does not exist in "+dateslice) # return None,None # # # # weather_feature = [0] * 4 # cur_weather = int(weather['weather']) # if cur_weather == 2 or cur_weather == 3 or cur_weather == 4: # weather_feature[0] = 1 # elif cur_weather == 8: # weather_feature[1] = 1 # elif cur_weather == 9: # weather_feature[2] = 1 # else: # weather_feature[3] = 1 # #print(weather_feature) # #weather_feature[int(weather['weather']) - 1] = 1 # # orderdata = self.dataio.select_orderdata_by_district(dateslice,distinct) # gap_real = (orderdata['demand']-orderdata['supply']).values # gap_real = gap_real[0] # timeslice = int(dateslice.split('-')[-1]) # if timeslice <4: # return None,None # traffic_info = self.dataio.select_trafficdata_by_district(dateslice,distinct) # if traffic_info.empty and distinct !=54: # return None,None # # ts_feature = gene_timeslice_feature(timeslice,4) # # # result = isWeekends(date) # if result == 0: # daytype = 'weekday' # if result == 1: # daytype = 'sat' # if result == 2: # daytype = 'sun' # # gap_filtered = self.dataio.select_filter_gap(dateslice,distinct,daytype) # gap_filtered_last = self.dataio.select_filter_gap(get_last_ts(dateslice),distinct,daytype) # traffic_level =[1,1,1,1] # if not traffic_info.empty: # level1 = (traffic_info['level1'].values)[0] # level2 = (traffic_info['level2'].values)[0] # level3 = (traffic_info['level3'].values)[0] # level4 = (traffic_info['level4'].values)[0] # traffic_level[0] = level1 # traffic_level[1] = level2 # traffic_level[2] = level3 # traffic_level[3] = level4 # # #print(traffic_level) # # trafficBeList = [] # GapBeList = [] # for delta in range(3): # datesliceBe = dateslice[0:11]+str(timeslice-delta-1) # orderdataBe = self.dataio.select_orderdata_by_district(datesliceBe, distinct) # gap_real_Be = (orderdataBe['demand'] - orderdataBe['supply']).values # gap_real_Be = gap_real_Be[0] # GapBeList.append(gap_real_Be) # # traffic_info = self.dataio.select_trafficdata_by_district(datesliceBe,distinct) # if not traffic_info.empty: # level1 = (traffic_info['level1'].values)[0] # level2 = (traffic_info['level2'].values)[0] # level3 = (traffic_info['level3'].values)[0] # level4 = (traffic_info['level4'].values)[0] # traffic_temp = level1 + level2 * 2 + level3 * 3 + level4 * 4 # else: # traffic_temp = 1 # trafficBeList.append(traffic_temp) # # # #GapBeListExp2 = [x*x for x in GapBeList] # GapBeListExp2 = math.pow(GapBeList[0],2) # #GapBeListExp2 = math.exp(GapBeList[0]) # feature = [] # # #feature.extend(weather_feature) # feature.extend(GapBeList) # #feature.extend(ts_feature) # feature.append(gap_filtered) # feature.append(gap_filtered_last) # # # diff = abs(gap_filtered - GapBeList[0]) # # diff_exp05 = math.pow(diff,0.5) # # if gap_filtered - GapBeList[0]>0: # # pass # # else: # # diff_exp05 *= -1 # # #feature.append(math.log(gap_filtered-GapBeList[0])) # # #feature.append(math.pow((GapBeList[0] - GapBeList[1]),2)) # #feature.extend(GapBeListExp2) # #feature.append(diff_exp05) # # # #feature.extend(traffic_level) # #feature.extend(trafficBeList) # feature.append(1) # # return feature,gap_real def gene_KRR_clf_bydaylist(self, distinct_list, clflist, count, gamma=1, alpha=1): for distinct in distinct_list: rand = random.random() time.sleep(rand / 10) count[0] += 1 print("Training model in " + "{:.1f}".format(count[0] / 66 * 100) + "% completed...") clf_weekday = self.train_kernel_ridge_regression_clf( weekday, distinct + 1, gamma, alpha) clf_weekend = self.train_kernel_ridge_regression_clf( weekend, distinct + 1, gamma, alpha) clflist[distinct] = [clf_weekday, clf_weekend] def train_OPT_clf_bydaylist(self, distinct_list, clflist, count): for distinct in distinct_list: rand = random.random() time.sleep(rand / 10) count[0] += 1 print("Training model in " + "{:.1f}".format(count[0] / 66 * 100) + "% completed...") clf_weekday = self.train_optimzation_model(self.weekday, distinct + 1) clf_weekend = self.train_optimzation_model(self.weekend, distinct + 1) clflist[distinct] = [clf_weekday, clf_weekend]
from data_io import DataIO from sklearn.ensemble import ExtraTreesRegressor #from sklearn.cross_validation import cross_val_score from sklearn.feature_extraction.text import CountVectorizer import numpy as np dio = DataIO("Settings.json") submission = False n_trees = 10 min_samples_split = 2 if submission: type_n = "train_full" type_v = "valid_full" else: type_n = "train" type_v = "valid" vectorizer = CountVectorizer( max_features=200, ) short_id = "count_200f" tfidf_columns = ["Title", "FullDescription", "LocationRaw"] #dio.make_counts(vectorizer, short_id, tfidf_columns, type_n, type_v) columns = ["Category", "ContractTime", "ContractType"] le_features = dio.get_le_features(columns, "train_full") extra_features = dio.get_features(columns, type_n, le_features) extra_valid_features = dio.get_features(columns, type_v, le_features)
class cFeature: dataio = DataIO() datelice = '' date = '' distinct = 0 daytype = '' back_len = 3 def generate(self, ds, distinct): self.date = ds[0:10] self.datelice = ds self.distinct = distinct self.daytype = isWeekendsText(self.date) slice = int(ds.split('-')[-1]) if slice <= self.back_len: return None, None #--------------------feature generate----------------------# f = [] #wea_feature = self.weather_feature() # if wea_feature != None: # f.extend(wea_feature) # else: # return None, None gap_feature = self.gap_feature() if gap_feature == None: return None, None f.extend(gap_feature) # ts_feature = self.ts_feature() # f.extend(ts_feature) #f.append(1) gap = self.dataio.select_gap(self.datelice, self.distinct) return f, gap def weather_feature(self): weather = self.dataio.select_weatherdata_by_dateslice(self.datelice) if type(weather) == type(None): return None wea_feature = [0] * 4 cur_weather = int(weather['weather']) if cur_weather == 2 or cur_weather == 3 or cur_weather == 4: wea_feature[0] = 1 elif cur_weather == 8: wea_feature[1] = 1 elif cur_weather == 9: wea_feature[2] = 1 else: wea_feature[3] = 1 return wea_feature def gap_feature(self): gapfeature = [] ls = get_last_ts(self.datelice) gap_b1 = self.dataio.select_gap(ls, self.distinct) ls = get_last_ts(ls) gap_b2 = self.dataio.select_gap(ls, self.distinct) ls = get_last_ts(ls) gap_b3 = self.dataio.select_gap(ls, self.distinct) gap_std = np.std(np.array([gap_b1, gap_b2, gap_b3])) gapfeature.append(gap_std) gap_diff_b1 = gap_b1 - gap_b2 gap_diff_b2 = gap_b2 - gap_b3 # if gap_b1 != 0: # gapfeature.append(gap_diff_b1/gap_b1) # else: # gapfeature.append(gap_diff_b1) gapfeature.append(gap_b1) gapfeature.append(gap_diff_b1) #gapfeature.append(gap_diff_b1**2) gapfeature.append(gap_diff_b2) #ls = self.datelice # for i in range(self.back_len): # gap_filtered = self.dataio.select_filter_gap(ls,self.distinct,self.daytype) # #print(ls,self.daytype) # gapfeature.append(gap_filtered) # ls = get_last_ts(ls) gap_filtered_b2 = self.dataio.select_filter_gap( get_last_ts(get_last_ts(self.datelice)), self.distinct, self.daytype) gap_filtered_b1 = self.dataio.select_filter_gap( get_last_ts(self.datelice), self.distinct, self.daytype) gap_filtered_cur = self.dataio.select_filter_gap( self.datelice, self.distinct, self.daytype) gap_filtered_a1 = self.dataio.select_filter_gap( get_next_ts(self.datelice), self.distinct, self.daytype) if gap_filtered_a1 == None or gap_filtered_b1 == None or gap_filtered_b2 == None: return None gap_filter_diff_b2 = gap_filtered_b1 - gap_filtered_b2 gap_filter_diff_b1 = gap_filtered_cur - gap_filtered_b1 gap_filter_diff_a1 = gap_filtered_a1 - gap_filtered_cur #gapfeature.append(gap_filter_diff_b2) gapfeature.append(gap_filter_diff_b1) gapfeature.append(gap_filter_diff_a1) gapfeature.append(gap_filtered_cur) #gapfeature.append(math.pow(gap_filter_diff_b1,3)) #gapfeature.append(math.pow(gap_filter_diff_b1,2)) return gapfeature def ts_feature(self): slice = int(self.datelice.split('-')[-1]) ts_feature = gene_timeslice_feature(slice, 8) return ts_feature def traffic_feature(self): traffic_info = self.dataio.select_trafficdata_by_district( get_last_ts(self.datelice), self.distinct) traffic_level = [] if not traffic_info.empty: level1 = (traffic_info['level1'].values)[0] level2 = (traffic_info['level2'].values)[0] level3 = (traffic_info['level3'].values)[0] level4 = (traffic_info['level4'].values)[0] traffic_level = [level1, level2, level3, level4] else: traffic_level = [0, 0, 0, 0] return traffic_level
from data_io import DataIO from sklearn.ensemble import ExtraTreesRegressor #from sklearn.cross_validation import cross_val_score from sklearn.feature_extraction.text import CountVectorizer import numpy as np dio = DataIO("Settings.json") submission = False n_trees = 10 min_samples_split = 2 if submission: type_n = "train_full" type_v = "valid_full" else: type_n = "train" type_v = "valid" vectorizer = CountVectorizer(max_features=200, ) short_id = "count_200f" tfidf_columns = ["Title", "FullDescription", "LocationRaw"] #dio.make_counts(vectorizer, short_id, tfidf_columns, type_n, type_v) columns = ["Category", "ContractTime", "ContractType"] le_features = dio.get_le_features(columns, "train_full") extra_features = dio.get_features(columns, type_n, le_features) extra_valid_features = dio.get_features(columns, type_v, le_features) split_name = "Category" #split_name = "ContractTime" #split_name = "ContractType"
class neural_network: fnn = FeedForwardNetwork() inputlen = 0 outputlen = 7 dataio = DataIO() feature = cFeature() dataset = {} def network_init(self): #输入feature len tempfeature, gap = self.feature.generate('2016-01-03-100', 1) self.inputlen = len(tempfeature) # 设立三层,一层输入层(3个神经元,别名为inLayer),一层隐藏层,一层输出层 inLayer = LinearLayer(self.inputlen, name='inLayer') hiddenLayer1 = SigmoidLayer(7, name='hiddenLayer1') hiddenLayer2 = SigmoidLayer(7, name='hiddenLayer2') outLayer = LinearLayer(7, name='outLayer') # 将三层都加入神经网络(即加入神经元) self.fnn.addInputModule(inLayer) self.fnn.addModule(hiddenLayer1) self.fnn.addModule(hiddenLayer2) self.fnn.addOutputModule(outLayer) # 建立三层之间的连接 in_to_hidden1 = FullConnection(inLayer, hiddenLayer1) hidden1_to_hidden2 = FullConnection(hiddenLayer1, hiddenLayer2) hidden_to_out = FullConnection(hiddenLayer2, outLayer) # 将连接加入神经网络 self.fnn.addConnection(in_to_hidden1) self.fnn.addConnection(hidden1_to_hidden2) self.fnn.addConnection(hidden_to_out) # 让神经网络可用 self.fnn.sortModules() def gene_training_sample(self): self.DS = SupervisedDataSet(self.inputlen, self.outputlen) if os.path.exists('nn_dataset.pkl'): with open('nn_dataset.pkl', 'rb') as f: self.dataset = pickle.load(f) for i in range(len(self.dataset['feature'])): #print(self.dataset['feature'][i]) self.DS.addSample(self.dataset['feature'][i], self.dataset['label'][i]) else: backdaylen = 3 prefix = '2016-01-' loop = 0 featurelist = [] targetlist = [] for day in range(2, 22, 1): date = prefix + "{:02}".format(day) for distinct in range(1, 67): for slice in range(1, 145): if slice < backdaylen: continue ts_cur = date + '-' + str(slice) gap_cur = self.dataio.select_gap(ts_cur, distinct) if gap_cur > 10: continue f_cur, gap = self.feature.generate(ts_cur, distinct) if f_cur == None: continue output = self.gene_output(gap_cur) featurelist.append(f_cur) targetlist.append(output) loop += 1 if loop % 1000 == 0: print(loop) self.dataset['feature'] = featurelist self.dataset['label'] = targetlist for i in range(len(featurelist)): self.DS.addSample(featurelist[i], targetlist[i]) print( "Building training set is finished. Total amount is {}".format( loop)) with open('nn_dataset.pkl', 'wb') as f: pickle.dump(self.dataset, f) def training_nerual_network(self): dataTrain, dataTest = self.DS.splitWithProportion(0.7) xTrain, yTrain = dataTrain['input'], dataTrain['target'] xTest, yTest = dataTest['input'], dataTest['target'] trainer = BackpropTrainer(self.fnn, dataTrain, verbose=True, learningrate=0.03, momentum=0.1) trainer.trainUntilConvergence(maxEpochs=20) output = self.fnn.activateOnDataset(dataTest) count = 0 countRight = 0 error = 0 for i in range(len(output)): posReal = yTest[i].argmax() posPredict = output[i].argmax() #print('o',output[i],posPredict) #print('r',yTest[i],posReal) error += abs(posReal - posPredict) if posReal == posPredict: countRight += 1 count += 1 error /= count print('Correct rate:{:.2f} Average error:{:.2f}'.format( countRight / count, error)) def gene_output(self, val): output = np.zeros(self.outputlen) if val == 0 or val == 1: output[0] = 1 if val == 2: output[1] = 1 if val == 3: output[2] = 1 if val == 4 or val == 5: output[3] = 1 if val == 6 or val == 7: output[4] = 1 if val == 8 or val == 9: output[5] = 1 if val > 9: output[6] = 1 return output
class MainForm(wx.Frame): # ---------------------------------------------------------------------- def __init__(self): self.data_io = DataIO() self.input = None self.content_data = None self.execute_data, self.prior_1, self.prior_2, self.leave = None, None, None, None wx.Frame.__init__(self, None, wx.ID_ANY, "Customer Segmentation", size=(800, 640)) self.panel = wx.Panel(self, wx.ID_ANY) status = self.CreateStatusBar() self.sizer = wx.BoxSizer(wx.VERTICAL) file_label = wx.StaticText(self.panel, -1, "Input:", (30, 20)) self.InputPathTextBox = wx.TextCtrl(self.panel, -1, "", size=(200, -1), pos=(100, 21)) self.browse_btn = wx.Button(self.panel, -1, "Browse", pos=(310, 20)) self.Bind(wx.EVT_BUTTON, self.onOpenFile, self.browse_btn) # output_label = wx.StaticText(self.panel, -1, "Ket qua:", (30, 50)) # self.OutputPathTextBox = wx.TextCtrl(self.panel, -1, "", size=(200, -1), pos=(100, 47)) self.execute_btn = wx.Button(self.panel, -1, "Phân tích", pos=(400, 20)) self.Bind(wx.EVT_BUTTON, self.onExecute, self.execute_btn) self.execute_btn.Disable() inform_label = wx.StaticText(self.panel, -1, "Tổng quát:", (30, 60)) self.inform_txt = wx.TextCtrl(self.panel, -1, "", style=wx.TE_MULTILINE | wx.TE_READONLY, size=(463, 70), pos=(100, 60)) self.prior1_btn = wx.Button(self.panel, -1, "Nhóm Ưu tiên 1", pos=(98, 140)) self.Bind(wx.EVT_BUTTON, self.onOpenPrior1, self.prior1_btn) self.prior1_btn.Disable() self.prior2_btn = wx.Button(self.panel, -1, "Nhóm Ưu tiên 2", pos=(210, 140)) self.Bind(wx.EVT_BUTTON, self.onOpenPrior2, self.prior2_btn) self.prior2_btn.Disable() self.leave_btn = wx.Button(self.panel, -1, "Dự đoán rời mạng", pos=(320, 140)) self.Bind(wx.EVT_BUTTON, self.onOpenLeave, self.leave_btn) self.leave_btn.Disable() self.total_btn = wx.Button(self.panel, -1, "Toàn bộ các nhóm", pos=(443, 140)) self.Bind(wx.EVT_BUTTON, self.onOpenTotal, self.total_btn) self.total_btn.Disable() self.chart_btn = wx.Button(self.panel, -1, "Biểu đồ", pos=(580, 140)) self.Bind(wx.EVT_BUTTON, self.onOpenChart, self.chart_btn) self.chart_btn.Disable() self.save_btn = wx.Button(self.panel, -1, "Export", pos=(680, 140)) self.Bind(wx.EVT_BUTTON, self.onSaveFile, self.save_btn) self.save_btn.Disable() self.data_grid = grid.Grid(self.panel, size=(780, 400), pos=(5, 180)) self.data_grid.CreateGrid(20, 10) self.panel.SetSizer(self.sizer) def onOpenFile(self, event): dlg = wx.FileDialog( self, message="Choose a file", defaultDir='../data/', defaultFile="", wildcard="Excel files (*.xlsx, *.xls)|*.xlsx; *.xls", style=wx.FD_OPEN | wx.FD_CHANGE_DIR | wx.FD_FILE_MUST_EXIST) if dlg.ShowModal() == wx.ID_OK: path = dlg.GetPaths() # print("You chose the following file:", path) self.InputPathTextBox.ChangeValue(path[0]) self.input = path[0] data = self.data_io.import_data(path[0]) self.content_data = self.fill_grid_data(data) self.execute_btn.Enable() self.prior1_btn.Disable() self.prior2_btn.Disable() self.leave_btn.Disable() self.total_btn.Disable() self.chart_btn.Disable() self.save_btn.Disable() self.inform_txt.ChangeValue('') dlg.Destroy() def onOpenPrior1(self, event): self.fill_grid_data(self.prior_1) def onOpenPrior2(self, event): self.fill_grid_data(self.prior_2) def onOpenLeave(self, event): self.fill_grid_data(self.leave) def onOpenTotal(self, event): self.fill_grid_data(self.execute_data) def onSaveFile(self, event): dlg = wx.FileDialog( self, message="Save as", defaultDir='../data/', defaultFile="", wildcard="Excel files (*.xlsx, *.xls)|*.xlsx; *.xls", style=wx.FD_SAVE) if dlg.ShowModal() == wx.ID_OK: path = dlg.GetPaths()[0] self.data_io.export_data(self.execute_data, path) # print("You chose the following file:", self.path) dlg.Destroy() def onExecute(self, event): if not self.content_data: msg = "You have an error.\nPlease reopen data file" self.show_error_dialog(msg, "ERROR", wx.OK | wx.ICON_EXCLAMATION) else: executed_data, prior_1, prior_2, leave = classify( self.content_data) self.execute_data = executed_data self.fill_grid_data(self.execute_data) self.prior_1 = prior_1 self.prior_2 = prior_2 self.leave = leave nums = len(self.content_data) - 1 summary = 'Tổng số thuê bao: {}\n'.format( len(self.content_data) - 1) rate = 100 * (len(self.prior_1) - 1) / nums summary += 'Số thuê bao ưu tiên 1: {0} ({1:.2f}%)\n'.format( len(self.prior_1) - 1, round(rate, 2)) rate = 100 * (len(self.prior_2) - 1) / nums summary += 'Số thuê bao ưu tiên 2: {0} ({1:.2f}%)\n'.format( len(self.prior_2) - 1, round(rate, 2)) rate = 100 * (len(self.prior_1) - 1) / nums summary += 'Số thuê bao dự đoán rời mạng: {0} ({1:.2f}%)'.format( len(self.leave) - 1, round(rate, 2)) self.inform_txt.ChangeValue(summary) self.prior1_btn.Enable() self.prior2_btn.Enable() self.leave_btn.Enable() self.total_btn.Enable() self.save_btn.Enable() self.chart_btn.Enable() # else: # msg = "Something error. Please try again." # self.show_error_dialog(msg, "ERROR", wx.OK | wx.ICON_EXCLAMATION) def onOpenChart(self, event): self.draw() def show_error_dialog(self, msg, title, style): dlg = wx.MessageDialog(parent=None, message=msg, caption=title, style=style) dlg.ShowModal() dlg.Destroy() def fill_grid_data(self, data): # print(data[11]) current_nrows, new_nrows = (self.data_grid.GetNumberRows(), len(data)) if current_nrows < new_nrows: self.data_grid.AppendRows(new_nrows - current_nrows) elif current_nrows > new_nrows: self.data_grid.DeleteRows(new_nrows, current_nrows - new_nrows) current_ncols, new_ncols = (self.data_grid.GetNumberCols(), len(data[0])) if current_ncols < new_ncols: self.data_grid.AppendCols(new_ncols - current_ncols) elif current_ncols > new_ncols: self.data_grid.DeleteCols(new_ncols, current_ncols - new_ncols) for i in range(new_ncols): self.data_grid.SetCellValue(0, i, str(data[0][i])) for row in range(1, new_nrows): for col in range(new_ncols): self.data_grid.SetCellValue(row, col, str(data[row][col])) return data def draw(self): labels = ['Nhóm Ưu tiên 1', 'Nhóm Ưu tiên 2', 'Dự đoán rời mạng'] sizes = [ len(self.prior_1) - 1, len(self.prior_2) - 1, len(self.leave) - 1 ] explode = (0, 0, 0.1) fig, ax = plt.subplots() fig.canvas.set_window_title('Biểu đồ') plt.title('Tỉ lệ các nhóm thuê bao') ax.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', shadow=True, startangle=90) ax.axis('equal') plt.tight_layout() plt.show()
def get_data(): dataio = DataIO() df = dataio.get_data() return df
from sklearn.grid_search import GridSearchCV from sklearn.metrics import f1_score from sklearn.linear_model import SGDClassifier from sklearn.linear_model import Perceptron from sklearn.linear_model import PassiveAggressiveClassifier from sklearn.naive_bayes import BernoulliNB, MultinomialNB from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import NearestCentroid from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.metrics import classification_report, confusion_matrix from pprint import pprint from time import time import numpy as np import joblib dio = DataIO("Settings.json") submission = False n_trees = 10 min_samples_split = 2 param = """Normal count vector with max 200. New submission which is repeatable. and nicer count_vector_titles = TfidfVectorizer( read_column(train_filename, column_name), max_features=200, norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True) """ if submission: type_n = "train_full" type_v = "valid_full" else:
op.add_option("--n_features", action="store", type=int, default=2**16, help="n_features when using the hashing vectorizer.") (opts, args) = op.parse_args() if len(args) > 0: op.error("this script takes no arguments.") sys.exit(1) print __doc__ op.print_help() print dio = DataIO("Settings_loc5.json") submission = False n_trees = 10 min_samples_split = 2 if submission: type_n = "train_full" type_v = "valid_full" else: type_n = "train" type_v = "valid" vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') #short_id = "tfidf_200f_l1"
from data_io import DataIO #from os.path import join as path_join #import joblib import numpy as np dio = DataIO("Settings_submission.json") submission = True if submission: type_n = "train_full" type_v = "test_full" else: type_n = "train" type_v = "valid" model_names = [ "ExtraTree_min_sample2_30trees_200f_noNorm_categoryTimeType_new_log", "ExtraTree_min_sample2_40trees_200f_noNorm_categoryTimeType_new_log", "ExtraTree_min_sample2_40trees_200f_noNorm_categoryTimeType_new", "ExtraTree_min_sample2_40trees_200f_noNorm_categoryTimeType_tfidfl2_new_log", "vowpall_submission", "vowpall_loc5" ] #model_names = [model2, model4] #model_names = [model1, model6, model4] #fit_predict(model2) #fit_predict(model1) #fit_predict(model3) #fit_predict(model5)
class DeepGalaxyTraining(object): def __init__(self): self.data_io = DataIO() self.model = None self.x_train = None self.y_train = None self.x_test = None self.y_test = None self.num_classes = 0 self.epochs = 50 self.batch_size = 8 self.use_noise = True self.distributed_training = False self.multi_gpu_training = False self._multi_gpu_model = None self._n_gpus = 1 self.callbacks = [] self.logger = None self.log_level = logging.DEBUG self.input_shape = (512, 512, 3) # (256, 256, 3) self._t_start = 0 self._t_end = 0 def get_flops(self, model): # run_meta = tf.RunMetadata() # commented out since it doesn't work in TF2 run_meta = tf.compat.v1.RunMetadata() # opts = tf.profiler.ProfileOptionBuilder.float_operation() opts = tf.compat.v1.profiler.ProfileOptionBuilder.float_operation() # We use the Keras session graph in the call to the profiler. flops = tf.compat.v1.profiler.profile( graph=tf.compat.v1.keras.backend.get_session().graph, run_meta=run_meta, cmd='op', options=opts) return flops.total_float_ops # Prints the "flops" of the model. def initialize(self): # init_op = tf.initialize_all_variables() # init_op = tf.global_variables_initializer() # sess = tf.Session() # sess.run(init_op) # Check if GPUs are available # if tf.test.is_gpu_available(): # commented out since this test will cause a new session be created # allow growth # config = tf.compat.v1.ConfigProto() # config.gpu_options.per_process_gpu_memory_fraction = 1 # config.gpu_options.allow_growth = True # dynamically grow the memory used on the GPU # # config.log_device_placement = True # to log device placement (on which device the operation ran) # sess = tf.compat.v1.Session(config=config) # tf.compat.v1.keras.backend.set_session(sess) # set this TensorFlow session as the default session for Keras # Create logger self.logger = logging.getLogger('DeepGalaxyTrain') self.logger.setLevel(self.log_level) self.logger.addHandler(logging.FileHandler('train_log.txt')) if self.distributed_training is True: try: import horovod.tensorflow.keras as hvd # initialize horovod hvd.init() self.callbacks.append( hvd.callbacks.BroadcastGlobalVariablesCallback(0)) self.callbacks.append(hvd.callbacks.MetricAverageCallback()) # self.callbacks = [hvd.BroadcastGlobalVariablesHook(0)] if hvd.rank() == 0: self.logger.info('Parallel training enabled.') self.logger.info( 'batch_size = %d, global_batch_size = %d, num_workers = %d\n' % (self.batch_size, self.batch_size * hvd.size(), hvd.size())) # Map an MPI process to a GPU (Important!) print('hvd_rank = %d, hvd_local_rank = %d' % (hvd.rank(), hvd.local_rank())) self.logger.info('hvd_rank = %d, hvd_local_rank = %d' % (hvd.rank(), hvd.local_rank())) # Bind a CUDA device to one MPI process (has no effect if GPUs are not used) os.environ["CUDA_VISIBLE_DEVICES"] = str(hvd.local_rank()) # # Horovod: pin GPU to be used to process local rank (one GPU per process) gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) # if gpus: # tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') except ImportError as identifier: print( 'Error importing horovod. Disabling distributed training.') self.distributed_training = False else: self.logger.info('Parallel training disabled.') self.logger.info('Batch_size = %d' % (self.batch_size)) def load_data(self, data_fn, test_size=0.3, random=True): if not self.distributed_training: self.logger.info( 'Loading the full dataset since distributed training is disabled ...' ) # X, Y = self.data_io.load_all(data_fn, dset_name_pattern=dset_name_pattern, camera_pos=camera_pos) X, Y = self.data_io.load_all(data_fn) else: self.logger.info( 'Loading part of the dataset since distributed training is enabled ...' ) X, Y = self.data_io.load_partial(data_fn, hvd.size(), hvd.rank()) self.logger.debug('Shape of X: %s' % str(X.shape)) self.logger.debug('Shape of Y: %s' % str(Y.shape)) # update the input_shape setting according to the loaded data self.input_shape = X.shape[1:] if test_size > 0: x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=test_size, random_state=42) self.x_train = x_train self.x_test = x_test self.y_train = y_train self.y_test = y_test else: self.x_train = X self.y_train = Y self.num_classes = np.unique(Y).shape[0] print("shapes:", self.x_train.shape, self.x_test.shape, self.y_train.shape, self.y_test.shape) self.logger.debug('Number of classes: %d' % self.num_classes) def load_model(self): if not os.path.isfile('efn_b4.h5'): # base_model = efn.EfficientNetB4(weights='imagenet', include_top=False, input_shape=(self.input_shape[0], self.input_shape[1], 3), classes=self.num_classes) base_model = efn.EfficientNetB4(weights=None, include_top=True, input_shape=(self.input_shape[0], self.input_shape[1], 3), classes=self.num_classes) if self.distributed_training is True and hvd.rank == 0: base_model.save('efn_b4.h5') else: base_model = tf.keras.models.load_model('efn_b4.h5', compile=False) print(base_model.summary()) if not self.use_noise: # x = base_model.output # x = tf.keras.layers.GlobalAveragePooling2D()(x) # x = tf.keras.layers.Dropout(0.3)(x) # predictions = tf.keras.layers.Dense(self.num_classes, activation='softmax')(x) # model = tf.keras.models.Model(inputs = base_model.input, outputs = predictions) # model = tf.keras.models.Model(inputs = base_model.input, outputs = base_model.outputs) model = tf.keras.models.Sequential() # model.add(tf.keras.layers.Lambda(lambda x: tf.repeat(x, 3, axis=-1), input_shape=self.input_shape)) # commented out since tf.repeat does not exist before 1.15 model.add( tf.keras.layers.Lambda( lambda x: tf.keras.backend.repeat_elements(x, 3, axis=-1), input_shape=self.input_shape)) model.add(base_model) # model.add(tf.keras.layers.GlobalAveragePooling2D()) # model.add(tf.keras.layers.Dropout(0.3)) # model.add(tf.keras.layers.Dense(self.num_classes, activation='softmax')) else: model = tf.keras.models.Sequential() # model.add(tf.keras.layers.Lambda(lambda x: tf.repeat(x, 3, axis=-1), input_shape=self.input_shape)) # commented out since tf.repeat does not exist before 1.15 model.add( tf.keras.layers.Lambda( lambda x: tf.keras.backend.repeat_elements(x, 3, axis=-1), input_shape=self.input_shape)) model.add( tf.keras.layers.GaussianNoise(0.5, input_shape=self.input_shape)) model.add(base_model) # model.add(tf.keras.layers.GlobalAveragePooling2D(name="gap")) # model.add(tf.keras.layers.Dropout(0.3)) # model.add(tf.keras.layers.Dense(self.num_classes, activation="softmax", name="fc_out")) if self.distributed_training is True: # opt = K.optimizers.SGD(0.001 * hvd.size()) # opt = tf.keras.optimizers.Adam(hvd.size()) opt = tf.keras.optimizers.Adadelta(1.0 * hvd.size()) # Horovod: add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) else: opt = tf.keras.optimizers.Adam() if self.multi_gpu_training is True: # probe the number of GPUs from tensorflow.python.client import device_lib local_device_protos = device_lib.list_local_devices() gpu_list = [ x.name for x in local_device_protos if x.device_type == 'GPU' ] self._n_gpus = len(gpu_list) print('Parallalizing the model on %d GPUs...' % self._n_gpus) parallel_model = tf.keras.utils.multi_gpu_model(model, gpus=self._n_gpus) parallel_model.compile( loss=tf.keras.losses.sparse_categorical_crossentropy, optimizer=opt, metrics=['sparse_categorical_accuracy']) self._multi_gpu_model = parallel_model self.model = model print(parallel_model.summary()) else: model.compile(loss=tf.keras.losses.sparse_categorical_crossentropy, optimizer=opt, metrics=['sparse_categorical_accuracy']) self.model = model if self.distributed_training is True: if hvd.rank() == 0: print(model.summary()) else: print(model.summary()) def fit(self): if self.distributed_training is True: try: # print('len(train_iter)', len(train_iter)) # if hvd.rank() == 0: # self.f_usage.write('len(train_iter) = %d, x_train.shape=%s\n' % (len(train_iter), x_train.shape)) self._t_start = datetime.now() self.model.fit(self.x_train, self.y_train, batch_size=self.batch_size, epochs=self.epochs, callbacks=self.callbacks, verbose=1 if hvd.rank() == 0 else 0, validation_data=(self.x_test, self.y_test)) self._t_end = datetime.now() # train_gen = ImageDataGenerator() # train_iter = train_gen.flow(self.x_train, self.y_train, batch_size=self.batch_size) # test_gen = ImageDataGenerator() # test_iter = test_gen.flow(self.x_test, self.y_test, batch_size=self.batch_size) # self.model.fit_generator(train_iter, # # batch_size=batch_size, # steps_per_epoch=len(train_iter) // hvd.size(), # epochs=self.epochs, # callbacks=self.callbacks, # verbose=1 if hvd.rank() == 0 else 0, # validation_data=test_gen.flow(self.x_test, self.y_test, self.batch_size), # validation_steps=len(test_iter) // hvd.size()) except KeyboardInterrupt: print('Terminating due to Ctrl+C...') finally: print( "On hostname {0} - After training using {1} GB of memory". format( socket.gethostname(), psutil.Process(os.getpid()).memory_info()[0] / 1024 / 1024 / 1024)) self._t_end = datetime.now() if hvd.rank() == 0: self.logger.info( "On hostname {0} - After training using {1} GB of memory\n" .format( socket.gethostname(), psutil.Process(os.getpid()).memory_info()[0] / 1024 / 1024 / 1024)) self.logger.info('Time is now %s\n' % datetime.now()) # self.f_usage.write('Elapsed time %s\n' % (t_end-t_start)) # print('Elapsed time:', t_end-t_start) else: try: if self.multi_gpu_training is True: self._t_start = datetime.now() self._multi_gpu_model.fit( self.x_train, self.y_train, batch_size=self.batch_size * self._n_gpus, epochs=self.epochs, # callbacks=self.callbacks, verbose=1, validation_data=(self.x_test, self.y_test)) self._t_end = datetime.now() else: self._t_start = datetime.now() self.model.fit( self.x_train, self.y_train, batch_size=self.batch_size, epochs=self.epochs, # callbacks=self.callbacks, verbose=1, validation_data=(self.x_test, self.y_test)) self._t_end = datetime.now() except KeyboardInterrupt: pass finally: self._t_end = datetime.now() print('Elapsed time:', self._t_end - self._t_start) print('Saving model...') print(self.get_flops(self.model)) def save_model(self): if self.distributed_training is True: if hvd.rank() == 0: if self.use_noise is True: self.model.save('model_hvd_bw_%d_B0_with_noise_n_p_%d.h5' % (self.input_shape[0], hvd.size())) else: self.model.save('model_hvd_bw_%d_B0_no_noise_%d_nodes.h5' % (self.input_shape[0], hvd.size())) else: if self.use_noise is True: self.model.save('model_bw_%d_B0_with_noise.h5' % (self.input_shape[0])) else: self.model.save('model_bw_%d_B0_no_noise.h5' % (self.input_shape[0])) def validate(self): y_pred = self.model.predict(self.x_test) print( precision_recall_fscore_support(self.y_test, np.argmax(y_pred, axis=1))) print(confusion_matrix(self.y_test, np.argmax(y_pred, axis=1))) def finalize(self): pass
from data_io import DataIO from sklearn.decomposition import RandomizedPCA from sklearn.ensemble import ExtraTreesRegressor from sklearn.base import clone from sklearn.cross_validation import cross_val_score import numpy as np dio = DataIO("Settings.json") title_corpus = dio.read_gensim_corpus("train_title_nltk_filtered.corpus.mtx") pca = RandomizedPCA(random_state=3465343) salaries = dio.get_salaries("train", log=True) columns = ["Category", "ContractTime", "ContractType"] le_features = dio.get_le_features(columns, "train_full") extra_features = dio.get_features(columns, "train", le_features) #extra_valid_features = dio.get_features(columns, "valid", le_features) param = "RandomizedPCA title 200 Fulldescription 200 " + ",".join(columns) print map(len, extra_features) extra_features = map(lambda x: np.reshape(np.array(x), (len(x), 1)), extra_features) print type(title_corpus) print title_corpus.shape title_pca = clone(pca) title_pca.set_params(n_components=200) title_corpus_pca = title_pca.fit_transform(title_corpus) print type(title_corpus_pca)
from data_io import DataIO from sklearn.ensemble import ExtraTreesRegressor from sklearn.cross_validation import cross_val_score from os.path import join as path_join from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np import joblib import cloud import os tfidf_columns = ["Title", "FullDescription", "LocationRaw"] dio = DataIO("Settings.json") vectorizer = TfidfVectorizer(max_features=200, norm='l1', smooth_idf=True, sublinear_tf=False, use_idf=True) short_id = "tfidf_200f_l1" type_n = "train" type_v = "valid" dio.make_counts(vectorizer, short_id, tfidf_columns, "train", "valid") columns = ["Category", "ContractTime", "ContractType"] le_features = dio.get_le_features(columns, "train_full") extra_features = dio.get_features(columns, type_n, le_features) extra_valid_features = dio.get_features(columns, type_v, le_features) features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix", tfidf_columns, extra_features) validation_features = dio.join_features( "%s_" + type_v + "_" + short_id + "_matrix", tfidf_columns, extra_valid_features)
class Integrator(object): def __init__(self): """ The constructor of an abstract integrator """ # =================== CONSTANTS ================== # by default, using the square of the Gaussian gravitational constant self.CONST_G = 0.000295912208232213 # units: (AU^3/day^2) self.CONST_C = 0.0 # speed of light; PN terms will be calculated if CONST_C > 0 # =================== VARIABLES ================== self._t = 0.0 self.t_start = 0.0 self.t_end = 0.0 self.h = 0.01 # time step size self.store_dt = 100 # storage time step self._particles = None self.acceleration_method = 'ctypes' self.output_file = 'data.hdf5' self.collision_output_file = 'collisions.txt' self.close_encounter_output_file = 'close_encounters.txt' self.max_close_encounter_events = 1 self.max_collision_events = 1 self.close_encounter_distance = 0.0 self.__energy_init = 0.0 self.__energy = 0.0 self.__buf = None self.buffer_len = 1024 self.__initialized = False # =============== C Library ============= self.libabie = CLibABIE() @property def t(self): return self._t @property def particles(self): if self._particles is None: self._particles = Particles(self.CONST_G) return self._particles @property def buf(self): if self.__buf is None: self.__buf = DataIO(buf_len=self.buffer_len, output_file_name=self.output_file, CONST_G=self.CONST_G) return self.__buf @staticmethod def load_integrators(): """ Load integrator modules :return: a dict of integrator class objects, mapping the name of the integrator to the class object """ mod_dict = dict() module_candidates = glob.glob( os.path.join(__mpa_dir__, 'integrator_*.py')) sys.path.append(__mpa_dir__) # append the python path if __mpa_dir__ != __user_shell_dir__: # load the integrator module (if any) also from the current user shell directory module_cwd = glob.glob( os.path.join(__user_shell_dir__, 'integrator_*.py')) for m_cwd in module_cwd: module_candidates.append(m_cwd) sys.path.append(__user_shell_dir__) # append the python path for mod_name in module_candidates: mod_name = os.path.basename(mod_name) mod = __import__(mod_name.split('.')[0]) if hasattr(mod, '__integrator__'): # it is a valid ABI module, register it as a module mod_dict[mod.__integrator__] = mod return mod_dict def initialize(self): if self.__buf is None: self.__buf = DataIO( buf_len=self.buffer_len, output_file_name=self.output_file, close_encounter_output_file_name=self. close_encounter_output_file, collision_output_file_name=self.collision_output_file, CONST_G=self.CONST_G) if self.particles.N > 0: # initialize the C library self.libabie.initialize_code( self.CONST_G, self.CONST_C, self.particles.N, MAX_CE_EVENTS=self.max_close_encounter_events, MAX_COLLISION_EVENTS=self.max_collision_events, close_encounter_distance=self.close_encounter_distance) self.buf.initialize_buffer(self.particles.N) def stop(self): if self.__buf is not None: self.__buf.close() def calculate_orbital_elements(self, primary=None): return self._particles.calculate_orbital_elements(primary) def calculate_energy(self): # return self._particles.energy if self.acceleration_method == 'ctypes': return self.libabie.get_total_energy() else: return self._particles.energy def set_additional_forces(self, ext_acc): self.libabie.set_additional_forces(ext_acc) def integrator_warmup(self): pos = self.particles.positions.copy() vel = self.particles.velocities.copy() self.libabie.set_state(pos, vel, self.particles.masses, self.particles.radii, self.particles.N, self.CONST_G, self.CONST_C) def integrate(self, to_time=None): """ Integrate the system to a given time. :param to_time: The termination time. If None, it will use the self.t_end value, and the code will be stopped when reaching self.t_end (i.e. if this function is called without argument, it can only be called once; but if it is called with a to_time argument specificed, then it can be called iteratively. :return: """ if to_time is not None: self.t_end = to_time if self.__initialized is False: self.initialize() self.integrator_warmup() self.__initialized = True if self.t >= self.t_end: return # Note: this dt is not the integrator time step h dt = min(self.store_dt, self.t_end - self.t) ret = 0 # launch the integration while self.t < self.t_end: if self.__energy_init == 0: self.__energy_init = self.calculate_energy() next_t = self.t + dt - ((self.t + dt) % dt) if self.acceleration_method == 'numpy': ret = self.integrate_numpy(next_t) elif self.acceleration_method == 'ctypes': ret = self.integrate_ctypes(next_t) # the self.t is updated by the subclass # energy check self.__energy = self.calculate_energy() print(('t = %f, N = %d, dE/E0 = %g' % (self.t, self.particles.N, np.abs(self.__energy - self.__energy_init) / self.__energy_init))) if os.path.isfile('STOP'): break if to_time is None: # triggering the termination of the code, save the buffer to the file and close it self.stop() # if ret > 0: # break # if self.t == self.t_end: # self.__energy = self.calculate_energy() # print('t = %f, E/E0 = %g' % (self.t, np.abs(self.__energy - self.__energy_init) / self.__energy_init)) return ret def integrate_numpy(self, to_time): """ Integrate the system to a given time using python/numpy. This method must be implemented in the subclasses. :param to_time: The termination time. If None, it will use the self.t_end value :return: """ raise NotImplementedError('integrate_numpy() method not implemented!') def integrate_ctypes(self, to_time): """ Integrate the system to a given time using the ctypes (libabie.so) This method must be implemented in the subclasses. :param to_time: The termination time. If None, it will use the self.t_end value :return: """ raise NotImplementedError('integrate_ctypes() method not implemented!') def store_state(self): if self.buf is None: self.initialize() self.buf.initialize_buffer(self.particles.N) elem = self.particles.calculate_aei() self.buf.store_state(self.t, self.particles.positions, self.particles.velocities, self.particles.masses, radii=self.particles.radii, names=self.particles.hashes, ptypes=self.particles.ptypes, a=elem[:, 0], e=elem[:, 1], i=elem[:, 2]) def store_collisions(self, collision_buffer): self.buf.store_collisions(collision_buffer) def store_close_encounters(self, ce_buffer): self.buf.store_close_encounters(ce_buffer) def handle_collisions(self, collision_buffer, actions=None): if actions is None: actions = ['merge', 'store'] if 'store' in actions: self.store_state() self.store_collisions(collision_buffer) if 'merge' in actions: collision_buffer = collision_buffer.reshape( len(collision_buffer), 4) for coll_pair in range(collision_buffer.shape[0]): pid1 = int(collision_buffer[coll_pair, 1]) pid2 = int(collision_buffer[coll_pair, 2]) self.particles.merge_particles_inelastically(pid1, pid2) self.libabie.reset_collision_buffer() self.integrator_warmup() self.buf.flush() self.buf.reset_buffer() self.buf.initialize_buffer(self.particles.N) if 'halt' in actions: print('Simulation terminated due to a collision event.') sys.exit(0) def handle_close_encounters(self, ce_buffer, actions=None): if actions is None: actions = ['merge', 'store'] if 'store' in actions: self.store_state() self.store_close_encounters(ce_buffer) if 'halt' in actions: print('Simulation terminated due to a collision event.') sys.exit(0)
from data_io import DataIO import logging from sklearn.decomposition import RandomizedPCA from sklearn.grid_search import GridSearchCV from sklearn.pipeline import Pipeline from pprint import pprint from time import time from sklearn.ensemble import ExtraTreesRegressor dio = DataIO("Settings_loc5.json") logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) salaries = dio.get_salaries("train", log=True) #title_corpus_csc = dio.read_gensim_corpus("train_title_nltk_filtered.corpus.mtx") #desc_corpus_csc = dio.read_gensim_corpus("train_desc_nltk_filtered.corpus.mtx") locraw_corpus_csc = dio.read_gensim_corpus( "train_locraw_nltk_filtered.corpus.mtx") #print title_corpus_csc.shape print locraw_corpus_csc.shape pipeline = Pipeline([ ('pca', RandomizedPCA(random_state=3465343)), ('trees', ExtraTreesRegressor(min_samples_split=2, n_estimators=10, n_jobs=4)), ])