예제 #1
0
파일: scrape_tools.py 프로젝트: casimp/pyxe
def spec_scrape(folder, save=False):
    """
    Runs through a .spc file (located in folder with associated .edf files)
    and extracts load, position, and slit size information.
    """
    spec_file = sorted([x for x in os.listdir(folder) if x[-4:] == ".spc"])
    error = "Either zero or multiple .spc files have been found."
    assert len(spec_file) == 1, error
    spec_file = spec_file[0]
    scan = spec_file[:-4]

    data_store = []
    with open(os.path.join(folder, spec_file), "r") as f:
        lines = [line.rstrip("\n") for line in f][1:]

    search = r"-?\ *[0-9]+\.?[0-9]*(?:[Ee]\ *-?\ *[0-9]+)?"

    for idx, line in enumerate(lines):
        if scan in line:
            x, y = [float(i) for i in re.findall(search, lines[idx + 11])[2:4]]
            slit_x = [float(i) for i in re.findall(search, lines[idx + 12])][7]
            slit_y = [float(i) for i in re.findall(search, lines[idx + 13])][1]
            scan_num = [float(i) for i in re.findall(search, lines[idx])][-1]
            load = [float(i) for i in re.findall(search, lines[idx + 23])][-3]
            data_store.append([int(scan_num), load, x, y, slit_x, slit_y])

    df = pd.DataFrame(
        data_store, columns=("Scan Number", "Load (kN)", "x (mm)", "y (mm)", "slit_x (mm)", "slit_y (mm)")
    )
    if save:
        pd.to_pickle(df, os.path.join(folder, "%s.pkl" % scan))
    return df
def main(data_path, rng):
    allfiles = listdir(data_path)
    # Just in case.
    allfiles = sorted(allfiles)
    
    if rng:
        low, high = rng.split("-")
        low = int(low)
        high = int(high)
        howmany = high - low 
    else:
        l = len(allfiles)
        low, high = 0, l
        howmany = high - low
    
    n = 0
    
    print "Preprocessing features, files in the range %s"%(rng)
    print "Progress:"
    # Read each driver trips
    for i in range(low, high):
        driver = allfiles[i]
        #output = open("data/stats/%s"%(driver), "w")
        trips = read_driver_trips(data_path, driver)
        
        #headers = trips[0].keys()
        #output.write(','.join(headers) + '\n')
        df = pd.DataFrame(trips)
        outputname = driver.split('.')[0]
        pd.to_pickle(df, "data/stats/%s.pkl"%(outputname))

        if n%10 == 0:
            print "%d more to go"%(howmany-n)

        n += 1
예제 #3
0
파일: csvreader.py 프로젝트: restrepo/gssis
def clean_dictionaries(dictionary=impact_factors, filename="impactfactors.pickle"):
    import pickle

    dictionary = {}
    # with open('impactfactors.pickle', 'wb') as handle:
    #    pickle.dump(dictionary, handle)
    pd.to_pickle(dictionary, file)
    def get_subjects_list_adults_fct(df_path, df_qc_path, subjects_list):
        '''
        excludes kids and subjects with missing sex or age
        '''
        import pandas as pd
        import numpy as np

        df = pd.read_pickle(df_path)
        df_qc = pd.read_pickle(df_qc_path)
        df = pd.merge(df, df_qc, left_index=True, right_index=True)
        pd.to_pickle(df, 'testdf.pkl')

        df['subject_id'] = df.subject_id_x

        # fixme exclude subjects with mean_FD>.1
        subjects_list_exclude = df[(df.age<18) | (df.mean_FD_Power>.1)].index
        subjects_list_adults = subjects_list

        for exclude_subject in subjects_list_exclude:
            if exclude_subject in subjects_list_adults:
                subjects_list_adults.remove(exclude_subject)

        missing_info = df[(df.age==999) | ((np.logical_or(df.sex=='M', df.sex=='F'))==False)].index
        for missing in missing_info:
            if missing in subjects_list_adults:
                subjects_list_adults.remove(missing)


        # remove subject from subject_list_adults for which no entry exists in df
        for subject in subjects_list_adults:
            if not(subject in df.index):
                subjects_list_adults.remove(subject)

        return subjects_list_adults
    def save_task_table(self, file_path=None, task_table=None):
        if not file_path:
            file_path = os.path.join(self.data_dir, 'task_table.pkl')
        if not task_table:
            task_table = self.get_task_table()

        pandas.to_pickle(task_table, file_path)
예제 #6
0
def main():

    #load input data for xgboost
    xgbInput = fetch_data.clfInput()
    xgbInput.sessions_ftrEng()
    xgbInput.users_ftrEng()
    xgbInput.one_hot()
    xgbInput.split_data()

    param = {'num_class': 12, 'objective': 'multi:softprob', 'seed': 0}
    param['eta'] = 0.20
    param['max_depth'] = 6
    param['subsample'] = .5
    param['col_sample_bytree'] = .6
    results = {}
    cv_train = pd.DataFrame()
    cv_valid = pd.DataFrame()
    nrounds = 40
    
    for train_indx, valid_indx in cv_bymonth(xgbInput):
        dtrain = xgb.DMatrix(xgbInput.train_X[train_indx], label = xgbInput.train_Y[train_indx],
                    missing = -1)
        dvalid = xgb.DMatrix(xgbInput.train_X[valid_indx], label = xgbInput.train_Y[valid_indx],
                    missing = -1)
        evallist = [(dtrain, 'train'), (dvalid, 'eval')]
        bst = xgb.train(param, dtrain, nrounds, evallist, feval = calc_ndcg.evalerror, evals_result = results)
        cv_train = pd.concat([cv_train, pd.Series(results['train']['error'])], axis = 1)
        cv_valid = pd.concat([cv_valid, pd.Series(results['eval']['error'])], axis = 1)
        pd.to_pickle(cv_train, 'cv_results/sessions_e20_25n/tr_err_av.p')
        pd.to_pickle(cv_valid, 'cv_results/sessions_e20_25n/val_err_av.p')

    full_cv(xgbInput.train_X, xgbInput.train_Y, 'fulltr_err_av.p', param, nrounds)    
예제 #7
0
def pickle_trialDataSource():
    '''
    trialDataSource is converted to a dataframe: gazeEventsDF, and then pickled
    return:  void
    '''

    global gazeEventsDF

    trialEventsDF = pd.DataFrame()

    for key, source in trialSourceDict.items():

        if key != "index":
            eventDF = source.to_df()
            eventDF['eventType'] = key
            trialEventsDF = pd.concat([eventDF, trialEventsDF], axis=0)

    if gazeEventsDF is False:
        pd.to_pickle(trialEventsDF, eventPickleLoc)
    else:
        # Remove old records from current trial from gazeEventsDF
        gazeEventsDF = gazeEventsDF[gazeEventsDF['trialNum'] != trialNum]
        # Add new data
        gazeEventsDF = pd.concat([gazeEventsDF, trialEventsDF], axis=0)
        pd.to_pickle(gazeEventsDF, eventPickleLoc)
def test_min_ms_cvar_avgsp(n_stock, win_length, alpha, scenario_cnt=1):
    """
    :param n_stock: range(5, 55)
    :param win_length:  range(50, 250)
    :param alphas: list
    :return:
    """
    t_start_date, t_end_date = date(2005, 1, 3), date(2014, 12, 31)

    symbols = EXP_SYMBOLS[:n_stock]
    # read rois panel
    roi_path = os.path.join( os.path.abspath(os.path.curdir), '..', 'data',
                             'pkl', 'TAIEX_2005_largest50cap_panel.pkl')
    # shape: (n_period, n_stock, {'simple_roi', 'close_price'})
    roi_panel = pd.read_pickle(roi_path)

    # shape: (n_period, n_stock)
    risk_rois = roi_panel.loc[t_start_date:t_end_date,
                                symbols, 'simple_roi'].T
    n_period = len(risk_rois.index)
    n_scenario = 200

    risk_free_roi = np.zeros(n_period, dtype=np.float)
    allocated_risk_wealth = np.zeros(n_stock, dtype=np.float)
    allocated_risk_free_wealth = 1e6
    buy_trans_fee =  0.001425
    sell_trans_fee = 0.004425

    # read scenario
    scenario_name = "{}_{}_m{}_w{}_s{}_{}_{}.pkl".format(
            START_DATE.strftime("%Y%m%d"), END_DATE.strftime("%Y%m%d"),
            len(symbols), win_length, n_scenario, "unbiased", scenario_cnt)
    scenario_path = os.path.join(EXP_SP_PORTFOLIO_DIR, 'scenarios',
                                     scenario_name)
    scenario_panel = pd.read_pickle(scenario_path)

    predict_risk_rois = scenario_panel.loc[t_start_date:t_end_date]
    predict_risk_free_rois = np.zeros((n_period, n_scenario))

    # model
    t0 = time()
    res = min_ms_cvar_avgsp_portfolio(symbols, risk_rois.index,
                                   risk_rois.as_matrix(), risk_free_roi,
                                allocated_risk_wealth,
                          allocated_risk_free_wealth, buy_trans_fee,
                          sell_trans_fee, alpha,
                          predict_risk_rois.as_matrix(),
                          predict_risk_free_rois, n_scenario,
                            solver="cplex", verbose=False)

    print res
    pd.to_pickle(res, os.path.join(TMP_DIR, 'min_ms_cvar_avgsp.pkl'))
    print predict_risk_rois.mean(axis=2)
    print "all_scenarios_min_cvar_avgsp_portfolio: "
    print "(n_period, n_stock, n_scenarios):({}, {}, {}): {:.4f} secs".format(
        n_period, n_stock, 200, time() - t0
    )
예제 #9
0
def conv2pkl(name):
    elist = read_network("network.dat")
    coms = read_community("community.dat")

    elist_path = os.path.join("binary_networks/data", name+"_edge.pkl")
    coms_path = os.path.join("binary_networks/data", name+"_label.pkl")

    pd.to_pickle(elist, elist_path)
    pd.to_pickle(coms, coms_path)
예제 #10
0
 def convert(split_size=max_timestep):
     m = pd.read_csv('har_dataset/train/X_train.txt', header=None).as_matrix()
     x_train = HarData.split_time(m, split_size)
     m = pd.read_csv('har_dataset/train/y_train.txt', header=None).as_matrix()
     y_train = HarData.split_time(tflearn.data_utils.to_categorical(m - 1, HarData.output_size), split_size)
     m = pd.read_csv('har_dataset/test/X_test.txt', header=None).as_matrix()
     x_test = HarData.split_time(m, split_size)
     m = pd.read_csv('har_dataset/test/y_test.txt', header=None).as_matrix()
     y_test = HarData.split_time(tflearn.data_utils.to_categorical(m - 1, HarData.output_size), split_size)
     pd.to_pickle([(x_train, y_train), (x_test, y_test)], 'har_data.pkl')
예제 #11
0
파일: Dynamics.py 프로젝트: airanmehr/bio
def ComputeGlobaldf(s,nu):
    from popgen.SFselect import metaSVM ;sys.modules['metaSVM']=metaSVM
    svm=pd.read_pickle('/home/arya/sfselect/SVMs/general_SVM_sp.pck')
    dfs={'AF':getGlobaldf(s,'AF',svm,removeFixedSites=True),
         'HAF':getGlobaldf(s,'HAF',svm,removeFixedSites=True),
         'tajimaD':getGlobaldf(s,'tajimaD',svm,removeFixedSites=True),
         'H':getGlobaldf(s,'H',svm,removeFixedSites=True),
         'SFSelect':getGlobaldf(s,'SFSelect',svm,removeFixedSites=True)
        }
    pd.to_pickle(dfs, path+'nu{}.s{}.df'.format(nu,s))
예제 #12
0
def retry_write_pickle(data, file_path, retry_cnt=10):
    for retry in xrange(retry_cnt):
        try:
            pd.to_pickle(data, file_path)
        except (IOError, EOFError) as e:
            if retry == retry_cnt -1:
                raise Exception(e)
            else:
                print ("dispatch:writing retry: {}, {}".format(
                    retry+1, e))
                time.sleep(np.random.rand() * 10)
예제 #13
0
def create_graph(g_name,lis,dd,d,pickle=False):
    g_name=nx.Graph()
    for node in lis:
        g_name.add_node(node)    
    g_name.add_weighted_edges_from(dd)
    g_name.add_edges_from(d)
    
    if pickle:
        pd.to_pickle(g_name,g_name+".pkl")
    
    return g_name
    def test_round_trip_current(self):
        for typ, dv in self.data.items():

            for dt, expected in dv.items():

                with tm.ensure_clean(self.path) as path:

                    pd.to_pickle(expected,path)

                    result = pd.read_pickle(path)
                    self.compare_element(typ, result, expected)
예제 #15
0
def merge():
    APs,MRRs=[],[]
    for i in range(200):
        try:
            res=pd.read_pickle(outpath+'numFB{}i{}.pkl'.format(numFeedbacks,i))
            if res[0]:
                APs.append(res[0])
                MRRs.append(res[1])
        except:
            pass
    pd.to_pickle( {'AP':{'mean':np.mean(APs),'std':np.std(APs)}, 'MRR':{'mean':np.mean(MRRs),'std':np.std(MRRs)}},'/home/arya/PubMed/prefFB{}.pkl'.format(numFeedbacks))
예제 #16
0
def split_data(infile, train, test, attrfile, na_strategy, trainpct, split_randomly):
  expanded_data = strip_and_process_na(pd.read_pickle(infile), attrfile, na_strategy)
  train_example_count = int(len(expanded_data.index) * trainpct / 100.0)
  if split_randomly:
    train_indices = np.random.choice(expanded_data.index, size=train_example_count)
  else:
    train_indices = expanded_data.sort("Date").index[:train_example_count]
  train_data = expanded_data.ix[train_indices]
  test_data = expanded_data.drop(train_indices)

  pd.to_pickle(train_data, train)
  pd.to_pickle(test_data, test)
예제 #17
0
파일: io.py 프로젝트: dr-nate/msmbuilder
def save_meta(meta, meta_fn='meta.pandas.pickl'):
    """Save metadata associated with a project.

    Parameters
    ----------
    meta : pd.DataFrame
        The DataFrame of metadata
    meta_fn : str
        The filename
    """
    backup(meta_fn)
    pd.to_pickle(meta, meta_fn)
def generate_reference(data, file_base):
    """ Takes a results data frame and returns an experiment dictionary with
    the columsn and column types for each experiment (after apply post_processing)
    :data: the data dataframe of a expfactory Result object
    :file_base:
    """
    exp_dic = {}
    for exp_id in numpy.unique(data['experiment_exp_id']):
        exp_dic[exp_id] = {}
        df = extract_experiment(data,exp_id, clean = False)
        col_types = df.dtypes
        exp_dic[exp_id] = col_types
    pandas.to_pickle(exp_dic, file_base + '.pkl')
	def run(self):	

		with open(self.input().path) as f:
		    f = f.readlines()

		array=[]
		for i in range(len(f)):
			array.append(apache2_logrow(f[i]))


		df = pd.DataFrame(array)
		df.columns = ['Host','Log_Name1','Log_Name','Date_Time','Method','Response_Code','Bytes_Sent','URL','User_Agent']
	
		pd.to_pickle(df,self.output().path)  
예제 #20
0
def dump_mlb_data(outfile, start_date=None, end_date=None, max_count=None, use_random=False, datatype='batting'):
  """
  Dump MLB statistical data to a file.
  :param str outfile: name of file to become pickled pandas datafile
  :param str start_date: don't include games from before this date when dumping data
  :param str end_date: don't include games from after this date when dumping data
  :param int max_count: maximum # of rows to dump
  :param bool use_random: whether to select rows at random (if False, choose most recent)
  :return:
  """
  print 'Dump MLB data for', datatype
  print 'loading data...'
  all_bsbr_logs = load_gamelogs(datatype=datatype)
  unindexed_dfs = []
  print 'reindexing data...'
  pbar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA()])
  for player_id, dataframe in pbar(all_bsbr_logs.items()):
    uidf = dataframe.reset_index()
    # Add player ID as a column to the dataframe for future joining purposes!
    uidf['player_id'] = pandas.Series(data=player_id, index=uidf.index)
    unindexed_dfs.append(uidf)
  all_game_rows = pandas.concat(unindexed_dfs, ignore_index=True)

  # Filter by date
  if start_date is not None:
    all_game_rows = all_game_rows[all_game_rows['Date'] > start_date]
  if end_date is not None:
    all_game_rows = all_game_rows[all_game_rows['Date'] < end_date]

  # Don't use relief pitchers in our dataset
  if datatype == 'pitching':
    print 'restricting to starting pitchers only...'
    all_game_rows = all_game_rows[all_game_rows['player_id'].apply(brefid_is_starting_pitcher)]

  # Sample filtered data
  if max_count and max_count < len(all_game_rows):
    print 'sampling %d rows...' % max_count
    if use_random:
      kept_indices = random.sample(all_game_rows.index, max_count)
      selected = all_game_rows.iloc[kept_indices]
    else:
      all_game_rows.sort("Date")
      selected = all_game_rows.tail(max_count)
  else:
    selected = all_game_rows
  print 'saving...'
  pandas.to_pickle(selected, outfile)
  print 'Done!'
  return selected
    def aggregate(self):
        file_data_frame = self._get_file_data_frame(self.input_data_dir)
        partial_data_frames = self._get_partial_data_frames(file_data_frame)

        observer_id = partial_data_frames.observer_id
        surface = partial_data_frames.surface
        result_series = partial_data_frames.file_series.apply(self._process_partial_file_series)

        result_data_frame = pandas.DataFrame({'observer_id': observer_id, 'surface': surface,
                                              'oscillogram': result_series})

        pandas.to_pickle(result_data_frame, os.path.join(self.aggregated_data_dir, 'aggregated_data.pkl'))

        for path in file_data_frame.path:
            os.remove(path)
예제 #22
0
def read_all_clusterings_cache(drop_centers=True, recompute=False,
                               categoricals=('dset', 'neuropil', 'distance', 'clusterer')):
    cache_file = op.join(CLUSTER_RUNS_CACHE_PATH, 'all#with_centers=%r.pkl' % (not drop_centers))
    if recompute or not op.isfile(cache_file):
        dfs = [read_clusterings_cache(dset, neuropil, drop_centers=drop_centers, categoricals=())
               for dset, neuropil in product(get_all_datasets(), get_all_neuropils())]
        dfs = [df for df in dfs if df is not None]
        df = pd.concat(dfs)
        for categorical in categoricals:
            df[categorical] = df[categorical].astype('category')
        pd.to_pickle(df, cache_file)
    try:
        return pd.read_pickle(cache_file)
    except:  # quick and dirty account for old pandas versions
        return read_all_clusterings_cache(drop_centers=drop_centers, recompute=True,)
def data_loader_1(symbol_list):
    if os.path.exists('rets.pkl'):
        rets = pd.read_pickle('rets.pkl')
    else:
        rets = get_data(symbol_list)
        pd.to_pickle(rets, 'rets.pkl')
    ins, outs = sort_data(rets)
    ins = ins.transpose([0,2,1]).reshape([-1, len(symbol_list) * 100])
    div = int(.8 * ins.shape[0])
    train_ins, train_outs = ins[:div], outs[:div]
    test_ins, test_outs = ins[div:], outs[div:]

    #normalize inputs
    train_ins, test_ins = train_ins/np.std(ins), test_ins/np.std(ins)
    return train_ins, test_ins, train_outs, test_outs
예제 #24
0
def dump_nba_data(outfile, start_date=None, end_date=None, max_count=None, use_random=False):
  """
  Dump NBA statistical data to a file.
  :param str outfile: name of file to become pickled pandas datafile
  :param str start_date: don't include games from before this date when dumping data
  :param str end_date: don't include games from after this date when dumping data
  :param int max_count: maximum # of rows to dump
  :param bool use_random: whether to select rows at random (if False, choose most recent)
  :return:
  """
  if start_date:
    start_date = parser.parse(start_date)
  else:
    start_date = datetime.datetime(2010, 10, 1)
  if end_date:
    end_date = parser.parse(end_date)
  else:
    end_date = datetime.datetime.today()
  print 'Dump NBA data for %s to %s' % (start_date, end_date)
  print 'loading data...'
  all_game_rows = load_all_game_data()

  # Filter by date
  if start_date is not None:
    all_game_rows = all_game_rows[all_game_rows['date'] > start_date]
  if end_date is not None:
    all_game_rows = all_game_rows[all_game_rows['date'] < end_date]

  # Sample filtered data
  if max_count and max_count < len(all_game_rows):
    print 'sampling %d rows...' % max_count
    if use_random:
      # We seed to 0 when we call this from CLI to make sure that random splits are replicable.
      random.seed(0)
      kept_indices = random.sample(all_game_rows.index, max_count)
      selected = all_game_rows.loc[kept_indices]
    else:
      all_game_rows.sort("date")
      selected = all_game_rows.tail(max_count)
  else:
    selected = all_game_rows
  print 'saving...'
  pandas.to_pickle(selected, outfile)
  print 'Done!'
  return selected
def load_data(path):
    try:
        data_filename = get_filename(path, 'trackedobjects.pickle')
        print 'Found file: ', data_filename
        pd = pandas.read_pickle(data_filename)
    except:
        data_filename = get_filename(path, 'trackedobjects.hdf5')
        print 'Found file: ', data_filename
        data_filename_pickled = data_filename.split('.')[0] + '.pickle'
        try:
            pd = pandas.read_pickle(data_filename_pickled)
        except:
            pd = mta.read_hdf5_file_to_pandas.load_data_as_pandas_dataframe_from_hdf5_file(data_filename)
            pandas.to_pickle(pd, data_filename_pickled)
            
    pd = mta.read_hdf5_file_to_pandas.remove_rows_above_speed_threshold(pd, speed_threshold=2)
            
    return pd
예제 #26
0
파일: stablity.py 프로젝트: airanmehr/bio
def createData(s):
    T=mkv.Markov.computeTransition(s=s, N=1000, takeLog=True).astype(np.float128).apply(np.exp).apply(lambda x: x/x.sum(),axis=1)
    T2=T.dot(T)
    T4=T2.dot(T2)

    T8=T4.dot(T2)

    T10=T8.dot(T2);T100=T10.dot(T10)



    stable=pd.Series([T,T10,T100],index=[1,10,100]).apply(lambda x: x.applymap(np.log))
    naive=pd.Series([mkv.Markov.computeTransition(s=s, N=1000, takeLog=True),mkv.computePowerSimulations(s=s,n=10,save=False),mkv.computePowerSimulations(s=s,n=10,save=False)],index=[1,10,100])
    data={'naive':naive,'stable':stable}
    if s==0:
        pd.to_pickle(data,utl.outpath+'real/stablity.neutral.pkl')
    else:
        pd.to_pickle(data,utl.outpath+'real/stablity.selection.pkl')
예제 #27
0
def run_best_ms_simulation(n_stock ,verbose=False):
    """
    The best multi-stage strategy,
    """
    t0 = time()
    # read rois panel
    roi_path = os.path.join(SYMBOLS_PKL_DIR,
                            'TAIEX_2005_largest50cap_panel.pkl')
    if not os.path.exists(roi_path):
        raise ValueError("{} roi panel does not exist.".format(roi_path))

    param = "{}_{}_m{}".format(
        START_DATE.strftime("%Y%m%d"), END_DATE.strftime("%Y%m%d"),
        n_stock)

    symbols = EXP_SYMBOLS[:n_stock]
    n_stock = len(symbols)
    # shape: (n_period, n_stock, {'simple_roi', 'close_price'})
    roi_panel = pd.read_pickle(roi_path)

    # shape: (n_period, n_stock)
    exp_risk_rois = roi_panel.loc[START_DATE:END_DATE, symbols,
                    'simple_roi'].T
    n_exp_period = exp_risk_rois.shape[0]
    exp_risk_free_rois = pd.Series(np.zeros(n_exp_period),
                                   index=exp_risk_rois.index)

    allocated_risk_wealth = pd.Series(np.zeros(n_stock), index=symbols)
    initial_wealth = 1e6


    instance = BestMSPortfolio(symbols, exp_risk_rois, exp_risk_free_rois,
                            allocated_risk_wealth, initial_wealth,
                            start_date=START_DATE, end_date=END_DATE)
    reports = instance.run()

    file_name = 'best_ms_{}.pkl'.format(param)

    file_dir = os.path.join(EXP_SP_PORTFOLIO_DIR, 'best_ms')
    if not os.path.exists(file_dir):
        os.makedirs(file_dir)

    pd.to_pickle(reports, os.path.join(file_dir, file_name))
    print ("best_ms {} OK, {:.3f} secs".format(param, time()-t0))
 def export_files(out):
     to_pickle(out, 'recap_export.pkl')
     out_csv = []
     for court, v in out.items():
         for judge_name, data in v.items():
             for title, years in data.items():
                 row = OrderedDict([
                     ('court', court),
                     ('name', judge_name),
                     ('title', title),
                     ('total count', sum(years.values()))
                 ])
                 for year, count in years.items():
                     row[str(year)] = count
                 out_csv.append(row)
     df = pandas.DataFrame(out_csv)
     df = df[['court', 'name', 'title', 'total count'] + sorted(
         [x for x in df.columns if x.isdigit()])]
     df.to_csv('recap_export.csv', index=False)
예제 #29
0
파일: Simulation.py 프로젝트: airanmehr/bio
 def forwardSimulation(self,selectionOnRandomSite=False,siteUnderSelection=None,H0=None):
     """
     returns np 3D array T x nSS x R which T=|{t_1,t_2,..}| (nnumber of times), nSS is number of SS , and R is the number of replicates  
     """
     if self.initialCarrierFreq==-1:
         selectionOnRandomSite=True
     if H0 is None:
         if self.H0 is None:
             H0=MSMS.Song(F=self.F, L=self.L, Ne=self.Ne, r=self.r, mu=self.mu,uid=self.uidMSMS,msmsFile=self.msmsFile,dir=self.outpathmsms)
         else:
             H0=self.H010
     if self.foldInitialAFs:
         idx=H0.mean(0)>0.5
         H0.iloc[:,idx.values]=1-H0.iloc[:,idx.values]
     
     self.setH0(H0)
     self.positions_msms=self.H0.columns.values.copy(True)
     self.positions=sorted(np.random.choice(self.L,self.H0.shape[1],replace=False))
     self.H0=pd.DataFrame(self.H0.values, columns=self.positions)
     self.X0=self.H0.mean(0).values
     if selectionOnRandomSite:
         self.set_siteUnderSelection(np.random.randint(0,self.H0.shape[1]))
     elif siteUnderSelection is not None:
         self.set_siteUnderSelection(siteUnderSelection)
     else:
         if not self.s:
             self.set_siteUnderSelection(self.X0.argmax())
         else:
             sites=np.sort(np.where(self.X0== self.initialCarrierFreq)[0]);
             if not len(sites):
                 sites=np.sort(np.where(( self.X0 <= self.initialCarrierFreq +0.025) & ( self.X0 >= self.initialCarrierFreq -0.025) ) [0]);
                 if not len(sites):
                     print 'Try again. No site at freq ',self.initialCarrierFreq, self.uid; return
             self.set_siteUnderSelection(sites[np.random.randint(0,len(sites))])
     pop= self.createInitialDiploidPopulation()
     self.X=np.array([self.multiLocSelectionHardSweepOneReplicate(pop.clone()) for _ in range(self.numReplicates)]).swapaxes(0,2).swapaxes(0,1) #makes sure the site under selection does not go to zero
     if self.ignoreInitialNeutralGenerations:    self.X=self.X[self.initialNeutralGenerations:,:,:]
     self.X=np.append(np.tile(self.X0[:,None],(1,self.X.shape[2]))[None,:,:],self.X,axis=0)
     if self.onlyKeep is not None:   self.X=self.X[:,self.X0==self.onlyKeep,:]
     self.sampleDepths()
     if self.save:
         pd.to_pickle(self,self.outpath+self.uid+'.pkl')
예제 #30
0
def runAllForEachS(s):
    outpath=home+'out/vineet/'; 
    if not os.path.exists(outpath): os.makedirs(outpath)
    fname='{}results{}'.format(outpath,float(s))
    experimentIDX=range(numExperiments)
    np.random.shuffle(experimentIDX)
    print 'Running Experiments for s= {}'.format(s)
    sys.stdout=open('{}results.out'.format(outpath),'w')
    sys.stderr=open('{}results.err'.format(outpath),'w')
    for j in experimentIDX:
        print 's={} j={}'.format(s,j),sys.stdout.flush()
        if os.path.exists('{}_{}.pd'.format(fname,j)): continue
        param=getERParam(simulateData=  True,numThreads=1 ,s=s, experimentNumber=j)
        df,param=runFindSForAllMethods(param)
        df.to_pickle('{}_{}.pd'.format(fname,j))
        pd.to_pickle(param,'{}_{}.pd'.format(fname.replace('results','param'),j))
        print s,j,param['initHaps'].shape,sys.stdout.flush()
        if ps.virtual_memory().percent>95:
            print >> sys.stderr,'s={} exited!'.format(s)
            exit()
예제 #31
0
        if(image_count % 30 == 0):
            data = {'image': cate_list,
                    'image2': cate_list2,
                    'url': url_list[0:image_count]}
            frame = DataFrame(data)
            pd.to_pickle(frame, 'C:/Users/user/Desktop/url-image.df')

def http_label_request(image_name):
    return urllib2.urlopen(server_url+image_name)

def init_list(arg_df):
    global image_count
    for each in arg_df.iterrows():
        cate_list.append(each[1]['image'])
        cate_list2.append(each[1]['image2'])

train_df = pd.read_pickle("C:/Users/user/Desktop/url.df")
reserved_df = pd.read_pickle("C:/Users/user/Desktop/url-image.df")
url_list = train_df.url

init_list(reserved_df)

search()

data = {'image': cate_list,
        'image2': cate_list2,
        'url': url_list}
frame = DataFrame(data)
pd.to_pickle(frame, 'C:/Users/user/Desktop/url-image.df')
예제 #32
0
                                  "SVM_OVR: CV Accur",
                                  "SVM_OVR: Train Accur",
                                  "SVM_OVR: Test Accur",
                              ])

for model_type in ['SVM_OVR', 'LR_OVR']:

    print("Running CV for %s" % model_type)

    for cat in range(0, 10):
        # Read the CV accuracies
        CV_accuracies = pd.read_pickle(
            ct.ROOT + "Pickles\\Fitted_Hierarchy_CV\\%s_%s_CV.p" %
            (cat, model_type))
        best_C, best_CV_acc = fc.get_best_C(CV_accuracies)

        print("Best C and best CV accurancy:", best_C, best_CV_acc)

        summ_stats.ix[cat, "%s: Best C" % model_type] = best_C
        summ_stats.ix[cat, "%s: CV Accur" % model_type] = best_CV_acc

        train_acc, test_acc = fit_model_cat(model_type, best_C, cat)

        print("Train accur and test accur are:", train_acc, test_acc)

        summ_stats.ix[cat, "%s: Train Accur" % model_type] = train_acc
        summ_stats.ix[cat, "%s: Test Accur" % model_type] = test_acc

    pd.to_pickle(summ_stats,
                 ct.ROOT + "Pickles\\Fitted_Hierarchy\\HC_Summ_Stats.p")
예제 #33
0
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    # Return a list of words
    return (text)


clean_q1 = []
clean_q2 = []
for i in tqdm(np.arange(data_all.shape[0])):
    clean_q1.append(preprocessing(data_all[i][0]))
    clean_q2.append(preprocessing(data_all[i][1]))

for i in tqdm(np.arange(data_all.shape[0])):
    clean_q1[i] = text_to_wordlist(clean_q1[i])
    clean_q2[i] = text_to_wordlist(clean_q2[i])

train_clean = pd.DataFrame()
test_clean = pd.DataFrame()
train_clean['question1'] = clean_q1[:train.shape[0]]
train_clean['question2'] = clean_q2[:train.shape[0]]

test_clean['question1'] = clean_q1[train.shape[0]:]
test_clean['question2'] = clean_q2[train.shape[0]:]

pd.to_pickle(train_clean, '../X_v2/train_final_clean.pkl')
pd.to_pickle(test_clean, '../X_v2/test_final_clean.pkl')
예제 #34
0
                  ft_model_path=ft_model_path)


def evaluate(epoch=None):
    trainer, dev_data = prepare_model(training=False,
                                      test_code=False,
                                      load_weights=True,
                                      lr=0.5)
    em, f1 = trainer.evaluate_on_dev(dev_data, BATCH_SIZE // 3,
                                     -1)  # can control huge data
    print('--Dev Extract Match score :%f--------F1 score:%f' % (em, f1))


if __name__ == '__main__':
    action = cfg.get('Action', 'ACTION')
    if action == 'train':
        # training
        prepare_model(training=True,
                      test_code=False,
                      load_weights=False,
                      lr=LEARNING_RATE)
    elif action == 'finetune':
        assert '_ft' in MODEL_NAME, 'finetune action should append the model name with _ft'
        # finetune
        finetune(ft_model_path='./model/squad_' +
                 MODEL_NAME.replace('_ft', '') + '.hdf5')
    elif action == 'predict':
        # predict
        pred = predict(text=False)
        pd.to_pickle(pred, './pred.pkl')
예제 #35
0
def config_create(main_model, sector_name, result_file_name, config_name, data,
                  time_para, pot_in_num, leve_ratio_num, sp_in, ic_num,
                  fit_ratio, n, use_factor_num):
    time_para_dict = dict()
    time_para_dict['time_para_4'] = [
        pd.to_datetime('20140601'),
        pd.to_datetime('20180601'),
        pd.to_datetime('20180901')
    ]

    time_para_dict['time_para_5'] = [
        pd.to_datetime('20140701'),
        pd.to_datetime('20180701'),
        pd.to_datetime('20180901')
    ]

    time_para_dict['time_para_6'] = [
        pd.to_datetime('20140801'),
        pd.to_datetime('20180801'),
        pd.to_datetime('20180901')
    ]

    data_n = data[data['time_para'] == time_para]
    a_n = data_n[(data_n['ic'].abs() > ic_num)
                 & (data_n['pot_in'].abs() > pot_in_num) &
                 (data_n['leve_ratio'].abs() > leve_ratio_num) &
                 (data_n['sp_in'].abs() > sp_in) &
                 (data_n['fit_ratio'].abs() > fit_ratio)]
    a_n['pnl_file_name'] = a_n[['time_para', 'key', 'fun_name'
                                ]].apply(lambda x: '|'.join(x.astype(str)),
                                         axis=1)
    print(a_n['con_out_2'].sum() / len(a_n), len(a_n))
    a_n['buy_sell'] = (a_n['sp_m'] > 0).astype(int).replace(0, -1)
    use_factor_ratio = use_factor_num / len(a_n.index)

    pnl_save_path = '/mnt/mfs/dat_whs/data/mix_factor_pnl/' + result_file_name
    sum_pnl_df = pd.DataFrame()
    for i in a_n.index:
        pnl_file_name = a_n['pnl_file_name'].loc[i]
        print('***************************************************')
        print('now {}\'s is running, key={}'.format(i, pnl_file_name))
        pnl_df = pd.read_pickle(
            os.path.join(pnl_save_path, '{}.csv'.format(pnl_file_name)))
        pnl_df.name = pnl_file_name
        sum_pnl_df = pd.concat([sum_pnl_df, pnl_df], axis=1)
    # _________________________________________________________________________________
    part_sum_pnl_df = sum_pnl_df.loc[:pd.to_datetime('20180601')]
    sharpe_df_after = part_sum_pnl_df.iloc[-100:].apply(bt.AZ_Sharpe_y)
    sharpe_df_after.name = 'sharpe_df_after'
    sharpe_df_before = part_sum_pnl_df.iloc[:-100].apply(bt.AZ_Sharpe_y)
    sharpe_df_before.name = 'sharpe_df_before'
    sharpe_df = part_sum_pnl_df.apply(bt.AZ_Sharpe_y)
    sharpe_df.name = 'sharpe_df'
    # info_df = pd.concat([sharpe_df_before, sharpe_df_after], axis=1)

    # _________________________________________________________________________________
    target_df = (sum_pnl_df > 0).astype(int)
    kmeans = KMeans(n_clusters=n).fit(target_df.T)

    kmeans_result = kmeans.labels_
    columns_list = target_df.columns
    group_df = pd.DataFrame(kmeans_result, index=columns_list)
    file_name_list = a_n['pnl_file_name'].values
    a_n['group_key'] = group_df.loc[file_name_list].values
    target_df = pd.DataFrame()

    for i in range(n):
        part_a_n = a_n[a_n['group_key'] == i].sort_values(by='sp_in')
        part_num = int(len(part_a_n) * use_factor_ratio)

        part_target_df = part_a_n[[
            'fun_name', 'name1', 'name2', 'name3', 'buy_sell'
        ]].iloc[:part_num]
        print(part_num)
        target_df = target_df.append(part_target_df)

    print(len(target_df))
    print(Counter(target_df['name1'].values))
    print(Counter(target_df['name2'].values))
    print(Counter(target_df['name3'].values))

    config_info = dict()
    config_info['factor_info'] = target_df
    config_info['sector_name'] = sector_name
    config_info['result_file_name'] = result_file_name
    config_info['if_weight'] = main_model.if_weight
    config_info['ic_weight'] = main_model.ic_weight
    config_info['hold_time'] = main_model.hold_time
    config_info['if_hedge'] = main_model.if_hedge
    config_info['if_only_long'] = main_model.if_only_long
    pd.to_pickle(config_info,
                 '/mnt/mfs/dat_whs/alpha_data/{}.pkl'.format(config_name))
예제 #36
0
def trial(model_file_name, scenario, number_of_trials, rendering=False, graphs_suffix='', verbose=C_VERBOSE_NONE,
          store_history=False, compute_saliency=False, history_save_path='./output/history_test.pkl'):
    """
    Summary:
        Evaluate the trained DQN for a number of trials (number_of_trials).

    Args:
        model_file_name: string
            The saved trained DQN (Keras DNN h5 file).

        scenario: string
            The OpenAI gym scenario to be loaded by the Emulator.

        number_of_trials: int
            How many trials to execute.

        rendering: boolean
            If True, OpenAI gym environment rendering is enabled.

        graphs_suffix: string
            A suffix added in the graphs file names. To be used in case of multiple trials.

        verbose: int
            Verbose level (0: None, 1: INFO, 2: DEBUG)

        store_history: bool
            Store history data or not.

        compute_saliency: bool
            Computes saliency or not.

        history_save_path: str
            Where to store the history file.

    Raises:
        -

    Returns:
        trials_average_reward: float
            The average reward for the trial-episode (100 episodes)

    notes:
        -
    """

    if verbose > C_VERBOSE_NONE:
        print('\nEvaluate the trained DQN in ', str(number_of_trials), ' trials (episodes).', sep='')
        print('- model_file_name = ', model_file_name, ', scenario = ', scenario, ', number_of_trials = ',
              number_of_trials,
              ', rendering = ', rendering, ', graphs_suffix = ', graphs_suffix, sep='')

        # Create a Emulator object instance (without a seed)
    emulator = em.Emulator(scenario=scenario, average_reward_episodes=number_of_trials, statistics=True,
                           rendering=rendering, seed=42, verbose=verbose)

    # Create a Deep Neural Network object instance and load the trained model (model_file_name)
    dnn = deepNeuralNetwork.DeepNeuralNetwork(file_name=model_file_name, verbose=verbose)

    # Start measuring Trials time
    start_time = time.time()

    history = {
        'trial': [],
        'state': [],
        'action': [],
        'reward': [],
        'next_state': [],
        'done': [],
        'q_values': []
    }
    if compute_saliency:
        history['saliency'] = []

    # Trials
    # used as baseline for perturbation
    # for each feature, apply a random noise of 0.2 * (max(feature) - min(feature))
    state_min = np.array([-0.354871, -0.10391249, -0.468456, -0.89336216, -0.15218297, -0.4017307, 0, 0])
    state_max = np.array([-0.00462484, 1.4088593, 0.12988918, 0.05392841, 0.5564749, 0.8584606, 1, 1])
    for i in range(number_of_trials):

        current_state = emulator.start()

        while emulator.emulator_started:
            q_values = dnn.predict(current_state)
            action = np.argmax(q_values)

            if compute_saliency:
                # compute saliency
                saliency = np.zeros(NUM_STATE)
                for _ in range(NUM_SALIENCY_TESTS):
                    for j in range(NUM_STATE):
                        # perturb state
                        perturbed_state = np.array(current_state)
                        if j < 6:  # numerical states
                            perturbed_state[j] = SALIENCY_PERTURBATION * np.random.rand() \
                                                 * (state_max[j] - state_min[j]) + state_min[j]
                        else:  # boolean states
                            perturbed_state = current_state.copy()
                            perturbed_state[j] = 1 - perturbed_state[j]
                        q_values_preturbed = dnn.predict(perturbed_state)

                        max_q = np.max(q_values)
                        q_values /= max_q
                        q_values_preturbed /= max_q

                        q_value_dict = {a: q_values[0, a].astype(np.float64) for a in range(4)}
                        q_value_preturbed_dict = {a: q_values_preturbed[0, a].astype(np.float64) for a in range(4)}
                        saliency[j] = sarfa_saliency.computeSaliencyUsingSarfa(action,
                                                                               q_value_dict,
                                                                               q_value_preturbed_dict)[0]
                saliency /= NUM_SALIENCY_TESTS

            # Experience [s, a, r, s']
            experience = emulator.applyAction(action)

            # save data
            if store_history:
                history['trial'].append(i)
                history['state'].append(current_state)
                history['action'].append(action)
                history['reward'].append(experience[2])
                if experience[3] is not None:
                    history['next_state'].append(experience[3])
                    history['done'].append(False)
                else:
                    history['next_state'].append(current_state)
                    history['done'].append(True)
                history['q_values'].append(q_values)
                if compute_saliency:
                    history['saliency'].append(saliency)

            current_state = experience[3]

    if store_history:
        for k in history.keys():
            history[k] = np.array(history[k])
        history_save_dir = os.path.split(history_save_path)[0]
        if not os.path.exists(history_save_dir):
            os.makedirs(history_save_dir)
        pd.to_pickle(history, history_save_path)

    if verbose > C_VERBOSE_NONE:
        print('\nDQN ', str(number_of_trials), ' trials average = ', emulator.execution_statistics.values[-1, 3],
              ', in ',
              executionTimeToString(time.time() - start_time), sep='')

    return emulator.execution_statistics.values[-1, 3]
예제 #37
0
def train(scenario, average_reward_episodes, rendering, hidden_layers, hidden_layers_size, memory_size, minibatch_size,
          optimizer_learning_rate, gamma, epsilon_decay_factor, maximum_episodes, model_file_name,
          converge_criteria=None, graphs_suffix='', seed=None, verbose=C_VERBOSE_NONE, store_history=False,
          history_save_path='./output/history_train.pkl'):
    """
    Summary:
        Trains a DQN model for solving the given OpenAI gym scenario.

    Args:
        scenario: string
            The OpenAI gym scenario to be solved.

        average_reward_episodes: int
            On how many concecutive episodes the averaged reward should be calculated.

        rendering: boolean
            If True, OpenAI gym environment rendering is enabled.

        hidden_layers: int
            The number of hidden layers of the Deep Neural Network. Not including the first
            and last layer.

        hidden_layers_size: int
            The size of each hidden layer of the Neural Network.

        memory_size: int
            The size of the replay memory feature which will be used by the DQN.

        minibatch_size: int
            The minibatch size which will be retrieved randomly from the memory in each
            iteration in the DQN.

        optimizer_learning_rate: float
            The Adam optimizer learning rate used in the DNN.

        gamma: float
                The discount factor to be used in the equation (3) of [1].

        epsilon_decay_factor: float
            The decay factor of epsilon parameter, for each iteration step.

        maximum_episodes: int
            The maximum number of episodes to be executed. If DQN converges earlier the training stops.

        model_file_name: string
            The file in which the DQN trained model (DNN Keras) should be saved.

        converge_criteria: int or None
            The DQN converge criteria (when for converge_criteria concecutive episodes average reward
            is > 200, the DQN assumed that has been converged).
            If None, the training continues till the maximum_episodes is reached.

        graphs_suffix: string
            A suffix added in the graphs file names. To be used in case of multiple trains.

        seed: int
            Optional Seed to be used with the OpenAI gym environment, for results reproducability.

        verbose: int
            Verbose level (0: None, 1: INFO, 2: DEBUG)

        store_history: bool
            Store history or not.

        history_save_path: str
            Where to store the history file.

    Raises:
        -

    Returns:
        convergence_episode: int
            In which episode the DQN convergences

        convergence_time: string (time)
            On how much time the DQN convergences

        Rturns None if converge_criteria is None

    notes:
        -
    """

    if verbose > C_VERBOSE_NONE:
        print('\nDQN Training Starts (scenario = ', scenario, ', average_reward_episodes = ', average_reward_episodes,
              ', rendering = ', rendering,
              ', hidden_layers = ', hidden_layers, ', hidden_layers_size = ', hidden_layers_size, ', memory_size = ',
              memory_size,
              ', minibatch_size = ', minibatch_size, ', optimizer_learning_rate = ', optimizer_learning_rate,
              ', gamma = ', gamma,
              ', epsilon_decay_factor = ', epsilon_decay_factor, ', maximum_episodes = ', maximum_episodes,
              ', model_file_name = ', model_file_name,
              ', converge_criteria = ', converge_criteria, ', graphs_suffix = ', graphs_suffix, ', seed = ', seed, ')',
              sep='')

    # If seed is given the apply it
    if seed is not None:
        applySeed(seed, verbose)

    # Create a Emulator object instance
    emulator = em.Emulator(scenario, average_reward_episodes, statistics=True, rendering=rendering, seed=seed,
                           verbose=verbose)

    # Create a Deep Neural Network object instance (Keras with Tensor Flow backend)
    dnn = deepNeuralNetwork.DeepNeuralNetwork(inputs=emulator.state_size, outputs=emulator.actions_number,
                                              hidden_layers=hidden_layers,
                                              hidden_layers_size=hidden_layers_size,
                                              optimizer_learning_rate=optimizer_learning_rate, seed=seed,
                                              verbose=verbose)

    # Create a DQN object instance (we start always from epsilon = 1.0, we control each value with the
    # epsilon_decay_factor
    dqn = deepQNetwork.DeepQNetwork(emulator=emulator, dnn=dnn, states_size=emulator.state_size,
                                    actions_number=emulator.actions_number,
                                    memory_size=memory_size, minibatch_size=minibatch_size, gamma=gamma, epsilon=1.0,
                                    epsilon_decay_factor=epsilon_decay_factor,
                                    seed=seed, verbose=verbose)

    # Start measuring training time
    start_time = time.time()

    history = {
        'trial': [],
        'state': [],
        'action': [],
        'reward': [],
        'next_state': [],
        'done': [],
        'q_values': []
    }

    if converge_criteria is not None:
        # Holds how many concecutive episodes average reward is > 200
        convergence_counter = 0
        episodes_convergence_counter = []  # Holds the convergence_counter for all episodes
        convergence_episode = 0

    # Training starts here
    for i in range(maximum_episodes):
        current_state = emulator.start()

        # See Algorithm 1 in [1]
        while emulator.emulator_started:
            q_values = dnn.predict(current_state)
            action = np.argmax(q_values)

            # Experience [s, a, r, s']
            experience = emulator.applyAction(action)

            # save data
            if store_history:
                history['trial'].append(i)
                history['state'].append(current_state)
                history['action'].append(action)
                history['reward'].append(experience[2])
                if experience[3] is not None:
                    history['next_state'].append(experience[3])
                    history['done'].append(False)
                else:
                    history['next_state'].append(current_state)
                    history['done'].append(True)
                history['q_values'].append(q_values)

            dqn.storeTransition(experience)
            dqn.sampleRandomMinibatch()

            # s = s' at the end of the step, before starting the new step
            current_state = experience[3]

        if converge_criteria is not None:
            # Check if convergence counter should be increased or to be reset
            if emulator.average_reward > 200:
                convergence_counter += 1
            else:
                convergence_counter = 0

            episodes_convergence_counter.append(convergence_counter)

            if verbose > C_VERBOSE_NONE:
                print('Convergence Counter: ', convergence_counter, sep='')

            # DQN model assumed that it has been converged
            if convergence_counter >= converge_criteria:
                convergence_episode = i
                break

    if store_history:
        for k in history.keys():
            history[k] = np.array(history[k])
        history_save_dir = os.path.split(history_save_path)[0]
        if not os.path.exists(history_save_dir):
            os.makedirs(history_save_dir)
        pd.to_pickle(history, history_save_path)

    if converge_criteria is not None:
        convergence_time = time.time() - start_time

    if verbose > C_VERBOSE_NONE and converge_criteria is not None:
        print('\nDQN converged after ', convergence_episode, ' episodes in ', executionTimeToString(convergence_time),
              sep='')
    elif verbose > C_VERBOSE_NONE and converge_criteria is None:
        print('\nDQN trained for ', maximum_episodes, ' episodes in ', executionTimeToString(time.time() - start_time),
              sep='')

    # Create Graphs
    # 1. Steps per Episode
    plt.plot(emulator.execution_statistics.values[:, 0], emulator.execution_statistics.values[:, 1], color='coral',
             linestyle='-')
    plt.grid(b=True, which='major', axis='y', linestyle='--')
    plt.xlabel('Episode', fontsize=12)
    plt.ylabel('Steps', fontsize=12)
    plt.title('Steps per Episode', fontsize=12)
    plt.savefig('Steps_Per_Episode' + graphs_suffix + '.png')
    plt.clf()

    # 2. Total Reward per Training Episode
    plt.plot(emulator.execution_statistics.values[:, 0], emulator.execution_statistics.values[:, 2], color='coral',
             linestyle='-',
             label='Total Reward')
    plt.plot(emulator.execution_statistics.values[:, 0], emulator.execution_statistics.values[:, 3],
             color='midnightblue', linestyle='--',
             label='Episodes Reward Average')
    plt.grid(b=True, which='major', axis='y', linestyle='--')
    plt.xlabel('Episode', fontsize=12)
    plt.ylabel('Reward', fontsize=12)
    plt.title('Total Reward per Training Episode', fontsize=12)
    plt.legend(loc='lower right', fontsize=12)
    plt.savefig('Total_Reward_Per_Training_Episode' + graphs_suffix + '.png')
    plt.clf()

    # Save the trained model
    dnn.saveModel(model_file_name)

    if converge_criteria is not None:
        return convergence_episode
예제 #38
0
    def kfold_run(self,
                  X_train, y_train,
                  X_test=None, y_test=None,
                  model_params=None,
                  n_folds=5,
                  stratify=False,
                  index_number=None,
                  flow_augment=False,
                  save_oof=False,
                  ):
        """KFold/StratifiedKFold run.

        # Arguments
            X_train: (numpy array), training set.
            y_train: (numpy array), training set labels.
            X_test: (numpy array), test set.
            y_test: (numpy array), test set labels.
            model_params: (Dict), dictionary of model parameters.
            n_folds: (Int), number of folds used in training.
            stratify: (Boolean), whether fold split should be stratified according to labels distribution.
            index_number: (Int), index specifying from which bag should training or prediction be started.
            flow_augment: (Boolean), whether to use data augmentation during test and prediction.
            save_oof: (Boolean), whether to automatically save oof predictions.
                Assumes oof/train and oof/test folders in source directory.

        # Returns
            model: (Keras model), trained model for last fold.
            oof_train: (numpy array), array with out-of-fold training set predictions.
            if predict_test additionally:
                oof_test: (numpy array), array with out-of-fold test set predictions.
        """

        if index_number is not None:
            self.i = index_number
            oof_index = 0

        if len(y_train.shape) == 1:
            y_train = y_train.reshape((y_train.shape[0], 1))

        self.oof_train = np.zeros(y_train.shape + (1,))
        print('OOF train predictions shape: {}'.format(self.oof_train.shape))

        if X_test is not None:
            self.oof_test = np.zeros(
                (X_test.shape[0],) + y_train.shape[1:] + (n_folds,))
            print('OOF test predictions shape: {}'.format(self.oof_test.shape))

        if stratify and self.oof_train.shape[-2] != 1:
            print(
                'To use StratifiedKFold please provide categorically encoded labels, not One-Hot encoded. \
                \n Reversing OH encoding now.')
            y_train_split = pd.DataFrame(y_train).idxmax(axis=1).values
            print('Labels after reversed encoding:', y_train_split[:10])
            kf = StratifiedKFold(
                n_splits=n_folds, shuffle=self.shuffle, random_state=self.seed)
        else:
            kf = KFold(
                n_splits=n_folds, shuffle=self.shuffle, random_state=self.seed)
            y_train_split = y_train

        for train_index, test_index in kf.split(X_train, y_train_split):
            print('Training on fold:', self.i, '\n')

            X_tr, X_val = X_train[train_index], X_train[test_index]
            y_tr, y_val = y_train[train_index], y_train[test_index]

            model = self.model_name(model_params)

            if self.save_statistics:
                os.makedirs('{}{}'.format(
                    self.checkpoints_dst, self.run_save_name), exist_ok=True)

            if self.save_model:
                self.callbacks_append_checkpoint('fold')
            if self.save_history:
                self.callbacks_append_logger('fold')

            if self.load_keras_model:
                model = self.load_trained_model('fold')
            else:
                if flow_augment:
                    print('Training with data augmentation.')
                    history = model.fit_generator(
                        self.train_datagen.flow(
                            X_tr, y_tr, batch_size=self.batch_size),
                        steps_per_epoch=X_tr.shape[0] / self.batch_size,
                        epochs=self.number_epochs,
                        validation_data=self.valid_datagen.flow(
                            X_val, y_val, batch_size=self.batch_size,
                            shuffle=False),
                        validation_steps=X_val.shape[0] / self.batch_size,
                        callbacks=self.model_callbacks)
                else:
                    history = model.fit(X_tr, y_tr, verbose=self.verbose,
                                        batch_size=self.batch_size, epochs=self.number_epochs,
                                        validation_data=(X_val, y_val),
                                        callbacks=self.model_callbacks)

            if not self.load_keras_model:
                validation_loss = history.history['val_loss']
                self.loss_history.append(validation_loss)
                self.min_losses.append(np.min(validation_loss))
                if self.output_statistics:
                    self.output_run_statistics('fold')

            print('Predicting on validation data.')
            self.oof_train[test_index, :, 0] = model.predict(
                X_val, batch_size=self.batch_size)
            if self.verbose:
                print('Validation split - standard deviation for original target values: {} \n \
                for predicted target values: {} \n \n'.format(
                    np.std(y_val), np.std(self.oof_train[test_index, :])))

            if self.predict_test and X_test is not None:
                print('Predicting on test data.')
                if flow_augment:
                    self.oof_test[:, :, oof_index] = self.flow_predict_test_augment(
                        X_test, model)
                else:
                    self.oof_test[:, :, oof_index] = model.predict(
                        X_test, batch_size=self.batch_size)
                oof_index += 1

            self.i += 1
            if not self.load_keras_model:
                if self.output_statistics:
                    self.output_run_statistics('fold')

        if self.predict_test and save_oof:
            pd.to_pickle(np.array(self.oof_train), 'oof/train/{}_{:.5f}.pkl'.format(
                self.run_save_name, np.array(self.min_losses).mean(axis=0)))
            pd.to_pickle(np.array(self.oof_test), 'oof/test/{}_{:.5f}.pkl'.format(
                self.run_save_name, np.array(self.min_losses).mean(axis=0)))

        if self.predict_test and X_test is not None:
            return model, np.array(self.oof_train), np.array(self.oof_test)
        return model, np.array(self.oof_train).mean(axis=-1)
    #     'GROUP', 'TYPE', 'CSS', 'NIP']
    cols = [
        'TIMESTAMP_UTC', 'EVENT_SENTIMENT_SCORE', 'EVENT_RELEVANCE', 'CSS',
        'NIP'
    ]
    df = df[cols]

    prices = read_data_from_csv(prices_path)
    prices['Date'] = prices['Date'].apply(
        lambda x: x[:10])  # ****-**-** format

    news = adjust_dates_to_trading_dates(df)
    prices = remove_prices_no_news(prices, news)
    news = remove_non_trading_dates(prices, news)

    assert prices.shape[0] == np.unique(news['Date']).shape[0]
    assert list(np.unique(
        prices['Date'])) == [str(d) for d in np.unique(news['Date'])]

    counts, indices = group_by_date(news)
    assert len(counts) == prices.shape[0]

    news = weight_news_by_time(news, counts, indices)
    print(news.head())

    X = concat_with_prices(prices, news)
    print(X.head())
    pd.to_pickle(X, '../Data/IBM_X_data.pkl', protocol=4)
    y = prices['Close']
    pd.to_pickle(y, '../Data/IBM_close_data.pkl', protocol=4)
예제 #40
0
def save_progress(names, new_patents, pickle_names_path, output_path):
    pd.to_pickle(names, pickle_names_path)
    store_patents(new_patents, output_path)