def get_silhouette(df): df=df[(df.AB!=".")].copy() df.loc[:,'AB']=pd.to_numeric(df.loc[:,'AB']) df.loc[:,'CN']=pd.to_numeric(df.loc[:,'CN']) tp=df.iloc[0,:].loc['svtype'] [mn_CN, mn_AB]=df.loc[:, ['CN', 'AB']].mean(skipna=True) [sd_CN, sd_AB]=df.loc[:, ['CN', 'AB']].std(skipna=True) if df.loc[:,'GT'].unique().size==1: df.loc[:,'sil_gt_avg']=1 df.loc[:, 'sil_gt']=1 df=df[ ['var_id', 'sample', 'svtype', 'AF', 'GT', 'CN', 'AB', 'sil_gt_avg', 'sil_gt']] return df #standardize the 2 dims if sd_AB>0.01: df.loc[:, 'AB1']=(df.loc[:,'AB']-mn_AB)/sd_AB else: df.loc[:, 'AB1']=df.loc[:, 'AB'] if tp in ['DEL', 'DUP', 'MEI'] or sd_CN>0.01: df.loc[:, 'CN1']=(df.loc[:,'CN']-mn_CN)/sd_CN else: df.loc[:, 'CN1']=df.loc[:, 'CN'] gt_code={'0/0':1, '0/1':2, '1/1':3} df.loc[:,'gtn']=df.loc[:, 'GT'].map(gt_code) dist_2d_sq=spatial.distance.squareform(spatial.distance.pdist(df[['AB1', 'CN1']], metric='cityblock')) df.loc[:, 'sil_gt_avg']=metrics.silhouette_score(dist_2d_sq, df.loc[:, 'gtn'].values, metric='precomputed') df.loc[:, 'sil_gt']=metrics.silhouette_samples(dist_2d_sq, df.loc[:, 'gtn'].values, metric='precomputed') df=df[ ['var_id', 'sample', 'svtype', 'AF', 'GT', 'CN', 'AB', 'sil_gt_avg', 'sil_gt']] return df
def parse_bammarkduplicates(fn): """ Parse the output from biobambam2's bammarkduplicates and return as pandas Series. Parameters ---------- fn : str Path to the output file to parse. Returns ------- metrics : pandas.Series Duplicate metrics. hist : pandas.Series Duplicate histogram. """ with open(fn) as f: lines = [x.strip().split('\t') for x in f.readlines()] metrics = pd.Series(lines[4], lines[3]) m = pd.to_numeric(metrics[metrics.index[1:]]) metrics[m.index] = m.values vals = np.array(lines[8:-1]) hist = pd.Series(vals[:, 1], index=[int(float(x)) for x in vals[:, 0]]) hist = pd.to_numeric(hist) return metrics, hist
def plot_week_data_facet(df, sample_type, metric, hue=None, hide_donor_baseline=False, hide_control_baseline=False, dm=None, save=True): df['week'] = pd.to_numeric(df['week'], errors='coerce') df[metric] = pd.to_numeric(df[metric], errors='coerce') df = df.sort_values(by='week') asd_data = filter_sample_md(df, [('SampleType', sample_type), ('Group', 'autism')]) order = sorted(asd_data['SubjectID'].unique()) grid = sns.FacetGrid(asd_data, col="SubjectID", hue=hue, col_wrap=6, size=1.5, palette=palette, col_order=order) control_y = np.median(control_metric(df, sample_type, metric=metric)) grid.map(plt.plot, "week", metric, marker="o", ms=4) if not hide_control_baseline: grid.map(plt.axhline, y=control_y, ls="--", c=palette['neurotypical']) if not hide_donor_baseline: donor_initial_y = np.median(donor_metric(df, metric=metric, group='donor-initial', sample_type=sample_type)) donor_maintenance_y = np.median(donor_metric(df, metric=metric, group='donor-maintenance', sample_type=sample_type)) grid.map(plt.axhline, y=donor_initial_y, ls="--", c=palette['donor']) grid.map(plt.axhline, y=donor_maintenance_y, ls=":", c=palette['donor']) if dm is not None: inter_nt_dm = inter_neurotypical_distances(df, dm, sample_type=sample_type) median_inter_nt = np.median(inter_nt_dm.condensed_form()) grid.map(plt.axhline, y=median_inter_nt, color=palette['neurotypical'], linestyle='-.', label='between neurotypical distance (median)') grid.set(xticks=[0, 3, 10, 18], xlim=(-0.5, 18.5)) grid.set_axis_labels("", "") grid.fig.tight_layout(w_pad=1) if save: filename = '%s-%s-%s-detail.pdf' % (sample_type, metric.replace(' ', '-'), hue) grid.savefig('engraftment-plots/%s' % filename) return grid
def import_data(self): ''' Reads ICICIDirect csv file row by row and populates one Transaction Fileeach companyDB is with companyName as key and multiple transactions as values ''' # companyDB = defaultdict(list) # f = open(self.csvFileName, 'r') # opens the csv file # try: # reader = csv.reader(f, delimiter=',', quotechar='|') # Creates the reader object # for row in reader: # Iterates the rows of the file in orders # date = datetime.datetime.strptime(row[0], "%d-%b-%y").date() # company = row[1] # action = row[2] # quantity = int(row[3]) # rate = float(row[4]) # brokerage = float(row[6]) # row[5] is TOTAL cost, ignored, as all the rest # transaction = Transaction(date, company, action, quantity, rate, brokerage) # companyDB.setdefault(company, []).append(transaction) # finally: # f.close() # closing # return companyDB if os.path.isfile(self.csvFileName): column_names = ['Date','CompanyName','OrderType','OrderQuantity','OrderPrice','OrderTotal','OrderCommision','OrderId1','OrderId2','OrderRolling','Account','Exchange'] df = pd.read_csv(self.csvFileName, header=None, names = column_names) df = df[['Date','CompanyName','OrderType','OrderQuantity','OrderPrice']] df['Date'] = pd.to_datetime(df['Date'], format="%d-%b-%y").dt.date df['OrderQuantity'] = pd.to_numeric(df['OrderQuantity'].astype(str).str.replace(',',''), errors='coerce') df['OrderPrice'] = pd.to_numeric(df['OrderPrice'].str.replace(",","").astype(str).str.replace(',',''), errors='coerce') df = df.sort_values(['CompanyName','Date'], ascending=[True,True]) list_of_companies = [] for name, subdf in df.groupby('CompanyName'): list_of_records = subdf.to_dict('records') transactions = [Transaction(rec['Date'],rec['CompanyName'],rec['OrderType'],rec['OrderQuantity'],rec['OrderPrice'],0) for rec in list_of_records] list_of_companies.append(Company(name, transactions)) self.list_of_companies = list_of_companies
def clean_pitching(self): """ Does basic cleaning of picthing table """ players_pitching = self.players_pitching.drop('index', axis=1) players_pitching = players_pitching[players_pitching.HR != 'HR'] players_pitching.Tm = players_pitching.Tm.apply( lambda x: filter(lambda y: y in printable, x)) # dropping rows that has 'Teams' in Teams column players_pitching.Tm = players_pitching['Tm'].apply( lambda x: 'toss_away' if 'Teams' in x else x) players_pitching = players_pitching[players_pitching.Tm != 'toss_away'] # dropping W-L perc as it is largely correlated players_pitching = players_pitching.drop('W-L_perc', axis=1) # fill missing Aff with N/A players_pitching.Aff = players_pitching.Aff.fillna('N/A') # dropping rows with missing AgeDif players_pitching = players_pitching[players_pitching.AgeDif.notnull()] # fill the rest of missing values with 0 players_pitching = players_pitching.fillna(0) # dropping players whose age is '--' for id in self.drop_id_lst: players_pitching = players_pitching[players_pitching[ 'player_id'] != id] players_pitching[['Age', 'AgeDif']] = players_pitching[ ['Age', 'AgeDif']].apply(lambda x: pd.to_numeric(x)) players_pitching[players_pitching.columns[8:]] =\ players_pitching[players_pitching.columns[8:]].apply( lambda x: pd.to_numeric(x))
def test_really_large_in_arr_consistent(large_val, signed, multiple_elts, errors): # see gh-24910 # # Even if we discover that we have to hold float, does not mean # we should be lenient on subsequent elements that fail to be integer. kwargs = dict(errors=errors) if errors is not None else dict() arr = [str(-large_val if signed else large_val)] if multiple_elts: arr.insert(0, large_val) if errors in (None, "raise"): index = int(multiple_elts) msg = "Integer out of range. at position {index}".format(index=index) with pytest.raises(ValueError, match=msg): to_numeric(arr, **kwargs) else: result = to_numeric(arr, **kwargs) if errors == "coerce": expected = [float(i) for i in arr] exp_dtype = float else: expected = arr exp_dtype = object tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))
def main(): datasets_path = r'D:\data\jdcl\datasets\2017-11-20_2017-12-23.csv' model_path = r'D:\model\randomforest_2017-11-20_2017-12-23.p' cols = [] # model = pickle.load(open(model_path)) label_name = 'label_class' df = pd.read_csv(datasets_path) for col in df.columns: try: pd.to_numeric(df[col]) cols.append(col) except BaseException: # print(col) # traceback.print_exc() pass date_1 = '2017-12-01' date_2 = '2017-12-10' date_3 = '2017-12-30' TdX = df.loc[df.create_time < date_1, cols].values TsX = df.loc[(df.create_time > date_1) & (df.create_time < date_2), cols].values S = df.loc[(df.create_time > date_2) & (df.create_time < date_3), cols].values label_df = df[[label_name, 'create_time']] label_d_df = label_df.loc[label_df.create_time < date_1, label_name].values label_s_df = label_df.loc[(label_df.create_time > date_1) & (label_df.create_time < date_2), label_name].values classifier = TrAdaBoostClassifier() result = classifier.fit(TdX, label_d_df, TsX, label_s_df, S, base_classifier=DecisionTreeClassifier) print(TdX.shape[0] + TsX.shape[0]) np.save('result.npy', result)
def get_data_ethbtc(): order_book = public_client.get_product_order_book('ETH-BTC', level=3) ask_tbl = pd.DataFrame(data=order_book['asks'], columns=['price', 'volume', 'address']) bid_tbl = pd.DataFrame(data=order_book['bids'], columns=['price', 'volume', 'address']) # building subsetted table for ask data only # sell side (would be Magma) ask_tbl['price'] = pd.to_numeric(ask_tbl['price']) ask_tbl['volume'] = pd.to_numeric(ask_tbl['volume']) first_ask = float(ask_tbl.iloc[1, 0]) perc_above_first_ask = (1.025 * first_ask) ask_tbl = ask_tbl[(ask_tbl['price'] <= perc_above_first_ask)] ask_tbl['color'] = 'red' # building subsetted table for bid data only # buy side (would be Viridis) bid_tbl['price'] = pd.to_numeric(bid_tbl['price']) bid_tbl['volume'] = pd.to_numeric(bid_tbl['volume']) first_bid = float(bid_tbl.iloc[1, 0]) perc_above_first_bid = (0.975 * first_bid) bid_tbl = bid_tbl[(bid_tbl['price'] >= perc_above_first_bid)] bid_tbl['color'] = 'green' # append the buy and sell side tables to create one cohesive view fulltbl = bid_tbl.append(ask_tbl) # limit our view to only orders greater than or equal to 1 ETH in size fulltbl = fulltbl[(fulltbl['volume'] >= 1)] # takes the square root of the volume (to be used later on for the purpose of sizing the orders fulltbl['sqrt'] = np.sqrt(fulltbl['volume']) # takes average of closet bid and ask to determine the market price fulltbl['market_price'] = ((perc_above_first_ask + perc_above_first_bid) / 2) return fulltbl
def _get_Laskar_data(verbose=True): longorbit = {} sources = {} pandas_kwargs = {'delim_whitespace':True, 'header':None, 'index_col':0, 'names':['kyear','ecc','obliquity','long_peri'],} for time in filenames: local_path = os.path.join(os.path.dirname(__file__), "data", filenames[time]) remote_path = base_url + filenames[time] if time is 'future': pandas_kwargs['skiprows'] = 1 # first row is kyear=0, redundant longorbit[time], path = load_data_source(local_path=local_path, remote_source_list=[remote_path], open_method = pd.read_csv, open_method_kwargs=pandas_kwargs, verbose=verbose) sources[time] = path xlongorbit = {} for time in ['past', 'future']: # Cannot convert to float until we replace the D notation with E for floating point numbers longorbit[time].replace(to_replace='D', value='E', regex=True, inplace=True) xlongorbit[time] = xr.Dataset() xlongorbit[time]['ecc'] = xr.DataArray(pd.to_numeric(longorbit[time]['ecc'])) for field in ['obliquity', 'long_peri']: xlongorbit[time][field] = xr.DataArray(np.rad2deg(pd.to_numeric(longorbit[time][field]))) longorbit = xr.concat([xlongorbit['past'], xlongorbit['future']], dim='kyear') # add 180 degrees to long_peri (see lambda definition, Berger 1978 Appendix) longorbit['long_peri'] += 180. longorbit['precession'] = longorbit.ecc*np.sin(np.deg2rad(longorbit.long_peri)) longorbit.attrs['Description'] = 'The Laskar et al. (2004) orbital data table' longorbit.attrs['Citation'] = 'https://doi.org/10.1051/0004-6361:20041335' longorbit.attrs['Source'] = [sources[time] for time in sources] longorbit.attrs['Note'] = 'Longitude of perihelion is defined to be 0 degrees at Northern Vernal Equinox. This differs by 180 degrees from the source files.' return longorbit
def coerce_types(df, field_properties): decimal_fields = [] integer_fields = [] string_fields = [] datetime_fields = [] for fp in field_properties: name = fp['name'] data_type = fp['type'] if data_type in fields_to_coerce_to_float: decimal_fields.append(name) elif data_type in fields_to_coerce_to_integer: integer_fields.append(name) elif data_type in fields_to_coerce_to_string: string_fields.append(name) elif data_type in fields_to_coerce_to_datetime: datetime_fields.append(name) # Forcing data types for decimal_field in decimal_fields: df[decimal_field] = pd.to_numeric(df[decimal_field], errors='coerce') for integer_field in integer_fields: df[integer_field] = pd.to_numeric(df[integer_field], errors='coerce') for datetime_field in datetime_fields: df[datetime_field] = pd.to_datetime( df[datetime_field], errors='coerce', infer_datetime_format=True ) return df
def test_numeric(self): s = pd.Series([1, -3.14, 7], dtype='O') res = to_numeric(s) expected = pd.Series([1, -3.14, 7]) tm.assert_series_equal(res, expected) s = pd.Series([1, -3.14, 7]) res = to_numeric(s) tm.assert_series_equal(res, expected) # GH 14827 df = pd.DataFrame(dict( a=[1.2, decimal.Decimal(3.14), decimal.Decimal("infinity"), '0.1'], b=[1.0, 2.0, 3.0, 4.0], )) expected = pd.DataFrame(dict( a=[1.2, 3.14, np.inf, 0.1], b=[1.0, 2.0, 3.0, 4.0], )) # Test to_numeric over one column df_copy = df.copy() df_copy['a'] = df_copy['a'].apply(to_numeric) tm.assert_frame_equal(df_copy, expected) # Test to_numeric over multiple columns df_copy = df.copy() df_copy[['a', 'b']] = df_copy[['a', 'b']].apply(to_numeric) tm.assert_frame_equal(df_copy, expected)
def get_data(ticker, start, end): global missing_tickers df = None # Gets stock data from web try: # Creates Google Finance url url = '''http://www.google.com/finance/historical?q=''' + ticker.replace('^', '-') + '''&startdate=''' + start.strftime("%B")[:3] url += '''%20''' + str(start.day) + ''',%20''' + str(start.year) + '''&enddate=''' + end.strftime("%B")[:3] + '''%20''' url += str(end.day) + ''',%20''' + str(end.year) + '''&output=csv''' df = pd.read_csv(url).rename(columns = {'\xef\xbb\xbfDate': 'Date'}) df[open_str] = pd.to_numeric(df[open_str], errors = 'coerce') df[close_str] = pd.to_numeric(df[close_str], errors = 'coerce') except: try: # Creates yahoo finance url url = '''http://ichart.finance.yahoo.com/table.csv?s=''' + ticker + '''&a=''' + str(start.month - 1) url += '''&b=''' + "%02d" % start.day + '''&c=''' + str(start.year) + '''&d=''' + str(end.month - 1) url += '''&e=''' + "%02d" % end.day + '''&f=''' + str(end.year) + '''&g=d&ignore=.csv''' df = pd.read_csv(url).rename(columns = {close_str: 'Non Adjust Close', 'Adj Close': close_str}) except: missing_tickers.append(ticker) if (df is not None) and (len(df) > 0): df[return_str] = df.apply(lambda row: return_rate(row[open_str], row[close_str]), axis = 1) df['ticker'] = ticker df['Date'] = pd.to_datetime(df['Date']) return df
def get_empty_summary_df(): # create an empty dataframe with just the site_names, xs, ys, and srcs to fill in the summary data empty_daily_tots_df = get_data_frame_from_table('sites_list') empty_daily_tots_df = empty_daily_tots_df.set_index('site_name') empty_daily_tots_df['x'] = pd.to_numeric(empty_daily_tots_df['x']) empty_daily_tots_df['y'] = pd.to_numeric(empty_daily_tots_df['y']) return empty_daily_tots_df
def get_concentration_functions(composition_table_dict): meta = composition_table_dict['meta'] composition_table = Table.from_dict(composition_table_dict['data']) elements = [col for col in composition_table.columns if col not in meta] x = composition_table["X"].values y = composition_table["Y"].values cats = composition_table["X"].unique() concentration, conc, d, y_c, functions = {}, {}, {}, {}, RecursiveDict() for el in elements: concentration[el] = to_numeric(composition_table[el].values)/100. conc[el], d[el], y_c[el] = {}, {}, {} if meta['X'] == 'category': for i in cats: k = '{:06.2f}'.format(float(i)) y_c[el][k] = to_numeric(y[where(x==i)]) conc[el][k] = to_numeric(concentration[el][where(x==i)]) d[el][k] = interp1d(y_c[el][k], conc[el][k]) functions[el] = lambda a, b, el=el: d[el][a](b) else: functions[el] = interp2d(float(x), float(y), concentration[el]) return functions
def parse_mark_duplicate_metrics(fn): """ Parse the output from Picard's MarkDuplicates and return as pandas Series. Parameters ---------- filename : str of filename or file handle Filename of the Picard output you want to parse. Returns ------- metrics : pandas.Series Duplicate metrics. hist : pandas.Series Duplicate histogram. """ with open(fn) as f: lines = [x.strip().split('\t') for x in f.readlines()] metrics = pd.Series(lines[7], lines[6]) m = pd.to_numeric(metrics[metrics.index[1:]]) metrics[m.index] = m.values vals = np.array(lines[11:-1]) hist = pd.Series(vals[:, 1], index=[int(float(x)) for x in vals[:, 0]]) hist = pd.to_numeric(hist) return metrics, hist
def boxplotArray(data, pGroups=None, thr=None, ax=None): if ax is None: w,h = (6.4, 4.8) dpi = 100 fig = plt.figure(figsize=(w,h)) ax = fig.add_axes([70.0/w/dpi, 54.0/h/dpi, 1-2*70.0/w/dpi, 1-2*54.0/h/dpi]) if pGroups is None: df = pd.DataFrame() df["x"] = pd.to_numeric(pd.Series(data[2:])) ax.boxplot(df["x"]) else: bdata = [] for k in range(len(pGroups)): df = pd.DataFrame() order = pGroups[k][2] val = [data[i] for i in order if data[i] != ""] df["x"] = pd.to_numeric(pd.Series(val)) bdata.append(df["x"]) bp = ax.boxplot(bdata, patch_artist=True) ax.set_xticklabels([ g[0] for g in pGroups ]) for k in range(len(pGroups)): bp['boxes'][k].set(color = pGroups[k][1]) bp['boxes'][k].set(facecolor = pGroups[k][1], alpha=0.2) bp['fliers'][k].set(color = pGroups[k][1]) bp['medians'][k].set(color = pGroups[k][1]) for k in range(2*len(pGroups)): bp['whiskers'][k].set(color = pGroups[k/2][1]) bp['caps'][k].set(color = pGroups[k/2][1]) ax.set_ylabel(data[1]) if thr is not None: ax.axhline(y=thr[1], color='r') ax.axhline(y=thr[3], color='cyan') ax.axhline(y=thr[4], color='cyan') return ax
def readThreeColumnTruth(fn, suffix=""): df = pd.read_csv(fn, sep=' ', skiprows=1, names=['Name', 'Gene{}'.format(suffix), 'TPM{}'.format(suffix)], engine='c') df.set_index("Name", inplace=True) pd.to_numeric(df["TPM{}".format(suffix)], errors='ignore') return df
def get_df_from_annovar_csv(df, chunk_ids): df = df.rename(columns={'1000g2015aug_all': 'ThousandGenomeAll'}) df.Chr = df.Chr.replace(to_replace='chrM', value='chrMT') df['Start'] = pandas.to_numeric(df['Start']) df['End'] = pandas.to_numeric(df['End']) df["nci60"] = utilities.to_float(df, "nci60") df["ThousandGenomeAll"] = utilities.to_float(df, "ThousandGenomeAll") df["ESP6500si_ALL"] = utilities.to_float(df, "ESP6500si_ALL") df["tfbsConsSites"] = df["tfbsConsSites"].dropna().apply(utilities.cell_to_dict) utilities.split_string(df, "Func.knownGene") utilities.split_string(df, "ExonicFunc.knownGene") #df["targetScanS"] = df["targetScanS"].dropna().apply(utilities.cell_to_dict) df["genomicSuperDups"] = df["genomicSuperDups"].dropna().apply(utilities.cell_to_dict) df["cytoBand"] = df["cytoBand"].dropna().apply(utilities.split_cytoband) df["cytoBand"] = df["cytoBand"].dropna().apply(utilities.lists_to_dict) df['hgvs_key'] = pandas.Series(chunk_ids) my_sample_id = df["Otherinfo"].dropna().apply(genotype_calling.split_sample_ID) genotype_call = my_sample_id.apply(lambda x: x[-2::]) dict_split = genotype_call.apply(genotype_calling.return_dict) df['Otherinfo'] = dict_split df = df.rename(columns={'Otherinfo': 'Genotype_Call'}) #Clean up dataframe df = utilities.modify_df(df) df_final = df.where((pandas.notnull(df)), None) return df_final
def plotBooleanSelect(obj, pHash=None): info = getHegemonDataset(obj[5]); thr = getHegemonThr(obj[5], obj[0], obj[2]); thash = {} for v in thr: thash[v[0]] = v datax = getHegemonPtr(obj[4], obj[8]) datay = getHegemonPtr(obj[4], obj[9]) thrx = thash[str(obj[0])] thry = thash[str(obj[2])] df = pd.DataFrame() if pHash is None: df["x"] = pd.to_numeric(pd.Series(datax[1][2:])) df["y"] = pd.to_numeric(pd.Series(datay[1][2:])) else: order = [i for i in range(2, len(datax[0])) if datax[0][i] in pHash] val = [datax[1][i] for i in order] df["x"] = pd.to_numeric(pd.Series(val)) val = [datay[1][i] for i in order] df["y"] = pd.to_numeric(pd.Series(val)) ax = df.plot.scatter(x='x', y='y') ax.set_xlabel(obj[6]) ax.set_ylabel(obj[7]) ax.set_title("{0} (n = {1})".format(info[1], info[2])) ax.axhline(y=thry[1], color='r') ax.axhline(y=thry[3], color='cyan') ax.axhline(y=thry[4], color='cyan') ax.axvline(x=thrx[1], color='r') ax.axvline(x=thrx[3], color='cyan') ax.axvline(x=thrx[4], color='cyan') return ax
def get_k_frame(cls, con, code): k_df = tushare.bar(code, con, adj='qfq') k_df_size = k_df.index.size if not k_df_size: return k_df k_df['high'] = pandas.to_numeric(k_df['high']) k_df['low'] = pandas.to_numeric(k_df['low']) k_df['open'] = pandas.to_numeric(k_df['open']) k_df['close'] = pandas.to_numeric(k_df['close']) k_df['date'] = k_df.index k_df['date'] = k_df['date'].apply(lambda x: datetime.datetime.strftime(x, "%Y-%m-%d")) k_df = k_df.drop(['vol', 'amount'], axis=1) k_df = k_df.reset_index(drop=True) k_df = k_df.sort_values(by='date').reset_index(drop=True) k_df['k_pos'] = k_df.index def _get_per_change(x): if x == 0: _pc = (k_df.at[0, "close"] - k_df.at[0, "open"]) / k_df.at[0, "open"] * 100 else: _pc = (k_df.at[x, "close"] - k_df.at[x-1, "close"]) / k_df.at[x-1, "close"] * 100 return float("%0.2f" % _pc) k_df["per_change"] = k_df.k_pos.apply(_get_per_change) return k_df
def arrange_datas(MODEL_NUMBER): MODEL_PATH = './ahmet_models/model' + str(MODEL_NUMBER) + '.h5' #import h5py # to fix loading model problem #f = h5py.File(MODEL_PATH, 'r+') #del f['optimizer_weights'] #f.close() model = load_model(MODEL_PATH) print("Train is being prepared..") model_new = Model(inputs=model.input, outputs=model.layers[-5].output) train_xgb= model_new.predict((X),batch_size=240, verbose=1) #model.layers print('\n New train shape: '+str(train_xgb.shape)) train_xgb=pd.DataFrame(train_xgb) df['inc_angle'] = pd.to_numeric(df['inc_angle'], errors='coerce') train_xgb['angle'] =df['inc_angle'] train_xgb['angle'] =train_xgb['angle'].fillna(train_xgb['angle'].median()) train_xgb=np.array(train_xgb) print("\nTest is being prepared..") test_xgb= model_new.predict((X_test), batch_size=240,verbose=1) print('\n New test shape: '+str(test_xgb.shape)) test_xgb=np.array(test_xgb) test_xgb=pd.DataFrame(test_xgb) df_test['inc_angle'] = pd.to_numeric(df_test['inc_angle'], errors='coerce') test_xgb['angle'] =df_test['inc_angle'] test_xgb=np.array(test_xgb) return train_xgb,test_xgb
def preprocess_people(data): # TODO refactor this duplication data['people_id'] = data['people_id'].apply(lambda x: x.split('_')[1]) data['people_id'] = pd.to_numeric(data['people_id']).astype(int) data.date = pd.to_datetime(data.date) # Values in the people df is Booleans and Strings columns = list(data.columns) columns.remove("date") bools = columns[11:] strings = columns[1:11] for col in bools: data[col] = pd.to_numeric(data[col]).astype(int) for col in strings: data[col] = data[col].fillna('type 0') data[col] = data[col].apply(lambda x: x.split(' ')[1]) data[col] = pd.to_numeric(data[col]).astype(int) # Add features from date data["year_p"] = data.date.apply(lambda x: x.year) data["month_p"] = data.date.apply(lambda x: x.month) data["day_p"] = data.date.apply(lambda x: x.day) data = data.drop(['date'], axis=1) return data
def validate_split_data( raw_data, split_data, split_obj, float_limit=float(TEST_CONFIG.get('TEST', 'float_limit')) ): """validate data that did not split Args: raw_data (:obj:`pandas.DataFrame`): raw data (A group) split_data (:obj:`pandas.DataFrame`): split data (B group) split_obj (:obj:`split_utils.SplitInfo`): split information float_limit (float): maximum deviation for equality test Raises: AssertionError: asserts expected shapes """ for column in split_data.columns.values: #print(split_data[column]) #print(raw_data[column]) if column == 'date': assert split_data[column].equals(raw_data[column]) elif column == 'index': continue elif column in split_utils.PRICE_KEYS: diff = abs( pd.to_numeric(split_data[column]) - pd.to_numeric(raw_data[column]) * split_obj ) assert diff.max() < float_limit else: diff = abs( pd.to_numeric(split_data[column]) - pd.to_numeric(raw_data[column]) / split_obj ) assert diff.max() < float_limit
def parse_xml(token): """Attempt to parse the XML into something useful""" root = ET.fromstring(token) hml = HMLData() hml.station = root.attrib['id'] hml.stationname = root.attrib.get('name') hml.originator = root.attrib.get('originator') hml.generationtime = parseUTC(root.attrib['generationtime']) for child in root: if child.tag not in ['observed', 'forecast']: continue rows = [] for datum in child.findall("datum"): secondary = datum.find('secondary') rows.append(dict(name=child.tag, valid=parseUTC(datum.find('valid').text), primary=datum.find('primary').text, secondary=(secondary.text if secondary is not None else None))) mydict = hml.data[child.tag] df = pd.DataFrame(rows) df['primary'] = pd.to_numeric(df['primary'], errors='coerse') df['secondary'] = pd.to_numeric(df['secondary'], errors='coerse') mydict['dataframe'] = df mydict['issued'] = parseUTC(child.attrib.get('issued')) for attr in ['primaryName', 'secondaryName', 'primaryUnits', 'secondaryUnits']: mydict[attr] = child.attrib.get(attr) return hml
def validate_plain_data( raw_data, split_data, float_limit=float(TEST_CONFIG.get('TEST', 'float_limit')) ): """validate data that did not split Args: raw_data (:obj:`pandas.DataFrame`): raw data (A group) split_data (:obj:`pandas.DataFrame`): split data (B group) float_limit (float): maximum deviation for equality test Returns: (None): asserts internally """ for column in split_data.columns.values: print(split_data[column]) print(raw_data[column]) if column == 'date': assert split_data[column].equals(raw_data[column]) elif column == 'index': continue else: diff = abs(pd.to_numeric(split_data[column]) - pd.to_numeric(raw_data[column])) assert diff.max() < float_limit
def _readData(filepath): data = pandas.read_csv(filepath) data["Age"] = data["Age"].fillna(data["Age"].median()) data.loc[data["Sex"] == 'male', "Sex"] = 0 data.loc[data["Sex"] == 'female', "Sex"] = 1 data["Embarked"] = data["Embarked"].fillna('S') data.loc[data["Embarked"] == 'S', "Embarked"] = 0 data.loc[data["Embarked"] == 'C', "Embarked"] = 1 data.loc[data["Embarked"] == 'Q', "Embarked"] = 2 data["Fare"] = data["Fare"].fillna(data["Fare"].median()) # new features data["FamilySize"] = data["SibSp"] + data["Parch"] data["NameLength"] = data["Name"].apply(lambda x: len(x)) # _family id # people with same (last name + family size) = family member family_id_mapping = {} def _getFamilyId(row): lastName = row["Name"].split(",")[0] family_id = lastName + str(row["FamilySize"]) if family_id not in family_id_mapping: if len(family_id_mapping) == 0: current_id = 1; else: current_id = (max(family_id_mapping.items(), key=operator.itemgetter(1))[1] + 1) family_id_mapping[family_id] = current_id return family_id_mapping[family_id] family_ids = data.apply(_getFamilyId, axis=1) family_ids[data["FamilySize"] < 3] = -1 data["FamilyId"] = family_ids # family id_ #_get family title def _getTitle(name): title = re.search(" ([A-Za-z]+)\.",name) if title: return title.group(1) return "" titles = data["Name"].apply(_getTitle) title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2} ''' j=0 for i in titles: title_mapping[i]=j j=j+1 ''' for k,v in title_mapping.items(): titles[titles == k] = v data["Title"] = titles data["Title"]=data["Title"].apply(lambda x: pandas.to_numeric(x,errors='coerce')) data.loc[data["Title"].apply(math.isnan),"Title"] = -1 # get family title_ data = data.apply(lambda x: pandas.to_numeric(x, errors='ignore')) return data
def get_project_timeline(): d = load_table('SafeWaterProjectMonthlySummary') # Remove invalid dates d = d[d['MonthAndYear'] != '2004-10-28'] d = d[d['MonthAndYear'] != '2018-07-28'] # Adjust for apparent date typo d['MonthAndYear'] = d['MonthAndYear'].where(d['MonthAndYear'] != '2105-06-28', '2015-06-28') d['MonthAndYear'] = d['MonthAndYear'].apply(lambda x: pd.to_datetime(x[:7])) # Rename date column d = d.rename(columns={'MonthAndYear': 'Date'}) # Remove "Additional*" fields d = d[[c for c in d if not c.startswith('Additional')]] # Assessment ID + date conflicts do occur, so choose arbitrarily from between them d = d.groupby(['AssessmentID', 'Date'], group_keys=False).apply(lambda x: x.head(1)) d['AssessmentID'] = pd.to_numeric(d['AssessmentID'], errors='coerce') d = d[d['AssessmentID'].notnull()] d = d[d['Date'].notnull()] d = d.set_index(['Date', 'AssessmentID']) for c in d: d[c] = pd.to_numeric(d[c], errors='coerce') return d
def preprocess_acts(data, train_set=True): # Getting rid of data feature for now data = data.drop(['activity_id'], axis=1) data.date = pd.to_datetime(data.date) if(train_set): data = data.drop(['outcome'], axis=1) ## Split off _ from people_id data['people_id'] = data['people_id'].apply(lambda x: x.split('_')[1]) data['people_id'] = pd.to_numeric(data['people_id']).astype(int) columns = list(data.columns) columns.remove("date") # Convert strings to ints for col in columns[1:]: data[col] = data[col].fillna('type 0') data[col] = data[col].apply(lambda x: x.split(' ')[1]) data[col] = pd.to_numeric(data[col]).astype(int) # Add features from date data["year"] = data.date.apply(lambda x: x.year) data["month"] = data.date.apply(lambda x: x.month) data["day"] = data.date.apply(lambda x: x.day) data = data.drop(["date"], axis = 1) return data
def convert_units_to_floats(path_base, out_path): files = util_path.get_files_by_name_ext(path_base, '.', 'csv') for f in files: #logging.debug("File {}".format(f)) this_dir, basename = os.path.split(f) basename_noext, this_ext = os.path.splitext(basename) logging.info("Processing {}, a {} from {} ".format(basename_noext, this_ext, path_base)) # Load as DF df = pd.read_csv(f, sep=";",encoding='utf-8') logging.debug("Loaded as DF: {}".format(df.shape)) # Converting df["Volume"] = pd.to_numeric(df["Volume"].str.extract(r"(\d+\.\d*)", expand=False)) df["Area"] = pd.to_numeric(df["Area"].str.extract(r"(\d+\.\d*)", expand=False)) if 'Elevation' in df: df["Elevation"] = pd.to_numeric(df["Elevation"].str.extract(r"(\d+\.\d*)", expand=False)) logging.debug("Converted Volume Area Elevation".format()) #print(os.access(out_path, os.R_OK),os.access('foo.txt', os.W_OK)) this_out_path = os.path.join(out_path, basename_noext+'.csv') logging.debug("Writing DF to {}".format(this_out_path)) df.to_csv(this_out_path, sep = ';')
def create_volume_time_graph(tank_type_var, file_path, file_path2 = None, date_from = None, date_to = None): """Creates a tank volumn in % vs time graph""" csv1, files_exist = open_files(file_path) if file_path2 != None: csv2, files_exist = open_files(file_path2) frames = [csv1, csv2] csv = pd.concat(frames) try: time = "Train end time [local_unit_time]" #get the time column csv = csv.sort(columns=time) except: time = "Report date" csv = csv.sort(columns=time) print files_exist if files_exist != True: return print "data analyzed" total_wheels = csv["Total wheels"] try: tank_level = csv["Product %"] except: tank_level = csv["Raw Level"] tank_level_mask = np.isfinite(tank_level) #find times when the control box is changed #counter_change = filtered_time.groupby(filtered_csv["Wheels TA"]).apply(lambda x: np.array(x)) box_change, settings = get_setting_changes(csv, get_time(csv)) #print box_change, settings, len(box_change), len(settings) #print total_wheels fig, ax1 = plt.subplots() ax2 = ax1.twinx() ax1.plot(get_time(csv)[tank_level_mask], tank_level[tank_level_mask], label = "tank level") if file_path2 != None: voltage1 = pd.to_numeric(csv1['Volts']) voltage2 = pd.to_numeric(csv2['Volts']) ax2.plot(get_time(csv1), voltage1, color='Red', label = "voltage1", alpha=0.5) ax2.plot(get_time(csv2), voltage2, color='Darkred', label = "voltage2", alpha=0.5) else: voltage = pd.to_numeric(csv['Volts']) ax2.plot(get_time(csv), voltage, color='Red', label = "voltage", alpha=0.5) [ax1.axvline(x=i, ls='--', lw=2.0) for i in box_change] [ax1.annotate('{0} x {1}'.format(s[0],s[1]), xy=(box_change[i], 15)) for i, s in enumerate(settings)] ax1.legend(loc=0) ax2.legend(loc=0) plt.title("Tank level and voltage over time") ax1.set_ylabel("Tank Level", color='Blue') ax2.set_ylabel("Voltage", color='Red') ax2.set_ylim(0,15) plt.xlabel("Time") plt.show()
def time_from_str(self, errors): to_numeric(self.str, errors=errors)
import pandas as pd import numpy as np import matplotlib.pyplot as plt df = pd.read_csv('data/train.csv', index_col=0) print df.shape df.head() df.shape #df = df.fillna('0') trainx = df.drop(['Churn', 'customerID'], axis=1) trainy = df['Churn'] trainx['TotalCharges'] = pd.to_numeric(trainx['TotalCharges'], errors='coerce') trainx = trainx.fillna('0') trainx.shape from scipy import stats from sklearn import tree clf = tree.DecisionTreeClassifier() clf = clf.fit(trainx, trainy) from sklearn import metrics # Combine all relevant outputs into one function def print_performance(y_true, y_pred): display(pd.DataFrame(metrics.confusion_matrix(y_true, y_pred))) print(metrics.classification_report(y_true, y_pred))
#Leemos el fichero Json directamente desde una URL sin descargarlo en local #url = "https://opendata.ecdc.europa.eu/covid19/casedistribution/json/" #json_url = urllib.request.urlopen(url) #datos_json = json.loads(json_url.read()) #Filtramos el Dataframe para el país indicado datospais_df = datos[datos['countryterritoryCode'] == 'ESP'] #Convertimos la fecha de formato cadena a formato fecha para poder filtrar un intervalo de fechas datospais_df['dateRep'] = pd.to_datetime(datospais_df['dateRep'], format='%d/%m/%Y') #datospais_df['dateRep'] = pd.to_datetime(datospais_df['dateRep'], dayfirst = True) print(datospais_df['dateRep']) #Convertimos el número de casos de formato cadena a formato número para poder calcular las medias móviles datospais_df['cases'] = pd.to_numeric(datospais_df['cases']) #Ordenamos los datos por fecha ascendentemente DatosOrdenadosPorFecha_df = datospais_df.sort_values(by='dateRep', ascending=True) #Calculamos la media móvil de los últimos 14 días DatosOrdenadosPorFecha_df['moving14'] = DatosOrdenadosPorFecha_df[ 'cases'].transform(lambda x: x.rolling(14, 1).mean()) #Calculamos la media móvil de los últimos 7 días DatosOrdenadosPorFecha_df['moving7'] = DatosOrdenadosPorFecha_df[ 'cases'].transform(lambda x: x.rolling(7, 1).mean()) #Indicamos el intervalo de fechas que queremos utilizar start_date = "2020-03-01" end_date = "2020-08-13"
def analysis(d): tar_li = ['AX', 'AY', 'AZ'] ind = [ 'back_cut', 'back_drive', 'back_short', 'back_smash', 'fo_cut', 'fo_drive', 'fo_short', 'fo_smash' ] print(d) tmp_str = d.split('|') mode = tmp_str[0] if mode in po_4: power = 4 elif mode in po_2: power = 2 elif mode in po_1: power = 1 data = tmp_str[1].split('\n') #data.pop(0) index = data.pop(0) tmp_real_data = [] for dat_num in range(len(data)): if data[dat_num] == '': continue tmp_real_data.append(data[dat_num].split(',')) df = pd.DataFrame(tmp_real_data) index_li = index.split(',') df.columns = index_li #now change str to float for y in index_li: df[y] = pd.to_numeric(df[y], downcast='float') tmp_li = [] for i in range(len(df)): tmp = [] #tmp.append((df['AX'][i]/1000)**power) #tmp.append((df['AY'][i]/1000)**power) #tmp.append((df['AZ'][i]/1000)**power) tmp.append((df['AX'][i])) tmp.append((df['AY'][i])) tmp.append((df['AZ'][i])) tmp_li.append(tmp) total_data = [] total_data.append(tmp_li) total_data = np.array(total_data) if mode != 'lstm': nsamples, nx, ny = total_data.shape total_data = total_data.reshape((nsamples, nx * ny)) if mode == 'lstm': with graph.as_default(): set_session(session) res = lstm.predict(total_data) res = np.argmax(res, axis=-1) elif mode == 'svm': res = svm.predict(total_data) print(res) elif mode == 'rf': res = rf.predict(total_data) print(res) elif mode == 'nb': res = nb.predict(total_data) print(res) elif mode == 'xgboost': res = xgboost.predict(total_data) print(res) elif mode == 'knn': res = knn.predict(total_data) print(res) print(ind[res[0]]) return ind[res[0]]
for time_to_death_label in df['Time_To_Death']: ohe_label = [0, 0, 0, 0] # Find out which range the time belongs to by finding index of first truth time_class = [time_to_death_label < cutoff for cutoff in cutoffs].index(True) Enc_labels.append(time_class) ohe_label[time_class] = 1.0 ohe_labels.append(ohe_label) # Add to dataframe df['OHE_Time_To_Death'] = ohe_labels df['Enc_Time_To_Death'] = Enc_labels # Exclude all entries with "Missing" Died stats df = df[~df['Died'].isin(['Missing'])] df['Died'] = pd.to_numeric(df['Died']) df.to_csv('/data/COVID/Labels/KCH_CXR_JPG_latest_dt.csv', index=False) # Mobile vs Non-mobile mobiles = df[df.Examination_Title == 'Chest - Xray (Mobile)'] non_mobiles = df[df.Examination_Title == 'Chest - X ray'] plt.figure(1) plt.title('Mobiles') plt.hist(mobiles.Enc_Time_To_Death) plt.figure(2) plt.title('Non-Mobiles') plt.hist(non_mobiles.Enc_Time_To_Death) plt.show()
return ga g_smth = testGauss(t, V, fs) plt.figure() plt.plot(t[:-1], g_smth) plt.xlabel('time(s)') plt.ylabel('Velocities(Cm/s)') def testButterworth(nyf, t, V, fs): b, a = butter(8, 6 / nyf, btype='low', analog=False) fl = filtfilt(b, a, V) #print (ssqe(fl, X, fs)) return fl test_butter = testButterworth(nyf, t, V, fs) plt.figure() plt.plot(t[:-1], test_butter) plt.title('butterworth lowpass 6Hz') plt.xlabel('time(s)') plt.ylabel('Velocities(cm/s)') plt.figure() plt.plot(pd.to_numeric(t[:-1]), pd.to_numeric(V)) plt.title('Trajectories') plt.xlabel('time(s)') plt.ylabel('Velocities(Cm/s)') plt.show()
def convert_string_col_to_int(df, col): converted_df = pd.to_numeric(df[col]).astype('Int64') return converted_df
def time_downcast(self, dtype, downcast): to_numeric(self.data, downcast=downcast)
### End : Collect All Data at one place --------------------------------------- ### Start : Clean up the Data ------------------------------------------------- #### Check NaN values present nan_val = df_allmonthsdata[df_allmonthsdata.isna().any(axis=1)] print("Check NaN values present : ",nan_val.head()) #### Drop rows of NAN df_allmonthsdata = df_allmonthsdata.dropna(how="all") invalid_data = df_allmonthsdata[df_allmonthsdata["Order Date"].str[0:2] == 'Or'] print("Invalid values present : ", invalid_data.head()) # Now Select Correct Data which does not have "Or" value in Date df_allmonthsdata = df_allmonthsdata[df_allmonthsdata["Order Date"].str[0:2] != 'Or'] #### Convert Columns to appropriate data type df_allmonthsdata["Quantity Ordered"] = pd.to_numeric(df_allmonthsdata["Quantity Ordered"]) # Make int df_allmonthsdata["Price Each"] = pd.to_numeric(df_allmonthsdata["Price Each"]) # Make int ### End : Clean up the Data ------------------------------------------------- ### Start : find Order id which is present more than once, i.e. more than one product under same order---------------------------- needed_data = df_allmonthsdata[df_allmonthsdata["Order ID"].duplicated(keep=False)] print("Retrieve & Collect only Duplicate : ",needed_data.head()) needed_data["list_of_products"] = needed_data.groupby("Order ID")["Product"].transform(lambda x: ','.join(x)) # Meaning of above line is, make a group of Products separated by comma and store in new Column, on the basis of similar Order IDs print("Check list_of_products Column & All Data : ",needed_data.head()) # Above line shows Same order id more than once which is correct but we want all duplicates lines should be combined together needed_data = needed_data[['Order ID','list_of_products']].drop_duplicates() # Above will create DataFrame of two columns 'Order ID','list_of_products' with combined manner. print("Show only one record, removed duplicates : ",needed_data.head()) sort_data = needed_data.sort_values(by=["list_of_products"])
def time_from_float(self, errors): to_numeric(self.float, errors=errors)
def num_coerce(value): if ' ' in value: value = value.split(' ')[0] elif value == '----': value = 0 return pd.to_numeric(value)
def train(self, data, categorical_columns=None, ordinal_columns=None, update_epsilon=None, verbose=False, mlflow=False): if update_epsilon: self.epsilon = update_epsilon if isinstance(data, pd.DataFrame): for col in data.columns: data[col] = pd.to_numeric(data[col], errors='ignore') self.pd_cols = data.columns self.pd_index = data.pd_index data = data.to_numpy() elif not isinstance(data, np.ndarray): raise ValueError("Data must be a numpy array or pandas dataframe") dataset = TensorDataset( torch.from_numpy(data.astype('float32')).to(self.device)) dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True, drop_last=True) if not hasattr(self, "generator"): self.generator = Generator(self.latent_dim, data.shape[1], binary=self.binary).to(self.device) if not hasattr(self, "discriminator"): self.discriminator = Discriminator(data.shape[1]).to(self.device) self.optimizer_d = optim.Adam(self.discriminator.parameters(), lr=4e-4, betas=(0.5, 0.9)) if hasattr(self, "state_dict"): self.optimizer_d.load_state_dict(self.state_dict) if not hasattr(self, "privacy_engine"): privacy_engine = PrivacyEngine( self.discriminator, batch_size=self.batch_size, sample_size=len(data), alphas=[1 + x / 10.0 for x in range(1, 100)] + list(range(12, 64)), noise_multiplier=3.5, max_grad_norm=1.0, clip_per_layer=True).to(self.device) else: privacy_engine = self.privacy_engine privacy_engine.attach(self.optimizer_d) if hasattr(self, "privacy_engine"): epsilon, best_alpha = self.optimizer_d.privacy_engine.get_privacy_spent( self.delta) else: epsilon = 0 if not hasattr(self, "optimizer_g"): self.optimizer_g = optim.Adam(self.generator.parameters(), lr=1e-4) criterion = nn.BCELoss() for epoch in range(self.epochs): if self.epsilon < epsilon: break for i, data in enumerate(dataloader): self.discriminator.zero_grad() real_data = data[0].to(self.device) # train with fake data noise = torch.randn(self.batch_size, self.latent_dim, 1, 1, device=self.device) noise = noise.view(-1, self.latent_dim) fake_data = self.generator(noise) label_fake = torch.full((self.batch_size, 1), 0, dtype=torch.float, device=self.device) output = self.discriminator(fake_data.detach()) loss_d_fake = criterion(output, label_fake) loss_d_fake.backward() self.optimizer_d.step() # train with real data label_true = torch.full((self.batch_size, 1), 1, dtype=torch.float, device=self.device) output = self.discriminator(real_data.float()) loss_d_real = criterion(output, label_true) loss_d_real.backward() self.optimizer_d.step() loss_d = loss_d_real + loss_d_fake max_grad_norm = [] for p in self.discriminator.parameters(): param_norm = p.grad.data.norm(2).item() max_grad_norm.append(param_norm) privacy_engine.max_grad_norm = max_grad_norm # train generator self.generator.zero_grad() label_g = torch.full((self.batch_size, 1), 1, dtype=torch.float, device=self.device) output_g = self.discriminator(fake_data) loss_g = criterion(output_g, label_g) loss_g.backward() self.optimizer_g.step() # manually clear gradients for p in self.discriminator.parameters(): if hasattr(p, "grad_sample"): del p.grad_sample # autograd_grad_sample.clear_backprops(discriminator) if self.delta is None: self.delta = 1 / data.shape[0] eps, best_alpha = self.optimizer_d.privacy_engine.get_privacy_spent( self.delta) self.alpha = best_alpha if (verbose): print('eps: {:f} \t alpha: {:f} \t G: {:f} \t D: {:f}'.format( eps, best_alpha, loss_g.detach().cpu(), loss_d.detach().cpu())) if (mlflow): import mlflow mlflow.log_metric("loss_g", float(loss_g.detach().cpu()), step=epoch) mlflow.log_metric("loss_d", float(loss_d.detach().cpu()), step=epoch) mlflow.log_metric("epsilon", float(eps), step=epoch) if self.epsilon < eps: break privacy_engine.detach() self.state_dict = self.optimizer_d.state_dict() self.privacy_engine = privacy_engine
# TODO: Load up the table, and extract the dataset # out of it. If you're having issues with this, look # carefully at the sample code provided in the reading # # .. your code here .. #df = pd.read_html('http://espn.go.com/nhl/statistics/player/_/stat/points/sort/points/year/2015/seasontype/2') htmlstr = 'http://espn.go.com/nhl/statistics/player/_/stat/points/sort/points/year/2015/seasontype/2' df = pd.read_html(htmlstr)[0] columns = df.iloc[1, :] df.columns = columns col_num = len(columns) df_1 = df.dropna(thresh=col_num - 4) # drop最少4个NAN的值 df_1 = df_1.drop(df.RK == 'RK') #df_1 = df_1[(df.RK != 'RK')] #df_1 = df_1.iloc[:,1:] df_1.iloc[:, 1] = pd.to_numeric(df.iloc[:, 1], errors='coerce') print(df_1.describe()) #print(df_1.loc[15:16,'GP']) #df_not = df[(df.PLAYER != 'PP') & (df.TEAM != 'SH')] # TODO: Rename the columns so that they match the # column definitions provided to you on the website # # .. your code here .. # TODO: Get rid of any row that has at least 4 NANs in it # # .. your code here .. # TODO: At this point, look through your dataset by printing
column_names = ['id_record', 'id_car', 'status', 'lat', 'lon', 'time'] use_columns = ['id_car', 'status', 'lat', 'lon', 'time'] # data_types = [str,int,str,float,float,str] arr = np.empty(shape=(len(os.listdir('/home/martin/MOBILITY/data/traces')), ), dtype=object) for idx, filename in tqdm( enumerate(os.listdir('/home/martin/MOBILITY/data/traces'))): abs_filename = os.path.join('/home/martin/MOBILITY/data/traces', filename) filename_parts = abs_filename.split(sep='.') file_extension = filename_parts[-2] df = pd.read_csv(abs_filename, header=None, names=column_names, usecols=['id_car', 'status', 'lat', 'lon', 'time'], converters={'status': f}) df['id_car'] = pd.to_numeric(file_extension + df['id_car'].astype(str)) # print(df.head()) # print(df.dtypes) # q = sort_one_by_one(df,'id_car','time') # print(q.head()) filtered = df[df.loc[:, 'status']].loc[:, ['id_car', 'lat', 'lon', 'time']] filtered.sort_values(by='time', ascending=True, inplace=True) filtered.sort_values(by='id_car', kind='mergesort', ascending=True, inplace=True) # print(filtered) # ls.append(filtered) arr[idx] = filtered df = pd.concat(arr, ignore_index=True)
df.describe() # We have a number of demographics for each individual as well as the products they currently own. To make a test set, I will separate the last month from this training data, and create a feature that indicates whether or not a product was newly purchased. First convert the dates. There's `fecha_dato`, the row-identifier date, and `fecha_alta`, the date that the customer joined. # In[ ]: df["fecha_dato"] = pd.to_datetime(df["fecha_dato"], format="%Y-%m-%d") df["fecha_alta"] = pd.to_datetime(df["fecha_alta"], format="%Y-%m-%d") df["fecha_dato"].unique() # I printed the values just to double check the dates were in standard Year-Month-Day format. I expect that customers will be more likely to buy products at certain months of the year (Christmas bonuses?), so let's add a month column. I don't think the month that they joined matters, so just do it for one. # In[ ]: df["month"] = pd.DatetimeIndex(df["fecha_dato"]).month df["age"] = pd.to_numeric(df["age"], errors="coerce") # Are there any columns missing values? # In[ ]: df.isnull().any() # Definitely. Onto data cleaning. # # ## Data Cleaning # # Going down the list, start with `age` # In[ ]:
def base_load_sessions( item_df, csv_file, secondary_csv_file, output_file, shared_output_file, label_encoders, hot_encoders, nrows, ): from train_recommender import RAW_DATA_PATH, DATA_PATH pickle_path = os.path.join(DATA_PATH, output_file) shared_pickle_path = os.path.join(DATA_PATH, shared_output_file) is_test = secondary_csv_file is None if os.path.exists(pickle_path): result = pickle.load(open(pickle_path, "rb")) result.update(pickle.load(open(shared_pickle_path, "rb"))) return result else: data_path = os.path.join(RAW_DATA_PATH, csv_file) print("load csv") raw_df = pd.read_csv(data_path, sep=',', nrows=nrows) if secondary_csv_file is not None: secondary_df = pd.read_csv(os.path.join(RAW_DATA_PATH, secondary_csv_file), sep=',', nrows=nrows) raw_df = pd.concat([raw_df, secondary_df], ignore_index=True) raw_df = split_row(raw_df, column='city', sep=',') # extract search_for poi impression into own column prepare_action_types = [ 'search for poi', 'change of sort order', 'filter selection', 'search for destination', ] raw_df = prepare_reference(raw_df, prepare_action_types) # encode labels hot_encoders, label_encoders = hot_encode_labels( raw_df, columns=[ 'session_id', 'action_type', 'city_0', 'city_1', 'platform', 'device', ], label_encoders=label_encoders, hot_encoders=hot_encoders) print("Remove invalid references...") raw_df['reference'] = pd.to_numeric( raw_df['reference'], errors='coerce').fillna(-1).astype(int) clickout_type = label_encoders['action_type'].transform( ['clickout item'])[0] print("filter references which do not exist") referencing_action_type = label_encoders['action_type'].transform( get_referencing_action_types(is_test)) item_properties = item_df.loc[raw_df['reference']] item_properties.reset_index(inplace=True, drop=True) raw_df = raw_df[~( (item_properties[0].isnull()) & (raw_df['action_type'].isin(referencing_action_type)))] raw_df.reset_index(inplace=True) print("filter session_ids where the last entry is not a clickout") next_session_id = raw_df["session_id"].shift(-1) to_delete = raw_df[(raw_df['session_id'] != next_session_id) & ( raw_df['action_type'] != clickout_type)]['session_id'] raw_df = raw_df[~raw_df['session_id'].isin(to_delete)] raw_df.reset_index(inplace=True, drop=True) print("prepare reference item_ids") item_properties = item_df.loc[raw_df['reference']] item_properties.reset_index(inplace=True, drop=True) item_properties.fillna(0.0, inplace=True) raw_df.drop([ 'index', ], axis=1, inplace=True) print("groupby") grouped = raw_df.groupby(by='session_id') print("shuffle & extract session ids") session_sizes = grouped[['step']].count() session_sizes = session_sizes[session_sizes['step'] > 1] noise = np.random.normal(0, 2, [len(session_sizes), 1]).astype( int) # locally shuffle by sorting by "noised length" noised_session_sizes = session_sizes + noise noised_session_sizes.sort_values(by='step', inplace=True) train_session_ids = np.array(list(noised_session_sizes.index)) print("write to disk") result = { "session": raw_df, "relevant_session_ids": train_session_ids, "item_properties": item_properties, "grouped": grouped, } shared_result = { "hot_encoders": hot_encoders, "label_encoders": label_encoders, } pickle.dump(result, open(pickle_path, "wb"), protocol=4) pickle.dump(shared_result, open(shared_pickle_path, "wb"), protocol=4) result.update(shared_result) return result
elif temp[i-j,0] == 'methylated' and temp[i-j,6] == False: paired.append(temp[i]) paired.append(temp[i-j]) temp[i,6] = True temp[i-j,6] = True ###### # # Convert paired sites into Dataframe paired_all = pd.DataFrame(paired, columns = ['CGstatus','methylation','motif','start','strand','type','paired']) paired_all.drop(['paired'],axis = 1, inplace = True) # make sign variable for downstream analysis paired_all['sign'] = paired_all['strand']+'1' paired_all['sign'] = pd.to_numeric(paired_all['sign']) # save output file paired_all.to_csv(output_path+'/ara_paired_10reads_%s_%s_k%s_%s.txt' %(mask_str, context, k, chromosome), sep = '\t', header = True)
def calculator_road(city_code): roads_city = pd.read_csv("全國路名/" + city_code + "_road.csv") df = pd.read_csv("concate_csvs/" + city_code + ".csv") df = df.dropna(subset=[ '鄉鎮市區', "土地區段位置建物區段門牌", '交易年月日', '建物移轉總面積平方公尺', '交易筆棟數', '單價元平方公尺' ]) roads = [] for i in range(len(roads_city)): site_id = roads_city["site_id"][i] road = roads_city["road"][i] roads.append(site_id + road) #取出距今5年前資料 from datetime import datetime y = str(int('{:%Y}'.format(datetime.today())) - 1911) month_day = '{:%m%d}'.format(datetime.today()) today_date_string = y + month_day five_year_ago_date = str(int(today_date_string) - 50000) filter_5年內資料 = pd.to_numeric( df["交易年月日"]) > (int(today_date_string) - 50000) df_for_analysis = df[filter_5年內資料] sd = pd.DataFrame(columns=["road", "每平方公尺標準差", "每平方公尺年成長率"]) #存放各地區 每平方公尺標準差 count = 1 for location in roads: fliter_location = df_for_analysis["土地區段位置建物區段門牌"].str.contains( location) print(location) print(count / len(roads)) count += 1 if any(fliter_location): #當有找到相符路段才執行dataframe操握 df_location = df_for_analysis[fliter_location] df_location["單價元平方公尺"] = pd.to_numeric(df_location["單價元平方公尺"]) df_location["建物移轉總面積平方公尺"] = pd.to_numeric( df_location["建物移轉總面積平方公尺"]) df_location["土地數"] = df_location["交易筆棟數"].str.get(2) df_location["土地數"] = pd.to_numeric(df_location["土地數"]) df_location["平均土地面積of一筆交易"] = df_location[ "建物移轉總面積平方公尺"] // df_location["土地數"] #計算標準差 sd_of_location = df_location["單價元平方公尺"].std() #計算年成長率 mean_value_of_each_year = [] for i in range(5): fliter_certain_year = ( pd.to_numeric(df_location['交易年月日']) >= int(five_year_ago_date) + i * 10000) & ( pd.to_numeric(df_location['交易年月日']) < int(five_year_ago_date) + 10000 + i * 10000) mean_value_of_each_year.append( df_location[fliter_certain_year].mean()[1]) gross_rate_of_each_year = [] for i in range(1, 5): gross_rate_of_each_year.append( (mean_value_of_each_year[i] - mean_value_of_each_year[i - 1]) / mean_value_of_each_year[i]) annual_gross_rate = sum(gross_rate_of_each_year) / len( gross_rate_of_each_year) mean_value_of_location = df_location["單價元平方公尺"].mean() mean_area_of_location = df_location["平均土地面積of一筆交易"].mean() tempt = pd.DataFrame( { "road": location, "每平方公尺標準差": sd_of_location, "每平方公尺年成長率": annual_gross_rate, "meanValue": mean_value_of_location, "mean_area": mean_area_of_location }, index=[1]) sd = sd.append(tempt, ignore_index=True) '''
def get_result(): result = pd.DataFrame() for data in get_datas(): result = result.append(data) return result result = get_result() print('空数据有:', result.isnull().any().sum()) result.columns = ['日期', '最高温度', '最低温度', '天气状况', '风向'] result.head(10) result['日期'] = pd.to_datetime(result['日期']) result['最高温度'] = pd.to_numeric(result['最高温度']) result['最低温度'] = pd.to_numeric(result['最低温度']) result['平均温度'] = (result['最高温度'] + result['最低温度']) / 2 #result.info() sns.distplot(result['平均温度']) # In[102]: sns.countplot(result['天气状况']) # In[110]: result['是否降水'] = result['天气状况'].apply( lambda x: '未降水' if x in ['晴', '多云', '阴', '雾', '浮尘', '霾', '扬沙'] else '降水')
def usbonds_command(): """US bonds overview [Wall St. Journal]""" # Debug user input if imps.DEBUG: logger.debug("econ-usbonds") # Retrieve data df = wsj_model.us_bonds() # Check for argument if df.empty: raise Exception("No available data found") df["Rate (%)"] = pd.to_numeric(df["Rate (%)"].astype(float)) df["Yld (%)"] = pd.to_numeric(df["Yld (%)"].astype(float)) df["Yld Chg (%)"] = pd.to_numeric(df["Yld Chg (%)"].astype(float)) formats = { "Rate (%)": "{:.2f}%", "Yld (%)": "{:.2f}%", "Yld Chg (%)": "<b>{:.2f}%</b>", } for col, value in formats.items(): df[col] = df[col].map(lambda x: value.format(x)) # pylint: disable=W0640 df = df.fillna("") df.set_index(" ", inplace=True) df = df.set_axis( [ "Rate", "Yld", "Yld Chg", ], axis="columns", ) font_color = ["white"] * 3 + [[ "#e4003a" if boolv else "#00ACFF" for boolv in df["Yld Chg"].str.contains("-") ]] fig = imps.plot_df( df, fig_size=(550, (40 + (40 * len(df.index)))), col_width=[4, 2, 2, 2.1], tbl_header=imps.PLT_TBL_HEADER, tbl_cells=imps.PLT_TBL_CELLS, font=imps.PLT_TBL_FONT, row_fill_color=imps.PLT_TBL_ROW_COLORS, paper_bgcolor="rgba(0, 0, 0, 0)", ) fig.update_traces(cells=(dict( align=["center", "right"], font=dict(color=font_color), ))) fig.update_traces(cells=(dict(align=["center", "right"]))) imagefile = imps.save_image("econ-usbonds.png", fig) return { "title": "Economy: [WSJ] US Bonds", "imagefile": imagefile, }
@author: T """ import numpy as np import pandas as pd from pandas.tools.plotting import scatter_matrix import matplotlib.pyplot as plt url = "http://archive.ics.uci.edu/ml/machine-learning-databases/mammographic-masses/mammographic_masses.data" mam = pd.read_csv(url, header= None) mam.columns = ["BI-RADS", "Age", "Shape", "Margin", "Density", "Severity"] mam.dtypes mam.loc [:,"BI-RADS"] = pd.to_numeric(mam.loc[:,"BI-RADS"], errors= 'coerce') hasnan = np.isnan(mam.loc[:, "BI-RADS"]) print(hasnan) mam.loc[hasnan,"BI-RADS"] = np.median(mam.loc[:, "BI-RADS"]) plt.hist(mam.loc[:, "BI-RADS"]) toohigh = mam.loc[:, "BI-RADS"] > 6 mam.loc[toohigh, "BI-RADS"] = 6 import pandas as pd url = "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
plt.title( "Distribution of the Classes based on Clump Thickness and Epithelial Cell Size" ) plt.show() # ## Data Preprocessing and Selection # In[6]: cancer_data.dtypes # It looks like the __BareNuc__ column includes some values that are not numerical. We can drop those rows: # In[4]: cancer_data = cancer_data[pd.to_numeric(cancer_data['BareNuc'], errors='coerce').notnull()] cancer_data['BareNuc'] = cancer_data['BareNuc'].astype('int') cancer_data.dtypes # In[5]: feature_df = cancer_data[[ 'Clump', 'UnifSize', 'UnifShape', 'MargAdh', 'SingEpiSize', 'BareNuc', 'BlandChrom', 'NormNucl', 'Mit' ]] X = np.asarray(feature_df) X[0:5] # In[6]: cancer_data['Class'] = cancer_data['Class'].astype('int')
if not nexTour.empty: nt1 = pd.DataFrame(nexTour['stats'].tolist()) nt1['id'] = nexTour['id'] nt1['gameweek'] = i nt1.index = nt1['gameweek']*1000+nt1['id'] Gameweeks = Gameweeks.append(nt1) print(i) teams = dict(zip(pd.DataFrame(d1['teams'])['id'],pd.DataFrame(d1['teams'])['name'])) players = dict(zip(bigTable['id'],bigTable['full_name'])) teamplayers = dict(zip(bigTable['id'],bigTable['team'])) Gameweeks['team'] = [teamplayers[i] for i in Gameweeks['id']] Gameweeks['threat'] = pd.to_numeric(Gameweeks['threat']) Gameweeks['creativity'] = pd.to_numeric(Gameweeks['creativity']) Gameweeks['team_a'] = [int(Fixtures[(Fixtures['event'] == Gameweeks.iloc[i,20]) & ((Fixtures['team_a'] == Gameweeks.iloc[i,21])| (Fixtures['team_h'] == Gameweeks.iloc[i,21]))]['team_a']) for i in range(len(Gameweeks))] Gameweeks['team_h'] = [int(Fixtures[(Fixtures['event'] == Gameweeks.iloc[i,20]) & ((Fixtures['team_a'] == Gameweeks.iloc[i,21])| (Fixtures['team_h'] == Gameweeks.iloc[i,21]))]['team_h']) for i in range(len(Gameweeks))] Gameweeks['teamAgainst'] = [Gameweeks.at[i,'team_a'] if Gameweeks.at[i,'team'] == Gameweeks.at[i,'team_h'] else Gameweeks.at[i,'team_h'] for i in Gameweeks.index] Gameweeks['side'] = ['home' if Gameweeks.at[i,'team'] == Gameweeks.at[i,'team_h'] else 'away' for i in Gameweeks.index] del Gameweeks['team_a'] del Gameweeks['team_h'] Gameweeks.to_csv(Path('in/fplgameweeks.csv')) Gameweeks
def main(): sf_gdf = gpd.read_file("san-francisco.geojson") sf_gdf['pop2010'] = pd.to_numeric(sf_gdf['pop2010'], downcast='integer') sf_map = folium.Map([37.7556, -122.4399], zoom_start=13) folium.GeoJson('san-francisco.geojson', name='geojson').add_to(sf_map) folium.GeoJson(data='san-francisco.geojson', name='geojson', style_function=style).add_to(sf_map) # Add labels manual_label = {5, 8, 9, 12, 15, 26, 27} for index, row in sf_gdf.iterrows(): if index not in manual_label: folium.CircleMarker(get_centroid(row), radius=POINT_RADIUS, color='black', fill=True, fill_opacity=1).add_to(sf_map) add_label(sf_map, get_centroid(row), row['zip_code']) # 94104 row = sf_gdf.iloc[12] add_label(sf_map, (37.794, -122.363705), row['zip_code'], icon_anchor=(0, 13)) centroid = get_centroid(row) folium.CircleMarker(centroid, radius=POINT_RADIUS, color='black', fill=True, fill_opacity=1).add_to(sf_map) folium.PolyLine(locations=[centroid, (37.794, centroid[1])], color='black', weight=LINE_WEIGHT).add_to(sf_map) folium.PolyLine(locations=[(37.794, centroid[1]), (37.794, -122.363705)], color='black', weight=LINE_WEIGHT).add_to(sf_map) # 94108 row = sf_gdf.iloc[15] add_label(sf_map, (37.797, -122.363705), row['zip_code'], icon_anchor=(0, 25)) centroid = get_centroid(row) folium.CircleMarker(centroid, radius=POINT_RADIUS, color='black', fill=True, fill_opacity=1).add_to(sf_map) folium.PolyLine(locations=[centroid, (37.797, centroid[1])], color='black', weight=LINE_WEIGHT).add_to(sf_map) folium.PolyLine(locations=[(37.797, centroid[1]), (37.797, -122.363705)], color='black', weight=LINE_WEIGHT).add_to(sf_map) sf_map.save('index.html')
'PL': 14901, 'PT': 23408, 'RO': 12301, 'RU': 11162, 'SA': 22865, 'SE': 54608, 'TH': 7274, 'TR': 9370, 'TW': 24827, 'UA': 3592, 'US': 65111, 'VN': 2740, 'ZA': 11300, 'CO': 6500 }) df.gdpCountry = pd.to_numeric(df.gdpCountry, errors='coerce') df['gdpCountry'] = df['gdpCountry'].fillna(11335) df['gdpCountry'] = pd.cut(df.gdpCountry, bins=[0, 29960, 50000, 150000], labels=[0, 1, 2]) #End Yev Gdp device_map = { 'IPhone7': 0, 'IPhone7Plus': 0, 'IPhone8Plus': 0, 'IPhone6S': 0, 'IPhoneSE': 0, 'IPhone8': 0, 'IPhone6SPlus': 0,
def main(task): if 'tune_problem' in task: # Tune Knapsack problem problem_size = 50 weights = [idx for idx in range(1, problem_size + 1)] values = [idx for idx in range(1, problem_size + 1)] max_weight_pct_list = np.arange(0.1, 1, 0.05) knapsack_tuning_fitness = [] knapsack_tuning_time = [] knapsack_tuning_fevals = [] for max_weight_pct in max_weight_pct_list: fitness = mlrose.Knapsack(weights, values, max_weight_pct) problem = mlrose.DiscreteOpt(problem_size, fitness, maximize=True, max_val=2) experiment_name = 'knapsack_tuning_weight_pct_' + str( max_weight_pct) temperature_list = np.arange(1, 50, 1) knapsack = runners.SARunner(problem=problem, experiment_name=experiment_name, output_directory='knapsack', seed=27, iteration_list=[5000], max_attempts=50, temperature_list=temperature_list) # the two data frames will contain the results knapsack_run_stats, knapsack_run_curves = knapsack.run() knapsack_tuning_fitness.append(knapsack_run_curves.loc[ knapsack_run_curves['Fitness'].idxmax()]['Fitness']) knapsack_tuning_time.append(knapsack_run_curves.loc[ knapsack_run_curves['Time'].idxmax()]['Time']) knapsack_tuning_fevals.append(2 * knapsack_run_curves.loc[ knapsack_run_curves['Iteration'].idxmax()]['Iteration']) plt.rc("font", size=8) plt.rc("axes", titlesize=12) plt.rc("axes", labelsize=10) plt.rc("xtick", labelsize=8) plt.rc("ytick", labelsize=8) plt.rc("legend", fontsize=11) plt.rc("figure", titlesize=11) fig, ax = plt.subplots(1, 3, figsize=(10, 3.5)) fig.suptitle('Knapsack Tuning w/ Simulated Annealing Optimizer', fontsize=14) ax[0].scatter(max_weight_pct_list, knapsack_tuning_fitness, c='r', marker='x', s=10) ax[0].set(xlabel='Max Weight %', ylabel='Max Fitness') ax[1].scatter(max_weight_pct_list, knapsack_tuning_time, c='g', marker='o', s=10) ax[1].set(xlabel='Max Weight %', ylabel='Max Runtime (s)') ax[2].scatter(max_weight_pct_list, knapsack_tuning_fevals, c='b', marker='+') ax[2].set(xlabel='Max Weight %', ylabel='Max Function Evaluations') ax[2].yaxis.tick_right() plt.show() return if 'tuning_plots' in task: # FOUR PEAKS GOOD FOR GENETIC # Tune Algorithms problem_size = 50 # Knapsack weights = [idx for idx in range(1, problem_size + 1)] print(weights) #weights = np.ones(100) values = [idx for idx in range(1, problem_size + 1)] #values = np.arange(1, 101) max_weight_pct = 0.3 knapsack_fitness = mlrose.Knapsack(weights, values, max_weight_pct) #state = np.array([1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0]) #problem = mlrose.DiscreteOpt(problem_size, four_peaks_fitness, maximize=True, max_val=2) #temperature_list = np.arange(0.1, 2, 0.1) best_fitness_list = [] #for size in problem_size_list: problem = mlrose.DiscreteOpt(problem_size, knapsack_fitness, maximize=True, max_val=2) problem_size = 50 rhc_fitness_tuning_list = [] rhc_param_tuning_list = [] rhc_feval_tuning_list = [] time_tuning_list = [] asdf_list = [] fdsa_list = [] experiment_name = 'rhc_knapsack_tuning_size_' + str(problem_size) #restart_list = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100] restart_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100] rhc = runners.RHCRunner(problem=problem, experiment_name=experiment_name, output_directory='knapsack', seed=27, iteration_list=[5000], max_attempts=125, restart_list=restart_list) # the two data frames will contain the results rhc_run_stats, rhc_run_curves = rhc.run() for restart in restart_list: this_temp_df = rhc_run_curves.loc[rhc_run_curves['Restarts'] == restart] this_temp_df[ 'Iteration'] = this_temp_df['Iteration'] - this_temp_df.loc[ this_temp_df['Iteration'].idxmin()]['Iteration'] + 1 rhc_fitness_tuning_list.append( this_temp_df.loc[this_temp_df['Fitness'].idxmax()]['Fitness']) rhc_param_tuning_list.append(restart) time_tuning_list.append( this_temp_df.loc[this_temp_df['Time'].idxmax()]['Time']) rhc_feval_tuning_list.append(3 * this_temp_df.loc[ this_temp_df['Iteration'].idxmax()]['Iteration']) asdf_list.append(this_temp_df['Fitness']) fdsa_list.append(this_temp_df['Iteration']) # plt.rc("font", size=8) # plt.rc("axes", titlesize=12) # plt.rc("axes", labelsize=10) # plt.rc("xtick", labelsize=8) # plt.rc("ytick", labelsize=8) # plt.rc("legend", fontsize=8) # plt.rc("figure", titlesize=11) # #fig, ax = plt.subplots(2, 1, dpi=100, sharex=True, figsize=(5,4)) # fig, ax = plt.subplots(1,3,figsize=(12,3.5)) # fig.suptitle('RHC Restarts Tuning, problem_size = ' + str(problem_size)) # ax[0].scatter(param_tuning_list, time_tuning_list, c='r', marker='x', s=10) # ax[0].set(xlabel='Restarts', ylabel = 'Time') # ax[1].scatter(param_tuning_list, fitness_tuning_list, c='g', marker='o', s=10) # ax[1].set(xlabel='Restarts', ylabel = 'Fitness') # ax[2].scatter(param_tuning_list, feval_tuning_feval, c='g', marker='o', s=10) # ax[2].set(xlabel='Restartsc', ylabel = 'Function Evaluations') # ax[2].yaxis.tick_right() # plt.show() # fig, ax = plt.subplots() # ax.scatter(fdsa_list[7], asdf_list[7]) # ax.set(xlabel='Iteration', ylabel = 'Fitness') # plt.show() # problem_size = 50 sa_fitness_tuning_list = [] sa_param_tuning_list = [] time_tuning_list = [] sa_feval_tuning_list = [] asdf_list = [] fdsa_list = [] experiment_name = 'sa_knapsack_tuning_size_' + str(problem_size) temperature_list = np.arange(1, 50, 0.5) sa = runners.SARunner(problem=problem, experiment_name=experiment_name, output_directory='knapsack', seed=27, iteration_list=[1000], max_attempts=50, temperature_list=temperature_list) #decay_list=mlrose.GeomDecay(init_temp=1.1)) #temperature_list=[1, 10, 50, 100, 250, 500, 1000, 2500, 5000, 10000]) #temperature_list=[1, 10, 50, 100, 250, 500, 1000, 2500, 5000, 10000]) # the two data frames will contain the results df_run_stats, df_run_curves = sa.run() df_run_curves['Temperature'] = pd.to_numeric( df_run_curves['Temperature'].astype(str).astype(float)) for temp in temperature_list: this_temp_df = df_run_curves.loc[df_run_curves['Temperature'] == temp] this_temp_df[ 'Iteration'] = this_temp_df['Iteration'] - this_temp_df.loc[ this_temp_df['Iteration'].idxmin()]['Iteration'] + 1 sa_fitness_tuning_list.append( this_temp_df.loc[this_temp_df['Fitness'].idxmax()]['Fitness']) sa_param_tuning_list.append(temp) sa_feval_tuning_list.append(2 * this_temp_df.loc[ this_temp_df['Iteration'].idxmax()]['Iteration']) time_tuning_list.append( this_temp_df.loc[this_temp_df['Time'].idxmax()]['Time']) asdf_list.append(this_temp_df['Fitness']) fdsa_list.append(this_temp_df['Iteration']) # plt.rc("font", size=8) # plt.rc("axes", titlesize=12) # plt.rc("axes", labelsize=10) # plt.rc("xtick", labelsize=8) # plt.rc("ytick", labelsize=8) # plt.rc("legend", fontsize=8) # plt.rc("figure", titlesize=11) # #fig, ax = plt.subplots(2, 1, dpi=100, sharex=True, figsize=(5,4)) # fig, ax = plt.subplots(1,3,figsize=(12,3.5)) # fig.suptitle('SA Temperature Tuning, problem_size = ' + str(problem_size)) # ax[0].scatter(param_tuning_list, time_tuning_list, c='r', marker='x', s=10) # ax[0].set(xlabel='Temperature', ylabel = 'Time') # ax[1].scatter(param_tuning_list, fitness_tuning_list, c='g', marker='o', s=10) # ax[1].set(xlabel='Temperature', ylabel = 'Fitness') # ax[2].scatter(param_tuning_list, feval_tuning_list, c='g', marker='o', s=10) # ax[2].set(xlabel='Temperature', ylabel = 'Function Evaluations') # ax[2].yaxis.tick_right() # plt.show() # fig, ax = plt.subplots() # ax.scatter(fdsa_list[17], asdf_list[17]) # ax.set(xlabel='Iteration', ylabel = 'Fitness') # plt.show() ga_fitness_tuning_list = [] ga_param_tuning_list = [] time_tuning_list = [] ga_feval_tuning_list = [] asdf_list = [] fdsa_list = [] experiment_name = 'ga_knapsack_tuning_size_' + str(problem_size) population_sizes_list = 100, mutation_rates_list = np.arange(0.05, 1.0, 0.05) ga = runners.GARunner(problem=problem, experiment_name=experiment_name, output_directory='knapsack', seed=27, iteration_list=[100], population_sizes=population_sizes_list, mutation_rates=mutation_rates_list, max_attempts=5) # the two data frames will contain the results df_run_stats, df_run_curves = ga.run() # for rate in mutation_rates_list: # this_temp_df = df_run_curves.loc[df_run_curves['Mutation Rate'] == rate] # this_temp_df['Iteration'] = this_temp_df['Iteration'] - this_temp_df.loc[this_temp_df['Iteration'].idxmin()]['Iteration'] + 1 # ga_fitness_tuning_list.append(this_temp_df.loc[this_temp_df['Fitness'].idxmax()]['Fitness']) # ga_param_tuning_list.append(rate) # feval_tuning_list.append(population_sizes_list[0] * this_temp_df.loc[this_temp_df['Iteration'].idxmax()]['Iteration']) # time_tuning_list.append(this_temp_df.loc[this_temp_df['Time'].idxmax()]['Time']) # asdf_list.append(this_temp_df['Fitness']) # fdsa_list.append(this_temp_df['Iteration']) # print(time_tuning_list) # plt.rc("font", size=8) # plt.rc("axes", titlesize=12) # plt.rc("axes", labelsize=10) # plt.rc("xtick", labelsize=8) # plt.rc("ytick", labelsize=8) # plt.rc("legend", fontsize=8) # plt.rc("figure", titlesize=11) # #fig, ax = plt.subplots(2, 1, dpi=100, sharex=True, figsize=(5,4)) # fig, ax = plt.subplots(1,3,figsize=(12,3.5)) # fig.suptitle('GA Mutation Rate Tuning, problem_size = ' + str(problem_size)) # ax[0].scatter(param_tuning_list, time_tuning_list, c='r', marker='x', s=10) # ax[0].set(xlabel='Mutation Rate', ylabel = 'Time (s)') # ax[1].scatter(param_tuning_list, fitness_tuning_list, c='g', marker='o', s=10) # ax[1].set(xlabel='Mutation Rate', ylabel = 'Fitness') # ax[2].scatter(param_tuning_list, feval_tuning_list, c='g', marker='o', s=10) # ax[2].set(xlabel='Mutation Rate', ylabel = 'Function Evaluations') # ax[2].yaxis.tick_right() # plt.show() # fig, ax = plt.subplots() # ax.scatter(fdsa_list[17], asdf_list[17]) # ax.set(xlabel='Iteration', ylabel = 'Fitness') # plt.show() # Tune population size ga_population_tuning_fitness = [] ga_population_tuning_time = [] ga_population_tuning_feval = [] population_sizes_list = np.arange(10, 500, 10) for population_size in population_sizes_list: experiment_name = 'ga_knapsack_tuning_population_size_' + str( problem_size) mutation_rates_list = [0.1] ga = runners.GARunner(problem=problem, experiment_name=experiment_name, output_directory='knapsack', seed=27, iteration_list=[500], population_sizes=[int(population_size)], mutation_rates=mutation_rates_list, max_attempts=10) # the two data frames will contain the results ga_run_stats, ga_run_curves = ga.run() ga_population_tuning_fitness.append(ga_run_curves.loc[ ga_run_curves['Fitness'].idxmax()]['Fitness']) ga_population_tuning_time.append( ga_run_curves.loc[ga_run_curves['Time'].idxmax()]['Time']) ga_population_tuning_feval.append( population_size * ga_run_curves.loc[ ga_run_curves['Iteration'].idxmax()]['Iteration']) # plt.rc("font", size=8) # plt.rc("axes", titlesize=12) # plt.rc("axes", labelsize=10) # plt.rc("xtick", labelsize=8) # plt.rc("ytick", labelsize=8) # plt.rc("legend", fontsize=8) # plt.rc("figure", titlesize=11) # #fig, ax = plt.subplots(2, 1, dpi=100, sharex=True, figsize=(5,4)) # fig, ax = plt.subplots(1,3,figsize=(12,3.5)) # fig.suptitle('GA Population Size Tuning, problem_size = ' + str(problem_size)) # ax[0].scatter(population_sizes_list, ga_population_tuning_time, c='r', marker='x', s=10) # ax[0].set(xlabel='Population Size', ylabel = 'Time') # ax[1].scatter(population_sizes_list, ga_population_tuning_fitness, c='g', marker='x', s=10) # ax[1].set(xlabel='Population Size', ylabel = 'Fitness') # ax[2].scatter(param_tuning_list, ga_population_tuning_feval, c='g', marker='o', s=10) # ax[2].set(xlabel='Population Size', ylabel = 'Function Evaluations') # ax[2].yaxis.tick_right() # plt.show() mimic_fitness_tuning_list = [] mimic_param_tuning_list = [] time_tuning_list = [] mimic_feval_tuning_list = [] asdf_list = [] fdsa_list = [] experiment_name = 'mimic_knapsack_tuning_size_' + str(problem_size) population_sizes_list = 100, # keep_percent_list=np.arange(0.05, 1.0, 0.05) # mimic = runners.MIMICRunner(problem=problem, # experiment_name=experiment_name, # output_directory='knapsack', # seed=27, # iteration_list=[100], # population_sizes=population_sizes_list, # keep_percent_list=keep_percent_list, # max_attempts=5) # # the two data frames will contain the results # df_run_stats, df_run_curves = mimic.run() # print(df_run_curves.dtypes) # print(df_run_curves) # #df_run_curves['Temperature'] = pd.to_numeric(df_run_curves['Temperature'].astype(str).astype(float)) # print(df_run_curves) # for percent in keep_percent_list: # this_temp_df = df_run_curves.loc[df_run_curves['Keep Percent'] == percent] # this_temp_df['Iteration'] = this_temp_df['Iteration'] - this_temp_df.loc[this_temp_df['Iteration'].idxmin()]['Iteration'] + 1 # mimic_fitness_tuning_list.append(this_temp_df.loc[this_temp_df['Fitness'].idxmax()]['Fitness']) # mimic_param_tuning_list.append(percent) # feval_tuning_list.append(population_sizes_list[0] * this_temp_df.loc[this_temp_df['Iteration'].idxmax()]['Iteration']) # time_tuning_list.append(this_temp_df.loc[this_temp_df['Time'].idxmax()]['Time']) # asdf_list.append(this_temp_df['Fitness']) # fdsa_list.append(this_temp_df['Iteration']) # plt.rc("font", size=8) # plt.rc("axes", titlesize=12) # plt.rc("axes", labelsize=10) # plt.rc("xtick", labelsize=8) # plt.rc("ytick", labelsize=8) # plt.rc("legend", fontsize=8) # plt.rc("figure", titlesize=11) # #fig, ax = plt.subplots(2, 1, dpi=100, sharex=True, figsize=(5,4)) # fig, ax = plt.subplots(1,3,figsize=(12,3.5)) # fig.suptitle('MIMIC Keep Percent Tuning, problem_size = ' + str(problem_size)) # ax[0].scatter(param_tuning_list, time_tuning_list, c='r', marker='x', s=10) # ax[0].set(xlabel='Keep Percent (decimal)', ylabel = 'Time (s)') # ax[1].scatter(param_tuning_list, fitness_tuning_list, c='g', marker='o', s=10) # ax[1].set(xlabel='Keep Percent (decimal)', ylabel = 'Fitness') # ax[2].scatter(param_tuning_list, feval_tuning_list, c='g', marker='o', s=10) # ax[2].set(xlabel='Keep Percent (decimal)', ylabel = 'Function Evaluations') # ax[2].yaxis.tick_right() # plt.show() # fig, ax = plt.subplots() # ax.scatter(fdsa_list[17], asdf_list[17]) # ax.set(xlabel='Iteration', ylabel = 'Fitness') # plt.show() # Tune population size mimic_population_tuning_fitness = [] mimic_population_tuning_time = [] mimic_population_tuning_feval = [] population_sizes_list = np.arange(10, 500, 10) for population_size in population_sizes_list: experiment_name = 'mimic_knapsack_tuning_population_size_' + str( problem_size) keep_percent_list = [0.45] mimic = runners.MIMICRunner( problem=problem, experiment_name=experiment_name, output_directory='knapsack', seed=27, iteration_list=[100], population_sizes=[int(population_size)], keep_percent_list=keep_percent_list, max_attempts=5, use_fast_mimic=True) # the two data frames will contain the results mimic_run_stats, mimic_run_curves = mimic.run() mimic_population_tuning_fitness.append(mimic_run_curves.loc[ mimic_run_curves['Fitness'].idxmax()]['Fitness']) mimic_population_tuning_time.append(mimic_run_curves.loc[ mimic_run_curves['Time'].idxmax()]['Time']) mimic_population_tuning_feval.append( population_size * mimic_run_curves.loc[ mimic_run_curves['Iteration'].idxmax()]['Iteration']) plt.rc("font", size=8) plt.rc("axes", titlesize=14) plt.rc("axes", labelsize=10) plt.rc("xtick", labelsize=8) plt.rc("ytick", labelsize=8) plt.rc("legend", fontsize=11) plt.rc("figure", titlesize=11) fig, ax = plt.subplots(2, 4, figsize=(12, 7)) fig.suptitle('Knapsack Algorithm Tuning, problem size = ' + str(problem_size)) ax[0, 0].scatter(rhc_param_tuning_list, rhc_fitness_tuning_list, c='r', marker='x', s=10) ax[0, 0].set(xlabel='Restarts', ylabel='Fitness', title='RHC Restarts') ax[0, 1].scatter(sa_param_tuning_list, sa_fitness_tuning_list, c='g', marker='o', s=10) ax[0, 1].set(xlabel='Temperature', title='SA Temperature') ax[0, 2].scatter(population_sizes_list, ga_population_tuning_fitness, c='g', marker='o', s=10) ax[0, 2].set(xlabel='Population Size', title='GA Population Size') ax[0, 2].yaxis.tick_right() ax[0, 3].scatter(population_sizes_list, mimic_population_tuning_fitness, c='g', marker='o', s=10) ax[0, 3].set(xlabel='Population Size', title='MIMIC Population Size') ax[0, 3].yaxis.tick_right() ax[1, 0].scatter(rhc_param_tuning_list, rhc_feval_tuning_list, c='r', marker='x', s=10) ax[1, 0].set(xlabel='Restarts', ylabel='Function Evaluations') ax[1, 1].scatter(sa_param_tuning_list, sa_feval_tuning_list, c='g', marker='o', s=10) ax[1, 1].set(xlabel='Temperature') ax[1, 2].scatter(population_sizes_list, ga_population_tuning_feval, c='g', marker='o', s=10) ax[1, 2].set(xlabel='Population Size') ax[1, 2].yaxis.tick_right() ax[1, 3].scatter(population_sizes_list, mimic_population_tuning_feval, c='g', marker='o', s=10) ax[1, 3].set(xlabel='Population Size') ax[1, 3].yaxis.tick_right() plt.show() if 'complexity_graph' in task: problem_size_list = np.arange(5, 85, 5) sa_time_list = [] sa_fitness_list = [] sa_feval_list = [] rhc_time_list = [] rhc_fitness_list = [] rhc_feval_list = [] ga_time_list = [] ga_fitness_list = [] ga_feval_list = [] mimic_time_list = [] mimic_fitness_list = [] mimic_feval_list = [] for problem_size in problem_size_list: # Knapsack weights = [idx for idx in range(1, problem_size + 1)] print(weights) values = [idx for idx in range(1, problem_size + 1)] max_weight_pct = 0.3 knapsack_fitness = mlrose.Knapsack(weights, values, max_weight_pct) best_fitness_list = [] problem = mlrose.DiscreteOpt(int(problem_size), knapsack_fitness, maximize=True, max_val=2) # RHC experiment_name = 'rhc_knapsack_complexity_size_' + str( problem_size) restart_list = [100] rhc = runners.RHCRunner(problem=problem, experiment_name=experiment_name, output_directory='knapsack', seed=27, iteration_list=[5000], max_attempts=10, restart_list=restart_list) # the two data frames will contain the results rhc_run_stats, rhc_run_curves = rhc.run() rhc_time = rhc_run_curves['Time'] rhc_fitness = rhc_run_curves['Fitness'] rhc_iteration = rhc_run_curves['Iteration'] rhc_fitness_list.append(rhc_run_curves.loc[ rhc_run_curves['Fitness'].idxmax()]['Fitness']) rhc_time_list.append( rhc_run_curves.loc[rhc_run_curves['Time'].idxmax()]['Time']) rhc_feval_list.append(3 * rhc_run_curves.loc[ rhc_run_curves['Iteration'].idxmax()]['Iteration']) # SA experiment_name = 'sa_knapsack_complexity_size_' + str( problem_size) temperature_list = [2] sa = runners.SARunner(problem=problem, experiment_name=experiment_name, output_directory='knapsack', seed=27, iteration_list=[10000], max_attempts=50, temperature_list=temperature_list) # the two data frames will contain the results sa_run_stats, sa_run_curves = sa.run() # print(sa_run_curves.dtypes) # print(sa_run_curves) sa_run_curves['Temperature'] = pd.to_numeric( sa_run_curves['Temperature'].astype(str).astype(float)) # print(df_run_curves) sa_time = sa_run_curves['Time'] sa_fitness = sa_run_curves['Fitness'] sa_iteration = sa_run_curves['Iteration'] sa_fitness_list.append(sa_run_curves.loc[ sa_run_curves['Fitness'].idxmax()]['Fitness']) sa_time_list.append( sa_run_curves.loc[sa_run_curves['Time'].idxmax()]['Time']) sa_feval_list.append(2 * sa_run_curves.loc[ sa_run_curves['Iteration'].idxmax()]['Iteration']) # GA experiment_name = 'ga_knapsack_complexity_size_' + str( problem_size) population_sizes_list = 100, mutation_rates_list = [0.15] ga = runners.GARunner(problem=problem, experiment_name=experiment_name, output_directory='knapsack', seed=27, iteration_list=[1000], population_sizes=population_sizes_list, mutation_rates=mutation_rates_list, max_attempts=100) # the two data frames will contain the results ga_run_stats, ga_run_curves = ga.run() # print(ga_run_curves.dtypes) # print(ga_run_curves) # print(df_run_curves) ga_time = ga_run_curves['Time'] ga_fitness = ga_run_curves['Fitness'] ga_iteration = ga_run_curves['Iteration'] ga_fitness_list.append(ga_run_curves.loc[ ga_run_curves['Fitness'].idxmax()]['Fitness']) ga_time_list.append( ga_run_curves.loc[ga_run_curves['Time'].idxmax()]['Time']) ga_feval_list.append(population_sizes_list[0] * ga_run_curves.loc[ ga_run_curves['Iteration'].idxmax()]['Iteration']) # MIMC experiment_name = 'mimic_knapsack_complexity_size_' + str( problem_size) population_sizes_list = 200, keep_percent_list = [0.35] mimic = runners.MIMICRunner(problem=problem, experiment_name=experiment_name, output_directory='knapsack', seed=27, iteration_list=[150], population_sizes=population_sizes_list, keep_percent_list=keep_percent_list, max_attempts=15, use_fast_mimic=True) # the two data frames will contain the results mimic_run_stats, mimic_run_curves = mimic.run() # print(mimic_run_curves.dtypes) # print(mimic_run_curves) # print(df_run_curves) mimic_time = mimic_run_curves['Time'] mimic_fitness = mimic_run_curves['Fitness'] mimic_iteration = mimic_run_curves['Iteration'] mimic_fitness_list.append(mimic_run_curves.loc[ mimic_run_curves['Fitness'].idxmax()]['Fitness']) mimic_time_list.append(mimic_run_curves.loc[ mimic_run_curves['Time'].idxmax()]['Time']) mimic_feval_list.append( population_sizes_list[0] * mimic_run_curves.loc[ mimic_run_curves['Iteration'].idxmax()]['Iteration']) plt.rc("font", size=8) plt.rc("axes", titlesize=12) plt.rc("axes", labelsize=10) plt.rc("xtick", labelsize=8) plt.rc("ytick", labelsize=8) plt.rc("legend", fontsize=8) plt.rc("figure", titlesize=11) #fig, ax = plt.subplots(2, 1, dpi=100, sharex=True, figsize=(5,4)) fig, ax = plt.subplots(1, 3, figsize=(12, 3.5)) fig.suptitle('Knapsack Complexity Analysis', fontsize=14) # ax[0].plot(problem_size_list, sa_fitness_list, 'b-', label='Simulated Annealing', linewidth=1) # ax[0].plot(problem_size_list, ga_fitness_list, 'g:', label='Genetic', linewidth=1) w = 1 ax[0].bar(problem_size_list - w, sa_fitness_list, width=w, color='blue', label='Simulated Annealing') ax[0].bar(problem_size_list, ga_fitness_list, width=w, color='green', label='Genetic') ax[0].bar(problem_size_list - 2 * w, rhc_fitness_list, width=w, color='red', label='Random Hill Climb') ax[0].bar(problem_size_list + w, mimic_fitness_list, width=w, color='orange', label='MIMIC') ax[0].set(xlabel='Knapsack Size', ylabel='Fitness') ax[0].legend() ax[1].plot(problem_size_list, sa_time_list, 'b-', label='Simulated Annealing', linewidth=1) ax[1].plot(problem_size_list, ga_time_list, 'g:', label='Genetic', linewidth=1) ax[1].plot(problem_size_list, rhc_time_list, 'r--', label='Random Hill Climb', linewidth=1) ax[1].plot(problem_size_list, mimic_time_list, '-.', color='orange', label='MIMIC', linewidth=1) ax[1].set(xlabel='Knapsack Size', ylabel='Time (s)') ax[1].legend() ax[2].plot(problem_size_list, sa_feval_list, 'b-', label='Simulated Annealing', linewidth=1) ax[2].plot(problem_size_list, ga_feval_list, 'g:', label='Genetic', linewidth=1) ax[2].plot(problem_size_list, rhc_feval_list, 'r--', label='Random Hill Climb', linewidth=1) ax[2].plot(problem_size_list, mimic_feval_list, '-.', color='orange', label='MIMIC', linewidth=1) ax[2].set(xlabel='Knapsack Size', ylabel='Function Evaluations') ax[2].yaxis.tick_right() plt.show() if 'performance_graph' in task: problem_size = 80 # Knapsack weights = [idx for idx in range(1, problem_size + 1)] print(weights) values = [idx for idx in range(1, problem_size + 1)] max_weight_pct = 0.3 knapsack_fitness = mlrose.Knapsack(weights, values, max_weight_pct) best_fitness_list = [] problem = mlrose.DiscreteOpt(int(problem_size), knapsack_fitness, maximize=True, max_val=2) # RHC experiment_name = 'rhc_knapsack_performance_size_' + str(problem_size) restart_list = [100] rhc = runners.RHCRunner(problem=problem, experiment_name=experiment_name, output_directory='knapsack', seed=27, iteration_list=[5000], max_attempts=10, restart_list=restart_list) # the two data frames will contain the results rhc_run_stats, rhc_run_curves = rhc.run() # print(rhc_run_curves.dtypes) # print(rhc_run_curves) # print(df_run_curves) rhc_time = rhc_run_curves['Time'] rhc_fitness = rhc_run_curves['Fitness'] rhc_iteration = rhc_run_curves['Iteration'] rhc_feval = rhc_run_curves['Iteration'] * 2 # SA experiment_name = 'sa_knapsack_performance_size_' + str(problem_size) temperature_list = [2] sa = runners.SARunner(problem=problem, experiment_name=experiment_name, output_directory='knapsack', seed=27, iteration_list=[10000], max_attempts=50, temperature_list=temperature_list) # the two data frames will contain the results sa_run_stats, sa_run_curves = sa.run() # print(sa_run_curves.dtypes) # print(sa_run_curves) sa_run_curves['Temperature'] = pd.to_numeric( sa_run_curves['Temperature'].astype(str).astype(float)) # print(df_run_curves) sa_time = sa_run_curves['Time'] sa_fitness = sa_run_curves['Fitness'] sa_iteration = sa_run_curves['Iteration'] sa_feval = sa_run_curves['Iteration'] * 2 # GA experiment_name = 'ga_knapsack_performance_size_' + str(problem_size) population_sizes_list = 100, mutation_rates_list = [0.15] ga = runners.GARunner(problem=problem, experiment_name=experiment_name, output_directory='knapsack', seed=27, iteration_list=[1000], population_sizes=population_sizes_list, mutation_rates=mutation_rates_list, max_attempts=100) # the two data frames will contain the results ga_run_stats, ga_run_curves = ga.run() # print(ga_run_curves.dtypes) # print(ga_run_curves) # print(df_run_curves) ga_time = ga_run_curves['Time'] ga_fitness = ga_run_curves['Fitness'] ga_iteration = ga_run_curves['Iteration'] ga_feval = ga_run_curves['Iteration'] * population_sizes_list # MIMC experiment_name = 'mimic_knapsack_performance_size_' + str( problem_size) population_sizes_list = 200, keep_percent_list = [0.5] mimic = runners.MIMICRunner(problem=problem, experiment_name=experiment_name, output_directory='knapsack', seed=27, iteration_list=[150], population_sizes=population_sizes_list, keep_percent_list=keep_percent_list, max_attempts=15, use_fast_mimic=True) # the two data frames will contain the results mimic_run_stats, mimic_run_curves = mimic.run() # print(mimic_run_curves.dtypes) # print(mimic_run_curves) # print(df_run_curves) mimic_time = mimic_run_curves['Time'] mimic_fitness = mimic_run_curves['Fitness'] mimic_iteration = mimic_run_curves['Iteration'] mimic_feval = mimic_run_curves['Iteration'] * population_sizes_list plt.rc("font", size=8) plt.rc("axes", titlesize=12) plt.rc("axes", labelsize=10) plt.rc("xtick", labelsize=8) plt.rc("ytick", labelsize=8) plt.rc("legend", fontsize=8) plt.rc("figure", titlesize=11) #fig, ax = plt.subplots(2, 1, dpi=100, sharex=True, figsize=(5,4)) fig, ax = plt.subplots(1, 3, figsize=(12, 3.5)) fig.suptitle( 'Knapsack Algorithm Performance Analysis, problem size = ' + str(problem_size), fontsize=14) # ax[0].plot(problem_size_list, sa_fitness_list, 'b-', label='Simulated Annealing', linewidth=1) # ax[0].plot(problem_size_list, ga_fitness_list, 'g:', label='Genetic', linewidth=1) w = 1 ax[0].plot(rhc_iteration, rhc_fitness, 'r--', label='Random Hill Climb', linewidth=1) ax[0].plot(sa_iteration, sa_fitness, 'b:', label='Simulated Annealing', linewidth=1) ax[0].plot(ga_iteration, ga_fitness, 'g-', label='Genetic', linewidth=2) ax[0].plot(mimic_iteration, mimic_fitness, '-.', color='orange', label='MIMIC', linewidth=2) ax[0].set(xlabel='Iteration', ylabel='Fitness') ax[0].legend() #ax[0].set_title('Fitness vs. Iteration') ax[1].plot(rhc_time, rhc_fitness, 'r--', label='Random Hill Climb', linewidth=1) ax[1].plot(sa_time, sa_fitness, 'b:', label='Simulated Annealing', linewidth=1) ax[1].plot(ga_time, ga_fitness, 'g-', label='Genetic', linewidth=2) ax[1].plot(mimic_time, mimic_fitness, '-.', color='orange', label='MIMIC', linewidth=2) ax[1].set(xlabel='Time (s)', ylabel='Fitness') ax[1].legend() ax[2].plot(rhc_feval, rhc_fitness, 'r--', label='Random Hill Climb', linewidth=1) ax[2].plot(sa_feval, sa_fitness, 'b:', label='Simulated Annealing', linewidth=1) ax[2].plot(ga_feval, ga_fitness, 'g-', label='Genetic', linewidth=1) ax[2].plot(mimic_feval, mimic_fitness, '-.', color='orange', label='MIMIC', linewidth=1) ax[2].set(xlabel='Function Evaluations') plt.show() return
# -*- coding: utf-8 -*- """ Created on Wed Aug 26 18:17:30 2015 @author: ldierker """ import pandas import numpy import scipy.stats import seaborn import statsmodels import matplotlib.pyplot as plt data = pandas.read_csv('nesarc.txt', low_memory=False) """ setting variables you will be working with to numeric 10/29/15 note that the code is different from what you see in the videos A new version of pandas was released that is phasing out the convert_objects(convert_numeric=True) It still works for now, but it is recommended that the pandas.to_numeric function be used instead """ """ old code: data['TAB12MDX'] = data['TAB12MDX'].convert_objects(convert_numeric=True) data['CHECK321'] = data['CHECK321'].convert_objects(convert_numeric=True) data['S3AQ3B1'] = data['S3AQ3B1'].convert_objects(convert_numeric=True) data['S3AQ3C1'] = data['S3AQ3C1'].convert_objects(convert_numeric=True) data['AGE'] = data['AGE'].convert_objects(convert_numeric=True) """ # new code setting variables you will be working with to numeric data['TAB12MDX'] = pandas.to_numeric(data['TAB12MDX'], errors='coerce') data['CHECK321'] = pandas.to_numeric(data['CHECK321'], errors='coerce') data['S3AQ3B1'] = pandas.to_numeric(data['S3AQ3B1'], errors='coerce') data['S3AQ3C1'] = pandas.to_numeric(data['S3AQ3C1'], errors='coerce') data['AGE'] = pandas.to_numeric(data['AGE'], errors='coerce') #subset data to young adults age 18 to 25 who have smoked in the past 12 months
active_users_longer_intervals=[] active_devs_sleeping_intervals_df = [] active_devs_hibernation_intervals_df = [] active_devs_dead_intervals_df = [] n=0 for index, row in active_users_breaks.iterrows(): user_id=row['durations'][0] last_commit_day=util.getLastCommitDay(commit_table, user_id) last_break_length=util.days_between(last_commit_day, project_end) last_break_interval=last_commit_day+'/'+project_end row['durations'] = pandas.to_numeric(row['durations'][1:-2],'raise','integer').tolist() row['durations'].append(last_break_length) row['datelimits'] = row['datelimits'][1:] row['datelimits'].append(last_break_interval) user_actions = ae.get_user_activities(super_path, g, project_start_dt, project_end, user_id) ### Here the NORMAL execution goes on longer_breaks = pandas.DataFrame(columns=['durations', 'datelimits']) current_user_hibernation_periods_df = pandas.DataFrame(columns=['durations', 'datelimits']) current_user_sleepy_periods_df = pandas.DataFrame(columns=['durations', 'datelimits']) current_user_dead_periods_df=pandas.DataFrame(columns=['durations', 'datelimits']) dead_th = cfg.dead_threshold current_sleepy_periods_details=[] SLIDE_WIN_SIZE = 20
def apply(self, experiment): """ Assigns new metadata to events using the mixture model estimated in :meth:`estimate`. Returns ------- Experiment A new :class:`.Experiment` with the new condition variables as described in the class documentation. Also adds the following new statistics: - **mean** : Float the mean of the fitted gaussian in each channel for each component. - **sigma** : (Float, Float) the locations the mean +/- one standard deviation in each channel for each component. - **correlation** : Float the correlation coefficient between each pair of channels for each component. - **proportion** : Float the proportion of events in each component of the mixture model. only added if :attr:`num_components` ``> 1``. """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if len(self.channels) == 0: raise util.CytoflowOpError('channels', "Must set at least one channel") # make sure name got set! if not self.name: raise util.CytoflowOpError('name', "You have to set the gate's name " "before applying it!") if self.num_components > 1 and self.name in experiment.data.columns: raise util.CytoflowOpError('name', "Experiment already has a column named {0}" .format(self.name)) if self.sigma > 0: for i in range(1, self.num_components + 1): cname = "{}_{}".format(self.name, i) if cname in experiment.data.columns: raise util.CytoflowOpError('name', "Experiment already has a column named {}" .format(cname)) if self.posteriors: for i in range(1, self.num_components + 1): cname = "{}_{}_posterior".format(self.name, i) if cname in experiment.data.columns: raise util.CytoflowOpError('name', "Experiment already has a column named {}" .format(cname)) if not self._gmms: raise util.CytoflowOpError(None, "No components found. Did you forget to " "call estimate()?") for c in self.channels: if c not in self._scale: raise util.CytoflowOpError(None, "Model scale not set. Did you forget " "to call estimate()?") for c in self.channels: if c not in experiment.channels: raise util.CytoflowOpError('channels', "Channel {0} not found in the experiment" .format(c)) for b in self.by: if b not in experiment.conditions: raise util.CytoflowOpError('by', "Aggregation metadata {} not found, " "must be one of {}" .format(b, experiment.conditions)) # # if self.num_components == 1 and self.sigma == 0.0: # raise util.CytoflowOpError('sigma', # "if num_components is 1, sigma must be > 0.0") if self.num_components == 1 and self.posteriors: warn("If num_components == 1, all posteriors will be 1", util.CytoflowOpWarning) # raise util.CytoflowOpError('posteriors', # "If num_components == 1, all posteriors will be 1.") if self.num_components > 1: event_assignments = pd.Series(["{}_None".format(self.name)] * len(experiment), dtype = "object") if self.sigma > 0: event_gate = {i : pd.Series([False] * len(experiment), dtype = "double") for i in range(self.num_components)} if self.posteriors: event_posteriors = {i : pd.Series([0.0] * len(experiment), dtype = "double") for i in range(self.num_components)} if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that # contains all the events groupby = experiment.data.groupby(lambda _: True) # make the statistics components = [x + 1 for x in range(self.num_components)] prop_idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [components], names = list(self.by) + ["Component"]) prop_stat = pd.Series(name = "{} : {}".format(self.name, "proportion"), index = prop_idx, dtype = np.dtype(object)).sort_index() mean_idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [components] + [self.channels], names = list(self.by) + ["Component"] + ["Channel"]) mean_stat = pd.Series(name = "{} : {}".format(self.name, "mean"), index = mean_idx, dtype = np.dtype(object)).sort_index() sigma_stat = pd.Series(name = "{} : {}".format(self.name, "sigma"), index = mean_idx, dtype = np.dtype(object)).sort_index() interval_stat = pd.Series(name = "{} : {}".format(self.name, "interval"), index = mean_idx, dtype = np.dtype(object)).sort_index() corr_idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [components] + [self.channels] + [self.channels], names = list(self.by) + ["Component"] + ["Channel_1"] + ["Channel_2"]) corr_stat = pd.Series(name = "{} : {}".format(self.name, "correlation"), index = corr_idx, dtype = np.dtype(object)).sort_index() for group, data_subset in groupby: if group not in self._gmms: # there weren't any events in this group, so we didn't get # a gmm. continue gmm = self._gmms[group] x = data_subset.loc[:, self.channels[:]] for c in self.channels: x[c] = self._scale[c](x[c]) # which values are missing? x_na = pd.Series([False] * len(x)) for c in self.channels: x_na[np.isnan(x[c]).values] = True x = x.values x_na = x_na.values group_idx = groupby.groups[group] if self.num_components > 1: predicted = np.full(len(x), -1, "int") predicted[~x_na] = gmm.predict(x[~x_na]) predicted_str = pd.Series(["(none)"] * len(predicted)) for c in range(0, self.num_components): predicted_str[predicted == c] = "{0}_{1}".format(self.name, c + 1) predicted_str[predicted == -1] = "{0}_None".format(self.name) predicted_str.index = group_idx event_assignments.iloc[group_idx] = predicted_str # if we're doing sigma-based gating, for each component check # to see if the event is in the sigma gate. if self.sigma > 0.0: for c in range(self.num_components): s = np.linalg.pinv(gmm.covariances_[c]) mu = gmm.means_[c] # compute the Mahalanobis distance f = lambda x, mu, s: np.dot(np.dot((x - mu).T, s), (x - mu)) dist = np.apply_along_axis(f, 1, x, mu, s) # come up with a threshold based on sigma. you'll note we # didn't sqrt dist: that's because for a multivariate # Gaussian, the square of the Mahalanobis distance is # chi-square distributed p = (scipy.stats.norm.cdf(self.sigma) - 0.5) * 2 thresh = scipy.stats.chi2.ppf(p, 1) event_gate[c].iloc[group_idx] = np.less_equal(dist, thresh) if self.posteriors: # import sys;sys.path.append(r'/home/brian/.p2/pool/plugins/org.python.pydev_6.2.0.201711281614/pysrc') # import pydevd;pydevd.settrace() p = gmm.predict_proba(x) for c in range(self.num_components): event_posteriors[c].iloc[group_idx] = p[:, c] for c in range(self.num_components): if len(self.by) == 0: g = [c + 1] elif hasattr(group, '__iter__') and not isinstance(group, (str, bytes)): g = tuple(list(group) + [c + 1]) else: g = tuple([group] + [c + 1]) prop_stat.loc[g] = gmm.weights_[c] for cidx1, channel1 in enumerate(self.channels): g2 = tuple(list(g) + [channel1]) mean_stat.loc[g2] = self._scale[channel1].inverse(gmm.means_[c, cidx1]) s, corr = util.cov2corr(gmm.covariances_[c]) sigma_stat[g2] = (self._scale[channel1].inverse(s[cidx1])) interval_stat.loc[g2] = (self._scale[channel1].inverse(gmm.means_[c, cidx1] - s[cidx1]), self._scale[channel1].inverse(gmm.means_[c, cidx1] + s[cidx1])) for cidx2, channel2 in enumerate(self.channels): g3 = tuple(list(g2) + [channel2]) corr_stat[g3] = corr[cidx1, cidx2] corr_stat.drop(tuple(list(g2) + [channel1]), inplace = True) new_experiment = experiment.clone() if self.num_components > 1: new_experiment.add_condition(self.name, "category", event_assignments) if self.sigma > 0: for c in range(self.num_components): gate_name = "{}_{}".format(self.name, c + 1) new_experiment.add_condition(gate_name, "bool", event_gate[c]) if self.posteriors: for c in range(self.num_components): post_name = "{}_{}_posterior".format(self.name, c + 1) new_experiment.add_condition(post_name, "double", event_posteriors[c]) new_experiment.statistics[(self.name, "mean")] = pd.to_numeric(mean_stat) new_experiment.statistics[(self.name, "sigma")] = sigma_stat new_experiment.statistics[(self.name, "interval")] = interval_stat if len(corr_stat) > 0: new_experiment.statistics[(self.name, "correlation")] = pd.to_numeric(corr_stat) if self.num_components > 1: new_experiment.statistics[(self.name, "proportion")] = pd.to_numeric(prop_stat) new_experiment.history.append(self.clone_traits(transient = lambda _: True)) return new_experiment
def load_data(): data = pd.read_csv("cell_samples.csv") data = data[pd.to_numeric(data['BareNuc'], errors='coerce').notnull()] data['BareNuc'] = data['BareNuc'].astype('int') return data