def atr(df,periods=14,high='high',low='low',close='close',include=True,str='{name}({period})',**kwargs): def _atr(df,periods,high,low,close,include,str,detail=False): study='ATR' _df=pd.DataFrame() ## === talib ==== # _df['ATR']=pd.Series(talib.ATR(df[high].values, # df[low].values, # df[close].values, # periods),index=df.index) ## === /talib ==== ## === pure python ==== _df['HmL']=df[high]-df[low] _df['HmC']=abs(df[high]-df[close].shift(1)) _df['LmC']=abs(df[low]-df[close].shift(1)) _df['TR']=_df.apply(max,axis=1) _df['ATR']=_df['TR'].rolling(periods).mean() ## === /pure python ==== return rename(df,_df,study,periods,'',include,str,detail) periods=make_list(periods) __df=pd.concat([_atr(df,periods=y,high=high,low=low,close=close,include=False,str=str) for y in periods],axis=1) if include: return pd.concat([df,__df],axis=1) else: return __df
def pickle_trialDataSource(): ''' trialDataSource is converted to a dataframe: gazeEventsDF, and then pickled return: void ''' global gazeEventsDF trialEventsDF = pd.DataFrame() for key, source in trialSourceDict.items(): if key != "index": eventDF = source.to_df() eventDF['eventType'] = key trialEventsDF = pd.concat([eventDF, trialEventsDF], axis=0) if gazeEventsDF is False: pd.to_pickle(trialEventsDF, eventPickleLoc) else: # Remove old records from current trial from gazeEventsDF gazeEventsDF = gazeEventsDF[gazeEventsDF['trialNum'] != trialNum] # Add new data gazeEventsDF = pd.concat([gazeEventsDF, trialEventsDF], axis=0) pd.to_pickle(gazeEventsDF, eventPickleLoc)
def get_payment_method_details(self, *args): """ A banner usually gives several payment options for users. This function returns a dataframe showing how many people clicked on each payment method, how many successful donations came from each payment method, the percent of donations that came from each method, the total raised for each method, the average raised for reach method, where outliers where removed """ # set up list of banner to process if len(args) == 0: names = self.names else: names = args ds = [] #Define the metrics in the order requested by Megan column_order = [ 'name', 'donations', 'clicks', 'conversion_rate', 'percent clicked on', 'percent donated on', 'total_amount', 'ave_amount_ro' ] # Step through metrics and compute them for each banner for name in names: clicks = self.data[name]['clicks']['payment_method'].value_counts() donations = self.data[name]['donations']['payment_method'].value_counts() donations_sum = self.data[name]['donations'].groupby(['payment_method']).apply(lambda x: x.amount.sum()) ave = self.data[name]['clean_donations'].groupby(['payment_method']).apply(lambda x: x.amount.mean()) df = pd.concat([donations, clicks, ave, donations_sum], axis=1) df.columns = ['donations', 'clicks', 'ave_amount_ro', 'total_amount'] # metrics computed from above metrics df['conversion_rate'] = 100* df['donations'] / df['clicks'] df['percent clicked on'] = 100*df['clicks'] / df['clicks'].sum() df['percent donated on'] = 100*df['donations'] / df['donations'].sum() df['name'] = name #Put the metrics in the order requested by Megan df = df[column_order] ds.append(df) df = pd.concat(ds) df.index = pd.MultiIndex.from_tuples(zip(df['name'], df.index)) del df['name'] df = df.sort() return df
def cci(df,periods=14,high='high',low='low',close='close',include=True,str='{name}({period})',**kwargs): def _cci(df,periods,high,low,close,include,str,detail=False): study='CCI' _df=pd.DataFrame() ## === talib ==== # _df['CCI']=pd.Series(talib.CCI(df[high].values, # df[low].values, # df[close].values, # periods),index=df.index) ## === /talib ==== ## === pure python ==== _df['tp']=df[[low,high,close]].mean(axis=1) _df['avgTp']=_df['tp'].rolling(window=periods).mean() mad = lambda x: np.fabs(x - x.mean()).mean() _df['mad']=_df['tp'].rolling(window=periods).apply(mad) _df['CCI']=(_df['tp']-_df['avgTp'])/(0.015*_df['mad']) ## === /pure python ==== return rename(df,_df,study,periods,'',include,str,detail) periods=make_list(periods) __df=pd.concat([_cci(df,periods=y,high=high,low=low,close=close,include=False,str=str) for y in periods],axis=1) if include: return pd.concat([df,__df],axis=1) else: return __df
def getData(folderList, shapes, trips, stopTimes, calendar, frequencies): for folder in folderList: print('Adding data from ' + folder + '.') # Read the files from the data. readShapes = pd.read_csv('../' + folder + '/shapes.txt')[shapeData] readTrips = pd.read_csv('../' + folder + '/trips.txt')[routeData] readStopTimes = pd.read_csv('../' + folder + '/stop_times.txt')[timeData] readCalendar = pd.read_csv('../' + folder + '/calendar.txt')[calendarData] # Append it to the existing data. shapes = pd.concat([shapes, readShapes]) trips = pd.concat([trips, readTrips]) stopTimes = pd.concat([stopTimes, readStopTimes]) calendar = pd.concat([calendar, readCalendar]) if os.path.isfile('../' + folder + '/frequencies.txt'): readFrequencies = pd.read_csv('../' + folder + '/frequencies.txt') frequencies = pd.concat([frequencies, readFrequencies]) # Calculate the number of missing shapes. num_shapes = trips.groupby('route_id').size() num_validshapes = trips[trips.shape_id.isin(shapes.shape_id)].groupby('route_id').size() num_missingshapes = num_shapes - num_validshapes percent_missingshapes = num_missingshapes / num_shapes * 100 print('Missing data from ' + folder + ':') num_missingshapesList = num_missingshapes[num_missingshapes != 0] if num_missingshapes.empty: print(num_missingshapes[num_missingshapes != 0]) print(percent_missingshapes[percent_missingshapes != 0]) else: print('No data missing.\n') return lists(shapes, trips, stopTimes, calendar, frequencies)
def getDummiesInplace(columnList, train, test = None): #Takes in a list of column names and one or two pandas dataframes #One-hot encodes all indicated columns inplace columns = [] if test is not None: df = pd.concat([train,test], axis= 0) else: df = train for columnName in df.columns: index = df.columns.get_loc(columnName) if columnName in columnList: dummies = pd.get_dummies(df.ix[:,index], prefix = columnName, prefix_sep = ".") columns.append(dummies) else: columns.append(df.ix[:,index]) df = pd.concat(columns, axis = 1) if test is not None: train = df[:train.shape[0]] test = df[train.shape[0]:] return train, test else: train = df return train
def parse_sub(sub, office, district): sub = sub.reset_index(drop=True) # Special case these. Needs to be cleaned up and generalized. if (office, district) == ('U.S. House', '33'): sub = pd.concat([sub.iloc[0:4, 0:-1].reset_index(drop=True), sub.iloc[5:9, 1:-1].reset_index(drop=True), sub.iloc[10:14, 1:].reset_index(drop=True)], axis=1).dropna(how='all') elif (office, district) == ('State Assembly', '33'): sub = pd.concat([sub.iloc[0:4, 0:-1].reset_index(drop=True), sub.iloc[5:9, 1:].reset_index(drop=True)], axis=1).dropna(how='all') elif (office, district) == ('U.S. House', '24'): sub = pd.concat([sub.iloc[0:6, 0:-1].reset_index(drop=True), sub.iloc[7:13, 1:].reset_index(drop=True)], axis=1).dropna(how='all') sub.columns = ['county'] + \ sub.iloc[:, 1:-1].iloc[0].fillna('').tolist() + ['office'] sub = sub.dropna(axis=1, how='all') sub = sub.rename(columns=parse_candidate) parties = sub.iloc[:, 1:-1].iloc[1].to_dict() sub = sub[sub.county.isin(COUNTIES)] sub = pd.melt(sub, id_vars=['county', 'office'], value_vars=sub.columns.tolist()[ 1:-1], var_name='candidate', value_name='votes') sub['party'] = sub.candidate.apply(lambda x: parties[x]) sub = sub.assign(office=office, district=district) return sub[fieldnames]
def test_iloc_non_unique_indexing(self): # GH 4017, non-unique indexing (on the axis) df = DataFrame({'A': [0.1] * 3000, 'B': [1] * 3000}) idx = np.array(lrange(30)) * 99 expected = df.iloc[idx] df3 = pd.concat([df, 2 * df, 3 * df]) result = df3.iloc[idx] tm.assert_frame_equal(result, expected) df2 = DataFrame({'A': [0.1] * 1000, 'B': [1] * 1000}) df2 = pd.concat([df2, 2 * df2, 3 * df2]) sidx = df2.index.to_series() expected = df2.iloc[idx[idx <= sidx.max()]] new_list = [] for r, s in expected.iterrows(): new_list.append(s) new_list.append(s * 2) new_list.append(s * 3) expected = DataFrame(new_list) expected = pd.concat([expected, DataFrame(index=idx[idx > sidx.max()]) ]) result = df2.loc[idx] tm.assert_frame_equal(result, expected, check_index_type=False)
def iterate_weather_files(): new_dates = pd.date_range(str(constants.start_yr)+'-01-01',str(constants.until_yr)+'-12-31',freq='D') # Iterate through all daily weather files for fl in glob.iglob(os.path.join(constants.wth_dir, '*.txt')): print os.path.basename(fl) inp_df,ix_df = read_input_data(fl) # Create output climatology file frames = [compute_climatology(col_num,inp_df,ix_df,new_dates) for col_num in xrange(3,len(inp_df.columns))] result = pd.concat(frames,axis=1) # Add year, month and day columns (1st 3 columns) result.columns = xrange(3,len(inp_df.columns)) result[0] = result.index.year result[1] = result.index.month result[2] = result.index.day comb_df = pd.concat([inp_df,result]) # Output to new weather file epic_out = open(constants.out_dir+os.sep+os.path.basename(fl),'w') for index, row in comb_df.iterrows(): epic_out.write(('%6d%4d%4d'+6*'%6.2f'+'\n') % (int(row[0]),int(row[1]),int(row[2]), float(row[3]),float(row[4]),float(row[5]), float(row[6]),float(row[7]),float(row[8]))) epic_out.close()
def test_categorical_writing(self): original = DataFrame.from_records( [ ["one", "ten", "one", "one", "one", 1], ["two", "nine", "two", "two", "two", 2], ["three", "eight", "three", "three", "three", 3], ["four", "seven", 4, "four", "four", 4], ["five", "six", 5, np.nan, "five", 5], ["six", "five", 6, np.nan, "six", 6], ["seven", "four", 7, np.nan, "seven", 7], ["eight", "three", 8, np.nan, "eight", 8], ["nine", "two", 9, np.nan, "nine", 9], ["ten", "one", "ten", np.nan, "ten", 10] ], columns=['fully_labeled', 'fully_labeled2', 'incompletely_labeled', 'labeled_with_missings', 'float_labelled', 'unlabeled']) expected = original.copy() # these are all categoricals original = pd.concat([original[col].astype('category') for col in original], axis=1) expected['incompletely_labeled'] = expected['incompletely_labeled'].apply(str) expected['unlabeled'] = expected['unlabeled'].apply(str) expected = pd.concat([expected[col].astype('category') for col in expected], axis=1) expected.index.name = 'index' with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: # Silence warnings original.to_stata(path) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), expected)
def test_categorical_warnings_and_errors(self): # Warning for non-string labels # Error for labels too long original = pd.DataFrame.from_records( [['a' * 10000], ['b' * 10000], ['c' * 10000], ['d' * 10000]], columns=['Too_long']) original = pd.concat([original[col].astype('category') for col in original], axis=1) with tm.ensure_clean() as path: tm.assertRaises(ValueError, original.to_stata, path) original = pd.DataFrame.from_records( [['a'], ['b'], ['c'], ['d'], [1]], columns=['Too_long']) original = pd.concat([original[col].astype('category') for col in original], axis=1) with warnings.catch_warnings(record=True) as w: original.to_stata(path) tm.assert_equal(len(w), 1) # should get a warning for mixed content
def preprocess_greyc_nislab(in_file, out_file): """ Preprocess the raw GREYC NISLAB dataset """ df = pd.concat([pd.read_excel(in_file, sheetname=0), pd.read_excel(in_file, sheetname=1), pd.read_excel(in_file, sheetname=2), pd.read_excel(in_file, sheetname=3), pd.read_excel(in_file, sheetname=4)]) df = df[df['Class'] == 2] df['age'] = (df['Age'] < 30).map({True: '<30', False: '>=30'}) df['gender'] = df['Gender'].map({'F': 'female', 'M': 'male'}) df['handedness'] = df['Handedness'].map({'L': 'left', 'R': 'right'}) df['session'] = np.arange(len(df)) df['password'] = df['Password'].map({ 'leonardo dicaprio': 1, 'the rolling stones': 2, 'michael schumacher': 3, 'red hot chilli peppers': 4, 'united states of america': 5, }) def preprocess_row(idx_row): idx, row = idx_row keyname = list(map(lambda x: 'space' if x == ' ' else x, list(row['Password']))) v = np.array(row['Keystroke Template Vector'].strip().split()).astype(int) // 10000 s = len(keyname) - 1 pp, rr, pr, rp = [v[s * i:s * (i + 1)] for i in range(4)] timepress = np.r_[0, pp].cumsum() # Offset the first release time by the duration of the first key timerelease = np.r_[rp[0] - rr[0], rr].cumsum() # There are ~180 rows where timerelease == timepress. # Fix these by assuming at least the minimum standard clock resolution timerelease[timerelease == timepress] += 16 sample = pd.DataFrame.from_items([ ('user', row['User_ID']), ('session', row['session']), ('password', row['password']), ('age', row['age']), ('gender', row['gender']), ('handedness', row['handedness']), ('timepress', timepress), ('timerelease', timerelease), ('keyname', keyname) ]) return sample df = pd.concat(map(preprocess_row, df.iterrows())) df = df.set_index(['user', 'session'])[COLS] df = remove_repeated_keys(df) df.to_csv(out_file) return
def order_hist(CreateGroupList,num,f): order = pd.read_csv('./B/jdata_user_order.csv', parse_dates=['o_date']) sku = pd.read_csv('./B/jdata_sku_basic_info.csv', ) order = pd.merge(order, sku, on='sku_id', how='left') target_order = order[(order.cate == 101) | (order.cate == 30)].reset_index(drop=True) first_day = datetime.datetime.strptime('2016-08-31 00:00:00', '%Y-%m-%d %H:%M:%S') target_order['o_day_series'] = (target_order['o_date'] - first_day).apply(lambda x: x.days) target_order = target_order.sort_values(by=['user_id','o_day_series'], ascending=False).reset_index(drop=True) alld = [] for CG in CreateGroupList: CreateGroup = CG t = target_order[target_order.o_day_series < CreateGroup] features =[] for i in range(num): t2 = t[['user_id',f]].groupby(['user_id']).shift(-i) t2.columns = t2.columns + '_{}'.format(i) features.append(t2.columns[0]) t = pd.concat([t,t2],axis=1) x = t.drop_duplicates(subset=['user_id']) x = x[['user_id'] + features] x['CreateGroup'] = CreateGroup alld.append(x) df = pd.concat(alld).reset_index(drop=True) # print(np.unique(df.CreateGroup)) return df
def test_bollinger(self): prices = self.load_pandas('test_bollinger.pkl') df = pandas.concat([prices, prices.shift()], axis=1) df.columns = ['price', 'price_prev'] df['sigma'] = prices.std() df['mu'] = prices.mean() cumul = {'current_scaling': 0.} def scale(row, cumul=cumul): current_scaling = cumul['current_scaling'] price = row['price'] mu = row['mu'] sigma = 0.8 * row['sigma'] new_position_scaling = get_position_scaling(price, current_scaling, mu, sigma) # updating for next step cumul['current_scaling'] = new_position_scaling result = { 'position_scaling': new_position_scaling, 'band_inf': mu + ((new_position_scaling + 1) * sigma), 'band_mid': mu + (new_position_scaling * sigma), 'band_sup': mu + ((new_position_scaling - 1) * sigma) } return pandas.Series(result) df = pandas.concat([df, df.apply(scale, axis=1)], axis=1) df_diff = df['position_scaling'] - df['position_scaling'].shift().fillna(0.) expected = {Timestamp('2013-02-20 00:00:00'): -2.0, Timestamp('2012-12-06 00:00:00'): 0.0, Timestamp('2012-05-29 00:00:00'): 0.0, Timestamp('2012-01-01 00:00:00'): -3.0, Timestamp('2012-07-07 00:00:00'): 1.0, Timestamp('2012-02-10 00:00:00'): -2.0, Timestamp('2013-01-05 00:00:00'): -1.0, Timestamp('2012-03-03 00:00:00'): -1.0, Timestamp('2013-01-27 00:00:00'): -1.0, Timestamp('2012-04-04 00:00:00'): 0.0, Timestamp('2012-04-18 00:00:00'): 1.0, Timestamp('2013-01-12 00:00:00'): 0.0} variations = df_diff[df_diff != 0.].cumsum().to_dict() self.assertEqual(expected, variations)
def getAllJunctionSeqs(): # save junctions : first B1 junctions junctionMotifs = [] junctionSeqs = {} flanks = [['G', 'C', 'G', 'C']] maxNumSeqs = 12 for motif in [ '_', 'B1', 'B1,B1', 'B1,B1,B1', 'M', 'M,M', 'M,M,M', 'M,B1', 'M,M,B1', 'M,B1,B1']: junctionSeqs[motif] = {} for flank in flanks: baseNum = len(flank)/2 junctionMotif = ','.join(flank[:baseNum] + [motif] + flank[baseNum:]) junctionSeq = Junction(tuple(junctionMotif.split(','))).sequences junctionSeq.loc[:, 'n_flank'] = baseNum numSeqs = len(junctionSeq) # reduce total number of sequences if numSeqs > maxNumSeqs: index = np.linspace(0, numSeqs - 1, maxNumSeqs).astype(int) junctionSeq = junctionSeq.loc[index] junctionSeqs[motif][''.join(flank)] = junctionSeq junctionSeqs[motif] = pd.concat(junctionSeqs[motif], names=['flank', 'junction_num']) return pd.concat(junctionSeqs, names=['junction'])
def station_files_to_df(station_path, preamble='d04_text_station', concat_intv=10): """ Reads all the individual station files in directory at station_path and returns them as a single dataframe. :param station_path: (str) Path to directory with station data files. :param concat_intv: (int) Aggregation interval is the number of files to open and convert to data frame before aggregating. It would be fastest to open everything and concat only once. But this could cause memory problems. :param preamble: (str) Text that target file names begin with. :return: (pd.DataFrame) Dataframe containing all the data from the individual files with a date column appended. WARNING: This method only keeps the totals for each station. The lane-level data are thrown away. """ head = ['Timestamp', 'Station', 'District', 'Fwy', 'Dir', 'Type', 'Length', 'Samples', 'Observed', 'Total_Flow', 'Avg_Occ', 'Avg_Speed'] # Header for output df df = pd.DataFrame(columns=head) start_dir = os.getcwd() os.chdir(station_path) fnames = [n for n in os.listdir('.') if n[0:len(preamble)] == preamble] # List of all file name to read temp_list = [df] for name in fnames: print 'Adding file: ' + name temp = pd.read_csv(name, sep=',', header=None).iloc[:, 0:len(head)] temp.columns = head temp_list.append(temp) #TODO cast the Station column to int if len(temp_list) == concat_intv: temp_list = [pd.concat(temp_list)] os.chdir(start_dir) return pd.concat(temp_list)
def get_pnl_stats(df, start_capital, marginrate, freq): df['pnl'] = df['pos'].shift(1)*(df['close'] - df['close'].shift(1)).fillna(0.0) df['margin'] = pd.concat([df.pos*marginrate[0]*df.close, -df.pos*marginrate[1]*df.close], join='outer', axis=1).max(1) if freq == 'm': daily_pnl = pd.Series(df['pnl']).resample('1d',how='sum').dropna() daily_margin = pd.Series(df['margin']).resample('1d',how='last').dropna() daily_cost = pd.Series(df['cost']).resample('1d',how='sum').dropna() else: daily_pnl = pd.Series(df['pnl']) daily_margin = pd.Series(df['margin']) daily_cost = pd.Series(df['cost']) daily_pnl.name = 'daily_pnl' daily_margin.name = 'daily_margin' daily_cost.name = 'daily_cost' cum_pnl = pd.Series(daily_pnl.cumsum() + daily_cost.cumsum() + start_capital, name = 'cum_pnl') available = cum_pnl - daily_margin res = {} res['avg_pnl'] = daily_pnl.mean() res['std_pnl'] = daily_pnl.std() res['tot_pnl'] = daily_pnl.sum() res['tot_cost'] = daily_cost.sum() res['num_days'] = len(daily_pnl) res['sharp_ratio'] = res['avg_pnl']/res['std_pnl']*np.sqrt(252.0) max_dd, max_dur = max_drawdown(cum_pnl) res['max_margin'] = daily_margin.max() res['min_avail'] = available.min() res['max_drawdown'] = max_dd res['max_dd_period'] = max_dur if abs(max_dd) > 0: res['profit_dd_ratio'] = res['tot_pnl']/abs(max_dd) else: res['profit_dd_ratio'] = 0 ts = pd.concat([cum_pnl, daily_margin, daily_cost], join='outer', axis=1) return res, ts
def get_features(self): if not self._features is None: return self._features feats = [] for d in self._data: self._prep_data(d) c = self._feature_subset.data_subset(d) f = pd.concat(map(d._rm_break_info,c), axis=0, ignore_index=True) cols_set = set(list(f.columns)) h = [i for i in d.row_index_header if i in cols_set] f = f.set_index(h) d.prep_once_flag = True feats.append(f) for i,j in izip(feats, feats[1:]): assert (i.index == j.index).all() features = pd.concat(feats, axis=1, ignore_index=True) features = features.fillna(0) features = features.astype(float) assert self._feature_subset.has_allele() self._features = features.T return self._features
def test_concat_sparse_dense_cols(self, fill_value, sparse_idx, dense_idx): # See GH16874, GH18914 and #18686 for why this should be a DataFrame from pandas.core.dtypes.common import is_sparse frames = [self.dense1, self.dense3] sparse_frame = [frames[dense_idx], frames[sparse_idx].to_sparse(fill_value=fill_value)] dense_frame = [frames[dense_idx], frames[sparse_idx]] # This will try both directions sparse + dense and dense + sparse for _ in range(2): res = pd.concat(sparse_frame, axis=1) exp = pd.concat(dense_frame, axis=1) cols = [i for (i, x) in enumerate(res.dtypes) if is_sparse(x)] for col in cols: exp.iloc[:, col] = exp.iloc[:, col].astype("Sparse") for column in frames[dense_idx].columns: if dense_idx == sparse_idx: tm.assert_frame_equal(res[column], exp[column]) else: tm.assert_series_equal(res[column], exp[column]) tm.assert_frame_equal(res, exp) sparse_frame = sparse_frame[::-1] dense_frame = dense_frame[::-1]
def add_field(self, field): """Adds a field object to the universe.""" self._traits_need_update = True if isinstance(field, AtomicField): if not hasattr(self, 'field'): self.field = field else: new_field_values = self.field.field_values + field.field_values newdx = range(len(self.field), len(self.field) + len(field)) field.index = newdx new_field = pd.concat([self.field, field]) self.field = AtomicField(new_field, field_values=new_field_values) elif isinstance(field, list): if not hasattr(self, 'field'): fields = pd.concat(field) fields.index = range(len(fields)) fields_values = [j for i in field for j in i.field_values] self.field = AtomicField(fields, field_values=fields_values) else: new_field_values = self.field.field_values + [j for i in field for j in i.field_values] newdx = range(len(self.field), len(self.field) + sum([len(i.field_values) for i in field])) for i, idx in enumerate(newdx): field[i].index = [idx] new_field = pd.concat([self.field] + field) self.field = AtomicField(new_field, field_values=new_field_values) else: raise TypeError('field must be an instance of exatomic.field.AtomicField or a list of them') self._traits_need_update = True
def concat(*universes, name=None, description=None, meta=None): """ Warning: This function is not fully featured or tested yet! """ raise NotImplementedError() kwargs = {'name': name, 'description': description, 'meta': meta} names = [] for universe in universes: for key, data in universe._data().items(): name = key[1:] if key.startswith('_') else key names.append(name) if name in kwargs: kwargs[name].append(data) else: kwargs[name] = [data] for name in set(names): cls = kwargs[name][0].__class__ if isinstance(kwargs[name][0], Field): data = pd.concat(kwargs[name]) values = [v for field in kwargs[name] for v in field.field_values] kwargs[name] = cls(data, field_values=values) else: kwargs[name] = cls(pd.concat(kwargs[name])) return Universe(**kwargs)
def generate_dataset(pathway): pathway_id, pathway_genes = pathway POSITIVE_SAMPLES = 100 NEGATIVE_SAMPLES = 100 ovarian = pd.read_csv('../data_preparation/ovarian_inbiomap_exp.tsv', index_col=0) means = ovarian.mean(axis=0) covariances = ovarian.cov() variances = ovarian.var() print('here') new_pathway_means = pd.Series(np.random.normal(0,variances), index=variances.index)[pathway_genes].fillna(0) new_means = pd.concat([means, new_pathway_means], axis=1).fillna(0).sum(axis=1).reindex(means.index) positives = pd.DataFrame(np.random.multivariate_normal(new_means, covariances, size=POSITIVE_SAMPLES)) positives.index = [pathway_id+' positive']*len(positives) negatives = pd.DataFrame(np.random.multivariate_normal(means, covariances, size=NEGATIVE_SAMPLES)) negatives.index = [pathway_id+' negative']*len(negatives) dataset = pd.concat([positives, negatives]).sample(frac=1) # shuffle dataset.columns = ovarian.columns filename = 'synthetic_'+pathway_id+'_'+str(POSITIVE_SAMPLES)+'pos_'+str(NEGATIVE_SAMPLES)+'neg.csv' return dataset.to_csv(filename, index=True, header=True)
def build_totals(): h5_name = "../amounts.h5" store = HDFStore(h5_name) files = ['logement_tous_regime', 'pfam_tous_regimes', 'minima_sociaux_tous_regimes', 'IRPP_PPE', 'cotisations_TousRegimes' ] first = True for xlsfile in files: xls = ExcelFile(xlsfile + '.xlsx') print xls.path_or_buf df_a = xls.parse('amounts', na_values=['NA']) try: df_b = xls.parse('benef', na_values=['NA']) except: df_b = DataFrame() if first: amounts_df = df_a benef_df = df_b first = False else: amounts_df = concat([amounts_df, df_a]) benef_df = concat([benef_df, df_b]) amounts_df, benef_df = amounts_df.set_index("var"), benef_df.set_index("var") print amounts_df.to_string() print benef_df.to_string() store['amounts'] = amounts_df store['benef'] = benef_df store.close
def bollinger_band(self, tick, window=20, k=2, nml=False, mi_only=False): """ Return four arrays for Bollinger Band. The first one is the moving average. The second one is the upper band. The thrid one is the lower band. The fourth one is the Bollinger value. If mi_only, then return the moving average only. """ ldt_timestamps = self.index dt_timeofday = dt.timedelta(hours=16) days_delta = dt.timedelta(days=(np.ceil(window*7/5)+5)) dt_start = ldt_timestamps[0] - days_delta dt_end = ldt_timestamps[0] - dt.timedelta(days=1) pre_timestamps = du.getNYSEdays(dt_start, dt_end, dt_timeofday) # ldf_data has the data prior to our current interest. # This is used to calculate moving average for the first window. ldf_data = ut.get_tickdata([tick], pre_timestamps) if nml: ma_data = pd.concat([ldf_data[tick]['nml_close'], self['nml_close']]) else: ma_data = pd.concat([ldf_data[tick]['close'], self['close']]) bo = dict() bo['mi'] = pd.rolling_mean(ma_data, window=window)[ldt_timestamps] if mi_only: return bo['mi'] else: sigma = pd.rolling_std(ma_data, window=window) bo['up'] = bo['mi'] + k * sigma[ldt_timestamps] bo['lo'] = bo['mi'] - k * sigma[ldt_timestamps] bo['ba'] = (ma_data[ldt_timestamps] - bo['mi']) / (k * sigma[ldt_timestamps]) return bo
def find_steady_states_transients(metergroup, columns, noise_level, state_threshold, **load_kwargs): """ Returns ------- steady_states, transients : pd.DataFrame """ steady_states_list = [] transients_list = [] for power_df in metergroup.load(columns=columns, **load_kwargs): """ if len(power_df.columns) <= 2: # Use whatever is available power_dataframe = power_df else: # Active, reactive and apparent are available power_dataframe = power_df[[('power', 'active'), ('power', 'reactive')]] """ power_dataframe = power_df.dropna() if power_dataframe.empty: continue x, y = find_steady_states( power_dataframe, noise_level=noise_level, state_threshold=state_threshold) steady_states_list.append(x) transients_list.append(y) return [pd.concat(steady_states_list), pd.concat(transients_list)]
def boll(df,periods=20,boll_std=2,column=None,include=True,str='{name}({column},{period})',detail=False,**boll_kwargs): def _boll(df,periods,column): study='BOLL' df,_df,column=validate(df,column) ## === talib ==== # upper,middle,lower=talib.BBANDS(df[column].values,periods,boll_std,boll_std) # _df=pd.DataFrame({'SMA':middle,'UPPER':upper,'LOWER':lower},index=df.index) ## === /talib ==== ## === pure python ==== _df['SMA']=df[column].rolling(window=periods).mean() _df['UPPER']=_df['SMA']+df[column].rolling(window=periods).std()*boll_std _df['LOWER']=_df['SMA']-df[column].rolling(window=periods).std()*boll_std ## === /pure python ==== return rename(df,_df,study,periods,column,False,str,detail,output=output) column=make_list(column) periods=make_list(periods) output=['SMA','UPPER','LOWER'] __df=pd.concat([_boll(df,column=x,periods=y) for y in periods for x in column],axis=1) if include: return pd.concat([df,__df],axis=1) else: return __df
def __get_freq_vdj(self, data_type, prob = False): __sep = '__________' sample_freqs = [] sample_names = [] for sample in self.samples: freq = sample.get_summary(data_type, prob = prob) freqval = pd.Series(freq.frequency) freqval.index = freq.v.replace(np.nan, 'NA') + __sep + \ freq.d.replace(np.nan, 'NA') + __sep + \ freq.j.replace(np.nan, 'NA') sample_freqs.append(freqval) sample_names.append(sample.name) freq_dataframe = pd.concat(sample_freqs, axis = 1) freq_dataframe.columns = sample_names vdj_v = [] vdj_d = [] vdj_j = [] for vdj_combination in freq_dataframe.index: vv, dd, jj = vdj_combination.split(__sep) vdj_v.append(vv) vdj_d.append(dd) vdj_j.append(jj) freq_vdjcmbn = pd.concat([pd.Series(vdj_v), pd.Series(vdj_d), pd.Series(vdj_j)], axis = 1).replace('NA', np.nan) freq_vdjcmbn.columns = ['V', 'D', 'J'] freq_vdjcmbn.reset_index(drop = True, inplace = True) freq_dataframe.reset_index(drop = True, inplace = True) freq = pd.concat([freq_vdjcmbn, freq_dataframe], axis = 1) return freq
def submit_partial_merge(base, folder, all_blended=False): root_path = '/home/workspace/checkins' folder = "%s/submit/%s" % (root_path, folder) stamp = str(datetime.now().strftime("%Y%m%d_%H%M%S")) output = "%s/submit/treva_overwrite_%s_all_blended_%s.csv" % (root_path, stamp, all_blended) if all_blended: tfiles = [f for f in listdir(folder) if 'blend' in f] else: tfiles = [f for f in listdir(folder) if 'blend' not in f] # # remove old batch # print("tfiles before removing old batch: %i" % len(tfiles)) # old_partials = [f for f in listdir(root_path + "/submit/treva_merge")] # tfiles = [f for f in tfiles if f not in old_partials] # print("tfiles after removing old batch: %i" % len(tfiles)) # concat and merge df_treva = [pd.read_csv("%s/%s" % (folder, f)) for f in tfiles] df_treva = pd.concat(df_treva).sort_values(by='row_id') df_base = pd.read_csv("%s/data/submits/%s" % (root_path, base)) df_base = df_base[~df_base.row_id.isin(df_treva.row_id.values)] df_overwrite = pd.concat([df_base, df_treva]).sort_values(by='row_id') df_overwrite[['row_id', 'place_id']].sort_values(by='row_id').to_csv(output, index=False) print("ensure dim:", len(df_treva), len(set(df_treva.row_id.values)), len(set(df_overwrite.row_id.values))) print("overwrite output written in %s @ %s" % (output, datetime.now()))
def correl(df,periods=21,columns=None,include=True,str=None,detail=False,how='value',**correl_kwargs): """ how : string value pct_chg diff """ def _correl(df,periods=21,columns=None,include=True,str=None,detail=False,**correl_kwargs): study='CORREL' df,_df,columns=validate(df,columns) _df['CORREL'] = df[columns[0]].rolling(window=periods,**correl_kwargs).corr(df[columns[1]]) str=str if str else 'CORREL({column1},{column2},{period})'.format(column1=columns[0],column2=columns[1],period=periods) return rename(df,_df,study,periods,columns,include,str,detail) columns=df.columns if not columns else columns if len(columns) != 2: raise StudyError("2 Columns need to be specified for a correlation study") periods=make_list(periods) if how=='pct_chg': df=df[columns].pct_change() elif how=='diff': df=df[columns].diff() __df=pd.concat([_correl(df,columns=columns,periods=y,include=False,str=str) for y in periods],axis=1) if include: return pd.concat([df,__df],axis=1) else: return __df
def get_peaks(sub_gene_df, top_s, max_dist, feature_name): """ For each gene in gene_info get the peaks within max_dist in top_s. This is basically reverse engineering to get the peak info for each gene that was found to be associated with a peak. The reason for reverse engeneering rather than storing this information when searching for the genes for each peak is that we want to use precisely the same function to search the genes for the real data and for the permutations. Input: gene_info ... data frame with index ('chrom','start') and columns 'gene_id' and 'end' top_s ... series of peak positions with index (chrom, pos) and values peak height max_dist ... maximum distance between gene and peak """ gene_info = sub_gene_df def get_dist(df, gene_pos): """ calculate distance """ s = pd.Series(df.index.droplevel(0).values - gene_pos.ix[df.index[0][0]], index=df.index.droplevel(0).values) return s tot_gene_peaks_df = pd.DataFrame() if not top_s.index.is_monotonic: top_s = top_s.sortlevel([0, 1]) if not gene_info.index.is_monotonic: gene_info = gene_info.sort_index() for chrom in gene_info.index.droplevel(1).unique(): loc_top_s = top_s.ix[chrom] start = np.searchsorted(loc_top_s.index.values + max_dist, gene_info.ix[chrom].index.values) end = np.searchsorted(loc_top_s.index.values - max_dist, gene_info.ix[chrom]["end"].values) x = pd.concat( [loc_top_s.iloc[st:ed] for st, ed in zip(start, end)], keys=gene_info.ix[chrom][feature_name].values ) x.name = "peak_height" dist_start = x.groupby(lambda i: i[0]).apply( lambda df: get_dist(df, gene_info.ix[chrom].reset_index().set_index(feature_name)["start"]) ) dist_start.name = "dist_start" dist_end = x.groupby(lambda i: i[0]).apply( lambda df: get_dist(df, gene_info.ix[chrom].set_index(feature_name)["end"]) ) dist_end.name = "dist_end" gene_peaks_df = pd.concat([x, dist_start, dist_end], axis=1) gene_peaks_df.index = pd.MultiIndex.from_arrays( [gene_peaks_df.index.droplevel(1), [chrom] * len(x), gene_peaks_df.index.droplevel(0)] ) tot_gene_peaks_df = pd.concat([tot_gene_peaks_df, gene_peaks_df], axis=0) tot_gene_peaks_df.index.names = [feature_name, "chrom", "peak_pos"] return tot_gene_peaks_df
def call_opt_ideal_maxbudget(option, wage_max, wage, ser_prov, demand, supply, ser_max, row_i, col_j, provider_list, overhead_work, FTE_time , service_name): ''' core LP to optimize the allocation by wage or priority --- find something that using grid search ''' total_wage = []; total_sutab = []; d=[]; detail_result=[]; v = np.arange(0, 1.01, 0.1); w_weight = None; s= None for i in v: wi_weight = i; si_weight = 1- i; if( option == 'ideal_staffing'): dataset, tt = call_opt_ideal(wi_weight, si_weight, wage, ser_prov, demand, ser_max, row_i, col_j,FTE_time) if( option == 'ideal_staffing_current'): dataset, tt = call_opt_current(wi_weight, si_weight, wage, ser_prov, demand, supply, ser_max, row_i, col_j,\ FTE_time, overhead_work, provider_list) if tt > 0: # calculate statistics dataset.columns = provider_list['provider_abbr'] df = dataset.apply(sum, axis = 0) doctime = overhead_work.loc[0, provider_list['provider_abbr'] ] totaldoctime = overhead_work.loc[0, 'prop_f2f_tot']*demand.sum()[0]*doctime cortime = overhead_work.loc[1, provider_list['provider_abbr'] ] totalcortime = overhead_work.loc[1, 'prop_f2f_tot']*demand.sum()[0]*cortime df = df + totaldoctime + totalcortime df = (((df/FTE_time *10)/5).astype(float).round())/2 total_wage.append( np.round( sum(df*supply['provider_mean_wage']), 0) ) else: s = 'Excess supply' if(s == None): if( wage_max < min(total_wage) ): s = 'Try higher maximum wage. Available minimum/maximum wage to minimize wage or minimize \ sutability score is:' +round(min(total_wage)).astype(str)+ ' and '+ round(max(total_wage)).astype(str) print(s) else: wage_max = 0 if( wage_max >= min(total_wage) ): #print( 'Narrow the search.. it takes few seconds') mini = min( np.where( np.array(total_wage) < wage_max )[0] ) if mini == 0: w_weight = 0 else: total_wage = []; sv = np.arange(v[mini]-0.1, v[mini]+0.001, 0.01) for i in sv: wi_weight = i; si_weight = 1- i; if( option == 'ideal_staffing'): dataset, tt = call_opt_ideal(wi_weight, si_weight, wage, ser_prov, demand, ser_max, row_i, col_j,FTE_time) if( option == 'ideal_staffing_current'): dataset, tt = call_opt_current(wi_weight, si_weight, wage, ser_prov, demand, supply, ser_max, row_i, \ col_j,FTE_time, overhead_work, provider_list) if tt > 0: # calculate statistics dataset.columns = provider_list['provider_abbr'] df = dataset.apply(sum, axis = 0) doctime = overhead_work.loc[0, provider_list['provider_abbr'] ] totaldoctime = overhead_work.loc[0, 'prop_f2f_tot']*demand.sum()[0]*doctime cortime = overhead_work.loc[1, provider_list['provider_abbr'] ] totalcortime = overhead_work.loc[1, 'prop_f2f_tot']*demand.sum()[0]*cortime df = df + totaldoctime + totalcortime df = (((df/FTE_time *10)/5).astype(float).round())/2 total_wage.append( np.round( sum(df*supply['provider_mean_wage']), 0) ) else: total_wage.append(0 ) mini = min( np.where( np.array(total_wage) < wage_max )[0] ) w_weight = sv[mini] s_weight = 1-w_weight if( option == 'ideal_staffing'): dataset, tt = call_opt_ideal(w_weight, s_weight, wage, ser_prov, demand, ser_max,row_i, col_j,FTE_time) if( option == 'ideal_staffing_current'): dataset, tt = call_opt_current(w_weight, s_weight, wage, ser_prov, demand, supply, ser_max, row_i, col_j,\ FTE_time, overhead_work, provider_list) # calculate statistics if tt == 0: s = 'Can not find optimal allocation. Change input' else: dataset.columns = provider_list['provider_abbr'] detail_result = pd.concat([service_name, dataset], axis = 1) df = dataset.apply(sum, axis = 0) doctime = overhead_work.loc[0, provider_list['provider_abbr'] ] totaldoctime = overhead_work.loc[0, 'prop_f2f_tot']*demand.sum()[0]*doctime cortime = overhead_work.loc[1, provider_list['provider_abbr'] ] totalcortime = overhead_work.loc[1, 'prop_f2f_tot']*demand.sum()[0]*cortime df = df + totaldoctime + totalcortime df = (((df/FTE_time *10)/5).astype(float).round())/2 df.columns = 'FTE' total_wage = np.round( sum(df*supply['provider_mean_wage']), 0) total_sutab = np.round( sum((dataset * ser_prov).apply(sum, axis = 0))/sum(dataset.apply(sum, axis = 0)), 2) ind_wage = np.round( df*supply['provider_mean_wage'], 0) ind_sutab = np.round( (dataset * ser_prov).apply(sum, axis = 0)/dataset.apply(sum, axis = 0) ,2) #tmp = pd.concat([service_name, dataset], axis = 1) s = {} s = {'total_wage': total_wage, 'total_sutab': total_sutab, 'ind_wage': ind_wage, 'ind_sutab': ind_sutab, 'FTE': df, 'detail_f2f_mini': detail_result} return s
def assign_power_curve( self, wake_losses_model="wind_farm_efficiency", smoothing=False, block_width=0.5, standard_deviation_method="turbulence_intensity", smoothing_order="wind_farm_power_curves", turbulence_intensity=None, **kwargs, ): r""" Calculates the power curve of a wind farm. The wind farm power curve is calculated by aggregating the power curves of all wind turbines in the wind farm. Depending on the parameters the power curves are smoothed (before or after the aggregation) and/or a wind farm efficiency (power efficiency curve or constant efficiency) is applied after the aggregation. After the calculations the power curve is assigned to the attribute :py:attr:`~power_curve`. Parameters ---------- wake_losses_model : str Defines the method for taking wake losses within the farm into consideration. Options: 'wind_farm_efficiency' or None. Default: 'wind_farm_efficiency'. smoothing : bool If True the power curves will be smoothed before or after the aggregation of power curves depending on `smoothing_order`. Default: False. block_width : float Width between the wind speeds in the sum of the equation in :py:func:`~.power_curves.smooth_power_curve`. Default: 0.5. standard_deviation_method : str Method for calculating the standard deviation for the Gauss distribution. Options: 'turbulence_intensity', 'Staffell_Pfenninger'. Default: 'turbulence_intensity'. smoothing_order : str Defines when the smoothing takes place if `smoothing` is True. Options: 'turbine_power_curves' (to the single turbine power curves), 'wind_farm_power_curves'. Default: 'wind_farm_power_curves'. turbulence_intensity : float Turbulence intensity at hub height of the wind farm for power curve smoothing with 'turbulence_intensity' method. Can be calculated from `roughness_length` instead. Default: None. roughness_length : float (optional) Roughness length. If `standard_deviation_method` is 'turbulence_intensity' and `turbulence_intensity` is not given the turbulence intensity is calculated via the roughness length. Returns ------- :class:`~.wind_farm.WindFarm` self """ # Check if all wind turbines have a power curve as attribute for turbine in self.wind_turbine_fleet["wind_turbine"]: if turbine.power_curve is None: raise ValueError( "For an aggregated wind farm power curve " + "each wind turbine needs a power curve " + "but `power_curve` of '{}' is None.".format(turbine)) # Initialize data frame for power curve values df = pd.DataFrame() for ix, row in self.wind_turbine_fleet.iterrows(): # Check if needed parameters are available and/or assign them if smoothing: if (standard_deviation_method == "turbulence_intensity" and turbulence_intensity is None): if ("roughness_length" in kwargs and kwargs["roughness_length"] is not None): # Calculate turbulence intensity and write to kwargs turbulence_intensity = tools.estimate_turbulence_intensity( row["wind_turbine"].hub_height, kwargs["roughness_length"], ) kwargs["turbulence_intensity"] = turbulence_intensity else: raise ValueError( "`roughness_length` must be defined for using " + "'turbulence_intensity' as " + "`standard_deviation_method` if " + "`turbulence_intensity` is not given") # Get original power curve power_curve = pd.DataFrame(row["wind_turbine"].power_curve) # Editions to the power curves before the summation if smoothing and smoothing_order == "turbine_power_curves": power_curve = power_curves.smooth_power_curve( power_curve["wind_speed"], power_curve["value"], standard_deviation_method=standard_deviation_method, block_width=block_width, **kwargs, ) else: # Add value zero to start and end of curve as otherwise # problems can occur during the aggregation if power_curve.iloc[0]["wind_speed"] != 0.0: power_curve = pd.concat( [ pd.DataFrame(data={ "value": [0.0], "wind_speed": [0.0] }), power_curve, ], join="inner", ) if power_curve.iloc[-1]["value"] != 0.0: power_curve = pd.concat( [ power_curve, pd.DataFrame( data={ "wind_speed": [ power_curve["wind_speed"].loc[ power_curve.index[-1]] + 0.5 ], "value": [0.0], }), ], join="inner", ) # Add power curves of all turbine types to data frame # (multiplied by turbine amount) df = pd.concat( [ df, pd.DataFrame( power_curve.set_index(["wind_speed"]) * row["number_of_turbines"]), ], axis=1, ) # Aggregate all power curves wind_farm_power_curve = pd.DataFrame( df.interpolate(method="index").sum(axis=1)) wind_farm_power_curve.columns = ["value"] wind_farm_power_curve.reset_index(inplace=True) # Apply power curve smoothing and consideration of wake losses # after the summation if smoothing and smoothing_order == "wind_farm_power_curves": wind_farm_power_curve = power_curves.smooth_power_curve( wind_farm_power_curve["wind_speed"], wind_farm_power_curve["value"], standard_deviation_method=standard_deviation_method, block_width=block_width, **kwargs, ) if wake_losses_model == "wind_farm_efficiency": if self.efficiency is not None: wind_farm_power_curve = power_curves.wake_losses_to_power_curve( wind_farm_power_curve["wind_speed"].values, wind_farm_power_curve["value"].values, wind_farm_efficiency=self.efficiency, ) else: msg = ( "If you use `wake_losses_model` '{model}' your WindFarm " "needs an efficiency but `efficiency` is {eff}. \n\n" "Failing farm:\n {farm}") raise ValueError( msg.format(model=wake_losses_model, farm=self, eff=self.efficiency)) self.power_curve = wind_farm_power_curve return self
#mengulang index dari tiap baris sampai tiap elemen dari knownForTitles idx = name_df.index.repeat(name_df['knownForTitles'].str.len()) #memecah values dari list di setiap baris dan menggabungkan nya dengan rows lain menjadi dataframe df1 = pd.DataFrame({ x: np.concatenate(name_df[x].values) }) #mengganti index dataframe tersebut dengan idx yang sudah kita define di awal df1.index = idx #untuk setiap dataframe yang terbentuk, kita menambahkan ke dataframe bucket df_uni.append(df1) #menggabungkan semua dataframe menjadi satu df_concat = pd.concat(df_uni, axis=1) #join dengan value dari dataframe yang awal unnested_df = df_concat.join(name_df.drop(['knownForTitles'], 1), how='left') #select kolom sesuai dengan dataframe awal unnested_df = unnested_df[name_df.columns.tolist()] print(unnested_df) # # [Mengelompokkan primaryName menjadi list group by knownForTitles](https://academy.dqlab.id/main/projectcode/214/394/1977) # In[11]: unnested_drop = unnested_df.drop(['nconst'], axis=1)
def get_dtypes(cls, dtypes_ids): return (pandas.concat(cls.materialize(dtypes_ids), axis=1).apply( lambda row: find_common_type_cat(row.values), axis=1).squeeze(axis=0))
print (df_m_narrow.dtypes) # In[55]: df_m_narrow['Date'] = pd.to_datetime(df_m_narrow['Date']) print (df_m_narrow.dtypes) # In[201]: df_m_narrow_dates = df_m_narrow.set_index('Date') df_m_narrow_dates.head() # In[193]: combos = [df_mtd, df_m_narrow_dates] #listing the data sets combined = pd.concat(combos) #combining the datasets combined # In[ ]:
print(ParentLevel.head(5)) Lunch=pd.get_dummies(df["lunch"],drop_first=True) print(Lunch.head(5)) TestPreperation=pd.get_dummies(df["test preparation course"],drop_first=True) print(TestPreperation.head(5)) RaceEthnicity=pd.get_dummies(df['race/ethnicity'],drop_first=True) print(RaceEthnicity.head(5)) print(df.head(2)) X=pd.concat([Gender,RaceEthnicity,ParentLevel,TestPreperation,Lunch],axis=1) print(X.head(1)) from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error from sklearn import neighbors X_train, X_test, Y_train, Y_test = train_test_split(X,Y,random_state=0,test_size=0.1) model = neighbors.KNeighborsRegressor() model.fit(X_train,Y_train) predictions=model.predict(X_test) mean_squared_error(predictions,Y_test) scores = df.loc[:,["math score","reading score","writing score"]]
def plot_supply_demand(current_demand, current_supply): p = pd.concat([current_demand, current_supply], axis = 1) p.columns = ['current_needs','current_supply'] p.plot(kind='bar') # supply demand plot plt.title('Needs vs. Supply') plt.show()
def input_create_future(geo, year,current_year, sut_target, sdoh_score, pop_chronic_trend, pop_chronic_prev, chron_care_freq, geo_area, service_characteristics, pop_acute_need, population, provider_supply, pop_prev_need , provider_list , encounter_detail, overhead_work): yeardiff = int(year) - int(current_year) # every provider should follow the order of provider_list['provider_abbr'] population = population.loc[ population['pop_geo_area'] == geo, : ] population = population[ ['pop_sex','pop_age', year] ] #total_pop = population[year].sum() # preventive demand prev_ser = encounter_detail.loc[ encounter_detail['encounter_category'] == 'Preventive',:] prev_df = pd.merge(prev_ser, service_characteristics, how='left', \ left_on=['svc_category','svc_desc'], right_on = ['svc_category','svc_desc']) p_demand = [] for i in range(len(prev_df)): # demand = rate_per_encounter * freq * n of population * time tmpid = prev_df.loc[i,'encounter_type'] tmp = pop_prev_need[ ['pop_min_age','pop_max_age','pop_sex',tmpid] ] freq = tmp[ tmpid ].astype(float) s = tmp[ 'pop_min_age'].astype(int); e = tmp['pop_max_age'].astype(int); g = tmp['pop_sex'] s = s.loc[ tmp[tmpid] > 0 ]; e = e.loc[ tmp[tmpid] > 0 ]; g = g.loc[ tmp[tmpid] > 0 ]# total population need service freq = freq.loc[ tmp[tmpid] > 0 ]; t_demand = 0 for j in range( sum( tmp[tmpid] > 0 )): # won's have BOTH if(g.iloc[j]== 'F'): r = population.loc[ (population['pop_age'] >= s.iloc[j]) & \ (population['pop_age'] <= e.iloc[j]) & (population['pop_sex'] == 'F'), year ].sum() if(g.iloc[j] == 'M'): r = population.loc[ (population['pop_age'] >= s.iloc[j]) & \ (population['pop_age'] <= e.iloc[j]) & (population['pop_sex'] == 'M'), year ].sum() t_demand = t_demand + r*freq.iloc[j] p_demand.append(t_demand) # frequency * population f2f1 = prev_df['max_f2f_time']; f2f0 = prev_df['min_f2f_time'] f2f = (f2f1 + f2f0)/5.0*sdoh_score.values prev_demand = prev_df['rate_per_encounter'] * f2f * p_demand # rate_per_encounter prev_service_name = prev_df[['encounter_category','encounter_type', 'svc_category', 'svc_desc']] prev_ser_prv = prev_df[ provider_list['provider_abbr'] ] # acute demand == assume excel file updated acute_ser = encounter_detail.loc[encounter_detail['encounter_category'] == 'Acute',:] acute_df = pd.merge(acute_ser, service_characteristics, how='left', \ left_on=['svc_category','svc_desc'], right_on = ['svc_category','svc_desc']) a_demand = [] for i in range(len(acute_df)): # demand = rate_per_encounter * prev * n of population * time tmpid = acute_df.loc[i,'encounter_type'] tmp = pop_acute_need[ ['pop_min_age','pop_max_age','pop_sex',tmpid] ] prev = tmp[ tmpid ].astype(float) s = tmp[ 'pop_min_age'].astype(int); e = tmp['pop_max_age'].astype(int); g = tmp['pop_sex'] s = s.loc[ tmp[tmpid] > 0 ]; e = e.loc[ tmp[tmpid] > 0 ]; g = g.loc[ tmp[tmpid] > 0 ]# total population need service prev = prev.loc[ tmp[tmpid] > 0 ]; t_demand = 0 for j in range( sum( tmp[tmpid] > 0 )): # won's have BOTH if(g.iloc[j]== 'F'): r = population.loc[ (population['pop_age'] >= s.iloc[j]) & \ (population['pop_age'] <= e.iloc[j]) & (population['pop_sex'] == 'F'), year ].sum() if(g.iloc[j] == 'M'): r = population.loc[ (population['pop_age'] >= s.iloc[j]) & \ (population['pop_age'] <= e.iloc[j]) & (population['pop_sex'] == 'M'), year ].sum() t_demand = t_demand + r *prev.iloc[j]/1000 # proprtion per 1000 a_demand.append(t_demand) # total demand f2f1 = acute_df['max_f2f_time']; f2f0 = acute_df['min_f2f_time'] f2f = (f2f1 + f2f0)/5.0*sdoh_score.values acute_demand = acute_df['rate_per_encounter'] * f2f * a_demand acute_service_name = acute_df[['encounter_category','encounter_type', 'svc_category', 'svc_desc']] acute_ser_prv = acute_df[ provider_list['provider_abbr'] ] # chronic demand chro_ser = encounter_detail.loc[encounter_detail['encounter_category'] == 'Chronic',:] chro_df = pd.merge(chro_ser, service_characteristics, how='left', \ left_on=['svc_category','svc_desc'], right_on = ['svc_category','svc_desc']) # service level c_demand = [] for i in range(len(chro_df)): # demand = rate_per_encounter(prev)* freq * prev*n of population * time tmpid = chro_df.loc[i,'encounter_type'] freq = chron_care_freq.loc[ chron_care_freq[ tmpid ] > 0, ['chron_cond_abbr', tmpid]] # disease level t_demand = 0; lf = len(freq) if( lf > 0 ): for m in range(len(freq)): prev_freq = pop_chronic_prev[ freq.iloc[m, 0] ]*freq.iloc[m,1].astype(float) prev_freq = prev_freq.values prev_freq = np.squeeze(prev_freq) tmp1 = pop_chronic_trend[ freq.iloc[m, 0]] tmp = pop_chronic_prev[ ['pop_min_age','pop_max_age','pop_sex' ] ] s = tmp[ 'pop_min_age'].astype(int); e = tmp['pop_max_age'].astype(int); g = tmp['pop_sex'] for j in np.where(prev_freq > 0)[0]: if(g.iloc[j]== 'F'): r = population.loc[ (population['pop_age'] >= s.iloc[j]) & \ (population['pop_age'] <= e.iloc[j]) & (population['pop_sex'] == 'F'), year ].sum() if(g.iloc[j] == 'M'): r = population.loc[ (population['pop_age'] >= s.iloc[j]) & \ (population['pop_age'] <= e.iloc[j]) & (population['pop_sex'] == 'M'), year ].sum() t_demand = t_demand + r * prev_freq[j]*(1+ tmp1[j])**(yeardiff)/1000 c_demand.append(t_demand) # population * prev * freq # total demand f2f1 = chro_df['max_f2f_time']; f2f0 = chro_df['min_f2f_time'] f2f = (f2f1 + f2f0)/5.0*sdoh_score.values chronic_demand = chro_df['rate_per_encounter'] * f2f * c_demand chronic_service_name = chro_df[['encounter_category','encounter_type', 'svc_category', 'svc_desc']] chronic_ser_prv = chro_df[ provider_list['provider_abbr'] ] demand = pd.concat( [prev_demand, acute_demand, chronic_demand ] ).reset_index( drop=True ) demand = demand.to_frame('demand') # demand ser_prov = pd.concat( [prev_ser_prv, acute_ser_prv, chronic_ser_prv ] ).reset_index( drop=True ) service_name = pd.concat( [prev_service_name, acute_service_name, chronic_service_name] ).reset_index( drop=True ) supply = provider_supply.loc[ provider_supply['provider_geo_area'] == geo, : ] nprovidernum = supply['provider_num']*(1+supply['provider_growth_trend'])**(yeardiff) nproviderwage = supply['provider_mean_wage']*(1+supply['provider_wage_trend'])**(yeardiff) supply = pd.concat([ supply['provider_abbr'], nprovidernum, nproviderwage], axis = 1) supply.columns = ['provider_abbr','provider_num','provider_mean_wage'] supply.index = supply['provider_abbr'] wage = supply['provider_mean_wage']/sum(supply['provider_mean_wage']) # sutability get optimized by sut_target if( sut_target > 0): for col in provider_list['provider_abbr']: ser_prov[col] = ser_prov[col].replace('^\s*$', np.nan, regex=True).astype(float) v = 2*sut_target - ser_prov.loc[ser_prov[col] > sut_target, col] ser_prov.loc[ ser_prov[col] > sut_target, col ] = v ser_prov[col] = 1- ser_prov[col]/sut_target # need to remove NA ser_prov = ser_prov.fillna(1.1) # when licences not allow service, all will get 1.1, 1-top of the licence 0-super easy supply = supply.fillna(0) overhead_work = overhead_work.fillna(0) wage = wage.fillna(0) k = (demand==0) | (np.isnan(demand)) p = np.where( ~k ) ser_prov = ser_prov.iloc[p[0], :].reset_index( drop=True ) demand = demand.iloc[p[0]].reset_index( drop=True ) service_name = service_name.iloc[p[0], :].reset_index(drop=True ) wage = wage.loc[ provider_list['provider_abbr'] ] ser_prov = ser_prov[ provider_list['provider_abbr'] ] supply = supply.loc[ provider_list['provider_abbr'] ] return wage, ser_prov, demand, supply, overhead_work, provider_list, service_name
def kfold_lightgbm(params,df, predictors,target,num_folds, stratified = True, objective='', metrics='',debug= False, feval = f1_score_vali, early_stopping_rounds=100, num_boost_round=100, verbose_eval=50, categorical_features=None,sklearn_mertric = evaluate_macroF1_lgb ): lgb_params = params train_df = df[df[target].notnull()] test_df = df[df[target].isnull()] # Divide in training/validation and test data print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df[predictors].shape, test_df[predictors].shape)) del df gc.collect() # Cross validation model if stratified: folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1234) else: folds = KFold(n_splits= num_folds, shuffle=True, random_state=1234) # folds = GroupKFold(n_splits=5) # Create arrays and dataframes to store results oof_preds = np.zeros((train_df.shape[0],11)) sub_preds = np.zeros((test_df.shape[0],11)) feature_importance_df = pd.DataFrame() feats = predictors cv_resul = [] ''' perm = [i for i in range(len(train_df))] perm = pd.DataFrame(perm) perm.columns = ['index_'] for n_fold in range(5): train_idx = np.array(perm[train_df['cv'] != n_fold]['index_']) valid_idx = np.array(perm[train_df['cv'] == n_fold]['index_']) ''' for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df[target])): if (USE_KFOLD == False) and (n_fold == 1): break train_x, train_y = train_df[feats].iloc[train_idx], train_df[target].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[target].iloc[valid_idx] train_x = pd.concat([train_x,train_old[feats]]) train_y = pd.concat([train_y,train_old[target]]) train_y_t = train_y.values valid_y_t = valid_y.values print(train_y_t) xgtrain = lgb.Dataset(train_x.values, label = train_y_t, feature_name=predictors, categorical_feature=categorical_features ) xgvalid = lgb.Dataset(valid_x.values, label = valid_y_t, feature_name=predictors, categorical_feature=categorical_features ) clf = lgb.train(lgb_params, xgtrain, valid_sets=[xgvalid],#, xgtrain], valid_names=['valid'],#,'train'], num_boost_round=num_boost_round, early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose_eval, # feval=feval ) oof_preds[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration) sub_preds += clf.predict(test_df[feats], num_iteration=clf.best_iteration)/ folds.n_splits gain = clf.feature_importance('gain') fold_importance_df = pd.DataFrame({'feature':clf.feature_name(), 'split':clf.feature_importance('split'), 'gain':100*gain/gain.sum(), 'fold':n_fold, }).sort_values('gain',ascending=False) feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) result = evaluate_macroF1_lgb(valid_y, oof_preds[valid_idx]) # result = clf.best_score['valid']['macro_f1_score'] print('Fold %2d macro-f1 : %.6f' % (n_fold + 1, result)) cv_resul.append(round(result,5)) gc.collect() #score = np.array(cv_resul).mean() score = 'model_2' if USE_KFOLD: #print('Full f1 score %.6f' % score) for i in range(11): train_df["class_" + str(i)] = oof_preds[:,i] test_df["class_" + str(i)] = sub_preds[:,i] train_df[['user_id'] + ["class_" + str(i) for i in range(11)]].to_csv('./cv/val_prob_{}.csv'.format(score), index= False, float_format = '%.4f') test_df[['user_id'] + ["class_" + str(i) for i in range(11)]].to_csv('./cv/sub_prob_{}.csv'.format(score), index= False, float_format = '%.4f') oof_preds = [np.argmax(x)for x in oof_preds] sub_preds = [np.argmax(x)for x in sub_preds] train_df[target] = oof_preds test_df[target] = sub_preds print(test_df[target].mean()) train_df[target] = oof_preds train_df[target] = train_df[target].map(label2current_service) test_df[target] = sub_preds test_df[target] = test_df[target].map(label2current_service) print('all_cv', cv_resul) train_df[['user_id', target]].to_csv('./sub/val_{}.csv'.format(score), index= False) test_df[['user_id', target]].to_csv('./sub/sub_{}.csv'.format(score), index= False) print("test_df mean:") display_importances(feature_importance_df,score)
def resource_allocation(option, sub_option, wage, ser_prov, demand, supply, overhead_work, provider_list, service_name, collapse_group, w_weight, s_weight, wage_max, FTE_time): # dimension n_ser = len(demand) n_provider = len(provider_list) col_j = range(n_provider) row_i = range(n_ser) ser_max = pd.DataFrame(index=range(n_ser),columns=provider_list['provider_abbr']) for i in range( n_ser ):# service for m in provider_list['provider_abbr']: max_val = (ser_prov.loc[i, m] <= 1) * demand.loc[i,'demand'] ser_max.loc[i,m] = max_val #====== optimization total_wage = []; total_sutab = []; detail_result = []; d = pd.DataFrame(index = provider_list['provider_abbr']) if( (option == 'ideal_staffing') | (option == 'ideal_staffing_current') ): if (sub_option == "all_combination" ): co = 0; s = {} for i in np.arange(0, 1.1, 0.1): wi_weight = i; si_weight = 1- i; co = co + 1 if( option == 'ideal_staffing'): dataset, tt = call_opt_ideal(wi_weight, si_weight, wage, ser_prov, demand, ser_max, row_i, col_j,FTE_time) if( option == 'ideal_staffing_current'): dataset, tt = call_opt_current(wi_weight, si_weight, wage, ser_prov, demand, supply, ser_max, \ row_i, col_j,FTE_time, overhead_work, provider_list) # calculate statistics if tt == 0: df = pd.DataFrame(np.nan, index=provider_list['provider_abbr'], columns = [i]) d = pd.concat( [ d, df], axis = 1) total_wage.append( np.nan ) total_sutab.append( np.nan ) else: dataset.columns = provider_list['provider_abbr'] # F2F df = dataset.apply(sum, axis = 0) doctime = overhead_work.loc[0, provider_list['provider_abbr'] ] totaldoctime = overhead_work.loc[0, 'prop_f2f_tot']*demand.sum()[0]*doctime cortime = overhead_work.loc[1, provider_list['provider_abbr'] ] totalcortime = overhead_work.loc[1, 'prop_f2f_tot']*demand.sum()[0]*cortime df = df + totaldoctime + totalcortime df = (((df/FTE_time *10)/5).astype(float).round())/2 d = pd.concat( [d, df], axis = 1) total_wage.append( np.round( sum(df*supply['provider_mean_wage']), 0) ) total_sutab.append( sum((dataset * ser_prov).apply(sum, axis = 0))/sum(dataset.apply(sum, axis = 0)) ) dataset['weight'] = wi_weight if(co == 1): detail_result = pd.concat([service_name, dataset], axis = 1) else: tmp = pd.concat([service_name, dataset], axis = 1) detail_result = pd.concat([detail_result, tmp], axis = 0) d.columns = ['w_0.0','w_0.1','w_0.2','w_0.3','w_0.4','w_0.5','w_0.6','w_0.7','w_0.8','w_0.9', 'w_1.0'] s = {'total_wage': total_wage, 'total_sutab': total_sutab, 'FTE': d, 'detail_f2f_mini': detail_result} if( sub_option == "wage_weight" ) : if( option == 'ideal_staffing'): dataset, tt = call_opt_ideal(w_weight, s_weight, wage, ser_prov, demand, ser_max,row_i, col_j,FTE_time) if( option == 'ideal_staffing_current'): dataset, tt = call_opt_current(w_weight, s_weight, wage, ser_prov, demand, supply, ser_max, \ row_i, col_j,FTE_time,overhead_work, provider_list) # calculate statistics if tt == 0: s = 'Can not find optimal allocation. Check input' else: dataset.columns = provider_list['provider_abbr'] detail_result = pd.concat([service_name, dataset], axis = 1) df = dataset.apply(sum, axis = 0) doctime = overhead_work.loc[0, provider_list['provider_abbr'] ] totaldoctime = overhead_work.loc[0, 'prop_f2f_tot']*demand.sum()[0]*doctime cortime = overhead_work.loc[1, provider_list['provider_abbr'] ] totalcortime = overhead_work.loc[1, 'prop_f2f_tot']*demand.sum()[0]*cortime df = df + totaldoctime + totalcortime df = (((df/FTE_time *10)/5).astype(float).round())/2 df.columns = 'FTE' total_wage = np.round( sum(df*supply['provider_mean_wage']), 0) total_sutab = np.round( sum((dataset * ser_prov).apply(sum, axis = 0))/sum(dataset.apply(sum, axis = 0)) ,2) # this is the code to get total wage and stability scores of individual provider types # if you think individual information is useful, please use similar code for 'all combination' option ind_wage = np.round( df*supply['provider_mean_wage'], 0) ind_sutab = np.round( (dataset * ser_prov).apply(sum, axis = 0)/dataset.apply(sum, axis = 0) ,2) s = {} s = {'total_wage': total_wage, 'total_sutab': total_sutab, 'ind_wage': ind_wage, 'ind_sutab': ind_sutab, 'FTE': df, 'detail_f2f_mini': detail_result} if(sub_option == "wage_max"): s = call_opt_ideal_maxbudget(option, wage_max, wage, ser_prov, demand, supply, ser_max, row_i,\ col_j, provider_list, overhead_work, FTE_time, service_name ) if( option == 'service_allocation' ): #================== get pattern if(collapse_group == True): k = service_name['encounter_category']; k1 = service_name['svc_category'] k2 = k + k1 p = ser_prov.apply(lambda x: ''.join( ((x <=1 )*1).astype('str') ), axis = 1) k2 = k2 + p df = pd.concat([k, k1, k2], axis = 1); df.columns = ['d_type','category','comb'] k1 = df.groupby(["comb"]).size(); n_mem = len(k1) # create assignment ser_prov_mem = pd.DataFrame(index=range( len(ser_prov) ),columns=['mem']) for i in range( n_mem ): ser_prov_mem.loc[ df['comb'] == k1.keys()[i] ] = i # total Demand demand_mem = pd.DataFrame(index=range(n_mem),columns=['demand']) for k1 in range(n_mem): g = demand.loc[ ser_prov_mem['mem'] == k1 , :].apply(sum, axis = 0) demand_mem.iloc[k1,:] = g ser_max_mem = pd.DataFrame(index=range(n_mem),columns=provider_list['provider_abbr']) for k1 in range(n_mem): max_val = ser_max.loc[ ser_prov_mem['mem'] == k1, : ].apply(sum, axis = 0) ser_max_mem.iloc[k1,:] = max_val dataset, current_demand = \ call_assign_service(demand_mem, ser_max_mem, supply, overhead_work, provider_list, FTE_time) time_allocation = pd.DataFrame(index=range(n_ser),columns=provider_list['provider_abbr']) for k1 in range(n_mem): tmp = ser_prov_mem['mem'] == k1; n = sum(tmp) if( sum(tmp) == 1 ): time_allocation.loc[np.where(tmp)[0][0],: ] = dataset.iloc[k1,:] else: i_demand = demand.loc[ np.where(tmp)[0],'demand']; i_demand = i_demand/sum(i_demand) i = dataset.iloc[k1,:].apply( lambda x: x*i_demand ) for j in range(n): time_allocation.loc[ np.where(tmp)[0][j], :] = i.iloc[:,j] dataset = time_allocation else: # not collapsing dataset, current_demand = \ call_assign_service(demand, ser_max, supply, overhead_work, provider_list, FTE_time) dataset = pd.concat([service_name, dataset], axis = 1) s = {} s = {'FTE': current_demand, 'detail_f2f_mini': dataset} return s
def aggregatelines(network, buses, interlines, line_length_factor=1.0): #make sure all lines have same bus ordering positive_order = interlines.bus0_s < interlines.bus1_s interlines_p = interlines[positive_order] interlines_n = interlines[~positive_order].rename(columns={ "bus0_s": "bus1_s", "bus1_s": "bus0_s" }) interlines_c = pd.concat((interlines_p, interlines_n), sort=False) attrs = network.components["Line"]["attrs"] columns = set( attrs.index[attrs.static & attrs.status.str.startswith('Input')]).difference( ('name', 'bus0', 'bus1')) consense = { attr: _make_consense('Bus', attr) for attr in (columns | {'sub_network'} - { 'r', 'x', 'g', 'b', 'terrain_factor', 's_nom', 's_nom_min', 's_nom_max', 's_nom_extendable', 'length', 'v_ang_min', 'v_ang_max' }) } def aggregatelinegroup(l): # l.name is a tuple of the groupby index (bus0_s, bus1_s) length_s = haversine_pts(buses.loc[l.name[0], ['x', 'y']], buses.loc[l.name[1], ['x', 'y']]) * line_length_factor v_nom_s = buses.loc[list(l.name), 'v_nom'].max() voltage_factor = (np.asarray(network.buses.loc[l.bus0, 'v_nom']) / v_nom_s)**2 length_factor = (length_s / l['length']) data = dict(r=1. / (voltage_factor / (length_factor * l['r'])).sum(), x=1. / (voltage_factor / (length_factor * l['x'])).sum(), g=(voltage_factor * length_factor * l['g']).sum(), b=(voltage_factor * length_factor * l['b']).sum(), terrain_factor=l['terrain_factor'].mean(), s_nom=l['s_nom'].sum(), s_nom_min=l['s_nom_min'].sum(), s_nom_max=l['s_nom_max'].sum(), s_nom_extendable=l['s_nom_extendable'].any(), num_parallel=l['num_parallel'].sum(), capital_cost=(length_factor * _normed(l['s_nom']) * l['capital_cost']).sum(), length=length_s, sub_network=consense['sub_network'](l['sub_network']), v_ang_min=l['v_ang_min'].max(), v_ang_max=l['v_ang_max'].min()) data.update((f, consense[f](l[f])) for f in columns.difference(data)) return pd.Series(data, index=[f for f in l.columns if f in columns]) lines = interlines_c.groupby(['bus0_s', 'bus1_s']).apply(aggregatelinegroup) lines['name'] = [str(i + 1) for i in range(len(lines))] linemap_p = interlines_p.join(lines['name'], on=['bus0_s', 'bus1_s'])['name'] linemap_n = interlines_n.join(lines['name'], on=['bus0_s', 'bus1_s'])['name'] linemap = pd.concat((linemap_p, linemap_n), sort=False) return lines, linemap_p, linemap_n, linemap
print([(k, list(g)) for k, g in groupby(sorted(lst), key=gb)]) print([(k, list(g)) for k, g in groupby(lst, key=gb)]) list['1234'] [i for i in itertools.chain(str(1234),'fefg')] s = '3a4b5cdd7e' print([''.join(list(g)) for k, g in groupby(s, key=lambda x: x.isdigit())]) df = pd.DataFrame() index = ['alpha', 'beta', 'gamma', 'delta', 'eta'] for i in range(5): a = pd.DataFrame([np.linspace(i, 5*i, 5)], index=[index[i]]) df = pd.concat([df, a], axis=0) df[1] tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])) tuples[1] index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) index.values a=np.array([1,2]) t=[[a,b] for a,b in tuples] a,b=tuples[1] z=[a,y] list(a)
import pandas as pd import glob import numpy as np import argparse parser = argparse.ArgumentParser() parser.add_argument("input_files", help='input_files') parser.add_argument("output_files",help='output_files') args=parser.parse_args() input = args.input_files output = open(args.output_files,'w') filez = glob.glob(input + "*.cnt") print(filez[1]) t1 = pd.read_csv(filez[0], header=0, sep='\t') tout = t1.iloc[:,0] for f in filez: t1= pd.read_csv(f, header=0, sep='\t') tout= pd.concat([tout, t1.iloc[:,6]], axis=1) tout.to_csv(output)
def aggregategenerators(network, busmap, with_time=True, carriers=None, custom_strategies=dict()): if carriers is None: carriers = network.generators.carrier.unique() gens_agg_b = network.generators.carrier.isin(carriers) attrs = network.components["Generator"]["attrs"] generators = (network.generators.loc[gens_agg_b].assign( bus=lambda df: df.bus.map(busmap))) columns = (set( attrs.index[attrs.static & attrs.status.str.startswith('Input')]) | {'weight'}) & set(generators.columns) - {'control'} grouper = [generators.bus, generators.carrier] def normed_or_uniform(x): return x / x.sum() if x.sum(skipna=False) > 0 else pd.Series( 1. / len(x), x.index) weighting = generators.weight.groupby(grouper, axis=0).transform(normed_or_uniform) generators['capital_cost'] *= weighting strategies = { 'p_nom_max': np.min, 'weight': np.sum, 'p_nom': np.sum, 'capital_cost': np.sum } strategies.update(custom_strategies) if strategies['p_nom_max'] is np.min: generators['p_nom_max'] /= weighting strategies.update((attr, _make_consense('Generator', attr)) for attr in columns.difference(strategies)) new_df = generators.groupby(grouper, axis=0).agg(strategies) new_df.index = _flatten_multiindex(new_df.index).rename("name") new_df = pd.concat([ new_df, network.generators.loc[~gens_agg_b].assign( bus=lambda df: df.bus.map(busmap)) ], axis=0, sort=False) new_pnl = dict() if with_time: for attr, df in iteritems(network.generators_t): pnl_gens_agg_b = df.columns.to_series().map(gens_agg_b) df_agg = df.loc[:, pnl_gens_agg_b] if not df_agg.empty: if attr == 'p_max_pu': df_agg = df_agg.multiply(weighting.loc[df_agg.columns], axis=1) pnl_df = df_agg.groupby(grouper, axis=1).sum() pnl_df.columns = _flatten_multiindex( pnl_df.columns).rename("name") new_pnl[attr] = pd.concat([df.loc[:, ~pnl_gens_agg_b], pnl_df], axis=1, sort=False) return new_df, new_pnl
# df_close = pd.DataFrame() for fid in range(1, len(filename)): # print(fid, filename[fid]) ''' _df = pd.read_csv(path + filename[fid], index_col='date', parse_dates=True) # 如果你想保留第一个aa,那么keep就是first _df = _df.reset_index().drop_duplicates(subset='date', keep='first') _df['date'] = _df['date'].dt.date _df = _df.set_index('date') _df.to_csv(drop_duplicate_path + filename[fid]) ''' _df = pd.read_csv(drop_duplicate_path + filename[fid], index_col='date', parse_dates=True) # print(_df.index.duplicated().sum()) df_close = pd.concat([df_close, _df.loc[~_df.index.duplicated(), ['close']].rename( columns={'close': (filename[fid].split('_')[0])})], join='outer', axis=1, sort=True) df_close = df_close.fillna(method='bfill') df_close = log(df_close) for i in list(range(1650))[::-1]: df_close.iloc[i] -= df_close.iloc[i-1] drop_filenames = os.listdir(drop_duplicate_path) print(len(drop_filenames)) import pickle with open('./stock_data/data/relation/ordered_ticker.pkl', 'rb') as f: all_stock = pickle.load(f) df_close = df_close[all_stock] with open('./stock_data/data/relation/adj_mat.pkl', 'rb') as f: all_mat = pickle.load(f)
def get_options_data(self, month=None, year=None, expiry=None): """ ***Experimental*** Gets call/put data for the stock with the expiration data in the given month and year Parameters ---------- month : number, int, optional(default=None) The month the options expire. This should be either 1 or 2 digits. year : number, int, optional(default=None) The year the options expire. This should be a 4 digit int. expiry : date-like or convertible or list-like object, optional (default=None) The date (or dates) when options expire (defaults to current month) Returns ------- pandas.DataFrame A DataFrame with requested options data. Index: Strike: Option strike, int Expiry: Option expiry, Timestamp Type: Call or Put, string Symbol: Option symbol as reported on Yahoo, string Columns: Last: Last option price, float Chg: Change from prior day, float Bid: Bid price, float Ask: Ask price, float Vol: Volume traded, int64 Open_Int: Open interest, int64 IsNonstandard: True if the the deliverable is not 100 shares, otherwise false Underlying: Ticker of the underlying security, string Underlying_Price: Price of the underlying security, float64 Quote_Time: Time of the quote, Timestamp Notes ----- Note: Format of returned data frame is dependent on Yahoo and may change. When called, this function will add instance variables named calls and puts. See the following example: >>> aapl = Options('aapl', 'yahoo') # Create object >>> aapl.calls # will give an AttributeError >>> aapl.get_options() # Get data and set ivars >>> aapl.calls # Doesn't throw AttributeError Also note that aapl.calls and appl.puts will always be the calls and puts for the next expiry. If the user calls this method with a different expiry, the ivar will be named callsYYMMDD or putsYYMMDD, where YY, MM and DD are, respectively, two digit representations of the year, month and day for the expiry of the options. """ return concat([ f(month, year, expiry) for f in (self.get_put_data, self.get_call_data) ]).sortlevel()
def filter_processing(self, logical_type, filter): logger = logging.getLogger('django') df = pd.read_csv(self.open_path) # "与"的判断逻辑 if logical_type == "&": for f in filter: if f['field_type'] == 0: str_expression = "df['" + f['field_name'] + "']" + f[ 'filter_method'] + f['filter_obj'] logger.debug("LogDebug<" "str_expression : " + str_expression + ">") df = df[eval(str_expression)] elif f['field_type'] == 1 and f['filter_method'] == "contains": df = df[df[f['field_name']].str.contains(f['filter_obj'])] elif f['field_type'] == 1 and f[ 'filter_method'] == "notContains": df = df[~df[f['field_name']].str.contains(f['filter_obj'])] elif f['field_type'] == 1 and f['filter_method'] == "notNull": df = df[df[f['field_name']].notnull] elif f['field_type'] == 1 and f['filter_method'] == "isNull": df = df[df[f['field_name']].isnull] path = self.open_path df.to_csv(path, index_label=False, index=0) logger.debug("LogDebug<" "logical_type : 与>") # "或"的判断逻辑 elif logical_type == "|": df_merger = [] count = 0 for f in filter: if f['field_type'] == 0: str_expression = "df['" + f['field_name'] + "']" + f[ 'filter_method'] + f['filter_obj'] # df_merger[] = df[eval(str_expression)] df_merger.append(df[eval(str_expression)]) count += 1 elif f['field_type'] == 1 and f['filter_method'] == "contains": df_merger.append(df[df[f['field_name']].str.contains( f['filter_obj'])]) count += 1 elif f['field_type'] == 1 and f[ 'filter_method'] == "notContains": df_merger.append( df[~df[f['field_name']].str.contains(f['filter_obj'])]) count += 1 elif f['field_type'] == 1 and f['filter_method'] == "isNull": df_merger.append(df[df[f['field_name']].notnull]) count += 1 elif f['field_type'] == 1 and f['filter_method'] == "notNull": df_merger.append(df[df[f['field_name']].isnull]) count += 1 # accumulate,then remove replicated i = 0 dfs = pd.DataFrame(None) while i < count: dfs = pd.concat( [dfs, df_merger[i]], join='outer', axis=0, ignore_index=True, ) i += 1 path = self.open_path df.to_csv(path, index_label=False, index=0)
# In[6]: data_train # In[7]: dummies_Cabin = pd.get_dummies(data_train['Cabin'], prefix='Cabin') dummies_Pclass = pd.get_dummies(data_train['Pclass'], prefix='Pclass') dummies_Sex = pd.get_dummies(data_train['Sex'], prefix='Sex') dummies_Embarked = pd.get_dummies(data_train['Embarked'], prefix='Embarked') df = pd.concat([data_train, dummies_Cabin, dummies_Pclass, dummies_Sex, dummies_Embarked], axis=1) df.drop(['Name', 'Cabin', 'Pclass', 'Sex', 'Embarked', 'Ticket'], axis=1, inplace=True) df # In[10]: import sklearn.preprocessing as preprocessing scaler = preprocessing.StandardScaler() scale_param = scaler.fit(df[['Age', 'Fare']]) df['Age_scaled'] = scaler.fit_transform(df[['Age', 'Fare']], scale_param)[:, 0] df['Fare_scaled'] = scaler.fit_transform(df[['Age', 'Fare']], scale_param)[:, 1] df
def circos(df = DataFrame([]), label = True, node_color = 'None', column = 1, inter = 25, size = 5, fontsize = 10): df1 = df[['GO', 'Entry', 'Term', 'Short_Term']].merge(df, on = 'Entry', how = 'left').drop_duplicates() df2 = DataFrame(df[['GO', 'Term', 'Short_Term', 'Entry']].drop_duplicates().groupby(['GO','Term', 'Short_Term']).Entry.count()).reset_index() #### >>>>>>>>>>>>>>>> #### A partir de una matriz de datos extrae valores no redundantes matrix = df1.pivot_table(values='Entry',index=['GO_x', 'Term_x', 'Short_Term_x'],aggfunc=len,columns=['GO_y', 'Term_y', 'Short_Term_y']) ### df_mat = [] n = -1 for i in list(matrix.columns.values): n += 1 new = DataFrame(matrix.iloc[n:len(matrix)][i]) nn = -1 for index, row in new.iterrows(): nn += 1 df_mat.append([index, i, new.iloc[nn][i]]) nn = 0 ### df_mat = DataFrame(df_mat, columns = ['go0', 'go1', 'val']).dropna() ### nodos = [] for index, row in df_mat.iterrows(): if row.go0 == row.go1: #print(row.go0, row.go1) continue else: #print(row.go0, row.go1) nodos.append([row.go0, row.go1, row.val]) nodos = DataFrame(nodos) columnas = {0:'GO', 1:'Term', 2:'Short_Term'} nodos = DataFrame([[i[column] for i in nodos[0]], [i[column] for i in nodos[1]], nodos[2]/2]).T #### >>>>>>>>>>>>>>>> # si interacciona con mas uno, eliminar la redundancia, y si no interacciona con ninguno, dejar el nodo # y su valor, este se verá en la red como un nodo aislado aislado = [i for i in matrix.columns if len(matrix[[i]].dropna()) == 1] aislado = [df_mat[df_mat.go0 == i] for i in aislado] if len(aislado) > 0: aislado = pd.concat(aislado) aislado.columns = [0, 1, 2] aislado = DataFrame([[i[column] for i in aislado[0]], [i[column] for i in aislado[1]], aislado[2]/2]).T nodos = pd.concat([nodos, aislado]) else: pass nodos.columns = ['Source','Target','Weight'] edges = nodos order = [] for index, row in nodos.iterrows(): order.append(row[0]) order.append(row[1]) orden3 = DataFrame(order).drop_duplicates(keep = 'first').reset_index(drop = True) orden3.columns = [columnas[column]] nodes = pd.merge(orden3, df2, on = [columnas[column]], how = 'left') def make_graph(nodes, edges): g = nx.Graph() for i,row in nodes.iterrows(): keys = row.index.tolist() values = row.values # The dict contains all attributes g.add_node(row[nodes.columns[0]], **dict(zip(keys,values))) for i,row in edges.iterrows(): keys = row.index.tolist() values = row.values g.add_edge(row['Source'], row['Target'], **dict(zip(keys,values))) return g g = make_graph(nodes, edges) for i,row in nodes.iterrows(): if row['Entry'] >= inter: g.add_node(row[nodes.columns[0]], umbral='up') if row['Entry'] < inter: g.add_node(row[nodes.columns[0]], umbral='down') color_nodo = {'Uniques':nodes.columns[0], 'Umbral':'umbral', 'None':False} c = nxv.CircosPlot(g, node_color= color_nodo[node_color], # nodes.columns[0], node_grouping= color_nodo[node_color], node_labels=label, node_label_layout='rotation', edge_width= 'Weight', #edge_color = 'umbral', figsize=(size,size), fontsize = fontsize) return c.draw() ####################
for target in targets: out.append({'itemSet': D['itemSet'], 'target': target}) return out dataPos = pd.DataFrame( list(map(lambda x: deleteOne(dataPos.loc[x].to_dict()), dataPos.index))) dataPos['conversion'] = 1 negativeSampling = 5 dataNeg = list( map(lambda x: addOne(data.loc[x].to_dict(), negativeSampling), data.index)) dataNeg = pd.DataFrame(list([item for sublist in dataNeg for item in sublist])) dataNeg['conversion'] = 0 data = pd.concat([dataPos, dataNeg], ignore_index=True) setName = 'itemSet' taskName = 'target' rewardName = 'conversion' numItems = nItem numTasks = numItems numTraits = 100 lbda = 0.01 # 0.1 -> plafond à 6 alpha = 0.1 # 0.1 mieux ? eps = 0.001 # 0.01 -> NA betaMomentum = 0.0 #1 passe à 0 après 150 itérations numIterFixed = 1800 minibatchSize = 5000 # check 10000 maxIter = 2000 #250 gradient_cap = 1000.0
result_i.extend(get_ner(i['原发病灶大小'], 'S')) result_i.extend(get_ner(i['转移部位'], 'Z')) result_i = [j for j in result_i if j[0] is not None] #排序 result_i = sorted(result_i, key=lambda x: len(x[0])) result.append(result_i) pass return result pass if __name__ == '__main__': #读取数据 dataone = pd.read_excel('./data/onetrain.xlsx') datatwo = pd.read_excel('./data/twotrain.xlsx') data = pd.concat((dataone, datatwo), axis=0, ignore_index=True) result = get_ners_postion(data) result = np.array(result) np.save('./data/train.npy', result) # # 保存 # text = np.array(text) # pos = np.array(pos) # np.savez('./data/train.npy',text=text,pos=pos) #分析 # ners = get_ners(data) # ners_Y = [j[0] for i in ners for j in i if j[1] == 'Y'] # ners_Z = [j[0] for i in ners for j in i if j[1] == 'Z'] # count_y = Counter(ners_Y) # count_z = Counter(ners_Z)
dictData = { 'time': times, 'month': months, 'sender': senders, 'recipient': recipients } dfData = pd.DataFrame(dictData) #1 Perform Dataframe group by to get the count by Sender & Recipient. Then concat the DataFrame and sort it dfSender = dfData.groupby('sender').count()[['time']] dfSender.rename(columns={"time": "cntSender"}, inplace=True) dfRecipient = dfData.groupby('recipient').count()[['time']] dfRecipient.rename(columns={"time": "cntRecipient"}, inplace=True) dfMerged = pd.concat([dfSender, dfRecipient], axis=1, sort=True) dfMerged.fillna(value={'cntSender': 0, 'cntRecipient': 0}, inplace=True) dfMerged.sort_values(by=['cntSender', 'cntRecipient'], ascending=False, inplace=True, na_position='last') dfMerged.to_csv(OUTFILE1) top = 5 dfHead = dfMerged.head(top) dfHead.rename(columns={ "cntSender": "hSender", "cntRecipient": "hRecipient" }, inplace=True) lHead = dfHead.index.to_list()
def roll_up( df, levels: List[str], groupby_vars: List[str], extra_groupby_cols: List[str] = None, var_name: str = 'type', value_name: str = 'value', parent_name: str = 'parent', agg_func: str = 'sum', drop_levels: List[str] = None, ): """ Creates aggregates following a given hierarchy --- ### Parameters *mandatory :* - `levels` (*list of str*): name of the columns composing the hierarchy (from the top to the bottom level). - `groupby_vars` (*list of str*): name of the columns with value to aggregate. - `extra_groupby_cols` (*list of str*) optional: other columns used to group in each level. *optional :* - `var_name` (*str*) : name of the result variable column. By default, `“type”`. - `value_name` (*str*): name of the result value column. By default, `“value”`. - `parent_name` (*str*): name of the result parent column. By default, `"parent"`. - `agg_func` (*str*): name of the aggregation operation. By default, `“sum”`. - `drop_levels` (*list of str*): the names of the levels that you may want to discard from the output. --- ### Example **Input** | Region | City | Population | |:---------:|:--------:|:-----------:| | Idf | Panam| 200 | | Idf | Antony | 50 | | Nord | Lille | 20 | ```cson roll_up: levels: ["Region", "City"] groupby_vars: "Population" ``` **Output** | Region | City | Population | value | type | |:---------:|:--------:|:-----------:|:--------:|:------:| | Idf | Panam| 200 | Panam | City | | Idf | Antony | 50 | Antony | City | | Nord | Lille | 20 | Lille | City | | Idf | Nan | 250 | Idf | Region | | Nord | Nan | 20 | Nord | Region | """ dfs = list() groupby_cols_cpy = list(levels) levels_cpy = list(levels) levels_cpy.reverse() extra_groupby_cols = extra_groupby_cols or [] drop_levels = drop_levels or [] previous_level = None for (idx, top_level) in enumerate(levels_cpy): # Aggregation gb_df = getattr( df.groupby(groupby_cols_cpy + extra_groupby_cols)[groupby_vars], agg_func )().reset_index() # Melt-like columns gb_df[var_name] = top_level gb_df[value_name] = gb_df[top_level] gb_df[parent_name] = gb_df[levels_cpy[idx + 1]] if idx < len(levels_cpy) - 1 else np.NaN dfs.append(gb_df) if previous_level in drop_levels: del dfs[-2] previous_level = top_level # Remove one level each time in the groupby: lowest level column needs # a groupby with every levels, the next level needs every one except # the lowest, etc. until the top level column that needs only itself # inside the groupby. groupby_cols_cpy.pop() return pd.concat(dfs, sort=False).reset_index()
def net_plot(df = DataFrame([]), layout = 'Spring', label = 'none', column = 0, label_size = 5,diam_nodos = 10, espe_edges = 0.1, inter = 10, color_inter_min = 'k',color_inter_max = 'blue', edge_alpha_min = 0.3, edge_alpha_max = 0.3, k_num = 3, color_nodo = 'red', node_alpha = 0.7, backg = 'white', label_color = 'black'): # df1 = df[['GO', 'Entry', 'Term', 'Short_Term']].merge(df, on = 'Entry', how = 'left').drop_duplicates() df2 = DataFrame(df[['GO', 'Term', 'Short_Term', 'Entry']].drop_duplicates().groupby(['GO','Term', 'Short_Term']).Entry.count()).reset_index() #### >>>>>>>>>>>>>>>> #### A partir de una matriz de datos extrae valores no redundantes matrix = df1.pivot_table(values='Entry',index=['GO_x', 'Term_x', 'Short_Term_x'],aggfunc=len,columns=['GO_y', 'Term_y', 'Short_Term_y']) ### df_mat = [] n = -1 for i in list(matrix.columns.values): n += 1 new = DataFrame(matrix.iloc[n:len(matrix)][i]) nn = -1 for index, row in new.iterrows(): nn += 1 df_mat.append([index, i, new.iloc[nn][i]]) nn = 0 ### df_mat = DataFrame(df_mat, columns = ['go0', 'go1', 'val']).dropna() ### nodos = [] for index, row in df_mat.iterrows(): if row.go0 == row.go1: #print(row.go0, row.go1) continue else: #print(row.go0, row.go1) nodos.append([row.go0, row.go1, row.val]) nodos = DataFrame(nodos) columnas = {0:'GO', 1:'Term', 2:'Short_Term'} nodos = DataFrame([[i[column] for i in nodos[0]], [i[column] for i in nodos[1]], nodos[2]]).T #### >>>>>>>>>>>>>>>> # si interacciona con mas uno, eliminar la redundancia, y si no interacciona con ninguno, dejar el nodo # y su valor, este se verá en la red como un nodo aislado aislado = [i for i in matrix.columns if len(matrix[[i]].dropna()) == 1] aislado = [df_mat[df_mat.go0 == i] for i in aislado] if len(aislado) > 0: aislado = pd.concat(aislado) aislado.columns = [0, 1, 2] aislado = DataFrame([[i[column] for i in aislado[0]], [i[column] for i in aislado[1]], aislado[2]]).T nodos = pd.concat([nodos, aislado]) else: pass #################### # https://networkx.github.io/documentation/networkx-2.3/auto_examples/drawing/plot_weighted_graph.html#sphx-glr-auto-examples-drawing-plot-weighted-graph-py G=nx.Graph() for index, row in nodos.iterrows(): G.add_edge(row[0], row[1],weight = row[2]) elarge=[(u,v,d['weight']) for (u,v,d) in G.edges(data=True) if d['weight'] >= inter] esmall=[(u,v,d['weight']) for (u,v,d) in G.edges(data=True) if d['weight'] < inter] ### layouts = {'Circular':nx.circular_layout, 'Random':nx.random_layout, 'Shell':nx.shell_layout, 'Spectral':nx.spectral_layout, 'Spring':nx.spring_layout, 'KK':nx.kamada_kawai_layout} #circular_layout #random_layout #shell_layout #spring_layout #spectral_layout #pos=nx.spring_layout(G, k = k_num) # positions for all nodes if layouts[layout] == nx.spring_layout: pos=layouts[layout](G, k = k_num) else: pos=layouts[layout](G) #pos=layout(G) # nodes #------------------------------------------------------------------------------ # ordenar los valores para representarlos en el tama;o del nodo order = [] for index, row in nodos.iterrows(): order.append(row[0]) order.append(row[1]) orden3 = DataFrame(order).drop_duplicates(keep = 'first').reset_index(drop = True) orden3.columns = [columnas[column]] orden4 = pd.merge(orden3, df2, on = [columnas[column]], how = 'left') #------------------------------------------------------------------------------ # https://networkx.github.io/documentation/networkx-1.10/reference/generated/networkx.drawing.nx_pylab.draw_networkx_edges.html nx.draw_networkx_nodes(G,pos,node_size= np.array(orden4.Entry) * diam_nodos, node_color= color_nodo,alpha= node_alpha) # edges nx.draw_networkx_edges(G,pos,edgelist=esmall, width = np.array([i[2] for i in esmall]) * espe_edges, alpha= edge_alpha_min,edge_color= color_inter_min,style='-') nx.draw_networkx_edges(G,pos, edgelist=elarge, width = np.array([i[2] for i in elarge]) * espe_edges, alpha= edge_alpha_max,edge_color= color_inter_max,style= '-') # labels posicion = {} ## posicion de las etiquetas, ligeramente arriba for key, value in pos.items(): posicion[key] = value + 0.05 # arreglo de las posiciones de los nodos en el plano cartesiano arr = np.array([[i for i in value] for key, value in pos.items()]) # labels if label == 'label': nx.draw_networkx_labels(G,posicion,font_size=label_size, font_color=label_color) # ,font_weight='bold' if label == 'label': plt.axis([arr[:,0].min() - 0.3, arr[:,0].max() + 0.3, arr[:,1].min() - 0.3, arr[:,1].max() + 0.3]) #plt.axis('off') #plt.show() # display if label == 'none': plt.axis([arr[:,0].min() - 0.2, arr[:,0].max() + 0.2, arr[:,1].min() - 0.2, arr[:,1].max() + 0.2]) #plt.axis('off') #plt.show() # display plt.gca().set_facecolor(backg) plt.gca().spines['top'].set_visible(False) plt.gca().spines['right'].set_visible(False) plt.gca().spines['left'].set_visible(False) plt.gca().spines['bottom'].set_visible(False) plt.gca().axes.get_xaxis().set_visible(False) plt.gca().axes.get_yaxis().set_visible(False)
filen=filepath+'cv_ovocs_2018_M_Rowlinson.csv' odf = pd.read_csv(filen, index_col=0) odf.index = pd.to_datetime(odf.index,format='%d/%m/%Y %H:%M') cols=list(df) ; ocols = list(odf) for col in cols: try: df[col] = df[col].loc[~(df[col] <= 0. )] except: pass for col in ocols: odf = odf.loc[~(odf[col] <= 0.)] cols=cols+ocols hourly=df.resample('H').mean() ohourly=odf.resample('H').mean() df=pd.concat([hourly,ohourly], axis=1, sort=False) cvao = df[cv_spec]['2016'] #cvao = cvao.resample('H').mean() o3 = np.concatenate(o3,axis=0) mf = pd.DataFrame(o3[:,0,27,31]) mf.index = cvao.index m31 = 24*31 ; m30 = 24*30 ; m29 = 24*29 ; m28 = 24*28 mf_djf = pd.concat([mf[-m31:],mf[:m31+m29]]) mf_mam = mf[m31+m29:m31*3+m30+m29] mf_jja = mf[m31*3+m30+m29:m31*5+m30*2+m29] mf_son = mf[m31*5+m30*2+m29:m31*6+m30*4+m29] MF = [mf_djf, mf_mam, mf_jja, mf_son] cv_djf = pd.concat([cvao[-m31:],cvao[:m31+m29]])
li = [] pd.set_option('display.max_rows', None) pd.set_option('display.max_columns', None) pd.set_option('display.width', None) pd.set_option('display.max_colwidth', -1) for filename in all_files: dfi = pd.read_csv(filename, index_col=None, header=0) dfi = dfi.sort_values('mse') selectedi = dfi.head(1) li.append(dfi) df = pd.concat(li, axis=0, ignore_index=True) if args.abc: dfa = df[df['formulas'].str.contains("_A")] dfb = dfa[dfa['formulas'].str.contains("_B")] df = dfb[dfb['formulas'].str.contains("_C")] df = df.sort_values('mse') selected = df.head(args.n) previousvalue = float("inf") for f in selected[["formulas", "mse"]].values: rmse = math.sqrt(f[1]) if rmse != previousvalue: print(f[0], rmse)
rollnowise = { 'quiz_question': questions, 'option1': optiona, 'option2': optionb, 'option3': optionc, 'option4': optiond, 'correct_option': correctans, 'positive marks': answers, 'negative marks': wrongans, 'response': response, } listing = [correctchoice, wrongchoice, unattempted, totalmrks, totalmarks] legend = { 'Legend': ['correctchoice', 'wrongchoice', 'unattempted', 'marks', 'fullmarks'], 'Total': listing } dataframe = pd.DataFrame(rollnowise) dataframe2 = pd.DataFrame(legend) dataframe3 = pd.concat([dataframe, dataframe2], ignore_index=False, axis=1) filename = 'individual_responses/' + 'q' + str(quizno) + '_' + str( ROLLNO) + ".csv" dataframe3.to_csv(filename) # c.execute('SELECT * FROM project1_marks') # conn.commit() quizfile = {'Roll': [str(ROLLNO)], 'MARKS': [str(totalmrks)]} dataset = pd.DataFrame(quizfile) filename = "quiz_wise_responses/" + 'scores_' + 'q' + quizno + '.csv' dataset.to_csv(filename, mode='a') conn.close()
dow_jones = read_data('data/djia.csv') print("Loaded DJIA", len(dow_jones)) s_p = read_data('data/S&P.csv') print("Loaded S&P", len(s_p)) russell_2000 = read_data('data/Russell2000.csv') print("Loaded Russell", len(russell_2000)) nasdaq = read_data('data/nasdaq.csv') print("Loaded NASDAQ", len(nasdaq)) # combine stock indexes into one dataframe data = pd.concat( [dow_jones['Open'], s_p['Open'], russell_2000['Open'], nasdaq['Open']], axis=1, keys=['dow_jones', 'S&P', 'russell_2000', 'nasdaq']) ''' # compare indexes (data / data.ix[0] * 100).plot(figsize=(12,12)) plt.title("Standarized Indexes 1990-2016") plt.show() ''' # predict next year's price dow_jones['Future'] = dow_jones['Open'].shift(-252) # drop Nan dow_jones = dow_jones.dropna() train = dow_jones.loc[dow_jones.index < '12-31-2015']
#test_corpus = corpus[N_TRAIN:] # Write the shuffled corpora to file f = open('%s/bigramfree_%05i_corpus.txt' % (OUTPUT_DIR, i + 1), 'w') corpus = "\n".join([" ".join(w) for w in corpus]) f.write(corpus) f.close() #f = open('%s/bigramfree_%05i_test_corpus.txt'%(OUTPUT_DIR,i+1),'w') #corpus = "\n".join([ " ".join(w) for w in test_corpus ]) #f.write(corpus) #f.close() return corpus_stats return None corpus_stats = pd.DataFrame() if True: pool = mp.Pool(processes=6) results = [ pool.apply_async(generate_bigram_corpus, args=(i, )) for i in range(N_CORPORA) ] output = [p.get() for p in results] corpus_stats = pd.concat(output) corpus_stats.to_csv('interim/bigramgen_free_corpus_stats.csv')
def extract(self, start_date, end_date, ticker_list): ''' Extract histroical data. Args: start_date: The date range(start). end_date: The date range(end). ticker_list: `list` of The target tickers. Returns: `pd.DataFrame`. ''' df_list = [None] for i in range(len(ticker_list)): df = pd.read_csv(self.__logs_dir + ticker_list[i] + ".csv") df["ticker"] = ticker_list[i] df_list.append(df) result_df = pd.concat(df_list) #self.__logger.debug("total: " + str(result_df.shape[0])) result_df = result_df[[ "adjusted_close", "close", "high", "low", "open", "volume", "timestamp", "ticker" ]] result_df = result_df.dropna() #self.__logger.debug("After dropping na: " + str(result_df.shape[0])) result_df["date"] = result_df["timestamp"] try: result_df["timestamp"] = result_df.date.apply(self.__get_timestamp) except Exception as e: print(e) print(result_df["date"].drop_duplicates()) raise if start_date is not None: start_timestamp = datetime.strptime(start_date, self.__date_format).timestamp() result_df = result_df[result_df.timestamp >= start_timestamp] if end_date is not None: end_timestamp = datetime.strptime(end_date, self.__date_format).timestamp() result_df = result_df[result_df.timestamp <= end_timestamp] date_df = result_df.sort_values(by=["timestamp"]).drop_duplicates(["date"]) date_df = date_df.reset_index() date_df = pd.concat([ date_df, pd.DataFrame(np.arange(date_df.shape[0]), columns=["date_key"]) ], axis=1) result_df = pd.merge( result_df, date_df[["date", "date_key"]], on="date" ) result_df = result_df.sort_values(by=["timestamp", "ticker"]) return result_df