def test_column_dups2(self): # drop buggy GH 6240 df = DataFrame({'A': np.random.randn(5), 'B': np.random.randn(5), 'C': np.random.randn(5), 'D': ['a', 'b', 'c', 'd', 'e']}) expected = df.take([0, 1, 1], axis=1) df2 = df.take([2, 0, 1, 2, 1], axis=1) result = df2.drop('C', axis=1) assert_frame_equal(result, expected) # dropna df = DataFrame({'A': np.random.randn(5), 'B': np.random.randn(5), 'C': np.random.randn(5), 'D': ['a', 'b', 'c', 'd', 'e']}) df.iloc[2, [0, 1, 2]] = np.nan df.iloc[0, 0] = np.nan df.iloc[1, 1] = np.nan df.iloc[:, 3] = np.nan expected = df.dropna(subset=['A', 'B', 'C'], how='all') expected.columns = ['A', 'A', 'B', 'C'] df.columns = ['A', 'A', 'B', 'C'] result = df.dropna(subset=['A', 'C'], how='all') assert_frame_equal(result, expected)
def training(iden, Charg, Temps, use_cache_trainingset, test, verbose): ''' Return the prediction function, for a given site iden, history Charg and temperature Temps''' if use_cache_trainingset: if test: X = pickle.load(open(CACHE_DIR+"X_test_"+iden+".p", "rb")) else: X = pickle.load(open(CACHE_DIR+"X_"+iden+".p", "rb")) else: X = DataFrame(Charg[iden]) X = X.dropna(how='any') X['dayofweek'] = X.index.dayofweek X['Temps'] = Temps[iden].ix[X.index] X['fracday'] = X.index.minute/60.+X.index.hour X['lastminutes'] = X[iden].ix[X.index-10*Minute()].values X['yesterday'] = X[iden].ix[X.index-Day()].values X['yesterdaybis'] = X[iden].ix[X.index-Day()-10*Minute()].values X['lastweek'] = X[iden].ix[X.index-Week()].values X['lastweekbis'] = X[iden].ix[X.index-Week()-10*Minute()].values if test: pickle.dump(X, open(CACHE_DIR+"X_test_"+iden+".p", "wb" ) ) else: pickle.dump(X, open(CACHE_DIR+"X_test_"+iden+".p", "wb" ) ) X = X.dropna(how='any') y = X[iden] X = X.drop(iden, 1) scalerX = preprocessing.StandardScaler().fit(X) ############################## clf = linear_model.SGDRegressor(alpha = 0.000001,n_iter=3000) ############################## clf.fit(scalerX.transform(X), y) if verbose: print('Function for '+iden+' computed.') return(lambda x :clf.predict(scalerX.transform(x)))
def checkFile(fn,satdata,beamisr,maxdtsec): """ we need to find matching ISR beam IDs very near the time the satellite passes through the ISR beam. for speed, use Unix epoch time (seconds since Jan 1, 1970) for comparisons Note: the Madrigal HDF5 data is read in as a Numpy structured array Algorithm (not optimized): 1) knowing what satellites will eventually intersect beams, are any of those beamids in this file? 2) knowing what times intersections will occur, do those times exist in this file for those beams? 3) For the beams that meet conditions 1 and 2, compute TEC by numerical integration of NE output: tecisr: 2-D DataFrame, beamid x time """ h5p = '/Data/Table Layout' #rows: satellite. cols: time intersections = satdata.loc[:,:,'intersect'] intersections.dropna(axis=1,how='all',inplace=True) beamlist = beamisr['BEAMID'].values # have to make a copy to sort beamlist.sort() tecisr = DataFrame(index=beamlist, columns=intersections.columns) try: with h5py.File(fn,'r',libver='latest') as f: for t in intersections: #for each time... #mask for matching beam ids (not necessarily matching in time yet...) intmask = np.in1d(f[h5p]['beamid'].astype(int),intersections[t].dropna().astype(int)) if not intmask.any(): #no overlap, no point in evaluating times continue #mask for matching times (not necessarily matching beamids) timemask =np.absolute(f[h5p]['ut1_unix'] - (t.to_pydatetime()-datetime(1970,1,1)).total_seconds()) < maxdtsec #mask for where beamid and times "match" inttimemask = intmask & timemask #retrieve "good" rows of HDF5 that are the correct Beam ID(s) and time(s) intdata = f[h5p][inttimemask] #TODO not tested past this point #TODO account for the case where there are two times and one beam that overlap with the satellite. """ intdata will have numerous rows corresponding to each matching time & beam id each row is a range cell. These rows will be numerically integrated over Ne. """ uniqbeamid = np.unique(intdata['beamid']).astype(int) for b in uniqbeamid: mask = np.isclose(intdata['beamid'],b) #this is one beam's rows, all range bins mask &= np.isfinite(intdata['nel'][mask]) #dropna tecisr.loc[b,t] = comptecisr(10**intdata['nel'][mask], intdata['range'][mask]) except ValueError as e: warn('{} does not seem to have the needed data fields. {}'.format(fn,e)) tecisr.dropna(axis=1,how='all',inplace=True) #only retain times with TEC data (vast majority don't have) return tecisr
def __read_data_values(self): """ Reads the `Data Values` worksheet in a Time-Series excel file. :return: """ sheet = self.workbook.get_sheet_by_name('Data Values') # type: Worksheet dvs = self.__dv_row_generator(sheet.iter_rows()) headers = next(dvs) df = DataFrame([dv for dv in dvs], columns=headers) df.dropna(how='all', inplace=True) self.tables['DataValues'] = df
def test_dropna(self): df = DataFrame(np.random.randn(6, 4)) df[2][:2] = nan dropped = df.dropna(axis=1) expected = df.loc[:, [0, 1, 3]] inp = df.copy() inp.dropna(axis=1, inplace=True) assert_frame_equal(dropped, expected) assert_frame_equal(inp, expected) dropped = df.dropna(axis=0) expected = df.loc[lrange(2, 6)] inp = df.copy() inp.dropna(axis=0, inplace=True) assert_frame_equal(dropped, expected) assert_frame_equal(inp, expected) # threshold dropped = df.dropna(axis=1, thresh=5) expected = df.loc[:, [0, 1, 3]] inp = df.copy() inp.dropna(axis=1, thresh=5, inplace=True) assert_frame_equal(dropped, expected) assert_frame_equal(inp, expected) dropped = df.dropna(axis=0, thresh=4) expected = df.loc[lrange(2, 6)] inp = df.copy() inp.dropna(axis=0, thresh=4, inplace=True) assert_frame_equal(dropped, expected) assert_frame_equal(inp, expected) dropped = df.dropna(axis=1, thresh=4) assert_frame_equal(dropped, df) dropped = df.dropna(axis=1, thresh=3) assert_frame_equal(dropped, df) # subset dropped = df.dropna(axis=0, subset=[0, 1, 3]) inp = df.copy() inp.dropna(axis=0, subset=[0, 1, 3], inplace=True) assert_frame_equal(dropped, df) assert_frame_equal(inp, df) # all dropped = df.dropna(axis=1, how='all') assert_frame_equal(dropped, df) df[2] = nan dropped = df.dropna(axis=1, how='all') expected = df.loc[:, [0, 1, 3]] assert_frame_equal(dropped, expected) # bad input pytest.raises(ValueError, df.dropna, axis=3)
def parse_essentials(essentials_file, samples, normalization=None, cutoff=100): data = DataFrame.from_csv(essentials_file, sep="\t", index_col=False) data = data[["Position"] + samples] data["sum"] = data[samples].apply(sum, axis=1) data = data[data["sum"] < cutoff] data = data.groupby("Position").sum() for sample in samples: sample_data = DataFrame(None, index=data.index) if normalization is not None: sample_data["insertions"] = data[sample].apply(normalization) else: sample_data["insertions"] = data[sample] sample_data.dropna(inplace=True) yield sample_data
def pd_02(): string_data=Series(['a','b','c',np.nan,'e',None]) print string_data print string_data.isnull() print string_data.dropna() df=DataFrame(np.random.randn(7,3)) df.ix[:4,1]=np.nan df.ix[:2,2]=np.nan print df print df.dropna() print df.fillna(0) print df.fillna({1:0.5,3:-1}) print df df.fillna(0,inplace=True) print df
def get_flights_from_route(cur, origin, destination): """ Returns a dataframe for all flights matching origin, destination. """ import time ### MySQL query time0 = time.time() cur.execute("SELECT Year, Month, DayofMonth, DayOfWeek, CRSDepTime, UniqueCarrier, ArrDelay FROM flights_100000 WHERE Origin = %s and Dest = %s;", (origin, destination)) rows = cur.fetchall() td = time.time() - time0 print 'Database query took %.2f seconds.' % td ### Convert to dataframe df = DataFrame(list(rows), columns=['Year', 'Month', 'DayOfMonth', 'DayOfWeek', 'CRSDepTime', 'Carrier', 'ArrDelay']) ### Drop columns without delays (cancellations) df = df.dropna() ### Create some auxiliary columns df['DayOfYear'] = df.apply( lambda x: datetime.datetime(x['Year'],x['Month'],x['DayOfMonth']).timetuple().tm_yday, axis=1) df['Week'] = df['DayOfYear'] / 7 + 1 df['DepHour'] = df['CRSDepTime']/100 ### Drop unused columns df = df.drop(['DayOfMonth','CRSDepTime'],axis=1).sort_index(axis=1) ## df.head() return df
def _to_frame_build_data_frame(self, tfp, hasna, usecols): # build data frame if usecols is None: usecols = ['node', 'kind', 'level', 'msg'] dfinfo = {} dfcols = [] if 'node' in usecols: dfinfo['node'] = tfp.nodes dfcols.append('node') if 'kind' in usecols: dfinfo['kind'] = tfp.kinds dfcols.append('kind') if tfp.get_line_type is not None and 'level' in usecols: dfinfo['level'] = tfp.levels dfcols.append('level') if 'msg' in usecols: dfinfo['msg'] = tfp.msgs dfcols.append('msg') df = DataFrame(dfinfo, index=tfp.dates, columns=dfcols) if hasna: df = df.dropna() df.index.name = 'dtime' # pytable not support unicode for now if 'node' in df.columns: df['node'] = df['node'].astype(str) if 'kind' in df.columns: df['kind'] = df['kind'].astype(str) if 'level' in df.columns: df['level'] = df['level'].astype(str) return df
def proportion_error_per_appliance_df(mains_values, gt_values, predicted_values): gt_proportion = {} pr_proportion = {} proportion_error = {} for app in predicted_values: p_gt = gt_values[app]/mains_values p_pr = predicted_values[app]/mains_values fr = DataFrame(p_gt, columns=['p_gt']) fr['p_gt'] = p_gt fr['p_pr'] = p_pr # fr['01. mains'] = mains_values # fr['02. gt'] = gt_values[app] # fr['03. pr'] = predicted_values[app] fr = fr.dropna() summ_gt = fr['p_gt'].sum() summ_pr = fr['p_pr'].sum() T = len(fr) tru = float(summ_gt)/float(T) dis = float(summ_pr)/float(T) gt_proportion[app] = tru pr_proportion[app] = dis diff = abs(tru - dis) proportion_error[app] = diff return proportion_error, gt_proportion, pr_proportion
def __init__(self, train: pd.DataFrame, test: pd.DataFrame, params: dict, categorical_splits=None): """ :param train: train DF :param test: test DF :param params: dict with the following structure Template for params: params = { 'uuuu': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes}, 'uuku': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes}, 'ukuu': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes}, 'ukku': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes}, 'kuuu': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes}, 'kuuk': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes}, 'kuku': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes}, 'kukk': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes}, 'kkuu': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes}, 'kkuk': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes}, 'kkku': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes}, 'kkkk': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes} } u = unknown, k = known, scaler = None for Trees and else something like scale_features from dmc.transformation, ignore_features are the features which should be ignored for the split, :return: """ if categorical_splits is None: categorical_splits = ['articleID', 'customerID', 'voucherID', 'productGroup'] self.processes = 8 self.test = test.copy() test = test.dropna(subset=['rrp']) self.test_size = len(test) self.splits = split(train, test, categorical_splits) self._enrich_splits(params)
def cor_exp_ess(exp, ess): cor = DataFrame(np.nan, index=ess.columns, columns=['cor', 'pvalue']) for gene in ess.columns: if gene in exp.columns: cor.loc[gene] = spearmanr(ess[gene], exp[gene]) return cor.dropna()
def test_dropna_multiple_axes(self): df = DataFrame([[1, np.nan, 2, 3], [4, np.nan, 5, 6], [np.nan, np.nan, np.nan, np.nan], [7, np.nan, 8, 9]]) cp = df.copy() result = df.dropna(how='all', axis=[0, 1]) result2 = df.dropna(how='all', axis=(0, 1)) expected = df.dropna(how='all').dropna(how='all', axis=1) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) assert_frame_equal(df, cp) inp = df.copy() inp.dropna(how='all', axis=(0, 1), inplace=True) assert_frame_equal(inp, expected)
class Dropna(object): params = (['all', 'any'], [0, 1]) param_names = ['how', 'axis'] def setup(self, how, axis): self.df = DataFrame(np.random.randn(10000, 1000)) self.df.ix[50:1000, 20:50] = np.nan self.df.ix[2000:3000] = np.nan self.df.ix[:, 60:70] = np.nan self.df_mixed = self.df.copy() self.df_mixed['foo'] = 'bar' def time_dropna(self, how, axis): self.df.dropna(how=how, axis=axis) def time_dropna_axis_mixed_dtypes(self, how, axis): self.df_mixed.dropna(how=how, axis=axis)
def test_load_raw_arrays(self): reindex_reader = ReindexMinuteBarReader( self.trading_calendar, self.bcolz_equity_minute_bar_reader, self.START_DATE, self.END_DATE, ) m_open, m_close = self.trading_calendar.open_and_close_for_session( self.START_DATE) outer_minutes = self.trading_calendar.minutes_in_range(m_open, m_close) result = reindex_reader.load_raw_arrays( OHLCV, m_open, m_close, [1, 2]) opens = DataFrame(data=result[0], index=outer_minutes, columns=[1, 2]) opens_with_price = opens.dropna() self.assertEqual( 1440, len(opens), "The result should have 1440 bars, the number of minutes in a " "trading session on the target calendar." ) self.assertEqual( 390, len(opens_with_price), "The result, after dropping nans, should have 390 bars, the " " number of bars in a trading session in the reader's calendar." ) slicer = outer_minutes.slice_indexer( end=pd.Timestamp('2015-12-01 14:30', tz='UTC')) assert_almost_equal( opens[1][slicer], full(slicer.stop, nan), err_msg="All values before the NYSE market open should be nan.") slicer = outer_minutes.slice_indexer( start=pd.Timestamp('2015-12-01 21:01', tz='UTC')) assert_almost_equal( opens[1][slicer], full(slicer.stop - slicer.start, nan), err_msg="All values after the NYSE market close should be nan.") first_minute_loc = outer_minutes.get_loc(pd.Timestamp( '2015-12-01 14:31', tz='UTC')) # Spot check a value. # The value is the autogenerated value from test fixtures. assert_almost_equal( 10.0, opens[1][first_minute_loc], err_msg="The value for Equity 1, should be 10.0, at NYSE open.")
def PolyEq(x, y, order=1): try: df=DataFrame({'x':x,'y':y},index=x.index) df = df.dropna() PolyCoeffs = np.polyfit(df['x'], df['y'], order) ## calculates polynomial coeffs PolyEq = np.poly1d(PolyCoeffs) ## turns the coeffs into an equation except: print 'No regression equation possible' PolyEq = np.poly1d([0]) return PolyEq
def gatherGenDataData(ert, case, key): """ :rtype: pandas.DataFrame """ key, report_step = key.split("@", 1) report_step = int(report_step) try: data = GenDataCollector.loadGenData(ert, case, key, report_step) except ValueError: data = DataFrame() return data.dropna() # removes all rows that has a NaN
def test_dropna_tz_aware_datetime(self): # GH13407 df = DataFrame() dt1 = datetime.datetime(2015, 1, 1, tzinfo=dateutil.tz.tzutc()) dt2 = datetime.datetime(2015, 2, 2, tzinfo=dateutil.tz.tzutc()) df['Time'] = [dt1] result = df.dropna(axis=0) expected = DataFrame({'Time': [dt1]}) assert_frame_equal(result, expected) # Ex2 df = DataFrame({'Time': [dt1, None, np.nan, dt2]}) result = df.dropna(axis=0) expected = DataFrame([dt1, dt2], columns=['Time'], index=[0, 3]) assert_frame_equal(result, expected)
def agg(self): dframe = DataFrame(index=self.column.index) dframe = self._build_dframe(dframe, self.columns) column_names = [self._name_for_idx(i) for i in xrange(0, 2)] dframe = dframe.dropna(subset=column_names) dframe = DataFrame([dframe.sum().to_dict()]) return self._add_calculated_column(dframe)
def test_na_actions_categorical(self): cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) vals = ["a", "b", np.nan, "d"] df = DataFrame({"cats": cat, "vals": vals}) cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3]) vals2 = ["a", "b", "b", "d"] df_exp_fill = DataFrame({"cats": cat2, "vals": vals2}) cat3 = Categorical([1, 2, 3], categories=[1, 2, 3]) vals3 = ["a", "b", np.nan] df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3}) cat4 = Categorical([1, 2], categories=[1, 2, 3]) vals4 = ["a", "b"] df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4}) # fillna res = df.fillna(value={"cats": 3, "vals": "b"}) tm.assert_frame_equal(res, df_exp_fill) with pytest.raises(ValueError, match=("fill value must " "be in categories")): df.fillna(value={"cats": 4, "vals": "c"}) res = df.fillna(method='pad') tm.assert_frame_equal(res, df_exp_fill) # dropna res = df.dropna(subset=["cats"]) tm.assert_frame_equal(res, df_exp_drop_cats) res = df.dropna() tm.assert_frame_equal(res, df_exp_drop_all) # make sure that fillna takes missing values into account c = Categorical([np.nan, "b", np.nan], categories=["a", "b"]) df = pd.DataFrame({"cats": c, "vals": [1, 2, 3]}) cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"]) df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]}) res = df.fillna("a") tm.assert_frame_equal(res, df_exp)
def combine_spread(file_set, shift, drop_return_data=False): """ Combine the spread of input files, return with mean and standard deviation calculated. """ data = [] values = {} for val in ('left', 'right', 'com', 'dist', 'radius', 'diameter'): values[val] = {} # Collect data from all files into dictionaries for i, _file in enumerate(file_set): data.append(Spread().read(_file)) for val in values.keys(): values[val][i] = Series( data=data[i].spread[val]['val'], index=data[i].times ) data[i].times = (np.array(data[i].times) - shift[i]) spread = Spread() spread.spread['num'] = len(file_set) for val in values.keys(): # Shift time as per synchronisation for i in values[val]: values[val][i].index = np.array(values[val][i].index) - shift[i] # Convert to DataFrame df = DataFrame(data=values[val]) # If not a single file, keep only indices with at least two non-NaN if len(file_set) > 1: df = df.dropna() # If return data dropped, fill data here if drop_return_data: for i in df.columns: data[i].spread[val]['val'] = df[i].tolist() # Get times, mean and standard error as lists mean = list(df.mean(axis=1)) std_error = list(df.std(axis=1)) times = list(df.index) # Add to Spread object spread.spread[val]['val'] = mean spread.spread[val]['std'] = std_error spread.spread['times'] = times return spread, data
def createDataset(): data=read_csv('data/data2.csv',parse_dates=['DATE'],index_col='DATE') data.drop('DY', axis=1, inplace=True) data=data.dropna() data['RETURNS']=data['Price'].pct_change() rets=DataFrame(data['RETURNS'])*100 rets['MA10']=fun.sampleMovingAverage(rets,10) rets['MA30']=fun.sampleMovingAverage(rets['RETURNS'],50) rets['VAR10']=fun.movingVariance(rets['RETURNS'],30) rets=rets.dropna() return rets
def Polyfit(x, y, order=1, color='k--',lab='label',Xvals=10,subplot=plot): df=DataFrame({'x':x.values,'y':y.values},index=x.index) df = df.dropna() print df PolyCoeffs = np.polyfit(df['x'], df['y'], order) ## calculates polynomial coeffs PolyEq = np.poly1d(PolyCoeffs) ## turns the coeffs into an equation #print PolyEq PolyXvals = np.linspace(min(x), max(x)+Xvals) ## creates x-values for trendline #print PolyXvals Polyplot = subplot.plot(PolyXvals, PolyEq(PolyXvals),color,label=lab) ## plots the trendline return Polyplot
def get_plate_data(path,c): """ Get plate data, drop empty columns, drop selected columns, rename columns, add normalized columns. """ return thread_first(path, from_file, (str.replace,'\r',''), StringIO, pd.read_csv(delimiter=c['delimiter'], skiprows=c['skiprows']), df.dropna(axis=1,how='all'), (drop_matching_columns,c['dropcols']), df.rename(columns=c['colrename']), (add_normalized_columns,c['normcols']))
def append_2013_gva(dfin, csv_file_path): df = dfin.copy() gva = pd.read_csv(csv_file_path) gvasub = DataFrame(columns=['nuts3id', 'gva2013']) gvasub['nuts3id'], gvasub['gva2013'] = gva['nutsid'], gva['2013'] df_gva = pd.merge( left=df, right=gvasub.dropna(), how='left', left_on='nuts3id', right_on='nuts3id') return df_gva
def detect_objects(self, img, template, thres): #Conver to gray scale img_grey = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) #Match Template temp_match = cv2.matchTemplate(img_grey, template, 1) #Normalize scores temp_match = temp_match-temp_match.min() temp_match = 1 - temp_match/temp_match.max() #Apply Threshold retval, dst = cv2.threshold(temp_match, thres, 1, 0) dst = dst.astype(np.uint8) #Extract centroid of connected components contours, hierarchy = cv2.findContours(dst, 1, 2) if len(contours) > 0: m_df = DataFrame([cv2.moments(cont) for cont in contours]) m_df['x'] = (m_df['m10']/m_df['m00']) + int(template.shape[0]/2) m_df['y'] = (m_df['m01']/m_df['m00']) + int(template.shape[1]/2) m_df.dropna(subset=['x', 'y'], inplace=True) return m_df else: return DataFrame()
def gettraining(data,XColumns,YColumn,dropNA=True,shuffle=True): #make a copy trainingData = DataFrame(data) #drop a few columns which we won't use for ML models trainingData = trainingData[XColumns+YColumn] if dropNA == True: #drop Na values if necessary trainingData = trainingData.dropna() if shuffle == True: #shuffle data if necessary trainingData = trainingData.reindex(np.random.permutation(trainingData.index)) return(trainingData)
def parse_region(df: pd.DataFrame, min_row: int, max_row: int, cols: list) -> pd.DataFrame: df = df.loc[min_row:max_row, cols] # Region is in either 0,0 or 0,1 of the sliced DataFrame, with the data starting from the 3rd row region = df.iloc[0, 0] if type(region) == float: region = df.iloc[0, 1] df = df.dropna(axis=0, how='all').iloc[2:, 1:] df.columns = ['Name', 'Count'] df['Region'] = region return df
def test_dropEmptyRows(self): N = len(self.frame.index) mat = random.randn(N) mat[:5] = nan frame = DataFrame({'foo': mat}, index=self.frame.index) original = Series(mat, index=self.frame.index, name='foo') expected = original.dropna() inplace_frame1, inplace_frame2 = frame.copy(), frame.copy() smaller_frame = frame.dropna(how='all') # check that original was preserved assert_series_equal(frame['foo'], original) inplace_frame1.dropna(how='all', inplace=True) assert_series_equal(smaller_frame['foo'], expected) assert_series_equal(inplace_frame1['foo'], expected) smaller_frame = frame.dropna(how='all', subset=['foo']) inplace_frame2.dropna(how='all', subset=['foo'], inplace=True) assert_series_equal(smaller_frame['foo'], expected) assert_series_equal(inplace_frame2['foo'], expected)
def test_dropna_multiple_axes(self): df = DataFrame([[1, np.nan, 2, 3], [4, np.nan, 5, 6], [np.nan, np.nan, np.nan, np.nan], [7, np.nan, 8, 9]]) cp = df.copy() # GH20987 with tm.assert_produces_warning(FutureWarning): result = df.dropna(how='all', axis=[0, 1]) with tm.assert_produces_warning(FutureWarning): result2 = df.dropna(how='all', axis=(0, 1)) expected = df.dropna(how='all').dropna(how='all', axis=1) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) assert_frame_equal(df, cp) inp = df.copy() with tm.assert_produces_warning(FutureWarning): inp.dropna(how='all', axis=(0, 1), inplace=True) assert_frame_equal(inp, expected)
def volcano_plot(self, df: pd.DataFrame, p_value: float = 0.05, fc=2, x_colname='logFC', y_colname='-log10p', cutoff_lines=True, top_n=None, top_by='-log10p', show_labels=False, legend=True, **kwargs): # Get rid of NaN data df = df.dropna() # Convert cutoffs to logspace log2_fc = np.log2(fc) log10_pval = -np.log10(p_value) # Split data into above and below cutoff dataframes sig = df[(df[y_colname] >= log10_pval) & (np.abs(df[x_colname]) >= log2_fc)] insig = df[~(df[y_colname] >= log10_pval) | ~(np.abs(df[x_colname]) >= log2_fc)] # Get maximum values for formatting latter max_y = np.ceil(np.max(sig[y_colname])) max_x = np.ceil(np.max(np.abs(sig[x_colname]))) fig, ax = plt.subplots(**kwargs) # Split top data points if requested if top_n: # Find points to highlight sort = set() if isinstance(top_by, list): for col in top_by: sort = sort.union( set(sig.index[np.argsort(np.abs( sig[col]))[::-1]][:top_n].values)) elif isinstance(top_by, str): sort = sort.union( set(sig.index[np.argsort(np.abs( sig[top_by]))[::-1]][:top_n].values)) else: raise ValueError( 'top_by must be a string or list of values found in the DataFrame used for the plot' ) top_sig = sig.loc[sort] sig = sig.drop(sort) ax.plot(top_sig[x_colname], top_sig[y_colname], 'o', c=_colors[0], ms=10, zorder=2, label='Top Genes') if show_labels: fs = mpl.rcParams['legend.fontsize'] for row in top_sig.iterrows(): ax.annotate(row[0], xy=(row[1][x_colname], row[1][y_colname]), fontsize=fs, style='italic') # Make plot ax.plot(sig[x_colname], sig[y_colname], 'o', c=_colors[2], ms=10, zorder=1, label='Diff Exp') ax.plot(insig[x_colname], insig[y_colname], 'o', c=_colors[-1], ms=10, zorder=0, mew=0, label='') # Adjust axes ax.set_xlim([-max_x, max_x]) ax.set_ylim([0, max_y]) # Add cutoff lines if cutoff_lines: color = _colors[1] # P value line ax.plot([-max_x, max_x], [log10_pval, log10_pval], '--', c=color, lw=3, label='Threshold') # log fold change lines ax.plot([-log2_fc, -log2_fc], [0, max_y], '--', c=color, lw=3) ax.plot([log2_fc, log2_fc], [0, max_y], '--', c=color, lw=3) if legend: ax.legend(loc='best', numpoints=1) # Adjust labels ax.tick_params(axis='both', which='major') ax.set_xlabel(r'$log_2(\frac{KO}{WT})$') ax.set_ylabel(r'$-log_{10}$(corrected p-value)') return ax
def dropna(df:pd.DataFrame): """Drop rows with 'Nan' values""" df = df[df < math.exp(709)] # big number df = df[df != 0.0] df = df.dropna() return df
def run_model(model_data: pd.DataFrame, pred_data: pd.DataFrame, hierarchy: pd.DataFrame, gbd_hierarchy: pd.DataFrame, covariate_list: List[str], verbose: bool = True, **kwargs) -> Tuple[Dict, Dict, pd.Series, pd.Series, pd.Series]: model_data['logit_idr'] = logit(model_data['idr']) model_data['logit_idr'] = model_data['logit_idr'].replace( (-np.inf, np.inf), np.nan) model_data['idr_se'] = 1 model_data['logit_idr_se'] = 1 model_data['intercept'] = 1 # lose 0s and 1s model_data = model_data.loc[model_data['logit_idr'].notnull()] covariate_priors = get_covariate_priors(1, 'idr') covariate_priors = { covariate: covariate_priors[covariate] for covariate in covariate_list } covariate_constraints = get_covariate_constraints('idr') covariate_constraints = { covariate: covariate_constraints[covariate] for covariate in covariate_list } covariate_lambdas_sr_r = {covariate: 3. for covariate in covariate_list} covariate_lambdas_admin = {covariate: 100. for covariate in covariate_list} var_args = { 'dep_var': 'logit_idr', 'dep_var_se': 'logit_idr_se', 'fe_vars': [ 'intercept', 'log_infwavg_testing_rate_capacity', ] + covariate_list, 'prior_dict': { 'log_infwavg_testing_rate_capacity': { 'prior_beta_uniform': np.array([1e-6, np.inf]) }, }, 're_vars': [], 'group_var': 'location_id', } global_prior_dict = covariate_priors location_prior_dict = {} pred_replace_dict = { 'log_testing_rate_capacity': 'log_infwavg_testing_rate_capacity', } pred_exclude_vars = [] level_lambdas = { 0: { 'intercept': 3., 'log_infwavg_testing_rate_capacity': 3., **covariate_lambdas_sr_r, }, # G->SR 1: { 'intercept': 3., 'log_infwavg_testing_rate_capacity': 3., **covariate_lambdas_sr_r, }, # SR->R 2: { 'intercept': 100., 'log_infwavg_testing_rate_capacity': 100., **covariate_lambdas_admin, }, # R->A0 3: { 'intercept': 100., 'log_infwavg_testing_rate_capacity': 100., **covariate_lambdas_admin, }, # A0->A1 4: { 'intercept': 100., 'log_infwavg_testing_rate_capacity': 100., **covariate_lambdas_admin, }, # A1->A2 5: { 'intercept': 100., 'log_infwavg_testing_rate_capacity': 100., **covariate_lambdas_admin, }, # A2->A3 } if var_args['group_var'] != 'location_id': raise ValueError( 'NRMSE data assignment assumes `study_id` == `location_id` (`location_id` must be group_var).' ) model_data_cols = [ 'location_id', 'date', var_args['dep_var'], var_args['dep_var_se'] ] + var_args['fe_vars'] model_data = model_data.loc[:, model_data_cols] model_data = model_data.dropna() mr_model_dict, prior_dicts = cascade.run_cascade( model_name='idr', model_data=model_data.copy(), hierarchy=hierarchy.copy(), # run w/ modeling hierarchy var_args=var_args.copy(), global_prior_dict=global_prior_dict.copy(), location_prior_dict=location_prior_dict.copy(), level_lambdas=level_lambdas.copy(), verbose=False, ) adj_gbd_hierarchy = model_inputs.validate_hierarchies( hierarchy.copy(), gbd_hierarchy.copy()) pred_data = pred_data.dropna() pred, pred_fe, pred_location_map = cascade.predict_cascade( pred_data=pred_data.copy(), hierarchy=adj_gbd_hierarchy.copy(), # predict w/ gbd hierarchy mr_model_dict=mr_model_dict.copy(), pred_replace_dict=pred_replace_dict.copy(), pred_exclude_vars=pred_exclude_vars.copy(), var_args=var_args.copy(), verbose=False, ) pred = expit(pred).rename(pred.name.replace('logit_', '')) pred_fe = expit(pred_fe).rename(pred_fe.name.replace('logit_', '')) return mr_model_dict, prior_dicts, pred.dropna(), pred_fe.dropna( ), pred_location_map, level_lambdas
def test_sort_index_nan_multiindex(self): # GH#14784 # incorrect sorting w.r.t. nans tuples = [[12, 13], [np.nan, np.nan], [np.nan, 3], [1, 2]] mi = MultiIndex.from_tuples(tuples) df = DataFrame(np.arange(16).reshape(4, 4), index=mi, columns=list("ABCD")) s = Series(np.arange(4), index=mi) df2 = DataFrame( { "date": pd.DatetimeIndex( [ "20121002", "20121007", "20130130", "20130202", "20130305", "20121002", "20121207", "20130130", "20130202", "20130305", "20130202", "20130305", ] ), "user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5], "whole_cost": [ 1790, np.nan, 280, 259, np.nan, 623, 90, 312, np.nan, 301, 359, 801, ], "cost": [12, 15, 10, 24, 39, 1, 0, np.nan, 45, 34, 1, 12], } ).set_index(["date", "user_id"]) # sorting frame, default nan position is last result = df.sort_index() expected = df.iloc[[3, 0, 2, 1], :] tm.assert_frame_equal(result, expected) # sorting frame, nan position last result = df.sort_index(na_position="last") expected = df.iloc[[3, 0, 2, 1], :] tm.assert_frame_equal(result, expected) # sorting frame, nan position first result = df.sort_index(na_position="first") expected = df.iloc[[1, 2, 3, 0], :] tm.assert_frame_equal(result, expected) # sorting frame with removed rows result = df2.dropna().sort_index() expected = df2.sort_index().dropna() tm.assert_frame_equal(result, expected) # sorting series, default nan position is last result = s.sort_index() expected = s.iloc[[3, 0, 2, 1]] tm.assert_series_equal(result, expected) # sorting series, nan position last result = s.sort_index(na_position="last") expected = s.iloc[[3, 0, 2, 1]] tm.assert_series_equal(result, expected) # sorting series, nan position first result = s.sort_index(na_position="first") expected = s.iloc[[1, 2, 3, 0]] tm.assert_series_equal(result, expected)
def table_note(self): t = DataFrame(self.get_data('notes')) t = self.table_filter_modules(t, 'parent_type') t = t.dropna(subset=['filename']) # t = t[:10] # for debug return t
def _has_missing_feature(self, features): features = [features] df = DataFrame(features) return len(df.dropna()) == 0
def drop_missing(df: DataFrame, cols: list) -> DataFrame: df.dropna(subset=cols, inplace=True) return df
def drop_df_nan_rows_according2cols(df: pd.DataFrame, cols: list) -> pd.DataFrame: df = df.dropna(subset=cols) return df
#pandas : 기술적 통계와 관련된함수 NaN from pandas import Series, DataFrame import numpy as np df = DataFrame([[1.4, np.nan], [7, -4.5], [np.NaN, np.NaN], [0.5, -1]], columns=['one', 'two']) print(df) print(df.drop(1), '\n') # 1행지우기 print(df.dropna(), '\n') #nan값을 지운다. print(df.dropna(how='any'), '\n') # nan값이 하나라도 있으면 지운다 print(df.dropna(how='all'), '\n') # 모든행의 값이 nan 이면 지운다 print(df.dropna(subset=['one']), '\n') # 특정열에 nan 이 있으면 그행을 제거한다. print(df.fillna(0), '\n') # 평균으로 채우기 sklearn 모듈의 SimpleInputer # 기술적 통계와 관련된 함수 print('**' * 10) print(df.sum(), '\n') #열단위의 합 nan은 제외 print(df.sum(axis=0), '\n') print(df.sum(axis=1), '\n') # 행단위의 합 print(df.mean(axis=1), '\n') # 행의 평균 print(df.mean(axis=1, skipna=True), '\n') # na포함 계산 print(df.mean(axis=1, skipna=False), '\n') # na 있을시 계산 x print(df.mean(axis=0, skipna=True), '\n') # nan이 있어도 계산 o (열단위) print(df.mean(axis=0, skipna=False), '\n') #nan이 있기 때문에 계산 x print(df.max(), '\n') print(df.max(axis=0), '\n') #열값중 가장 큰값 print(df.idxmax(), '\n')
def gather_gen_data_data(self, case, key): """ :rtype: pandas.DataFrame """ key_parts = key.split("@") key = key_parts[0] if len(key_parts) > 1: report_step = int(key_parts[1]) else: report_step = 0 try: data = GenDataCollector.loadGenData(self._enkf_main, case, key, report_step) except (ValueError, KeyError): data = DataFrame() return data.dropna() # removes all rows that has a NaN def gather_custom_kw_data(self, case, key): """ :rtype: pandas.DataFrame """ data = CustomKWCollector.loadAllCustomKWData(self._enkf_main, case, [key]) if key in data: return data[key] else: return data def is_summary_key(self, key): """ :rtype: bool """ return key in self._enkf_main.getKeyManager().summaryKeys() def is_gen_kw_key(self, key):
merge_data = pd.merge(data2, dat3, left_on = "COUNTY", right_on="COUNTY_NAME") #%% data_final = merge_data.groupby(['Year','REGION_NAME'])['PK (FULL DAY)'].sum() #%% data_final = data_final.reset_index() # #%% main_list = [] #%% for county in data_final.REGION_NAME.unique(): local_list = [] albany = data_final[data_final.REGION_NAME == county] series = pd.Series(albany['PK (FULL DAY)'].to_list(), index=albany['Year'].to_list())# create lagged dataset values = DataFrame(series.values) values.dropna(inplace = True) dataframe = concat([values.shift(1), values], axis=1) dataframe.columns = ['t', 't+1'] dataframe.dropna(inplace = True) X = dataframe.values dict1= {} for size in [0.50,0.55, 0.60,0.66,0.70,0.75,0.80]: train_size = int(len(X) * size) train, test = X[1:train_size], X[train_size:] train_X, train_y = train[:,0], train[:,1] test_X, test_y = test[:,0], test[:,1] # persistence model on training set train_pred = [x for x in train_X] # calculate residuals train_resid = [train_y[i]-train_pred[i] for i in range(len(train_pred))] # model the training set residuals
def test_dropna(self): df = DataFrame(np.random.randn(6, 4)) df[2][:2] = np.nan dropped = df.dropna(axis=1) expected = df.loc[:, [0, 1, 3]] inp = df.copy() inp.dropna(axis=1, inplace=True) assert_frame_equal(dropped, expected) assert_frame_equal(inp, expected) dropped = df.dropna(axis=0) expected = df.loc[list(range(2, 6))] inp = df.copy() inp.dropna(axis=0, inplace=True) assert_frame_equal(dropped, expected) assert_frame_equal(inp, expected) # threshold dropped = df.dropna(axis=1, thresh=5) expected = df.loc[:, [0, 1, 3]] inp = df.copy() inp.dropna(axis=1, thresh=5, inplace=True) assert_frame_equal(dropped, expected) assert_frame_equal(inp, expected) dropped = df.dropna(axis=0, thresh=4) expected = df.loc[range(2, 6)] inp = df.copy() inp.dropna(axis=0, thresh=4, inplace=True) assert_frame_equal(dropped, expected) assert_frame_equal(inp, expected) dropped = df.dropna(axis=1, thresh=4) assert_frame_equal(dropped, df) dropped = df.dropna(axis=1, thresh=3) assert_frame_equal(dropped, df) # subset dropped = df.dropna(axis=0, subset=[0, 1, 3]) inp = df.copy() inp.dropna(axis=0, subset=[0, 1, 3], inplace=True) assert_frame_equal(dropped, df) assert_frame_equal(inp, df) # all dropped = df.dropna(axis=1, how="all") assert_frame_equal(dropped, df) df[2] = np.nan dropped = df.dropna(axis=1, how="all") expected = df.loc[:, [0, 1, 3]] assert_frame_equal(dropped, expected) # bad input msg = "No axis named 3 for object type <class 'pandas.core.frame.DataFrame'>" with pytest.raises(ValueError, match=msg): df.dropna(axis=3)
def data_clean(joined: pd.DataFrame) -> pd.DataFrame: """[function currently does basic na forward filling and conversion of variables to useful types. I also drop a bunch of columns that either are entirely null or duplciate columns, the data source seems to be a weirdly processed] Arguments: joined {df} -- [original df from kaggle download https://www.kaggle.com/init27/fastai-v3-rossman-data-clean] Returns: [df] -- [cleaned df] """ joined.loc[:, weather_vars] = joined.loc[:, weather_vars].fillna( method="ffill" ) weather_vars.append("Events") # some of the initial Max_Gust_Speed Data was missing # so I filled with the Max_wind Speed. joined.loc[ joined["Max_Gust_SpeedKm_h"].isna(), "Max_Gust_SpeedKm_h" ] = joined.loc[joined["Max_Gust_SpeedKm_h"].isna(), "Max_Wind_SpeedKm_h"] # change text data into categories, as codes. joined["Events"] = joined["Events"].astype("category").cat.codes + 1 joined["Store"] = joined["Store"] - 1 joined["DayOfWeek"] = joined["DayOfWeek"] - 1 joined["Week"] = joined["Week"] - 1 joined["Assortment"] = joined["Assortment"].astype("category").cat.codes joined["State"] = joined["State"].astype("category").cat.codes joined["WindDirDegrees"] = ( joined["WindDirDegrees"].astype("category").cat.codes ) joined["StoreType"] = joined["StoreType"].astype("category").cat.codes # Drop variables that didn't look useful. joined.drop( [ "Promo2Since", "Year", "Month", "Day", "PromoInterval", "StateName", "file_DE", "State_DE", "Dayofweek_DE", "Day_DE", "Date", "Is_quarter_end", "Is_month_end_DE", "Is_year_start", "week", "file", "Month_DE", "week_DE", "Dayofyear_DE", "CompetitionOpenSince", "Date_DE", "Elapsed_DE", "CompetitionDistance", ], axis=1, inplace=True, ) if "Id" in joined.keys(): joined.drop("Id", axis=1, inplace=True) # check the keys. Make sure that we don't have a miss match # between keys in list and dataframe. a = set(joined.keys()) total_keys = cat_vars.copy() total_keys.extend(cont_vars) b = set(total_keys) c = a.difference(b) assert not c # convert booleans to ints. joined[joined.select_dtypes(include="bool").keys()] = joined.select_dtypes( include="bool" ).astype("int") # change to floats. joined[cont_vars] = joined[cont_vars].astype("float") joined.dropna(0, inplace=True) return joined
def drop_na_records(table: DataFrame, keys: List[str]) -> DataFrame: """ Drops all records which have no data outside of the provided keys """ value_columns = [col for col in table.columns if not col in keys] return table.dropna(subset=value_columns, how="all")
def describe(title, df: pd.DataFrame) -> dict: """Calculate the statistics for each series in this DataFrame. Args: df: DataFrame. Returns: This function returns a dictionary containing: - table: overall statistics. - variables: descriptions per series. - correlations: correlation matrices. - missing: missing value diagrams. - messages: direct special attention to these patterns in your data. - package: package details. :param title: """ if df is None: raise ValueError("Can not describe a `lazy` ProfileReport without a DataFrame.") if not isinstance(df, pd.DataFrame): warnings.warn("df is not of type pandas.DataFrame") if df.empty: raise ValueError("df can not be empty") disable_progress_bar = not config["progress_bar"].get(bool) date_start = datetime.utcnow() correlation_names = [ correlation_name for correlation_name in ["pearson", "spearman", "kendall", "phi_k", "cramers",] if config["correlations"][correlation_name]["calculate"].get(bool) ] number_of_tasks = 9 + len(df.columns) + len(correlation_names) with tqdm( total=number_of_tasks, desc="Summarize dataset", disable=disable_progress_bar ) as pbar: series_description = get_series_descriptions(df, pbar) pbar.set_postfix_str("Get variable types") variables = { column: description["type"] for column, description in series_description.items() } pbar.update() # Transform the series_description in a DataFrame pbar.set_postfix_str("Get variable statistics") variable_stats = pd.DataFrame(series_description) pbar.update() # Get correlations correlations = {} for correlation_name in correlation_names: pbar.set_postfix_str(f"Calculate {correlation_name} correlation") correlations[correlation_name] = calculate_correlation( df, variables, correlation_name ) pbar.update() # Make sure correlations is not None correlations = { key: value for key, value in correlations.items() if value is not None } # Scatter matrix pbar.set_postfix_str("Get scatter matrix") scatter_matrix = get_scatter_matrix(df, variables) pbar.update() # Table statistics pbar.set_postfix_str("Get table statistics") table_stats = get_table_stats(df, variable_stats) pbar.update() # Missing diagrams pbar.set_postfix_str("Get missing diagrams") missing = get_missing_diagrams(df, table_stats) pbar.update() # Sample pbar.set_postfix_str("Take sample") sample = get_sample(df) pbar.update() # Duplicates pbar.set_postfix_str("Locating duplicates") supported_columns = [ key for key, value in series_description.items() if value["type"] != Variable.S_TYPE_UNSUPPORTED ] duplicates = get_duplicates(df, supported_columns) pbar.update() # Clusters pbar.set_postfix_str("Searching for clusters") categoricals = [column_name for column_name, variable_type in variables.items() if variable_type == Variable.TYPE_CAT] df_without_missing = df.dropna() df_ohe = pd.concat([df_without_missing.drop(categoricals, axis=1), pd.get_dummies(df_without_missing[categoricals])], axis=1).reset_index() clusters = { name: pd.concat([df_ohe, pd.DataFrame({"Cluster": eval(clustering).fit(df_ohe).labels_})], axis=1) for name, clustering in config["clusters"]["clusterings"].get() } # Outliers pbar.set_postfix_str("Detecting outliers") outliers = { name: pd.concat([df_ohe, pd.DataFrame({"Outlier": eval(detector).fit_predict(df_ohe)})], axis=1) for name, detector in config["outliers"]["detectors"].get() } # Messages pbar.set_postfix_str("Get messages/warnings") messages = get_messages(table_stats, series_description, correlations) pbar.update() pbar.set_postfix_str("Get reproduction details") package = { "pandas_profiling_version": VERSION, "pandas_profiling_config": config.dump(), } pbar.update() pbar.set_postfix_str("Completed") date_end = datetime.utcnow() analysis = { "title": title, "date_start": date_start, "date_end": date_end, "duration": date_end - date_start, } return { # Analysis metadata "analysis": analysis, # Overall dataset description "table": table_stats, # Per variable descriptions "variables": series_description, # Bivariate relations "scatter": scatter_matrix, # Correlation matrices "correlations": correlations, # Missing values "missing": missing, # Warnings "messages": messages, # Package "package": package, # Sample "sample": sample, # Duplicates "duplicates": duplicates, # Clusters "clusters": clusters, # Outliers "outliers": outliers }
def test_dropna(self): df = DataFrame(np.random.randn(6, 4)) df.iloc[:2, 2] = np.nan dropped = df.dropna(axis=1) expected = df.loc[:, [0, 1, 3]] inp = df.copy() return_value = inp.dropna(axis=1, inplace=True) tm.assert_frame_equal(dropped, expected) tm.assert_frame_equal(inp, expected) assert return_value is None dropped = df.dropna(axis=0) expected = df.loc[list(range(2, 6))] inp = df.copy() return_value = inp.dropna(axis=0, inplace=True) tm.assert_frame_equal(dropped, expected) tm.assert_frame_equal(inp, expected) assert return_value is None # threshold dropped = df.dropna(axis=1, thresh=5) expected = df.loc[:, [0, 1, 3]] inp = df.copy() return_value = inp.dropna(axis=1, thresh=5, inplace=True) tm.assert_frame_equal(dropped, expected) tm.assert_frame_equal(inp, expected) assert return_value is None dropped = df.dropna(axis=0, thresh=4) expected = df.loc[range(2, 6)] inp = df.copy() return_value = inp.dropna(axis=0, thresh=4, inplace=True) tm.assert_frame_equal(dropped, expected) tm.assert_frame_equal(inp, expected) assert return_value is None dropped = df.dropna(axis=1, thresh=4) tm.assert_frame_equal(dropped, df) dropped = df.dropna(axis=1, thresh=3) tm.assert_frame_equal(dropped, df) # subset dropped = df.dropna(axis=0, subset=[0, 1, 3]) inp = df.copy() return_value = inp.dropna(axis=0, subset=[0, 1, 3], inplace=True) tm.assert_frame_equal(dropped, df) tm.assert_frame_equal(inp, df) assert return_value is None # all dropped = df.dropna(axis=1, how="all") tm.assert_frame_equal(dropped, df) df[2] = np.nan dropped = df.dropna(axis=1, how="all") expected = df.loc[:, [0, 1, 3]] tm.assert_frame_equal(dropped, expected) # bad input msg = "No axis named 3 for object type DataFrame" with pytest.raises(ValueError, match=msg): df.dropna(axis=3)
def predict(self, user_score: pd.DataFrame, course_need_pre: list = None): # clear user score if len(user_score) > 1: user_score = user_score[0:1] # clear input data user_score = user_score.dropna(axis=1) valid_course = list(set(user_score.columns) & set(self.data.columns)) user_score = user_score[valid_course] del valid_course # course_need_pre == none -> all course if course_need_pre is None or len(course_need_pre) < 1: course_need_pre = set(self.data.columns) else: course_need_pre = [ c for c in course_need_pre if c in self.data.columns ] course_need_pre = list(set(course_need_pre) - set(user_score.columns)) all_course = list( set(list(user_score.columns) + list(course_need_pre))) train_data = self.data.loc[:, all_course].dropna(axis=0, thresh=2) # end clear data # preprocessing: normalize and fill nan train_for_sim, filled_matrix = process.fillNan(train_data, type=self.nor_type) # Tính độ tương tự similarity_df = self.sim(train_for_sim.loc[:, user_score.columns], user_score) del train_for_sim # normalize train_nor, train_avg = process.normalize(train_data, 'row_avg') # row_avg user_score_nor, user_avg = process.normalize(user_score, 'row_avg') # Dự đoán điểm, khởi tạo pre_score_nor pre_score_nor = pd.DataFrame(columns=course_need_pre, index=user_score.index) for col in course_need_pre: # lấy những điểm thật score_series = train_nor.loc[:, col].dropna() # lấy độ tương tự với những sinh viên có điểm trong score_series k_sim = similarity_df.loc[score_series.index, user_score.index[0]] if self.k is not None: k_sim = k_sim.nlargest(self.k) score_series = score_series.loc[k_sim.index.tolist()] # tính điểm dự đoán pre_score_nor.loc[ user_score.index, col] = k_sim.mul(score_series).sum() / k_sim.sum() # unnormalize pre_score_nor pre_score = process.unnormalize(pre_score_nor, user_avg, 'row_avg') # Sửa lỗi điểm dự đoán return process.formal_score(pre_score)
def preprocess(df: pd.DataFrame) -> pd.DataFrame: """ dtype df: dataframe rtype df: dataframe """ df = df.dropna(subset=[ 'FOB.VALUE', 'TOTAL.TAXES' ]) # Remove 170 rows which does not have FOB, CIF value. df.loc[:, 'Unitprice'] = df['CIF.VALUE'] / df['QUANTITY'] df.loc[:, 'WUnitprice'] = df['CIF.VALUE'] / df['GROSS.WEIGHT'] df.loc[:, 'TaxRatio'] = df['TOTAL.TAXES'] / df['CIF.VALUE'] df.loc[:, 'TaxUnitquantity'] = df['TOTAL.TAXES'] / df['QUANTITY'] df.loc[:, 'FOBCIFRatio'] = df['FOB.VALUE'] / df['CIF.VALUE'] df.loc[:, 'HS6'] = df['TARIFF.CODE'].apply(lambda x: int(x // 10000)) df.loc[:, 'HS4'] = df['HS6'].apply(lambda x: int(x // 100)) df.loc[:, 'HS2'] = df['HS4'].apply(lambda x: int(x // 100)) # Factor some thing df.loc[:, 'HS6.Origin'] = [ str(i) + '&' + j for i, j in zip(df['HS6'], df['ISO3']) ] # # Made a general function "merge_attributes" for supporting any combination # # Generated all possible combinations, But the final AUC is smaller than just adding three combinations active below. # candFeaturesCombine = ['OFFICE','IMPORTER.TIN','ISO3','HS6','DECLARANT.CODE'] # for subset in combinations(candFeaturesCombine, 2): # merge_attributes(df, *subset) # for subset in combinations(candFeaturesCombine, 3): # merge_attributes(df, *subset) merge_attributes(df, 'OFFICE', 'IMPORTER.TIN') merge_attributes(df, 'OFFICE', 'HS6') merge_attributes(df, 'OFFICE', 'ISO3') # Day of Year of SGD.DATE tmp2 = {} for date in set(df['SGD.DATE']): tmp2[date] = dt.strptime(date, '%y-%m-%d') tmp_day = {} tmp_week = {} tmp_month = {} yearStart = dt(tmp2[date].date().year, 1, 1) for item in tmp2: tmp_day[item] = (tmp2[item] - yearStart).days tmp_week[item] = int(tmp_day[item] / 7) tmp_month[item] = int(tmp_day[item] / 30) df.loc[:, 'SGD.DayofYear'] = df['SGD.DATE'].apply(lambda x: tmp_day[x]) df.loc[:, 'SGD.WeekofYear'] = df['SGD.DATE'].apply(lambda x: tmp_week[x]) df.loc[:, 'SGD.MonthofYear'] = df['SGD.DATE'].apply(lambda x: tmp_month[x]) # RECEIPT-SGD time # To-Do: We should consider where there aren't any receipt date. tmp = {} for date in set(df['SGD.DATE']).union(set(df['RECEIPT.DATE'])): tmp[date] = dt.strptime(date, '%y-%m-%d') df.loc[:, 'RECEIPT.DATE-SGD.DATE'] = df['RECEIPT.DATE'].apply( lambda x: tmp[x]) - df['SGD.DATE'].apply(lambda x: tmp[x]) df.loc[:, 'RECEIPT.DATE-SGD.DATE'] = df['RECEIPT.DATE-SGD.DATE'].apply( lambda x: x.days) return df
def pipe_filter_rows(self, df: pd.DataFrame) -> pd.DataFrame: return df.dropna(subset=["Daily change in cumulative total"])
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') #target_names = column_mapping.get('target_names') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) #target_names = None if production_data is not None and target_column is not None and prediction_column is not None: production_data.replace([np.inf, -np.inf], np.nan, inplace=True) production_data.dropna(axis=0, how='any', inplace=True) if len(prediction_column) <= 2: binaraizer = preprocessing.LabelBinarizer() binaraizer.fit(production_data[target_column]) binaraized_target = pd.DataFrame( binaraizer.transform(production_data[target_column])) binaraized_target.columns = ['target'] fpr, tpr, thrs = metrics.roc_curve( binaraized_target, production_data[prediction_column[0]]) fig = go.Figure() fig.add_trace( go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC', marker=dict( size=6, color=red, ))) fig.update_layout(yaxis_title="True Positive Rate", xaxis_title="False Positive Rate", showlegend=True) fig_json = json.loads(fig.to_json()) self.wi = BaseWidgetInfo( title=self.title, type="big_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=1, params={ "data": fig_json['data'], "layout": fig_json['layout'] }, additionalGraphs=[], ) else: binaraizer = preprocessing.LabelBinarizer() binaraizer.fit(production_data[target_column]) binaraized_target = pd.DataFrame( binaraizer.transform(production_data[target_column])) binaraized_target.columns = prediction_column #plot support bar graphs = [] for label in prediction_column: fpr, tpr, thrs = metrics.roc_curve( binaraized_target[label], production_data[label]) fig = go.Figure() fig.add_trace( go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC', marker=dict( size=6, color=red, ))) fig.update_layout(yaxis_title="True Positive Rate", xaxis_title="False Positive Rate", showlegend=True) fig_json = json.loads(fig.to_json()) graphs.append({ "id": "tab_" + str(label), "title": str(label), "graph": { "data": fig_json["data"], "layout": fig_json["layout"], } }) self.wi = BaseWidgetInfo( title=self.title, type="tabbed_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=1, params={"graphs": graphs}, additionalGraphs=[], ) else: self.wi = None
def courses_computed( courses: pd.DataFrame, listings: pd.DataFrame, evaluation_statistics: pd.DataFrame, course_professors: pd.DataFrame, ) -> pd.DataFrame: """ Populates computed course rating fields: average_rating: Average course rating over all past instances. average_workload: Average course workload over all past instances. Also populates last-offered course fields: last_offered_course_id: course_id of the most recent previous offering. last_enrollment_course_id: course_id of the most recent previous offering with enrollment statistics. last_enrollment: Number of students in most recent previous offering with enrollment statistics. last_enrollment_season_code: Season of recent previous offering with enrollment statistics. last_enrollment_same_professors: If recent previous offering with enrollment statistics was with same professors. Parameters ---------- Pandas tables post-import: courses listings evaluation_statistics course_professors Returns ------- courses: Table with computed fields. """ listings = listings.copy(deep=True) evaluation_statistics = evaluation_statistics.copy(deep=True) course_professors = course_professors.copy(deep=True) ( course_to_same_course, same_course_to_courses, course_to_same_course_filtered, same_course_to_courses_filtered, ) = resolve_historical_courses(courses, listings) # partition ID of same-codes courses (not used anymore, useful for debugging) courses["shared_code_id"] = courses["course_id"].apply( course_to_same_course.get) # connected courses with the same code (not used anymore, useful for debugging) courses["shared_code_courses"] = courses["shared_code_id"].apply( same_course_to_courses.get) # unique ID for each partition of the same courses courses["same_course_id"] = courses["course_id"].apply( course_to_same_course_filtered.get) # list of course_ids that are the same course per course_id courses["same_courses"] = courses["same_course_id"].apply( same_course_to_courses_filtered.get) # split same-course partition by same-professors course_to_same_prof_course, same_prof_course_to_courses = split_same_professors( course_to_same_course_filtered, course_professors) # unique ID for each partition of the same courses taught by the same set of profs courses["same_course_and_profs_id"] = courses["course_id"].apply( course_to_same_prof_course.get) # list of course_ids that are the same course and taught by same profs per course_id courses["same_courses_and_profs"] = courses[ "same_course_and_profs_id"].apply(same_prof_course_to_courses.get) # map course_id to professor_ids # use frozenset because it is hashable (set is not), needed for groupby course_to_professors = course_professors.groupby( "course_id")[ # type: ignore "professor_id"].apply(frozenset) # get historical offerings with same professors listings["professors"] = listings["course_id"].apply( course_to_professors.get) courses["professors"] = courses["course_id"].apply( course_to_professors.get) print("Computing last offering statistics") # course_id for all evaluated courses evaluated_courses = set( evaluation_statistics.dropna(subset=["enrolled"], axis=0)["course_id"]) # map course_id to season course_to_season = dict(zip(courses["course_id"], courses["season_code"])) # map course_id to number enrolled course_to_enrollment = dict( zip(evaluation_statistics["course_id"], evaluation_statistics["enrolled"])) # get last course offering in general (with or without enrollment) def get_last_offered(course_row): same_courses = course_row["same_courses"] same_courses = [ x for x in same_courses if course_to_season[x] < course_row["season_code"] ] if len(same_courses) == 0: return None same_courses = [ x for x in same_courses if x is not course_row["course_id"] ] if len(same_courses) == 0: return None last_offered_course = max(same_courses, key=lambda x: course_to_season[x]) return last_offered_course # helper function for getting enrollment fields of last-offered course def get_last_offered_enrollment(course_row): same_courses = course_row["same_courses"] # keep course only if distinct, has enrollment statistics, and is before current same_courses = [ x for x in same_courses if x in evaluated_courses and course_to_season[x] < course_row["season_code"] ] if len(same_courses) == 0: return [None, None, None, None] same_courses = [ x for x in same_courses if x is not course_row["course_id"] ] if len(same_courses) == 0: return [None, None, None, None] current_professors = course_to_professors.get(course_row["course_id"], set()) # sort courses newest-first same_courses = sorted(same_courses, key=lambda x: course_to_season[x], reverse=True) # get the newest course with the same professors, otherwise just the newest course last_enrollment_course = next( (prev_course for prev_course in same_courses if course_to_professors.get( prev_course, set()) == current_professors), # default to newest course if no previous course has same profs same_courses[0], ) # number of students last taking course last_enrollment = course_to_enrollment[last_enrollment_course] # season for last enrollment last_enrollment_season = course_to_season[last_enrollment_course] # professors for last enrollment last_enrollment_professors = course_to_professors.get( last_enrollment_course, set()) # if last enrollment is with same professors last_enrollment_same_professors = ( last_enrollment_professors == current_professors) return ( last_enrollment_course, last_enrollment, last_enrollment_season, last_enrollment_same_professors, ) tqdm.pandas(desc="Finding last-offered course") courses["last_offered_course_id"] = courses.progress_apply( # type: ignore get_last_offered, axis=1) tqdm.pandas(desc="Finding last-offered enrollment") # getting last-offered enrollment ( courses["last_enrollment_course_id"], courses["last_enrollment"], courses["last_enrollment_season_code"], courses["last_enrollment_same_professors"], ) = zip(*courses.progress_apply(get_last_offered_enrollment, axis=1) # type: ignore ) print("Computing historical ratings for courses") # map courses to ratings course_to_overall = dict( zip(evaluation_statistics["course_id"], evaluation_statistics["avg_rating"])) course_to_workload = dict( zip(evaluation_statistics["course_id"], evaluation_statistics["avg_workload"])) # get ratings courses["average_rating"] = courses["same_courses"].apply( lambda courses: [course_to_overall.get(x) for x in courses]) courses["average_workload"] = courses["same_courses"].apply( lambda courses: [course_to_workload.get(x) for x in courses]) courses["average_rating_same_professors"] = courses[ "same_courses_and_profs"].apply( lambda courses: [course_to_overall.get(x) for x in courses]) courses["average_workload_same_professors"] = courses[ "same_courses_and_profs"].apply( lambda courses: [course_to_workload.get(x) for x in courses]) # calculate the average of an array def average(nums): nums = list(filter(lambda x: x is not None, nums)) nums = list(filter(lambda x: not math.isnan(x), nums)) if not nums: return [None, None] num_obs = len(nums) return (sum(nums) / num_obs, num_obs) # calculate averages over past offerings for average_col, num_col in [ ("average_rating", "average_rating_n"), ("average_workload", "average_workload_n"), ("average_rating_same_professors", "average_rating_same_professors_n"), ("average_workload_same_professors", "average_workload_same_professors_n"), ]: courses[average_col], courses[num_col] = zip( *courses[average_col].apply(average)) # remove intermediate columns courses = courses.loc[:, get_table_columns(database.models.Course)] return courses
'last_name': ['Miller', np.nan, 'Ali', 'Milner', 'Cooze'], 'age': [42, np.nan, 36, 24, 73], 'sex': ['m', np.nan, 'f', 'm', 'f'], 'preTestScore': [4, np.nan, np.nan, 2, 3], 'postTestScore': [25, np.nan, np.nan, 62, 70]} 1. df라는 변수에 raw_data, 컬럼은 ['first_name', 'last_name', 'age', 'sex', 'preTestScore', 'postTestScore']로 만들기 df = DataFrame(data=raw_data, columns=['first_name', 'last_name', 'age', 'sex', 'preTestScore', 'postTestScore']) df 2. 각 컬럼별 Nan 갯수 df.isnull().sum() 3. 위 2번을 이용하여 각 컬럼별 NaN의 비율 df.isnull().sum()/len(df) 4. df_no_missing에 NaN이 한개도 없는 row만 저장하기 df_no_missing = df.dropna() df_no_missing 5. df_cleaned에 NaN으로만 된 행들 제거하여 저장하기 df_cleaned = df.dropna(how='all') df_cleaned 6. NaN이 3개이상 있는 row만 drop하기. df.dropna(thresh=3) 7. df에서 NaN을 0으로 바꾸기 df.fillna(0) 8. preTestScore 컬럼의 평균구하기 pre_mean = df.preTestScore.mean() 9. preTestScore의 NaN에 preTestScore의 평균 집어넣기. 단 df는 원본그대로 보존 df.preTestScore.fillna(pre_mean) df 10. preTestScore의 NaN에 preTestScore의 평균 집어넣기. 단 df data자체에 저장 df["preTestScore"].fillna(pre_mean, inplace=True)
import pandas as pd import numpy as np from pandas import DataFrame, get_dummies import keras from keras.layers import Dense, Dropout from keras.models import Sequential from keras.utils import to_categorical from keras.callbacks import EarlyStopping from keras.constraints import max_norm from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt f = pd.read_csv('presidents-data-words-january-3-2018.csv') df = DataFrame(f) df = df.dropna(subset=['dagalb','nseg','nsyll','nstress','mean']) early_stop = EarlyStopping(patience=5) X_cols = ['widx','lexstress','nseg','nsyll','nstress','pos','dep','doc.freq','d.inform.3','corpus.freq','c.inform.3','category'] X = df[X_cols] y = np.array(to_categorical(df.dagalb)) cat_cols = ['lexstress','pos','dep','category'] scale_cols = ['widx','nseg','nsyll','nstress','doc.freq','d.inform.3','corpus.freq','c.inform.3'] for c in cat_cols: dum = pd.get_dummies(X[c], columns=[c], prefix=c) X = pd.concat([dum, X], axis=1) del(X[c])
rets.head() prices.plot() plt.show() sns.heatmap(rets.corr()) plt.show() rets.corr() data = Series(['one', 'two', np.nan, 'four']) data dframe = DataFrame([[1, 2, 3], [np.nan, 5, 6], [7, np.nan, 9], [np.nan, np.nan, np.nan]]) dframe dframe2 = DataFrame([[1, 2, 3, nan], [2, nan, 5, 6], [nan, 7, nan, 9], [1, nan, nan, nan]]) dframe2 dframe2.dropna(thresh=2) dframe2.fillna({0: 'a', 1: 'b', 2: 'c', 3: 'd'}) ser = Series(np.random.randn(6), index=[[1, 1, 1, 2, 2, 2], ['a', 'b', 'c', 'a', 'b', 'c']]) ser ser.index ser[1] ser[2] ser[:, 'a'] dframe = ser.unstack() dframe dframe dframe.T.unstack() dframe2 = DataFrame(np.arange(16).reshape(4, 4), index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]], columns=[['NY', 'NY', 'LA', 'SF'],
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) if production_data is not None: if target_column is not None and prediction_column is not None: production_data.replace([np.inf, -np.inf], np.nan, inplace=True) production_data.dropna(axis=0, how='any', inplace=True) binaraizer = preprocessing.LabelBinarizer() binaraizer.fit(reference_data[target_column]) binaraized_target = binaraizer.transform( production_data[target_column]) array_prediction = production_data[prediction_column].to_numpy( ) prediction_ids = np.argmax(array_prediction, axis=-1) prediction_labels = [ prediction_column[x] for x in prediction_ids ] #calculate quality metrics if len(prediction_column) > 2: roc_auc = metrics.roc_auc_score(binaraized_target, array_prediction, average='macro') log_loss = metrics.log_loss(binaraized_target, array_prediction) else: roc_auc = metrics.roc_auc_score( binaraized_target, production_data[prediction_column[0]]) #problem!!! log_loss = metrics.log_loss( binaraized_target, production_data[prediction_column[0]]) #problem!!! accuracy_score = metrics.accuracy_score( production_data[target_column], prediction_labels) avg_precision = metrics.precision_score( production_data[target_column], prediction_labels, average='macro') avg_recall = metrics.recall_score( production_data[target_column], prediction_labels, average='macro') avg_f1 = metrics.f1_score(production_data[target_column], prediction_labels, average='macro') self.wi = BaseWidgetInfo( title=self.title, type="counter", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=2, params={ "counters": [{ "value": str(round(accuracy_score, 3)), "label": "Accuracy" }, { "value": str(round(avg_precision, 3)), "label": "Precision" }, { "value": str(round(avg_recall, 3)), "label": "Recall" }, { "value": str(round(avg_f1, 3)), "label": "F1" }, { "value": str(round(roc_auc, 3)), "label": "ROC AUC" }, { "value": str(round(log_loss, 3)), "label": "LogLoss" }] }, additionalGraphs=[], ) else: self.wi = None else: self.wi = None
def df2ecl( grid_df: pd.DataFrame, keywords: Union[str, List[str]], eclfiles: Optional[EclFiles] = None, dtype: Optional[Type] = None, filename: Optional[str] = None, nocomments: bool = False, ) -> str: """ Write an include file with grid data keyword, like PERMX, PORO, FIPNUM etc, for the GRID section of the Eclipse deck. Output (returned as string and optionally written to file) will then contain f.ex:: PERMX 3.3 4.1 500.1 8543.0 1223.0 5022.0 411.455 4433.9 / if the grid contains 8 cells (inactive and active). Args: grid_df: Dataframe with the keyword for which we want to export data, and also the a column with GLOBAL_INDEX. Without GLOBAL_INDEX, the output will likely be invalid. The grid can contain both active and inactive cells. keywords: The keyword(s) to export, with one value for every cell. eclfiles: If provided, the total cell count for the grid will be requested from this object. If not, it will be *guessed* from the maximum number of GLOBAL_INDEX, which can be under-estimated in the corner-case that the last cells are inactive. dtype: If provided, the columns which are outputted are converted to int or float. Dataframe columns read from CSV files easily gets the wrong type, while Eclipse might require some data to be strictly integer. filename: If provided, the string produced will also to be written to this filename. nocomments: Set to True to avoid any comments being written. Defaults to False. """ if isinstance(keywords, str): keywords = [keywords] if isinstance(dtype, str): if dtype.startswith("int"): dtype = int elif dtype.startswith("float"): dtype = float else: raise ValueError(f"Wrong dtype argument {dtype}") # Figure out the total number of cells for which we need to export data for: global_size = None active_cells = None if eclfiles is not None: if eclfiles.get_egrid() is not None: global_size = eclfiles.get_egrid().get_global_size() active_cells = eclfiles.get_egrid().getNumActive() if "GLOBAL_INDEX" not in grid_df: logger.warning(("Global index not found in grid dataframe. " "Assumes all cells are active")) # Drop NaN rows for columns to be used (triggered by stacked # dates and no global index, unlikely) # Also copy dataframe to avoid side-effects on incoming data. grid_df = grid_df.dropna( axis="rows", subset=[keyword for keyword in keywords if keyword in grid_df]) grid_df["GLOBAL_INDEX"] = grid_df.index if global_size is None: global_size = int(grid_df["GLOBAL_INDEX"].max() + 1) active_cells = len(grid_df[grid_df.index >= 0]) logger.warning("Global grid size estimated to %s", str(global_size)) ecl2df_header = ("Output file printed by " + "ecl2df.grid " + __version__ + "\n" + " at " + str(datetime.datetime.now())) string = "" if not nocomments: string += common.comment_formatter(ecl2df_header) string += "\n" # If we have NaNs in the dataframe, we will be more careful (costs memory) if grid_df.isna().any().any(): grid_df = grid_df.dropna( axis="rows", subset=[keyword for keyword in keywords if keyword in grid_df]) for keyword in keywords: if keyword not in grid_df.columns: raise ValueError(f"Keyword {keyword} not found in grid dataframe") vector = np.zeros(global_size) vector[grid_df["GLOBAL_INDEX"].astype(int).values] = grid_df[keyword] if dtype == int: vector = vector.astype(int) if dtype == float: vector = vector.astype(float) if len(vector) != global_size: logger.warning( ("Mismatch between dumped vector length " "%d from df2ecl and assumed grid size %d"), len(vector), global_size, ) logger.warning("Data will be dumped, but may error in simulator") strvector = " ".join([str(x) for x in vector]) strvector = common.runlength_eclcompress(strvector) string += keyword + "\n" indent = " " * 5 string += "\n".join( textwrap.wrap(strvector, initial_indent=indent, subsequent_indent=indent, width=70)) string += "\n/" if not nocomments: string += (f" -- {keyword}: {active_cells} active cells, " f"{global_size} total cell count\n") string += "\n" if filename is not None: Path(filename).parent.mkdir(parents=True, exist_ok=True) Path(filename).write_text(string, encoding="utf-8") return string
def process_merged(self, data: pd.DataFrame) -> DataType: data.dropna(inplace=True) return data
import pandas as pd from pandas import DataFrame #import data from Excel ReadExcel = pd.read_excel( r'C:\Users\asus\Documents\Python-Vaje\ts-lubatruu.xlsx') df = DataFrame(ReadExcel, columns=['Date', 'event_horizon', 'LUBATRUU_AR']) df.dropna(inplace=True) #event_horizon = -7 timeminus7 = df[df.event_horizon == -7] #print (timeminus7) listLUBATRUU_ARminus7 = timeminus7['LUBATRUU_AR'] #print (listLUBATRUU_ARminus7) a = sum(listLUBATRUU_ARminus7) #print (a) b = len(listLUBATRUU_ARminus7) #print (b) c = a / b #print (c) #AAR(-7) #event_horizon = -6 timeminus6 = df[df.event_horizon == -6] #print (timeminus6) listLUBATRUU_ARminus6 = timeminus6['LUBATRUU_AR'] #print (listLUBATRUU_ARminus6) d = sum(listLUBATRUU_ARminus6) #print (d) e = len(listLUBATRUU_ARminus6) #print (e) f = d / e
data[i] += noisyCount(sensitivety, epsilon) return data if __name__ == '__main__': data = [[2, 2, 0, 0, 3, 0], [2, 0, 2, 2, 0, 1], [2, 3, 1, 3, 0, 0], [1, 1, 1, 0, 1, 0], [0, 1, 1, 3, 2, 1], [3, 3, 0, 1, 3, 0], [2, 2, 1, 1, 3, 0], [2, 0, 1, 1, 3, 0], [0, 0, 1, 3, 3, 0], [1, 0, 1, 0, 1, 0], [3, 2, 1, 3, 0, 2], [2, 3, 1, 0, 3, 0]] df = DataFrame( data, index=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'], columns=['1', '2', '3', '4', '5', '6']) # print(df) df.dropna() x = df[0:1] print(x) w = cal_weight(x) # if __name__ == '__main__': # df = pd.read_csv('/Users/qiaoyanming/Desktop/工作簿2.csv', encoding='gb2312') # # 2数据预处理 ,去除空值的记录 # df.dropna() # print(df) # w = cal_weight(df) # x = [1., 1., 0.] # sensitivety = 1 # epsilon = 1 # data = laplace_mech(x, sensitivety, epsilon)
data = data.replace(".", "") data = data.replace("m²", "") data = re.sub(re.compile(" \D.*"), "", data) data = data.strip() return data def get_firstlayer(data): fist_layer = data.split(",")[0] return fist_layer.strip() def get_lastlayer(data): last_layer = data.split(",")[-1] return last_layer.strip() wohnung_data_clean = wohnung_data.dropna(axis=0) wohnung_data_clean["price"] = wohnung_data_clean["price"].apply( clean_pricesize) wohnung_data_clean["size"] = wohnung_data_clean["size"].apply(clean_pricesize) wohnung_data_clean["location_first"] = wohnung_data_clean["location"].apply( get_firstlayer) wohnung_data_clean["location_last"] = wohnung_data_clean["location"].apply( get_lastlayer) wohnung_data_clean.to_csv("~/wohnung_data_clean_" + time.strftime("%d/%m/%Y") + ".csv", sep=";", index=False)