def test_dataframe(): d2 = DirectAccessV2( api_key=DIRECTACCESS_API_KEY, client_id=DIRECTACCESS_CLIENT_ID, client_secret=DIRECTACCESS_CLIENT_SECRET, access_token=DIRECTACCESS_TOKEN, ) df = d2.to_dataframe("rigs", pagesize=10000, deleteddate="null") # Check index is set to API endpoint "primary key" assert df.index.name == "RigID" # Check datetime64 dtypes assert is_datetime64_ns_dtype(df.CreatedDate) assert is_datetime64_ns_dtype(df.DeletedDate) assert is_datetime64_ns_dtype(df.SpudDate) assert is_datetime64_ns_dtype(df.UpdatedDate) # Check Int64 dtypes assert is_int64_dtype(df.PermitDepth) assert is_int64_dtype(df.FormationDepth) # Check float dtypes assert is_float_dtype(df.RigLatitudeWGS84) assert is_float_dtype(df.RigLongitudeWGS84) return
def apply_to_one(dataframe, columns): for column in columns: if types.is_int64_dtype(dataframe[column]): c_min = dataframe[column].min() c_max = dataframe[column].max() dataframe[column] = np.log1p(dataframe[column]) return dataframe
def test_search(self, mp_wfs, mp_remote_describefeaturetype, mp_remote_md, mp_remote_fc, mp_remote_wfs_feature, mp_dov_xml): """Test the search method with only the query parameter. Test whether the result is correct. Parameters ---------- mp_wfs : pytest.fixture Monkeypatch the call to the remote GetCapabilities request. mp_remote_describefeaturetype : pytest.fixture Monkeypatch the call to a remote DescribeFeatureType. mp_remote_md : pytest.fixture Monkeypatch the call to get the remote metadata. mp_remote_fc : pytest.fixture Monkeypatch the call to get the remote feature catalogue. mp_remote_wfs_feature : pytest.fixture Monkeypatch the call to get WFS features. mp_dov_xml : pytest.fixture Monkeypatch the call to get the remote XML data. """ df = self.get_search_object().search( query=self.get_valid_query_single()) assert type(df) is DataFrame assert list(df) == self.get_df_default_columns() datatype = self.get_type() allfields = datatype.get_field_names() ownfields = datatype.get_field_names(include_subtypes=False) subfields = [f for f in allfields if f not in ownfields] assert len(df) >= 1 for field in list(df): if field in ownfields: assert len(df[field].unique()) == 1 elif field in subfields: assert len(df[field].unique()) >= 1 # dtype checks of the resulting df columns fields = self.get_type().get_fields(source=('wfs', 'xml', 'custom')) for field in list(df): datatype = fields[field]['type'] if datatype == 'string': assert (is_object_dtype(df[field]) or df[field].isnull().values.all()) # all Nan/None elif datatype == 'float': assert is_float_dtype(df[field]) elif datatype == 'integer': assert is_int64_dtype(df[field]) elif datatype == 'date': assert is_object_dtype(df[field]) elif datatype == 'boolean': assert is_bool_dtype(df[field])
def revisar_resultados(df_reto_3): import pandas as np import pandas.api.types as pdtypes assert pdtypes.is_int64_dtype(df_reto_3['is_potentially_hazardous_asteroid']), 'La columna "is_potentially_hazardous_asteroid" no ha sido transformada a tipo numerico' assert len(df_reto_3['is_potentially_hazardous_asteroid'].unique()) == 2, 'Hubo un error con la correspondencia de valores booleanos a numéricos. Hay más de dos valores posibles en la columna resultante' assert df_reto_3['relative_velocity.kilometers_per_minute'].equals(df_reto_3['relative_velocity.kilometers_per_hour'] / 60), 'La conversión de kilometros por hora a kilómetros por minuto no fue realizada correctamente' print(f'Todos los procesos fueron realizados exitosamente!')
def test_dataframe_v3(self): df = self.v3.to_dataframe("rigs", pagesize=1000, deleteddate="null") # Check index is set to API endpoint "primary keys" self.assertListEqual(df.index.names, ["CompletionID", "WellID"]) # Check object dtypes self.assertTrue(is_object_dtype(df.API_UWI)) self.assertTrue(is_object_dtype(df.ActiveStatus)) # Check datetime64 dtypes self.assertTrue(is_datetime64_ns_dtype(df.DeletedDate)) self.assertTrue(is_datetime64_ns_dtype(df.SpudDate)) self.assertTrue(is_datetime64_ns_dtype(df.UpdatedDate)) # Check Int64 dtypes self.assertTrue(is_int64_dtype(df.RatedWaterDepth)) self.assertTrue(is_int64_dtype(df.RatedHP)) # Check float dtypes self.assertTrue(is_float_dtype(df.RigLatitudeWGS84)) self.assertTrue(is_float_dtype(df.RigLongitudeWGS84))
def test_data_from_zip(self): with open('tests/pc6hnr.csv', 'rb') as f: res = populate.data_from_zip(f.read()) # must return a DF self.assertTrue(isinstance(res, pd.DataFrame)) # pc6 must be unique self.assertTrue(res['pc6'].is_unique) # must contain the following columns self.assertListEqual(['pc6', 'buurt', 'wijk', 'gemeente'], list(res.columns)) # must have the correct data types self.assertTrue(ptypes.is_string_dtype(res['pc6'])) self.assertTrue( all( ptypes.is_int64_dtype(res[col]) for col in ['buurt', 'wijk', 'gemeente']))
def create_dtypes(df): dtypes = {} for key, value in df.dtypes.items(): if is_string_dtype(value): dtypes.update({'{}'.format(key): String}) elif is_int64_dtype(value): dtypes.update({'{}'.format(key): Integer}) elif is_float_dtype(value): dtypes.update({'{}'.format(key): Float}) elif is_object_dtype(value): dtypes.update({'{}'.format(key): Float}) return dtypes
def factorize(self, na_sentinel=-1): # type: (int) -> Tuple[np.ndarray, ExtensionArray] """Encode the extension array as an enumerated type. Parameters ---------- na_sentinel : int, default -1 Value to use in the `labels` array to indicate missing values. Returns ------- labels : ndarray An integer NumPy array that's an indexer into the original ExtensionArray. uniques : ExtensionArray An ExtensionArray containing the unique values of `self`. .. note:: uniques will *not* contain an entry for the NA value of the ExtensionArray if there are any missing values present in `self`. See Also -------- pandas.factorize : Top-level factorize method that dispatches here. Notes ----- :meth:`pandas.factorize` offers a `sort` keyword as well. """ if pa.types.is_dictionary(self.data.type): raise NotImplementedError() elif self.data.num_chunks == 1: # Dictionaryencode and do the same as above encoded = self.data.chunk(0).dictionary_encode() indices = encoded.indices.to_pandas() if indices.dtype.kind == "f": indices[np.isnan(indices)] = na_sentinel indices = indices.astype(int) if not is_int64_dtype(indices): indices = indices.astype(np.int64) return indices, type(self)(encoded.dictionary) else: np_array = pa.column("dummy", self.data).to_pandas().values return pd.factorize(np_array, na_sentinel=na_sentinel)
# 2569个欺诈用户,5582个正常用户。总共8151个用户 train_data["label"].value_counts() # 913个用户ARPU为0,且全部为欺诈用户。 train_data[train_data["arpu"] == 0]["label"].value_counts() # 去除这913个用户之后,仍剩下7238个用户。其中1656个欺诈用户,5582个正常用户 except_arpu_data = train_data[train_data.apply(lambda x: False if x["arpu"] == 0 else True, axis=1)].copy() except_arpu_data["label"].value_counts() except_arpu_data[except_arpu_data["flow"] == 0]["label"].value_counts() # 43个用户的arpu大于等于550,且全部为欺诈用户。 except_arpu_data[except_arpu_data["arpu"] >= 550]["label"].value_counts() # 剩下7238-43=7195个用户 except_arpu_data = except_arpu_data[except_arpu_data["arpu"] < 550] # 当天最多与135个及以上的不同用户进行过通话。有78个用户,且全部为欺诈用户 except_arpu_data[except_arpu_data["voc_inner_day_user_cnt_max"] >= 135]["label"].value_counts() # 剩下7195-78=7117个用户 except_arpu_data = except_arpu_data[except_arpu_data["voc_inner_day_user_cnt_max"] < 135] # except_arpu_data[except_arpu_data["passive_voc_inner_hour_cnt_max"]>30]["label"].value_counts() dtypes_all = except_arpu_data.dtypes for column in except_arpu_data.columns: if is_float_dtype(dtypes_all[column]) or (is_int64_dtype(dtypes_all[column])): temp_statistic = except_arpu_data[except_arpu_data[column] == 0]["label"].value_counts() if len(temp_statistic) == 1: print("--- " + column) print(temp_statistic)
def apply_log1p_transformation(dataframe, columns): for column in columns: if types.is_int64_dtype(dataframe[column]): dataframe[column] = np.log1p(dataframe[column]) return dataframe
def tran_pivot_longer( df, columns, index_to=None, names_to=None, #names_prefix = None, names_sep=None, names_pattern=None, #names_ptypes = list(), #names_transform = list(), #names_repair, values_to=None, #values_drop_na = False, #values_ptypes = list(), #values_transform = list(), ): """Lengthen a dataset "Lengthens" data by increasing the number of rows and decreasing the number of columns. Args: df (DataFrame): DataFrame passed through columns (str): Label of column(s) to pivot into longer format index_to(str): str name to create a new representation index of observations; Optional. names_to (str): name to use for the 'variable' column, if None frame.columns.name is used or ‘variable’ • .value indicates that component of the name defines the name of the column containing the cell values, overriding values_to names_sep (str OR list of int): delimter to seperate the values of the argument(s) from the 'columns' parameter into 2 new columns with those values split by that delimeter • Regex expression is a valid input for names_sep names_pattern (str): Regular expression with capture groups to define targets for names_to. values_to (str): name to use for the 'value' column; overridden if ".value" is provided in names_to argument. Notes: Only one of names_sep OR names_pattern may be given. Returns: DataFrame: result of being pivoted into a longer format Examples:: import grama as gr ## Simple example ( gr.df_make( A=[1, 2, 3], B=[4, 5, 6], C=[7, 8, 9], ) >> gr.tf_pivot_longer( columns=["A", "B", "C"], names_to="variable", values_to="value", ) ) ## Matching columns on patterns ( gr.df_make( x1=[1, 2, 3], x2=[4, 5, 6], x3=[7, 8, 9], ) >> gr.tf_pivot_longer( columns=gr.matches("\\d+"), names_to="variable", values_to="value", ) ) ## Separating column names and data on a names_pattern ( gr.df_make( E00=[1, 2, 3], E45=[4, 5, 6], E90=[7, 8, 9], ) >> gr.tf_pivot_longer( columns=gr.matches("\\d+"), names_to=[".value", "angle"], names_pattern="(E)(\\d+)", ) ) """ ########### Pre-Check List ############# ### Check if tran_select was used if isinstance(columns, DataFrame): columns = columns.columns.values ### Check if selection helper was used: if isinstance(columns, Intention): columns = pivot_select(df, columns) if size(columns) == 0: raise ValueError("""Selection helper has found no matches. Revise columns input.""") ### Check if names_to is a list or str names_str = False if isinstance(names_to, str): names_str = True if names_sep is not None: raise TypeError("""In order to use names_sep more than 1 value needs to passed to names_to""") ### Check for .value input dot_value = False if names_str is False: for i, v in enumerate(names_to): if names_to[i] == ".value": dot_value = True else: if names_to == ".value": dot_value = True ### Check values_to argument if values_to is None: values_to = "values" if names_pattern and names_sep: raise ValueError("""Both names_sep and names_pattern were used, only one or the other is required""") ####################################### ########### .value pivot ############# ### Check if .value operation needs to occur if dot_value is True: ### collect unused columns to pivot around data_index = collect_indexes(df, columns) if names_sep is not None or names_pattern is not None: ### Add index and split column to dataset longer = df.reset_index().melt(id_vars="index", var_name="split", value_vars=columns, value_name=values_to) ### DataFrame Cleanup longer = split_cleanup(longer=longer, names_to=names_to, names_pattern=names_pattern, names_sep=names_sep, values_to=values_to) else: ### Add index column and .value column longer = df.reset_index().melt(id_vars="index", var_name=".value", value_vars=columns, value_name=values_to) ### clean up index_to call longer = index_to_cleanup(df, longer, data_index) ### arrange what indexes_from should be if names_str is True: indexes = ["index"] + data_index else: names_to = list(names_to) value_loc = names_to.index(".value") if value_loc == 0: indexes = ["index"] + data_index + names_to[1:] else: indexes = ["index"] + data_index + names_to[0:value_loc] \ + names_to[(value_loc+1):] ### Pivot wider the .value column value_longer = tran_pivot_wider(longer, indexes_from=indexes, names_from=".value", values_from=values_to) if index_to is None: ### drop "index" column value_longer.drop("index", axis=1, inplace=True) else: ### rename index column to desired: index_to value_longer.rename(columns={'index': index_to}, inplace=True) return value_longer ######################################### ########### names_sep pivot ############# ### Only if names_sep is used if names_sep is not None or names_pattern is not None: ### collect unused columns to pivot around data_index = collect_indexes(df, columns) if index_to is None: ### initial pivoted DataFrame longer = df.reset_index().melt(id_vars=data_index, var_name="split", value_vars=columns, value_name=values_to) ### DataFrame Cleanup longer = split_cleanup(longer=longer, names_to=names_to, names_pattern=names_pattern, names_sep=names_sep, values_to=values_to) return (longer) ### Add index column to dataset longer = df.reset_index().melt(id_vars="index", var_name="split", value_vars=columns, value_name=values_to) ### rename index column to desired: index_to longer.rename(columns={'index': index_to}, inplace=True) longer = index_to_cleanup(df, longer, data_index) ### DataFrame Cleanup longer = split_cleanup(longer=longer, names_to=names_to, names_pattern=names_pattern, names_sep=names_sep, values_to=values_to) return (longer) ###################################### ########### normal pivot ############# ### Check if index_to is provided if index_to is None: ### check to see if all columns are used already data_columns = df.columns.values data_index = [x for x in data_columns if x not in columns] ### check if data_index is empty and if it has a RangeIndex if not data_index: if is_int64_dtype(df.index.dtype): # if so do not add extra index column and pivot longer = df.reset_index().melt(id_vars=None, var_name=names_to, value_vars=columns, value_name=values_to) return (longer) # if it does not have a RangeIndex create new column from ID column # and add RangeIndex longer = df.reset_index().melt(id_vars="index", var_name=names_to, value_vars=columns, value_name=values_to) return (longer) ### look for unused columns to pivot around data_used = columns data_index = [x for x in data_columns if x not in data_used] ### pivot with leftover name that would be the index column if data_index: longer = df.reset_index().melt(id_vars=data_index, var_name=names_to, value_vars=columns, value_name=values_to) return (longer) ### collect unused columns to preserve post pivot data_index = collect_indexes(df, columns) ### Add index column to dataset longer = df.reset_index().melt(id_vars="index", var_name=names_to, value_vars=columns, value_name=values_to) ### rename index column to desired: index_to longer.rename(columns={'index': index_to}, inplace=True) longer = index_to_cleanup(df, longer, data_index) return longer
def test_total_deaths_df_deaths_column_type(self): self.assertTrue(ptypes.is_int64_dtype(self.total_deaths_df['deaths']))
def explore_global_plot(data, label='label', n_feats=50, id=None, task='classification'): ''' :param data: DataFrame :param label: label column name in the data :param n_feats: the number of features be used to analysis. :param task: regression or classification :return: ''' columns = data.columns.tolist() columns.remove(label) if id is not None: if columns[id].duplicated().sum(): print('{} is duplicated !!!'.format(id)) columns.remove(id) data.drop(id, axis=1, inplace=True) numeric_features = [True if any([ptypes.is_integer_dtype(i),ptypes.is_int64_dtype(i),ptypes.is_float_dtype(i)]) else False for i in data[columns].dtypes] numeric_names = [columns[i] for i, v in enumerate(numeric_features) if v] category_names = list(set(columns) - set(numeric_names)) if task == 'classification': if len(category_names): # data distribution for each class new_data = data.dropna(axis=0) famd = prince.FAMD( n_components=2, n_iter=3, copy=True, check_input=True, engine='auto', random_state=42 ) famd = famd.fit(new_data[columns]) ax = famd.plot_row_coordinates( new_data, ax=None, x_component=0, y_component=1, labels=new_data.index, color_labels=['{}'.format(t) for t in new_data[label]], ellipse_outline=False, ellipse_fill=True, show_points=True ) plt.show() else: new_data = data.dropna(axis=0) pca = PCA(n_components=2, random_state=seed) X_pca = pca.fit_transform(new_data[columns]) sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=label, data=new_data) plt.show() # sort features for correlation plot sorted_feat_name = numeric_names if len(numeric_names) > 6: n_clusters = 3 new_data = data[[label] + numeric_names].dropna(axis=0) new_data_feat = new_data[numeric_names] new_data_stand = StandardScaler().fit_transform(new_data_feat) kmean_init = KMeans(n_clusters=n_clusters, random_state=seed) new_data_kmean=kmean_init.fit_transform( new_data_stand.reshape(len(numeric_names), -1)) sorted_feat = sorted(zip(numeric_names, kmean_init.labels_), key=lambda x: x[1]) sorted_feat_name = [i[0] for i in sorted_feat] # correlation plot for all features sns.heatmap(data[[label] + sorted_feat_name + category_names].corr()) plt.show() # outlier detection just for numeric features outlier = data[numeric_names].apply(mad_based_outlier) for i, column in enumerate(outlier.columns): print('outlier:\n {}'.format(data[[column]][outlier.iloc[:, i]])) # missing value pattern plot for all features msno.matrix(data[columns[:n_feats]]) plt.show() msno.bar(data[columns[:n_feats]]) plt.show() miss_data = data[columns[:n_feats]].isnull().sum(axis=1) miss_data = miss_data.to_frame() miss_data.columns = ['number_of_missing_attributes'] miss_data.sort_values('number_of_missing_attributes', inplace=True) miss_data['index'] = list(range(0, miss_data.shape[0])) sns.jointplot(x="index", y="number_of_missing_attributes", data=miss_data) plt.show()
def create_table(ctx, infile, table_name, col_spacing, varchar_factor, sql, encoding, separator): """ Display SQL table create command from a CSV file. """ ordered_columns = OrderedDict() if infile.endswith(".xls") or infile.endswith(".xlsx"): print("Loading Excel file...") df = pd.ExcelFile(infile).parse() else: print("Loading CSV file...") df = pd.read_csv(infile, encoding=encoding, sep=separator) count = 0 for column in df.columns: #print(df[column].dtype) count += 1 sys.stdout.write(f"{str(count):3} ") # The entire column is empty. No rows have values. if df[column].isna().all(): ordered_columns[column] = {'type': None, 'length': None} print("{:{col_spacing}}: {}".format("No values", column, col_spacing=col_spacing)) continue # Handling of numeric fields if is_numeric_dtype(df[column]): # Find the max value maxVal = None validVals = [i for i in df[column].dropna()] if validVals: maxVal = max(validVals) if is_float_dtype(df[column]): # Pandas stores numerical columns with null values as floats. We # need to do some extra work to determine if the column is an int allIntegers = all(i.is_integer() for i in df[column].dropna()) if allIntegers: # this is an Integer column ordered_columns[column] = { 'type': get_int_type(maxVal), 'length': maxVal } print( f"int, {str(maxVal):{col_spacing-5}}: {column} : ({df[column].dtype})" ) #df[df[column].fillna(0) != 0.0][column].astype(int) else: # this is a Float column ordered_columns[column] = { 'type': get_float_type(maxVal), 'length': maxVal } print( f"{df[column].dtype}, {str(maxVal):{col_spacing-5}}: {column}" ) else: # These types were detected as integers during loading of the file. if is_int64_dtype(df[column]) or is_integer(df[column]): ordered_columns[column] = { 'type': get_int_type(maxVal), 'length': maxVal } print(f"int, {str(maxVal):{col_spacing-5}}: {column}") else: unknown = "???" print(f"{unknown:{col_spacing}}: {column}") # Handling of Strings else: # Look for values that look like dates in 2018/01/01 or 01/01/2018 form patterns = [ re.compile('^\d{1,2}[-/]\d{1,2}[-/]20\d\d$'), # re.compile('^\d{1,2}[-/]\d{1,2}[-/]\d{1,4}$'), re.compile('^20\d\d[-/]\d{1,2}[-/]\d{1,2}$') # re.compile('^\d{1,4}[-/]\d{1,2}[-/]\d{1,2}$') ] foundDate = False for pattern in patterns: if any(i == True for i in df[column].str.contains(pattern)): foundDate = True foundBool = False try: maxVal = str(int(df[column].dropna().str.len().max())) except: # Could be boolean? # if "otc" in column: # import pdb; pdb.set_trace() # if all(i.lower == "false" or i.lower() == "true" for i in df[column].dropna()): if any(type(i) == bool for i in df[column].dropna()): maxVal = 0 else: maxVal = 0 if foundDate: ordered_columns[column] = {'type': "DATE", 'length': maxVal} print(f"Date, {maxVal:{col_spacing-6}}: {column}") # elif foundBool: # ordered_columns[column] = {'type': "BOOL", 'length': maxVal} # print(f"Bool, {maxVal:{col_spacing-6}}: {column}") else: ordered_columns[column] = { 'type': f"VARCHAR({int(maxVal)*varchar_factor})", 'length': maxVal } print(f"String, {maxVal:{col_spacing-8}}: {column}") print("-------------------------------------") print(f"Total columns are: {len(df.columns)}") print("-------------------------------------") if sql: create_table_sql(ordered_columns, table_name)