def _validateDataFrame(self, Df): #if the df is a standard DataFrame if type(Df) == pd.DataFrame: self._logger.info('Using regular dataframe') if Df.empty: self._logger.error('Empty dataframe') raise EmptyDataError('DataFrame is empty') self.colsAndTypes = {name: Df.dtypes[name] for name in list(Df.columns)} self._isIterable = False #if the df is a large file read in through chunks elif type(Df) == pd.io.parsers.TextFileReader: self._logger.info('Using large dataframe') for chunk in Df: self.colsAndTypes = {name: chunk.dtypes[name] for name in list(chunk.columns)} if chunk.empty: self._logger.error('Empty dataframe') raise EmptyDataError('DataFrame is empty') break self._isIterable = True else: raise TypeError(f'Invalid Df type. Type "{type(Df)}" is not a DataFrame or TextFileReader') return True
def import_labeled_classify_files(f, mark_job): doc_type_id = mark_job.doc_type_id mark_job_id = mark_job.mark_job_id doc_unique_name, doc_relative_path = upload_fileset.save_file(f.filename, f.stream.read()) csv_doc = DocModel().create(doc_raw_name=f.filename, doc_unique_name=doc_unique_name) try: df = pd.read_csv(doc_relative_path, skiprows=0, na_values='') except EmptyDataError: raise EmptyDataError('上传数据为空,请检查上传数据:{}'.format(f.filename)) except Exception: raise EmptyDataError('上传数据处理异常,请检查上传数据:{}'.format(f.filename)) if 'text' not in df.columns or 'label' not in df.columns: raise KeyError doc_terms, _ = DocTermModel.get_doc_term_by_doctype(doc_type_id, offset=0, limit=9999) doc_term_name2id_map = {m.doc_term_name: m.doc_term_id for m in doc_terms} content_list = [] task_results = [] for row_num, row in df.iterrows(): content = row.get('text') label = row.get('label') try: label_id = doc_term_name2id_map[label] except KeyError as ke: raise ValueError(f"当前项目不存在文件第{row_num + 2}行的label:{ke.args[0]},请检查") task_result = [{'prob': 1, 'marked': 1, 'label_id': label_id, 'label_name': label}] if content and label: content_list.append(content) task_results.append(task_result) # bulk insert doc unique_name_list = [] for txt_content in content_list: doc_unique_name, _ = upload_fileset.save_file('format.txt', txt_content) unique_name_list.append(doc_unique_name) doc_list = [ dict( doc_raw_name=csv_doc.doc_raw_name, doc_unique_name=unique_name, ) for unique_name in unique_name_list ] doc_entity_list = DocModel().bulk_create(doc_list) # bulk insert task task_list = [] for i in range(len(doc_list)): task_list.append(dict( doc_id=doc_entity_list[i].doc_id, mark_job_id=mark_job_id, mark_task_result=task_results[i] if task_results else {}, mark_task_status=int(StatusEnum.approved) )) task_entity_list = MarkTaskModel().bulk_create(task_list) return task_entity_list
def read_value_data(path): try: header = pd.read_csv(path, nrows=0) except EmptyDataError: raise EmptyDataError('Empty file') try: data_without_header = pd.read_csv(path, skiprows=1, header=None) except EmptyDataError: raise EmptyDataError("There's no data") data_len = data_without_header.shape[1] header_len = header.shape[1] if data_len > header_len: len_difference = data_len - header_len columns_to_drop = list(range(data_len)[-len_difference:]) data = data_without_header.drop(columns_to_drop, axis=1) elif data_len < header_len: raise ValueError( f"Header length is {header_len}, but data length is {data_len}.") else: data = data_without_header.copy() data.columns = header.columns data.columns = np.char.lower(np.array(data.columns, dtype=str)) required_columns = [ 'lpep_pickup_datetime', 'lpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'total_amount' ] data['lpep_pickup_datetime'] = pd.to_datetime(data['lpep_pickup_datetime'], errors='coerce') data['lpep_dropoff_datetime'] = pd.to_datetime( data['lpep_dropoff_datetime'], errors='coerce') data['trip_distance'] = pd.to_numeric(data['trip_distance'], downcast='float', errors='coerce') data['total_amount'] = pd.to_numeric(data['total_amount'], downcast='float', errors='coerce') data['passenger_count'] = pd.to_numeric(data['passenger_count'], downcast='float', errors='coerce') return data[required_columns]
def _compile_columns(file_list): from pandas.errors import EmptyDataError # Read one line from each dataframe dfsamples = [] for file in file_list: try: dfsamples.append( pd.read_csv(file, index_col=None, header=0, nrows=1, comment='#')) except EmptyDataError: raise EmptyDataError('Summary file empty: \n{}'.format(file)) # Compare number of columns ncols = [df.shape[1] for df in dfsamples] if not all([n == ncols[0] for n in ncols]): warnings.warn( 'The dataframes to compile do not have the same number of columns.' ) # Compile all columns columns = pd.concat(dfsamples, axis=0, sort=False).columns.to_list() return columns
def read(self, nrows=None): if (nrows is None) and (self.chunksize is not None): nrows = self.chunksize elif nrows is None: nrows = self.row_count if len(self._column_types) == 0: self.close() raise EmptyDataError("No columns to parse from file") if self._current_row_in_file_index >= self.row_count: return None m = self.row_count - self._current_row_in_file_index if nrows > m: nrows = m nd = self._column_types.count(b"d") ns = self._column_types.count(b"s") self._string_chunk = np.empty((ns, nrows), dtype=object) self._byte_chunk = np.zeros((nd, 8 * nrows), dtype=np.uint8) self._current_row_in_chunk_index = 0 p = Parser(self) p.read(nrows) rslt = self._chunk_to_dataframe() if self.index is not None: rslt = rslt.set_index(self.index) return rslt
def filter_data_by_func(df, func, axis=1, op=operator.gt, value=None, quantile=None): """Generalized function for filtering a dataset by rows or columns.""" # aply function along specified axis vals = df.apply(func, axis=axis) # if quantile specified, find associated value if quantile is not None: value = vals.quantile(quantile, axis=axis) # apply operator mask = op(vals, value) # apply function along rows or columns and return filtered result if axis == 1: # rows df = df[mask] else: # columns df = df[mask.index[mask]] # check to make sure data is non-empty after filtering step if df.empty: raise EmptyDataError("No data remaining after filter applied") return df
def transform(self, X: pd.DataFrame = None): """ Performs a OHE on new data Parameters: ------------ X : pd.DataFrame (default = None) pandas dataframe input data (test) that should contain a self.key column """ if len(X) == 0: raise EmptyDataError("Input dataset is empty!") if len(self.columns) == 0: raise NotFittedError( "OHE transformer is not fiited! You need to execute a 'fit' method first!" ) try: X = pd.get_dummies(X[[self.key]], prefix=self.key) test_columns = [col for col in X.columns] for col_ in self.columns: if col_ not in test_columns: X[col_] = 0 return X[self.columns] except KeyError as e: raise KeyError( "Input column {} does not exist in the input Dataset!".format( self.key))
def df2csv(dataframe, ticker): """Save the contents of a dataframe to a csv file in the data/ directory. File name will include the ticker (required arg) and current timestamp. :param dataframe: :param ticker: :return: """ if dataframe is None: raise EmptyDataError("Dataframe cannot be empty.") if not ticker or ticker is None: raise ValueError("[!] Invalid ticker value.") datafile = os.path.relpath("data/{}_data_{}.csv".format(ticker, time.time())) try: dataframe.to_csv(datafile, index=False) except FileNotFoundError: print("[?] Retrying one directory level up.") datafile = os.path.relpath("../data/{}_data_{}.csv".format(ticker, time.time())) try: dataframe.to_csv(datafile, index=False) except FileNotFoundError: raise FileNotFoundError("[!] Unable to save dataframe to CSV file.") finally: print("[+] File saved:\t{}".format(datafile))
def filter_rows_col_not_na(df, col): """Returns all rows for which a specific column is not null""" df = df[df[col].notnull()] # check to make sure data is non-empty after filtering step if df.empty: raise EmptyDataError("No data remaining after filter applied") return df
def filter_rows_col_val_not_in(df, col, values): """ Removes all rows for which a column is one of a specified set of values """ df = df[~df[col].isin(values)] # check to make sure data is non-empty after filtering step if df.empty: raise EmptyDataError("No data remaining after filter applied") return df
def check( self, ingress=None, destination_ip=None, snapshot_folder=None, start_node=None, start_interface=None, ): self.destination_ip = destination_ip if ingress is not None: self.ingress = ingress else: ingress = self.ingress result = self.b_fish.bfq.traceroute( startLocation=ingress, headers=self.b_fish.hc(dstIps=destination_ip) ) result = result.answer().frame() # separate out Flow and Traces try: # Todo (flow is unused currently) flow = result.iloc[0]["Flow"] except EmptyDataError as e: print(e) raise EmptyDataError(f"No data in dataframe location: {e}") try: traces = result.iloc[0]["Traces"] except EmptyDataError as e: print(e) raise EmptyDataError(f"No data in dataframe location: {e}") self.dvt = DataviewTraceroute() self._generate_dataview(traces) self.dvt_dict = self._generate_dict() # finally return data return self.dvt_list, self.new_list
def filter_rows_by_nonzero(df, op=operator.gt, value=None, quantile=None): """Filters dataset rows based on the number of 0's present""" if quantile is not None: value = df.quantile(quantile, axis=1) df = df[op((df != 0).sum(axis=1), value)] # check to make sure data is non-empty after filtering step if df.empty: raise EmptyDataError("No data remaining after filter applied") return df
def _validateDataFrame(self, df): """ Validates that the df isn't empty and categorizes it as iterable (TextFileReader) or not iterable (DataFrame) """ #if the df is a standard DataFrame if type(df) == pd.DataFrame: self._logger.info('Using regular dataframe') if df.empty: self._logger.error('Empty dataframe') raise EmptyDataError('DataFrame is empty') self.colsAndTypes = { name: df.dtypes[name] for name in list(df.columns) } self._isIterable = False #if the df is a large file read in through chunks elif type(df) == pd.io.parsers.TextFileReader: self._logger.info('Using large dataframe') for chunk in df: self.colsAndTypes = { name: chunk.dtypes[name] for name in list(chunk.columns) } if chunk.empty: self._logger.error('Empty dataframe') raise EmptyDataError('DataFrame is empty') break self._isIterable = True else: raise TypeError( f'Invalid df type. Type "{type(df)}" is not a DataFrame or TextFileReader' ) return True
def get_qstick(data): """Calculate the QStick indicator of given dataframe. :param data: a dataframe in OHLC format :return: a Pandas series """ if data is None: raise EmptyDataError("[!] Invalid data value") result = TA.QSTICK(data) if result is None: raise IndicatorException return result
def get_chandelier(data): """Calculate the chandelier exit indicator of given dataframe. :param data: a dataframe in OHLC format :return: a concatenated Pandas series """ if data is None: raise EmptyDataError("[!] Invalid data value") result = TA.CHANDELIER(data) if result is None: raise IndicatorException return result
def get_cmo(data): """Calculate the Chande momentum oscillator of given dataframe. :param data: a dataframe in OHLC format :return: a Pandas series """ if data is None: raise EmptyDataError("[!] Invalid data value") result = TA.CMO(data) if result is None: raise IndicatorException return result
def get_baspn(data): """Calculate the normalized buying and selling pressure of given dataframe. :param data: a dataframe in OHLC format :return: a concatenated Pandas series """ if data is None: raise EmptyDataError("[!] Invalid data value") result = TA.BASPN(data) if result is None: raise IndicatorException return result
def get_wobv(data): """Calculate the weighted on balance volume of given dataframe. :param data: a dataframe in OHLC format :return: a Pandas series """ if data is None: raise EmptyDataError("[!] Invalid data value") result = TA.WOBV(data) if result is None: raise IndicatorException return result
def get_tsi(data): """Calculate the true strength index of given dataframe. :param data: a dataframe in OHLC format :return: a concatenated Pandas series """ if data is None: raise EmptyDataError("[!] Invalid data value") result = TA.TSI(data) if result is None: raise IndicatorException return result
def get_emv(data): """Calculate the ease of movement of given dataframe. :param data: a dataframe in OHLC format :return: a Pandas series """ if data is None: raise EmptyDataError("[!] Invalid data value") result = TA.EMV(data) if result is None: raise IndicatorException return result
def get_ebbp(data): """Calculate the bull power and bear power of given dataframe. :param data: a dataframe in OHLC format :return: a concatenated Pandas series """ if data is None: raise EmptyDataError("[!] Invalid data value") result = TA.EBBP(data) if result is None: raise IndicatorException return result
def get_efi(data): """Calculate the Elder's force index of given dataframe. :param data: a dataframe in OHLC format :return: a Pandas series """ if data is None: raise EmptyDataError("[!] Invalid data value") result = TA.EFI(data) if result is None: raise IndicatorException return result
def get_vzo(data): """Calculate the volume zone oscillator for given dataframe. :param data: a dataframe in OHLC format :return: a Pandas series """ if data is None: raise EmptyDataError("[!] Invalid data value") result = TA.VZO(data) if result is None: raise IndicatorException return result
def get_kst(data): """Calculate the known sure thing oscillator of given dataframe. :param data: a dataframe in OHLC format :return: a concatenated Pandas series """ if data is None: raise EmptyDataError("[!] Invalid data value") result = TA.KST(data) if result is None: raise IndicatorException return result
def get_vortex(data): """Calculate the vortex of given dataframe. :param data: a dataframe in OHLC format :return: a concatenated Pandas series """ if data is None: raise EmptyDataError("[!] Invalid data value") result = TA.VORTEX(data) if result is None: raise IndicatorException return result
def get_adl(data): """Calculate the accumulation/distribution line of given dataframe. :param data: a dataframe in OHLC format :return: a Pandas series """ if data is None: raise EmptyDataError("[!] Invalid data value") result = TA.ADL(data) if result is None: raise IndicatorException return result
def get_cci(data): """Calculate the commodity channel index of given dataframe. :param data: a dataframe in OHLC format :return: a Pandas series """ if data is None: raise EmptyDataError("[!] Invalid data value") result = TA.CCI(data) if result is None: raise IndicatorException return result
def get_copp(data): """Calculate the Coppock curve of given dataframe. :param data: a dataframe in OHLC format :return: a Pandas series """ if data is None: raise EmptyDataError("[!] Invalid data value") result = TA.COPP(data) if result is None: raise IndicatorException return result
def get_tema(data): """Calculate the triple exponential moving average for values of given dataframe. :param data: a dataframe in OHLC format :return: a Pandas series """ if data is None: raise EmptyDataError("[!] Invalid data value") result = TA.TEMA(data) if result is None: raise IndicatorException return result
def get_williams(data): """Calculate the Williams oscillator for given dataframe. :param data: a dataframe in OHLC format :return: a Pandas series """ if data is None: raise EmptyDataError("[!] Invalid data value") result = TA.WILLIAMS(data) if result is None: raise IndicatorException return result