def _validateDataFrame(self, Df):
        #if the df is a standard DataFrame
        if type(Df) == pd.DataFrame:
            self._logger.info('Using regular dataframe')

            if Df.empty:
                self._logger.error('Empty dataframe')
                raise EmptyDataError('DataFrame is empty') 

            self.colsAndTypes = {name: Df.dtypes[name] for name in list(Df.columns)}
            self._isIterable = False


        #if the df is a large file read in through chunks
        elif type(Df) == pd.io.parsers.TextFileReader:
            self._logger.info('Using large dataframe')
            for chunk in Df:
                self.colsAndTypes = {name: chunk.dtypes[name] for name in list(chunk.columns)}
                    
                if chunk.empty:
                    self._logger.error('Empty dataframe')
                    raise EmptyDataError('DataFrame is empty') 

                break
            self._isIterable = True

        else:
            raise TypeError(f'Invalid Df type. Type "{type(Df)}" is not a DataFrame or TextFileReader')

        return True
    def import_labeled_classify_files(f, mark_job):
        doc_type_id = mark_job.doc_type_id
        mark_job_id = mark_job.mark_job_id
        doc_unique_name, doc_relative_path = upload_fileset.save_file(f.filename, f.stream.read())
        csv_doc = DocModel().create(doc_raw_name=f.filename, doc_unique_name=doc_unique_name)
        try:
            df = pd.read_csv(doc_relative_path, skiprows=0, na_values='')
        except EmptyDataError:
            raise EmptyDataError('上传数据为空,请检查上传数据:{}'.format(f.filename))
        except Exception:
            raise EmptyDataError('上传数据处理异常,请检查上传数据:{}'.format(f.filename))
        if 'text' not in df.columns or 'label' not in df.columns:
            raise KeyError
        doc_terms, _ = DocTermModel.get_doc_term_by_doctype(doc_type_id, offset=0, limit=9999)
        doc_term_name2id_map = {m.doc_term_name: m.doc_term_id for m in doc_terms}
        content_list = []
        task_results = []
        for row_num, row in df.iterrows():
            content = row.get('text')
            label = row.get('label')
            try:
                label_id = doc_term_name2id_map[label]
            except KeyError as ke:
                raise ValueError(f"当前项目不存在文件第{row_num + 2}行的label:{ke.args[0]},请检查")
            task_result = [{'prob': 1, 'marked': 1, 'label_id': label_id, 'label_name': label}]
            if content and label:
                content_list.append(content)
                task_results.append(task_result)

        # bulk insert doc
        unique_name_list = []
        for txt_content in content_list:
            doc_unique_name, _ = upload_fileset.save_file('format.txt', txt_content)
            unique_name_list.append(doc_unique_name)
        doc_list = [
            dict(
                doc_raw_name=csv_doc.doc_raw_name,
                doc_unique_name=unique_name,
            ) for unique_name in unique_name_list
        ]
        doc_entity_list = DocModel().bulk_create(doc_list)

        # bulk insert task
        task_list = []
        for i in range(len(doc_list)):
            task_list.append(dict(
                doc_id=doc_entity_list[i].doc_id,
                mark_job_id=mark_job_id,
                mark_task_result=task_results[i] if task_results else {},
                mark_task_status=int(StatusEnum.approved)
            ))
        task_entity_list = MarkTaskModel().bulk_create(task_list)

        return task_entity_list
예제 #3
0
def read_value_data(path):
    try:
        header = pd.read_csv(path, nrows=0)

    except EmptyDataError:
        raise EmptyDataError('Empty file')

    try:
        data_without_header = pd.read_csv(path, skiprows=1, header=None)
    except EmptyDataError:
        raise EmptyDataError("There's no data")

    data_len = data_without_header.shape[1]
    header_len = header.shape[1]

    if data_len > header_len:
        len_difference = data_len - header_len
        columns_to_drop = list(range(data_len)[-len_difference:])

        data = data_without_header.drop(columns_to_drop, axis=1)
    elif data_len < header_len:
        raise ValueError(
            f"Header length is {header_len}, but data length is {data_len}.")
    else:
        data = data_without_header.copy()

    data.columns = header.columns
    data.columns = np.char.lower(np.array(data.columns, dtype=str))

    required_columns = [
        'lpep_pickup_datetime', 'lpep_dropoff_datetime', 'passenger_count',
        'trip_distance', 'total_amount'
    ]

    data['lpep_pickup_datetime'] = pd.to_datetime(data['lpep_pickup_datetime'],
                                                  errors='coerce')
    data['lpep_dropoff_datetime'] = pd.to_datetime(
        data['lpep_dropoff_datetime'], errors='coerce')
    data['trip_distance'] = pd.to_numeric(data['trip_distance'],
                                          downcast='float',
                                          errors='coerce')
    data['total_amount'] = pd.to_numeric(data['total_amount'],
                                         downcast='float',
                                         errors='coerce')
    data['passenger_count'] = pd.to_numeric(data['passenger_count'],
                                            downcast='float',
                                            errors='coerce')

    return data[required_columns]
def _compile_columns(file_list):
    from pandas.errors import EmptyDataError

    # Read one line from each dataframe
    dfsamples = []
    for file in file_list:
        try:
            dfsamples.append(
                pd.read_csv(file,
                            index_col=None,
                            header=0,
                            nrows=1,
                            comment='#'))
        except EmptyDataError:
            raise EmptyDataError('Summary file empty: \n{}'.format(file))

    # Compare number of columns
    ncols = [df.shape[1] for df in dfsamples]
    if not all([n == ncols[0] for n in ncols]):
        warnings.warn(
            'The dataframes to compile do not have the same number of columns.'
        )

    # Compile all columns
    columns = pd.concat(dfsamples, axis=0, sort=False).columns.to_list()

    return columns
예제 #5
0
    def read(self, nrows=None):

        if (nrows is None) and (self.chunksize is not None):
            nrows = self.chunksize
        elif nrows is None:
            nrows = self.row_count

        if len(self._column_types) == 0:
            self.close()
            raise EmptyDataError("No columns to parse from file")

        if self._current_row_in_file_index >= self.row_count:
            return None

        m = self.row_count - self._current_row_in_file_index
        if nrows > m:
            nrows = m

        nd = self._column_types.count(b"d")
        ns = self._column_types.count(b"s")

        self._string_chunk = np.empty((ns, nrows), dtype=object)
        self._byte_chunk = np.zeros((nd, 8 * nrows), dtype=np.uint8)

        self._current_row_in_chunk_index = 0
        p = Parser(self)
        p.read(nrows)

        rslt = self._chunk_to_dataframe()
        if self.index is not None:
            rslt = rslt.set_index(self.index)

        return rslt
예제 #6
0
파일: filters.py 프로젝트: khughitt/snakes
def filter_data_by_func(df,
                        func,
                        axis=1,
                        op=operator.gt,
                        value=None,
                        quantile=None):
    """Generalized function for filtering a dataset by rows or columns."""
    # aply function along specified axis
    vals = df.apply(func, axis=axis)

    # if quantile specified, find associated value
    if quantile is not None:
        value = vals.quantile(quantile, axis=axis)

    # apply operator
    mask = op(vals, value)

    # apply function along rows or columns and return filtered result
    if axis == 1:
        # rows
        df = df[mask]
    else:
        # columns
        df = df[mask.index[mask]]

    # check to make sure data is non-empty after filtering step
    if df.empty:
        raise EmptyDataError("No data remaining after filter applied")

    return df
예제 #7
0
 def transform(self, X: pd.DataFrame = None):
     """
     Performs a OHE on new data
     Parameters:
     ------------
     X : pd.DataFrame (default = None)
         pandas dataframe input data (test) that should contain a self.key column
     """
     if len(X) == 0:
         raise EmptyDataError("Input dataset is empty!")
     if len(self.columns) == 0:
         raise NotFittedError(
             "OHE transformer is not fiited! You need to execute a 'fit' method first!"
         )
     try:
         X = pd.get_dummies(X[[self.key]], prefix=self.key)
         test_columns = [col for col in X.columns]
         for col_ in self.columns:
             if col_ not in test_columns:
                 X[col_] = 0
         return X[self.columns]
     except KeyError as e:
         raise KeyError(
             "Input column {} does not exist in the input Dataset!".format(
                 self.key))
예제 #8
0
def df2csv(dataframe, ticker):
    """Save the contents of a dataframe to a csv file in the data/ directory.

    File name will include the ticker (required arg) and current timestamp.

    :param dataframe:
    :param ticker:
    :return:
    """
    if dataframe is None:
        raise EmptyDataError("Dataframe cannot be empty.")
    if not ticker or ticker is None:
        raise ValueError("[!] Invalid ticker value.")

    datafile = os.path.relpath("data/{}_data_{}.csv".format(ticker, time.time()))

    try:
        dataframe.to_csv(datafile, index=False)
    except FileNotFoundError:
        print("[?] Retrying one directory level up.")
        datafile = os.path.relpath("../data/{}_data_{}.csv".format(ticker, time.time()))
        try:
            dataframe.to_csv(datafile, index=False)
        except FileNotFoundError:
            raise FileNotFoundError("[!] Unable to save dataframe to CSV file.")
    finally:
        print("[+] File saved:\t{}".format(datafile))
예제 #9
0
파일: filters.py 프로젝트: khughitt/snakes
def filter_rows_col_not_na(df, col):
    """Returns all rows for which a specific column is not null"""
    df = df[df[col].notnull()]

    # check to make sure data is non-empty after filtering step
    if df.empty:
        raise EmptyDataError("No data remaining after filter applied")

    return df
예제 #10
0
파일: filters.py 프로젝트: khughitt/snakes
def filter_rows_col_val_not_in(df, col, values):
    """
    Removes all rows for which a column is one of a specified set of values
    """
    df = df[~df[col].isin(values)]

    # check to make sure data is non-empty after filtering step
    if df.empty:
        raise EmptyDataError("No data remaining after filter applied")

    return df
예제 #11
0
    def check(
        self,
        ingress=None,
        destination_ip=None,
        snapshot_folder=None,
        start_node=None,
        start_interface=None,
    ):

        self.destination_ip = destination_ip

        if ingress is not None:
            self.ingress = ingress
        else:
            ingress = self.ingress

        result = self.b_fish.bfq.traceroute(
            startLocation=ingress, headers=self.b_fish.hc(dstIps=destination_ip)
        )

        result = result.answer().frame()
        # separate out Flow and Traces
        try:
            # Todo (flow is unused currently)
            flow = result.iloc[0]["Flow"]
        except EmptyDataError as e:
            print(e)
            raise EmptyDataError(f"No data in dataframe location: {e}")
        try:
            traces = result.iloc[0]["Traces"]
        except EmptyDataError as e:
            print(e)
            raise EmptyDataError(f"No data in dataframe location: {e}")

        self.dvt = DataviewTraceroute()

        self._generate_dataview(traces)
        self.dvt_dict = self._generate_dict()

        # finally return data
        return self.dvt_list, self.new_list
예제 #12
0
파일: filters.py 프로젝트: khughitt/snakes
def filter_rows_by_nonzero(df, op=operator.gt, value=None, quantile=None):
    """Filters dataset rows based on the number of 0's present"""
    if quantile is not None:
        value = df.quantile(quantile, axis=1)

    df = df[op((df != 0).sum(axis=1), value)]

    # check to make sure data is non-empty after filtering step
    if df.empty:
        raise EmptyDataError("No data remaining after filter applied")

    return df
예제 #13
0
    def _validateDataFrame(self, df):
        """
        Validates that the df isn't empty and categorizes it as iterable (TextFileReader) or not iterable (DataFrame)
        """
        #if the df is a standard DataFrame
        if type(df) == pd.DataFrame:
            self._logger.info('Using regular dataframe')

            if df.empty:
                self._logger.error('Empty dataframe')
                raise EmptyDataError('DataFrame is empty')

            self.colsAndTypes = {
                name: df.dtypes[name]
                for name in list(df.columns)
            }
            self._isIterable = False

        #if the df is a large file read in through chunks
        elif type(df) == pd.io.parsers.TextFileReader:
            self._logger.info('Using large dataframe')
            for chunk in df:
                self.colsAndTypes = {
                    name: chunk.dtypes[name]
                    for name in list(chunk.columns)
                }

                if chunk.empty:
                    self._logger.error('Empty dataframe')
                    raise EmptyDataError('DataFrame is empty')

                break
            self._isIterable = True

        else:
            raise TypeError(
                f'Invalid df type. Type "{type(df)}" is not a DataFrame or TextFileReader'
            )

        return True
예제 #14
0
    def get_qstick(data):
        """Calculate the QStick indicator of given dataframe.

        :param data: a dataframe in OHLC format
        :return: a Pandas series
        """
        if data is None:
            raise EmptyDataError("[!] Invalid data value")

        result = TA.QSTICK(data)
        if result is None:
            raise IndicatorException
        return result
예제 #15
0
    def get_chandelier(data):
        """Calculate the chandelier exit indicator of given dataframe.

        :param data: a dataframe in OHLC format
        :return: a concatenated Pandas series
        """
        if data is None:
            raise EmptyDataError("[!] Invalid data value")

        result = TA.CHANDELIER(data)
        if result is None:
            raise IndicatorException
        return result
예제 #16
0
    def get_cmo(data):
        """Calculate the Chande momentum oscillator of given dataframe.

        :param data: a dataframe in OHLC format
        :return: a Pandas series
        """
        if data is None:
            raise EmptyDataError("[!] Invalid data value")

        result = TA.CMO(data)
        if result is None:
            raise IndicatorException
        return result
예제 #17
0
    def get_baspn(data):
        """Calculate the normalized buying and selling pressure of given dataframe.

        :param data: a dataframe in OHLC format
        :return: a concatenated Pandas series
        """
        if data is None:
            raise EmptyDataError("[!] Invalid data value")

        result = TA.BASPN(data)
        if result is None:
            raise IndicatorException
        return result
예제 #18
0
    def get_wobv(data):
        """Calculate the weighted on balance volume of given dataframe.

        :param data: a dataframe in OHLC format
        :return: a Pandas series
        """
        if data is None:
            raise EmptyDataError("[!] Invalid data value")

        result = TA.WOBV(data)
        if result is None:
            raise IndicatorException
        return result
예제 #19
0
    def get_tsi(data):
        """Calculate the true strength index of given dataframe.

        :param data: a dataframe in OHLC format
        :return: a concatenated Pandas series
        """
        if data is None:
            raise EmptyDataError("[!] Invalid data value")

        result = TA.TSI(data)
        if result is None:
            raise IndicatorException
        return result
예제 #20
0
    def get_emv(data):
        """Calculate the ease of movement of given dataframe.

        :param data: a dataframe in OHLC format
        :return: a Pandas series
        """
        if data is None:
            raise EmptyDataError("[!] Invalid data value")

        result = TA.EMV(data)
        if result is None:
            raise IndicatorException
        return result
예제 #21
0
    def get_ebbp(data):
        """Calculate the bull power and bear power of given dataframe.

        :param data: a dataframe in OHLC format
        :return: a concatenated Pandas series
        """
        if data is None:
            raise EmptyDataError("[!] Invalid data value")

        result = TA.EBBP(data)
        if result is None:
            raise IndicatorException
        return result
예제 #22
0
    def get_efi(data):
        """Calculate the Elder's force index of given dataframe.

        :param data: a dataframe in OHLC format
        :return: a Pandas series
        """
        if data is None:
            raise EmptyDataError("[!] Invalid data value")

        result = TA.EFI(data)
        if result is None:
            raise IndicatorException
        return result
예제 #23
0
    def get_vzo(data):
        """Calculate the volume zone oscillator for given dataframe.

        :param data: a dataframe in OHLC format
        :return: a Pandas series
        """
        if data is None:
            raise EmptyDataError("[!] Invalid data value")

        result = TA.VZO(data)
        if result is None:
            raise IndicatorException
        return result
예제 #24
0
    def get_kst(data):
        """Calculate the known sure thing oscillator of given dataframe.

        :param data: a dataframe in OHLC format
        :return: a concatenated Pandas series
        """
        if data is None:
            raise EmptyDataError("[!] Invalid data value")

        result = TA.KST(data)
        if result is None:
            raise IndicatorException
        return result
예제 #25
0
    def get_vortex(data):
        """Calculate the vortex of given dataframe.

        :param data: a dataframe in OHLC format
        :return: a concatenated Pandas series
        """
        if data is None:
            raise EmptyDataError("[!] Invalid data value")

        result = TA.VORTEX(data)
        if result is None:
            raise IndicatorException
        return result
예제 #26
0
    def get_adl(data):
        """Calculate the accumulation/distribution line of given dataframe.

        :param data: a dataframe in OHLC format
        :return: a Pandas series
        """
        if data is None:
            raise EmptyDataError("[!] Invalid data value")

        result = TA.ADL(data)
        if result is None:
            raise IndicatorException
        return result
예제 #27
0
    def get_cci(data):
        """Calculate the commodity channel index of given dataframe.

        :param data: a dataframe in OHLC format
        :return: a Pandas series
        """
        if data is None:
            raise EmptyDataError("[!] Invalid data value")

        result = TA.CCI(data)
        if result is None:
            raise IndicatorException
        return result
예제 #28
0
    def get_copp(data):
        """Calculate the Coppock curve of given dataframe.

        :param data: a dataframe in OHLC format
        :return: a Pandas series
        """
        if data is None:
            raise EmptyDataError("[!] Invalid data value")

        result = TA.COPP(data)
        if result is None:
            raise IndicatorException
        return result
예제 #29
0
    def get_tema(data):
        """Calculate the triple exponential moving average for values of given dataframe.

        :param data: a dataframe in OHLC format
        :return: a Pandas series
        """
        if data is None:
            raise EmptyDataError("[!] Invalid data value")

        result = TA.TEMA(data)
        if result is None:
            raise IndicatorException
        return result
예제 #30
0
    def get_williams(data):
        """Calculate the Williams oscillator for given dataframe.

        :param data: a dataframe in OHLC format
        :return: a Pandas series
        """
        if data is None:
            raise EmptyDataError("[!] Invalid data value")

        result = TA.WILLIAMS(data)
        if result is None:
            raise IndicatorException
        return result