Пример #1
0
def create_df_ls(flist=[""], indir=""):
    """
    This function takes a list of string values that are the paths and file names to csv files. The csv files are
        then individually loaded into pandas DataFrames and returned in a list containing a DataFrame for each csv
        in flist.

    Parameters:
    :param flist: A list of string values that contain the names of csv files to be loaded into pandas DataFrames
    :param indir: A string value that is the optional base directory of all of the csv files. If the full path for
        each file is included in flist, then this parameter should be left blank (""). Otherwise, the base directory
        where the csv files reside should be used.
    :return: df_ls: A list of pandas DataFrames loaded from the csvs specified in flist.
    """
    # Check passed parameter values to prevent error in future.
    _ec.check_ls(ls=flist)
    _ec.check_string(values=flist)
    _ec.check_string(values=[indir])

    df_ls = []
    for f in flist:
        file = Path(indir+f)
        if file.is_file():
            df = pd.DataFrame.from_csv(indir+f)
            df_ls.append(df)
            print('file "%s" added to list.' % f)
        else:
            print("File '%s' not found. Not included in DataFrame list." % (indir+f))

    return df_ls
Пример #2
0
def set_indices_ls(df_ls, index_name=''):
    """
    This function takes a list of pandas DataFrames and sets the index to a specified column. The specified
        column should exist in every DataFrame. Otherwise, the results may be inconsistent and some DataFrames
        may not have their index set to that which is specified.

    Parameters:
    :param df_ls: List of pandas DataFrames for which to attempt reindexing with the specified column.
    :param index_name: The user-specified column to reassign as the index of each pandas DataFrame in df_ls.
    :return: Nothing. DataFrames are modified in place.
    """
    # TODO: test the functionality of this function.
    # Input type checking to prevent errors during index setting.
    _ec.check_ls(ls=df_ls)
    _ec.check_dfs(values=df_ls)
    _ec.check_string(values=[index_name])

    for df, df_num in zip(df_ls, range(0, len(df_ls))):
        if index_name in df.columns:
            df.set_index(index_name, inplace=True)
        else:
            print("Target index not found in current DataFrame (#%i in list). Skipping re-indexing"
                  "of current DataFrame." % df_num)

    return 0
Пример #3
0
def df_col_avg_sum(df_ls, df_name_ls=[]):
    """
    This function takes a list of DataFrames, sums the columns, and then averages the sums of all columns to
        calculate one "average sum" value for each DataFrame. Then, a list of averaged column sums from all processed data
        frames is compiled into a new pandas DataFrame and returned. One example usage would be if a primary dataset
        was split into multiple subsets, and a comparison of the average sum of columns is desired.

    Parameters:
    :param df_ls: List of pandas DataFrames
    :param df_name_ls: A string list of DataFrame names (i.e., data set string labels)
    :return: df_out: A compiled DataFrame of original DataFrame names (column 1), and the average of column sums (col 2)
        each DataFrame
    """
    # Check for equal length of DataFrame list and factor list to prevent error in averaging/summing loop
    _ec.check_ls(ls=df_ls)
    _ec.check_ls(ls=df_name_ls)
    _ec.check_eq_ls_len(list_ls=[df_ls, df_name_ls])
    _ec.check_dfs(values=df_ls)
    _ec.check_string(values=df_name_ls)

    avgsums = []
    for df, df_name, df_num in zip(df_ls, df_name_ls, range(len(df_ls))):
        avgsum = df.sum(axis=0).mean(axis=0)
        avgsums.append(avgsum)
    # Create single DataFrame with average sums of each DataFrame in the first column.
    df_out = pd.DataFrame({"Dataset Name": df_name_ls,
                           "Mean of Column Sums": avgsums})
    return df_out
Пример #4
0
def concat_ls(df_ls1, df_ls2, axis=0, join='inner'):
    """
    This function takes two lists of pandas DataFrames and concatenates them. Options allow user to specify the axis
        along which to concatenate as well as the pandas join method (e.g., 'inner', 'outer'). Where available,
        inherent pandas DataFrame functionality is employed (e.g., transpose, axis selection, join method, etc).
        Parameter choice should be in line with the requirements of the pandas library and associated functions,
        and therefore the same convention is used for parameters axis and join. DataFrames are concatenated pairwise;
        that is, df_ls1[i] is concatenated with df_ls2[i].

    Parameters:
    :param df_ls1: A list of DataFrames on which to concatenate df_ls2
    :param df_ls2: A list of DataFrames to concatenate onto the corresponding DataFrames
    :param axis: The axis along which the DataFrames will be concatenated. axis=1 for column-wise, 0 for row-wise
        (standard pandas DataFrame syntax). Example: if axis=0, DataFrames will be concatenated in the row dimension
        (i.e., stacked vertically; will require same # of columns).  If axis=1, will be concatenated in the
        column dimension (i.e., side-by-side)
    :param join: Allows user to specify the join parameter for pandas.concat(). Must be compatible with choices
        available within the pandas package.
    :return: df_list: A list of DataFrames where elements are DataFrames from list 1 concatenated onto the corresponding
        DataFrame from list 2
    """
    # Check data types to prevent errors during processing.
    _ec.check_ls(ls=df_ls1)
    _ec.check_ls(ls=df_ls2)
    _ec.check_dfs(values=df_ls1)
    _ec.check_dfs(values=df_ls2)
    _ec.check_eq_ls_len(list_ls=[df_ls1, df_ls2])
    _ec.check_numeric(values=[axis])
    _ec.param_exists_in_set(value=axis, val_set=[0, 1])
    _ec.check_string(values=[join])
    _ec.param_exists_in_set(value=join, val_set=['inner', 'outer'])

    # Initialize return list for concatenated DataFrames
    df_ls_concat = []
    # Check row or column lengths of lists to make sure they're the same.  If not, tell user, but try to proceed.
    if axis == 0:
        for df1, df2 in zip(df_ls1, df_ls2):
            if df1.shape[1] != df2.shape[1]:
                print('WARNING: You chose concatenation in row dimension (i.e., vertical stacking) with '
                      'parameter axis=0,\n but some DataFrame pairs have different numbers of columns.  Proceeding...')
            else:
                pass
    elif axis == 1:
        for df1, df2 in zip(df_ls1, df_ls2):
            if df1.shape[0] != df2.shape[0]:
                print('WARNING: You chose to concatenate in column dimension (side-by-side) with axis=1, but'
                      'some DataFrame pairs have different number of rows.  Proceeding...')
    else:
        print('ERROR: Parameter axis must be set to 0 or 1')
        sys.exit()

    for df1, df2 in zip(df_ls1, df_ls2):
        df_ls_concat.append(pd.concat([df1, df2], axis=axis, join=join))

    return df_ls_concat
Пример #5
0
def drop_cols_df_ls(df_ls, cols2drop=['none'], inplace=True):
    """
    This function drops a single specified column of a given name in each pandas DataFrame in list if it exists. It
        does this 'in place' such that nothing is returned, but the passed DataFrames are modified in the original
        script.

    Parameters:
    :param df_ls: A list of pandas DataFrames from which to drop columns.
    :param cols2drop: Column name (string) to drop from DataFrames.
    :param inplace: The option whether or not to perform the column drop in place. If True, the original DataFrames
        are modified. If false, a new list of DataFrames with the specified columns dropped is returned, and the
        original DataFrames are preserved.
    :return: nothing
    """
    # Check data types to prevent errors in column dropping loop.
    _ec.check_ls(ls=df_ls)
    _ec.check_ls(ls=cols2drop)
    _ec.check_dfs(values=df_ls)
    _ec.check_string(values=cols2drop)

    if not inplace:
        df_dropped_ls = []
    for df, df_num in zip(df_ls, range(len(df_ls))):
        cols_dropped = 0
        if not inplace:
            df_dropped = df.copy(deep=True)
        for col in cols2drop:
            if col in df.columns:
                if inplace:
                    df.drop(col, axis=1, inplace=inplace)
                    cols_dropped += 1
                else:
                    df_dropped = df_dropped.drop(col, axis=1, inplace=inplace)
                    cols_dropped += 1
            else:
                print('Column %s not present in DataFrame # %i. Proceeding to next in list.' % (col, df_num))
        if not inplace:
            df_dropped_ls.append(df_dropped)
        print('Number of columns dropped from DataFrame #%i: %i' % (df_num, cols_dropped))
    if inplace:
        return 0
    else:
        return df_dropped_ls
Пример #6
0
def drop_cols(df, cols2drop=[""]):
    """
    This function drops a list of columns from a pandas DataFrame.

    Parameters:
    :param df: DataFrame from which to drop columns
    :return:
    """
    # Check data types to prevent errors in column dropping loop.
    _ec.check_ls(ls=cols2drop)
    _ec.check_dfs(values=[df])
    _ec.check_string(values=cols2drop)

    cols_dropped = 0
    for col in cols2drop:
        if col in df.columns:
            df.drop(col, axis=1, inplace=True)
            cols_dropped += 1
        else:
            pass
    print('Number of columns dropped from DataFrame: %i' % cols_dropped)

    return 0
Пример #7
0
def concat_trans_ls(df_ls1, df_ls2, axis=0, join='inner', pad=True, rep_colnames=True, pad_name=''):
    """
    This function takes two lists of pandas DataFrames and concatenates them, after transposing the second.
        Options allow user to specify the axis along which to concatenate as well as the pandas join method
        (e.g., 'inner', 'outer'). Where available, inherent pandas DataFrame functionality is employed
        (e.g., transpose, axis selection, join method, etc). Parameter choice should be in line with the requirements
        of the pandas library and associated functions, and therefore the same convention is used for parameters
        axis and join. DataFrames are concatenated pairwise; that is, df_ls1[i] is concatenated with df_ls2[i].
        Additional options are available through the parameters rep_colnames, pad, and padName

        Note, when using this that you will likely run into pandas errors if the transposed version of the second
        DataFrame has a different number of columns than the corresponding DataFrame in df_ls1.

    Parameters:
    :param df_ls1: list of DataFrames on which to concatenate the second list, df_ls2
    :param df_ls2: list of DataFrames to transpose and concatenate
    :param axis: axis=1 for columns, 0 for rows (standard pandas DataFrame syntax). Ex: if axis=0, DataFrames will
        be concatenated in the row dimension (i.e., stacked; may require same # of columns). If axis=1, will be
        concatenated in the column dimension (i.e., side-by-side)
    :param join: Join method a used by pandas.concat
    :param pad: Lets the user select whether or not to pad the two datasets with a blank row.  Can specify column
            name to add to this blank row with parameter pad_name.
    :param pad_name: optional info to add by user to name the padded row added between the datasets. leave blank for
            an empty index (nan).
    :param rep_colnames: option to replicate the column names after the padding.  This will add the column names from
        the first DataFrame into the padding between the two concatenated DataFrames
    :return: df_concat_ls - list of DataFrames where elements are DataFrames from list 1 concatenated onto the DataFrame
     from list 1
    """
    # Check parameter data types, list lengths, and values to prevent errors during processing
    _ec.check_ls(ls=df_ls1)
    _ec.check_ls(ls=df_ls2)
    _ec.check_dfs(values=df_ls1)                    # DataFrame lists
    _ec.check_dfs(values=df_ls2)
    _ec.check_eq_ls_len(list_ls=[df_ls1, df_ls2])
    _ec.check_numeric(values=[axis])                # axis
    _ec.param_exists_in_set(value=axis, val_set=[0, 1])
    _ec.check_bool(values=[pad])
    _ec.check_bool(values=[rep_colnames])
    _ec.check_string(values=[pad_name])

    # Initialize internal function variables and return list
    df_concat_ls = []
    # check row or column lengths of lists to make sure they're the same.  If not, tell user, but try to proceed
    if axis == 0:
        for df1, df2 in zip(df_ls1, df_ls2):
            if df1.shape[1] != df2.T.shape[1]:
                print('WARNING: You chose concatenation in row dimension (i.e., stacking) with parameter axis=0,\n'
                      'but some DataFrame pairs have different numbers of columns.  Proceeding...')
            else:
                pass
    elif axis == 1:
        for df1, df2 in zip(df_ls1, df_ls2):
            if df1.shape[0] != df2.T.shape[0]:
                print('WARNING: You chose to concatenate in column dimension (side by side) with axis=1, but'
                      'some DataFrame pairs have different number of rows.  Proceeding...')
    else:
        print('ERROR: Parameter axis must be set to 0 or 1')
        sys.exit()

    # Proceed with concatenation
    for df1, df2 in zip(df_ls1, df_ls2):
        # Create pad row if selected, and pad b/t the two DataFrames in current pair
        if pad:
            padding = pd.DataFrame(index=['', pad_name], columns=df1.columns)
            if rep_colnames:
                padding.values[1] = df1.columns.values
            else:
                pass
            df_concat_ls.append(pd.concat([df1, padding, df2.T], axis=axis, join=join))
        else:
            df_concat_ls.append(pd.concat([df1, df2.T], axis=axis, join=join))
    return df_concat_ls