Exemplo n.º 1
0
def plot_clusters_2D(df, feat1, feat2, labels, title="2-Feature Scatter Plot"):
    """
    Function for a 2-D plot of two features with each point colored by it's respective cluster label.
    :param df: (pandas DataFrame) DataFrame containing the feature axes from which the 2D plot will be
        constructed.
    :param feat1: (string/int) Feature column name or index in df to be used for x-axis on the scatter plot.
    :param feat2: (string/int) Feature column name or index in df to be used for y-axis on the scatter plot.
    :param labels: (1xn-dimension int array) Array containing the integer cluster labels for each sample in
        df
    :param title: (string) Title of the scatter plot
    :return: Nothing (shows plot)
    """
    from pygaero import therm

    # Check data types to prevent subsequent errors
    _check.check_dfs(values=[df])
    _check.check_string(values=[feat1, feat2, title])
    _check.check_numeric(values=labels)

    print('Features chosen for plotting: ', feat1, ',', feat2)
    groups = np.unique(labels)
    colors = therm.get_cmap(n=len(groups) + 1, cm='hsv')

    for group, group_no in zip(groups, range(0, len(groups))):
        x = df.ix[:, feat1][labels == group]
        y = df.ix[:, feat2][labels == group]
        plt.scatter(x, y, c=colors(group_no), marker='o', s=75, edgecolors='k')
        print('plotted first group')
    plt.title(title, fontsize=20)
    plt.xlabel(feat1, fontsize=18)
    plt.ylabel(feat2, fontsize=18)
    plt.show()

    return 0
Exemplo n.º 2
0
def flow_correction(thermograms, aero_samp_rates=[0.0], base_samp_rate=2.0):
    """
    A function to adjust desorption signals by the Figaero sample flow rate relative to the actual flow rate being
        pulled in by the CIMS. For example, if 4 LPM is used to sample aerosol, but the sample rate into the CIMS is
        2 LPM, then the thermograms need to be adjusted by *2/4. This is because the signal being sampled during
        desorptions is twice as concentrated as what would be sampled in the gas phase by a 2 LPM flow. This is useful
        and important if a direct comparison between gas and aerosol concentrations is to be made and if sensitivites
        obtained from gas-phase calibrations are to be applied to the aerosol signals.

        Note, thermograms are modified in-place, and overwritten.
    :param thermograms: (pandas DataFrames) Time series of aerosol desorption thermograms obtained from a Figaero
        ToF-CIMS
    :param aero_samp_rates: (float) Aerosol sample rates for the Figaero inlet. Typically this is higher than the
        base sample rate in order to reduce particle losses.
    :param base_samp_rate: (float) The base sample rate for the CIMS. Default is a nominal 2.0, which is popularly used.
        It is not suggested that this is changed unless a different sample rate has been verified.
    :return: (pandas DataFrame list) thermograms_corr: The desorption thermogram time series that have been adjusted
        for relative sample rates.
    """
    # Check data types to prevent subsequent errors
    _check.check_ls(ls=thermograms)
    _check.check_ls(ls=aero_samp_rates)
    _check.check_dfs(values=thermograms)
    _check.check_numeric(values=aero_samp_rates)
    _check.check_numeric(values=[base_samp_rate])

    for df, samp_rate in zip(thermograms, aero_samp_rates):
        for col in range(0, len(df.columns)):
            df.ix[:, col] = df.values[:, col] * (base_samp_rate / samp_rate)
    return 0
Exemplo n.º 3
0
def plot_tmax(df, ions, tmax_temps, tmax_vals):
    """
    Function to plot the desorption time series for a set of specified ions. The index values of df (df.index.values)
        will be used for the x values during plotting. Tmax values are indicated by a circular red marker.
    :param df: (pandas DataFrame) pandas DataFrame containing the desorption time series
    :param ions: (string) List of string values (or np array) for the ions to plot
    :param tmax_vals: (float) Corresponding tmax values for the items listed in parameter [ions]
    :param tmax_temps: (float) Corresponding tmax temperatures for the items listed in parameter [ions]
    :return: Nothing returned. Plot popped to screen.
    """
    # Check data types to prevent errors during plotting
    _check.check_dfs(values=[df])
    _check.check_string(values=ions)
    _check.check_numeric(values=tmax_temps)
    _check.check_numeric(values=tmax_vals)

    n_series = len(ions)
    cmap = get_cmap(n=n_series, cm='hsv')

    for series_num, ion, tmax, maxsig in zip(range(0, n_series), ions,
                                             tmax_temps, tmax_vals):
        color = cmap(series_num)
        y = df.ix[:, ion].values
        plt.plot(df.index.values, y, linewidth=2, c=color, label=ion, zorder=0)
        plt.scatter(tmax,
                    maxsig,
                    marker='o',
                    s=40,
                    c='r',
                    linewidths=1,
                    zorder=1)

    plt.xlim((min(df.index.values) * .9, max(df.index.values) * 1.1))
    plt.ylim((0, max(tmax_vals) * 1.1))
    plt.legend(fontsize=8)
    plt.show()

    return 0
Exemplo n.º 4
0
def h_to_c(h, c):
    """
    This function calculates a simple O/C ratio for a list of oxygen and carbon numbers. len(h) and len(c) must
        be equal.
    :param h: (int/float list) A list of numeric values representing the number of hydrogens for a list of molecules.
        Can be float or int, but float should only be used if the calculation is for a bulk average. Otherwise, a float
        value doesn't make sense for a single molecule.
    :param c: (int/float list) A list of numeric values representing the number of hydrogens for a list of molecules.
        Can be float or int, but float should only be used if the calculation is for a bulk average. Otherwise, a float
        value doesn't make sense for a single molecule.
    :return: hc_ratios: (float list) A list of values that are the index-to-index ratios of the values in h and c
    """
    # Check to make sure that input lists are numeric and the same length to prevent errors during processing.
    if (_check.check_ls(ls=h, nt_flag=True) and _check.check_np_array(arr=h, nt_flag=True)) or \
            (_check.check_ls(ls=c, nt_flag=True) and _check.check_np_array(arr=c)):
        main_module, main_fn, main_lineno = _check.parent_fn_mod_3step()
        calling_module, calling_fn, calling_lineno = _check.parent_fn_mod_2step(
        )
        print('On line %i in function %s of module %s' %
              (main_lineno, main_fn, main_module))
        print('     Error on line %i in module %s' %
              (calling_lineno, calling_module))
        print('         Invalid input for function %s' % calling_fn)
        sys.exit('ERROR: Either a list or np.array is required')
    _check.check_eq_ls_len(list_ls=[h, c])
    _check.check_numeric(values=h)
    _check.check_numeric(values=c)

    hc_ratios = []
    for no, nc in zip(h, c):
        if nc == 0:
            hc_ratios.append(np.nan)
        else:
            hc_ratios.append(float(no / nc))

    return hc_ratios
Exemplo n.º 5
0
def osc_nitr(c, h, o, n):
    """
    This function calculates a carbon oxidation state (OSC from Kroll et al., 2011). This is calculated by the formula:
        OSC = 2*(O/C) - (H/C) - 5*(N/C).
    :param c: (int/float list) Numerical list with the number of carbons in the molecules for which OSC will be
        calculated.
    :param h: (int/float list) Numerical list with the number of hydrogens in the molecules for which OSC will be
        calculated.
    :param o: (int/float list) Numerical list with the number of oxygens in the molecules for which OSC will be
        calculated.
    :param n: (int/float list) Numerical list with the number of nitrogens in the molecules for which OSC will be
        calculated.
    :return: ox_states_nitr: (float list) Oxidation states, accounting for nitrogen groups (assumed nitrates)
    """
    # Verify that parameters are lists of numerical values (numpy ndarray or python list) to prevent error in
    # subsequent loops
    if (_check.check_ls(ls=c, nt_flag=True) and _check.check_np_array(arr=c, nt_flag=True)) or \
            (_check.check_ls(ls=h, nt_flag=True) and _check.check_np_array(arr=h)) or \
            (_check.check_ls(ls=o, nt_flag=True) and _check.check_np_array(arr=o)) or \
            (_check.check_ls(ls=n, nt_flag=True) and _check.check_np_array(arr=n)):
        main_module, main_fn, main_lineno = _check.parent_fn_mod_3step()
        calling_module, calling_fn, calling_lineno = _check.parent_fn_mod_2step(
        )
        print('On line %i in function %s of module %s' %
              (main_lineno, main_fn, main_module))
        print('     Error on line %i in module %s' %
              (calling_lineno, calling_module))
        print('         Invalid input for function %s' % calling_fn)
        sys.exit('ERROR: Either a list or np.array is required')
    _check.check_eq_ls_len(list_ls=[c, h, o, n])
    _check.check_numeric(values=c)
    _check.check_numeric(values=h)
    _check.check_numeric(values=o)
    _check.check_numeric(values=n)

    ox_states_nitr = []
    for nc, nh, no, nn in zip(c, h, o, n):
        if nc == 0:
            ox_states_nitr.append(np.nan)
        else:
            ox_states_nitr.append(
                float((2 * (no / nc) - (nh / nc) - 5 * (nn / nc))))

    return ox_states_nitr
Exemplo n.º 6
0
def ele_stats(molec_ls,
              ion_state=-1,
              cluster_group=None,
              clst_group_mw=126.90447,
              xtra_elements=None):
    """
    This function takes a list of string values that are chemical formulas and calculates a set of basic statistics for
        each formula. The statistics are then returned in a pandas DataFrame with the molecule names as the indices
        with each statistic as a column. Statistics include (column name listed):
        1. Basic elemental counts:
            C - # of C atoms in molecule
            H - # of H atoms in molecule
            O - # of O atoms in molecule
            N - # of N atoms in molecule
            Cl - # of Cl atoms in molecule
            F - # of F atoms in molecule
            Br - # of Br atoms in molecule
            [cluster_group] - # of user-defined cluster group in molecule (for cluster-forming ionization mechanisms)
            * More element counts can be added by using a list in the parameter [xtra_elements]
        2. O/C and H/C:
            O/C - oxygen to carbon ratio
            H/C - hydrogen to carbon ratio
        3. Oxidation State (Kroll et al., 2011):
            OSC: 2*(O/C) - (H/C)
            OSC_N: 2*(O/C) - (H/C) - 5*(N/C). Assumes all nitrogen are nitrate groups (see reference for details)
        4. Molecular weights:
            MW: Molecular weight for exact formula in molecule name string (in molec_ls)
            MW_xclust: Molecular weight without the cluster group specified by cluster_group
    :param molec_ls: (list of strings) A list of strings which are molecule names for which statistics will be
        calculated.
    :param ion_state: (int or float) The multiplier of the mass of electrons (mol/g) to add/subtract for a charged
        molecule's molecular mass (in mass_electron*(-1)*ion_state). ion_state is equal to the charge on each ion
        (ion_state=0 for neutral)
    :param cluster_group: (string) If ions are in a clustered form (e.g., clustered with I-), the user should specify
        what element (or group) is the cluster group so that it can be counted. Note that if cluster_group is an
        element, it should NOT be repeated in the list, xtra_elements.
    :param clst_group_mw: (int or float) Molecular weight/mass of the cluster group in g/mol. Important to specify this
        for an accurate molecular weight calculations. If it is an element, then clst_group_mw can be set to 0 and the
        correct elemental molecular weight will be used in MW calculations.
    :param xtra_elements: (list of strings) Extra elements which should be accounted for. That is, elements other
        than C, H, O, N, Cl, F, and Br.
    :return df_ele: A returned pandas DataFrame containing statistics for the molecules in molec_ls
    """

    # Check data types to prevent subsequent errors.
    if _check.check_ls(ls=molec_ls, nt_flag=True) and _check.check_np_array(
            arr=molec_ls, nt_flag=True):
        main_module, main_fn, main_lineno = _check.parent_fn_mod_3step()
        calling_module, calling_fn, calling_lineno = _check.parent_fn_mod_2step(
        )
        print('On line %i in function %s of module %s' %
              (main_lineno, main_fn, main_module))
        print('     Error on line %i in module %s' %
              (calling_lineno, calling_module))
        print('         Invalid input for function %s' % calling_fn)
        sys.exit('ERROR: Either a list or np.array is required')
    if (_check.check_ls(ls=xtra_elements, nt_flag=True) and _check.check_np_array(arr=xtra_elements, nt_flag=True)) \
            and xtra_elements is not None:
        main_module, main_fn, main_lineno = _check.parent_fn_mod_3step()
        calling_module, calling_fn, calling_lineno = _check.parent_fn_mod_2step(
        )
        print('On line %i in function %s of module %s' %
              (main_lineno, main_fn, main_module))
        print('     Error on line %i in module %s' %
              (calling_lineno, calling_module))
        print('         Invalid input for function %s' % calling_fn)
        sys.exit('ERROR: Either a list or np.array is required')
    if not cluster_group:
        pass
    elif not _check.check_ls(ls=cluster_group, nt_flag=True):
        main_module, main_fn, main_lineno = _check.parent_fn_mod_3step()
        calling_module, calling_fn, calling_lineno = _check.parent_fn_mod_2step(
        )
        print('On line %i in function %s of %s' %
              (main_lineno, main_fn, main_module))
        print('     Error on line %i in module %s' %
              (calling_lineno, calling_module))
        print('         Invalid input for function %s' % calling_fn)
        sys.exit("Inappropriate type passed to parameter: list.")
    # elif cluster_group is not None:
    else:
        _check.check_string(values=[cluster_group])
    _check.check_numeric(values=[ion_state])
    _check.check_numeric(values=[clst_group_mw])

    # Set column names for returned DataFrame, df_ele.
    columns = [
        "C", "H", "O", "N", "Cl", "F", "Br", "O/C", "H/C", "OSC", "OSC_N",
        "MW", "MW_xclust"
    ]
    if cluster_group is not None:
        print('extending by cluster_group')
        columns.extend(cluster_group)
    # Extend columns list by # of elements in xtra_elements.
    if xtra_elements is not None:
        xtra_elements = _check.is_element(eles=xtra_elements,
                                          return_cleaned=True)
        columns.extend(xtra_elements)
    columns = remove_duplicates(values=columns)

    # Define me, the molar mass of electrons to adjust molar mass of ion molecules by their respective charge
    me = 0.00054857990946
    df_ele = pd.DataFrame(index=molec_ls, columns=columns, data=0.0)
    base_elements = ["C", "H", "O", "N", "Cl", "F", "Br"]
    if xtra_elements is not None:
        base_elements.extend(xtra_elements)
    elements_ls = base_elements

    for molec in molec_ls:
        for char_num in range(0, len(molec)):
            if molec[char_num].isdigit():
                pass
            elif molec[char_num].isalpha():
                for ele in elements_ls:
                    if molec[char_num] == ele:
                        if len(molec) > (char_num + 1):
                            if molec[char_num + 1].isdigit():
                                ele_count = int(molec[char_num + 1])
                                if len(molec) > (char_num + 2):
                                    if molec[char_num + 2].isdigit():
                                        ele_count = ele_count * 10 + int(
                                            molec[char_num + 2])
                            else:
                                ele_count = 1
                        else:
                            ele_count = 1
                        df_ele.ix[molec, ele] = ele_count

        # Loop to count cluster groups in the element
        if (cluster_group is not None) and (cluster_group in molec):
            clst_idx = molec.index(cluster_group)
            next_char_idx = clst_idx + len(cluster_group)
            if len(molec) > next_char_idx:
                if molec[next_char_idx].isdigit():
                    clst_count = int(molec[next_char_idx])
                    if len(molec) > (next_char_idx + 1):
                        clst_count = clst_count * 10 + int(
                            molec[next_char_idx + 1])
                else:
                    clst_count = 1
            else:
                clst_count = 1
        else:
            clst_count = 0
        df_ele.ix[molec, cluster_group] = clst_count

        # Calculate molecular weight/mass now that all basic elements/cluster groups have been counted
        mw = 0.0
        mw_xclust = 0.0
        for ele in elements_ls:
            ele_mass = table_of_elements[ele]
            mw += ele_mass * df_ele.ix[molec, ele]
        if (cluster_group is not None) and (len(cluster_group) > 0):
            if clst_group_mw == 0:
                if cluster_group in table_of_elements.keys():
                    mw += table_of_elements[cluster_group] * clst_count
                else:
                    print(
                        'Cluster group "%s" not found in periodic table. If it is not an element, please define'
                        'a molecular weight (g/mol) for the group using parameter [clst_group_mw] in '
                        'function ele_stats()' % cluster_group)
            else:
                mw += clst_group_mw * clst_count
        # Adjust mw by weight of electron (with respect to parameter [ion_state])
        mw += ion_state * (-1) * me
        df_ele.ix[molec, "MW"] = mw
        df_ele.ix[molec, "MW_xclust"] = mw_xclust

    # Calc all O/C, H/C and Oxidation states and then assign to df_ele
    df_ele.ix[:, "O/C"] = o_to_c(o=df_ele.ix[:, "O"].values,
                                 c=df_ele.ix[:, "C"].values)
    df_ele.ix[:, "H/C"] = h_to_c(h=df_ele.ix[:, "H"].values,
                                 c=df_ele.ix[:, "C"].values)
    df_ele.ix[:, "OSC"] = osc(c=df_ele.ix[:, "C"].values,
                              h=df_ele.ix[:, "H"].values,
                              o=df_ele.ix[:, "O"].values)
    df_ele.ix[:, "OSC_N"] = osc_nitr(c=df_ele.ix[:, "C"].values,
                                     h=df_ele.ix[:, "H"].values,
                                     o=df_ele.ix[:, "O"].values,
                                     n=df_ele.ix[:, "N"].values)

    return df_ele
Exemplo n.º 7
0
def concat_df_lists(df_ls1, df_ls2, axis=0, join='inner'):
    """
    This function takes two lists of pandas DataFrames and concatenates them. Options allow user to specify the axis
        along which to concatenate as well as the pandas join method (e.g., 'inner', 'outer'). Where available,
        inherent pandas DataFrame functionality is employed (e.g., transpose, axis selection, join method, etc).
        Parameter choice should be in line with the requirements of the pandas library and associated functions,
        and therefore the same convention is used for parameters axis and join. DataFrames are concatenated pairwise;
        that is, df_ls1[i] is concatenated with df_ls2[i].

        Specific to the pygaero package, this function is included for the joining of Tmax and elemental analysis
        DataFrames in preparation for using clustering or classification algorithms from scikit-learn

    Parameters:
    :param df_ls1: A list of DataFrames on which to concatenate df_ls2
    :param df_ls2: A list of DataFrames to concatenate onto the corresponding DataFrames
    :param axis: The axis along which the DataFrames will be concatenated. axis=1 for column-wise, 0 for row-wise
        (standard pandas DataFrame syntax). Example: if axis=0, DataFrames will be concatenated in the row dimension
        (i.e., stacked vertically; will require same # of columns).  If axis=1, will be concatenated in the
        column dimension (i.e., side-by-side)
    :param join: Allows user to specify the join parameter for pandas.concat(). Must be compatible with choices
        available within the pandas package.
    :return: df_list: A list of DataFrames where elements are DataFrames from list 1 concatenated onto the corresponding
        DataFrame from list 2
    """
    # Check data types to prevent errors during processing.
    _check.check_ls(ls=df_ls1)
    _check.check_ls(ls=df_ls2)
    _check.check_dfs(values=df_ls1)
    _check.check_dfs(values=df_ls2)
    _check.check_eq_ls_len(list_ls=[df_ls1, df_ls2])
    _check.check_numeric(values=[axis])
    _check.param_exists_in_set(value=axis, val_set=[0, 1])
    _check.check_string(values=[join])
    _check.param_exists_in_set(value=join, val_set=['inner', 'outer'])

    # Initialize return list for concatenated DataFrames
    df_ls_concat = []
    # Check row or column lengths of lists to make sure they're the same.  If not, tell user, but try to proceed.
    if axis == 0:
        for df1, df2 in zip(df_ls1, df_ls2):
            if df1.shape[1] != df2.shape[1]:
                print(
                    'WARNING: You chose concatenation in row dimension (i.e., vertical stacking) with '
                    'parameter axis=0,\n but some DataFrame pairs have different numbers of columns.  Proceeding...'
                )
            else:
                pass
    elif axis == 1:
        for df1, df2 in zip(df_ls1, df_ls2):
            if df1.shape[0] != df2.shape[0]:
                print(
                    'WARNING: You chose to concatenate in column dimension (side-by-side) with axis=1, but'
                    'some DataFrame pairs have different number of rows.  Proceeding...'
                )
    else:
        print('ERROR: Parameter axis must be set to 0 or 1')
        sys.exit()

    for df1, df2 in zip(df_ls1, df_ls2):
        df_ls_concat.append(pd.concat([df1, df2], axis=axis, join=join))

    return df_ls_concat