def test_reorder_data_frame(): """Tests that the ``reorder_data_frame`` function behaves as expected for 1 and 2 component entries.""" data_rows = [ { "N Components": 1, "Component 1": "C", "Role": "Solvent", "Mole Fraction 1": 1.0, "Exact Amount": 1, }, { "N Components": 2, "Component 1": "CC", "Role 1": "Solvent", "Mole Fraction 1": 0.25, "Exact Amount 1": 1, "Component 2": "CO", "Role 2": "Solute", "Mole Fraction 2": 0.75, "Exact Amount 2": 2, }, { "N Components": 2, "Component 1": "CO", "Role 1": "Solute", "Mole Fraction 1": 0.75, "Exact Amount 1": 2, "Component 2": "CC", "Role 2": "Solvent", "Mole Fraction 2": 0.25, "Exact Amount 2": 1, }, ] data_frame = pandas.DataFrame(data_rows) reordered_data_frame = reorder_data_frame(data_frame) assert len(reordered_data_frame) == 3 assert reordered_data_frame.loc[0, "N Components"] == 1 for index in [1, 2]: assert reordered_data_frame.loc[index, "N Components"] == 2 assert reordered_data_frame.loc[index, "Component 1"] == "CC" assert reordered_data_frame.loc[index, "Role 1"] == "Solvent" assert reordered_data_frame.loc[index, "Mole Fraction 1"] == 0.25 assert reordered_data_frame.loc[index, "Exact Amount 1"] == 1 assert reordered_data_frame.loc[index, "Component 2"] == "CO" assert reordered_data_frame.loc[index, "Role 2"] == "Solute" assert reordered_data_frame.loc[index, "Mole Fraction 2"] == 0.75 assert reordered_data_frame.loc[index, "Exact Amount 2"] == 2
def _apply(cls, data_frame: pandas.DataFrame, schema: SelectDataPointsSchema, n_processes) -> pandas.DataFrame: max_n_substances = data_frame["N Components"].max() component_headers = [ f"Component {i + 1}" for i in range(max_n_substances) ] # Re-order the data frame so that the components are alphabetically sorted. # This will make it easier to find unique substances. ordered_data_frame = reorder_data_frame(data_frame) # Find all of the unique substances in the data frame. unique_substances = ordered_data_frame[ component_headers].drop_duplicates() selected_data = [] # Start to choose the state points for each unique substance. for _, unique_substance in unique_substances.iterrows(): substance_data_frame = ordered_data_frame for index, component in enumerate( unique_substance[component_headers]): if pandas.isnull(component): substance_data_frame = substance_data_frame[ substance_data_frame[component_headers[index]].isna()] else: substance_data_frame = substance_data_frame[ substance_data_frame[ component_headers[index]] == component] for target_state in schema.target_states: substance_selected_data = cls._select_substance_data_points( substance_data_frame, target_state) if len(substance_selected_data) == 0: continue selected_data.append(substance_selected_data) selected_data = pandas.concat(selected_data, ignore_index=True, sort=False) return selected_data
def reindex_data_set( data_set: "PhysicalPropertyDataSet", reference_set: Union[DataSet, DataSetCollection], ): """Attempts to change the unique id of estimated estimated data points to match the unique id of their corresponding reference data points based upon the state at which they were measured. **Note**: the data set will be modified in place. This method should **only** be used when attempting to convert previous results into the new framework, and not in general. Parameters ---------- data_set: PhysicalPropertyDataSet The data set to re-index. reference_set: DataSet or DataSetCollection The data set(s) whose ids should be matched. """ import pandas from openff.evaluator.datasets.utilities import reorder_data_frame if len(data_set) == 0: return estimated_data_frame = reorder_data_frame(data_set.to_pandas()) if isinstance(reference_set, DataSet): reference_data_frame = reference_set.to_pandas() elif isinstance(reference_set, DataSetCollection): reference_data_frames = [ x.to_pandas() for x in reference_set.data_sets ] reference_data_frame: pandas.DataFrame = pandas.concat( reference_data_frames, ignore_index=True, sort=False) else: raise NotImplementedError reference_data_frame = reorder_data_frame(reference_data_frame) minimum_n_components = estimated_data_frame["N Components"].min() maximum_n_components = estimated_data_frame["N Components"].max() id_mappings = [] property_headers = [ x for x in estimated_data_frame if x.find(" Value ") >= 0 ] for n_components in range(minimum_n_components, maximum_n_components + 1): for property_header in property_headers: estimated_component_data = estimated_data_frame[ estimated_data_frame["N Components"] == n_components] reference_component_data = reference_data_frame[ reference_data_frame["N Components"] == n_components] estimated_component_data = estimated_component_data[ estimated_component_data[property_header].notna()].copy() reference_component_data = reference_component_data[ reference_component_data[property_header].notna()].copy() if len(estimated_component_data) == 0 or len( reference_component_data) == 0: continue component_data_frames = [ estimated_component_data, reference_component_data ] comparison_columns = {"Temperature (K)", "Pressure (kPa)", "Phase"} for component_data in component_data_frames: component_data["Temperature (K)"] = component_data[ "Temperature (K)"].round(1) component_data["Pressure (kPa)"] = ( component_data["Pressure (kPa)"].fillna( value=numpy.nan).round(1)) for index in range(n_components): component_data[f"Mole Fraction {index + 1}"] = ( component_data[f"Mole Fraction {index + 1}"].fillna( value=0.0).round(2)) component_data[ f"Exact Amount {index + 1}"] = component_data[ f"Exact Amount {index + 1}"].fillna(value=0) comparison_columns.update([ f"Component {index + 1}", f"Role {index + 1}", f"Mole Fraction {index + 1}", f"Exact Amount {index + 1}", ]) comparison_columns = [*comparison_columns] joined_frames = pandas.merge( estimated_component_data, reference_component_data, on=comparison_columns, suffixes=("_orig", "_new"), ) joined_frames.drop_duplicates(subset=["Id_orig"], inplace=True) if len(joined_frames) != len(estimated_component_data): logging.warning( f"{abs(len(joined_frames) - len(estimated_component_data))} " f"properties could not be re-indexed.") id_mappings.append(joined_frames[["Id_orig", "Id_new"]]) id_mappings_frame = pandas.concat(id_mappings, ignore_index=True, sort=False) id_mappings = { x["Id_orig"]: x["Id_new"] for _, x in id_mappings_frame.iterrows() } for physical_property in data_set: if physical_property.id not in id_mappings: logger.warning(f"{physical_property.id} could not be re-indexed.") continue physical_property.id = f"{id_mappings[physical_property.id]}"
def _apply( cls, data_frame: pandas.DataFrame, schema: FilterDuplicatesSchema, n_processes ) -> pandas.DataFrame: if len(data_frame) == 0: return data_frame data_frame = data_frame.copy() data_frame = reorder_data_frame(data_frame) minimum_n_components = data_frame["N Components"].min() maximum_n_components = data_frame["N Components"].max() filtered_data = [] for n_components in range(minimum_n_components, maximum_n_components + 1): component_data = data_frame[ data_frame["N Components"] == n_components ].copy() component_data["Temperature (K)"] = component_data["Temperature (K)"].round( schema.temperature_precision ) component_data["Pressure (kPa)"] = component_data["Pressure (kPa)"].round( schema.pressure_precision ) subset_columns = ["Temperature (K)", "Pressure (kPa)", "Phase"] for index in range(n_components): component_data[f"Mole Fraction {index + 1}"] = component_data[ f"Mole Fraction {index + 1}" ].round(schema.mole_fraction_precision) subset_columns.extend( [ f"Component {index + 1}", f"Role {index + 1}", f"Mole Fraction {index + 1}", f"Exact Amount {index + 1}", ] ) subset_columns = [x for x in subset_columns if x in component_data] value_headers = [x for x in component_data if x.find(" Value ") >= 0] sorted_filtered_data = [] for value_header in value_headers: uncertainty_header = value_header.replace("Value", "Uncertainty") property_data = component_data[component_data[value_header].notna()] if uncertainty_header in component_data: property_data = property_data.sort_values(uncertainty_header) property_data = property_data.drop_duplicates( subset=subset_columns, keep="last" ) sorted_filtered_data.append(property_data) sorted_filtered_data = pandas.concat( sorted_filtered_data, ignore_index=True, sort=False ) filtered_data.append(sorted_filtered_data) filtered_data = pandas.concat(filtered_data, ignore_index=True, sort=False) return filtered_data
def _apply( cls, data_frame: pandas.DataFrame, schema: FilterByPropertyTypesSchema, n_processes, ) -> pandas.DataFrame: property_headers = [ header for header in data_frame if header.find(" Value ") >= 0 ] # Removes the columns for properties which are not of interest. for header in property_headers: property_type = header.split(" ")[0] if property_type in schema.property_types: continue data_frame = data_frame.drop(header, axis=1) uncertainty_header = header.replace(" Value ", " Uncertainty ") if uncertainty_header in data_frame: data_frame = data_frame.drop(uncertainty_header, axis=1) # Drop any rows which do not contain any values for the property types of # interest. property_headers = [ header for header in property_headers if header.split(" ")[0] in schema.property_types ] data_frame = data_frame.dropna(subset=property_headers, how="all") # Apply a more specific filter which only retain which contain values # for the specific property types, and which were measured for the # specified number of components. for property_type, n_components in schema.n_components.items(): property_header = next( iter(x for x in property_headers if x.find(f"{property_type} ") == 0), None, ) if property_header is None: continue data_frame = data_frame[ data_frame[property_header].isna() | data_frame["N Components"].isin(n_components) ] # Apply the strict filter if requested if schema.strict: reordered_data_frame = reorder_data_frame(data_frame) # Build a dictionary of which properties should be present partitioned # by the number of components they should have been be measured for. property_types = defaultdict(list) if len(schema.n_components) > 0: for property_type, n_components in schema.n_components.items(): for n_component in n_components: property_types[n_component].append(property_type) min_n_components = min(property_types) max_n_components = max(property_types) else: min_n_components = reordered_data_frame["N Components"].min() max_n_components = reordered_data_frame["N Components"].max() for n_components in range(min_n_components, max_n_components + 1): property_types[n_components].extend(schema.property_types) substances_with_data = set() components_with_data = {} # For each N component find substances which have data points for # all of the specified property types. for n_components in range(min_n_components, max_n_components + 1): component_data = reordered_data_frame[ reordered_data_frame["N Components"] == n_components ] if n_components not in property_types or len(component_data) == 0: continue n_component_headers = [ header for header in property_headers if header.split(" ")[0] in property_types[n_components] and header in component_data ] if len(n_component_headers) != len(property_types[n_components]): continue n_component_substances = set.intersection( *[ data_frame_to_substances( component_data[component_data[header].notna()] ) for header in n_component_headers ] ) substances_with_data.update(n_component_substances) components_with_data[n_components] = { component for substance in n_component_substances for component in substance } if len(schema.n_components) > 0: components_with_all_data = set.intersection( *components_with_data.values() ) # Filter out any smiles for don't appear in all of the N component # substances. data_frame = FilterBySmiles.apply( data_frame, FilterBySmilesSchema(smiles_to_include=[*components_with_all_data]), ) # Filter out any substances which (within each N component) don't have # all of the specified data types. data_frame = FilterBySubstances.apply( data_frame, FilterBySubstancesSchema(substances_to_include=[*substances_with_data]), ) data_frame = data_frame.dropna(axis=1, how="all") return data_frame