def prob(self, comparison_vectors): """Compute the probabilities for each record pair. For each pair of records, estimate the probability of being a match. Parameters ---------- comparison_vectors : pandas.DataFrame The dataframe with comparison vectors. return_type : 'series' or 'array' Return a pandas series or numpy array. Default 'series'. Returns ------- pandas.Series or numpy.ndarray The probability of being a match for each record pair. """ logging.info("Classifying - compute probabilities") enc_vectors = self.algorithm._transform_vectors( comparison_vectors.as_matrix()) return pandas.Series(self.algorithm._expectation(enc_vectors), index=comparison_vectors.index)
def __init__(self, features=[], n_jobs=1, indexing_type='label', **kwargs): logging.info("comparing - initialize {} class".format( self.__class__.__name__)) self.features = [] self.add(features) # public self.n_jobs = n_jobs self.indexing_type = indexing_type # label of position self.features = [] # logging self._i = 1 self._i_max = None self._n = [] self._eta = [] self._output_log_total = True # private self._compare_functions = [] if isinstance(features, (pandas.MultiIndex, pandas.Index)): warnings.warn( "It seems you are using the older version of the Compare API, " "see the documentation about how to update to the new API. " "http://recordlinkage.readthedocs.io/" "en/latest/ref-compare.html", VisibleDeprecationWarning)
def predict(self, comparison_vectors, return_type='index'): """Predict the class of the record pairs. Classify a set of record pairs based on their comparison vectors into matches, non-matches and possible matches. The classifier has to be trained to call this method. Parameters ---------- comparison_vectors : pandas.DataFrame Dataframe with comparison vectors. return_type : 'index' (default), 'series', 'array' The format to return the classification result. The argument value 'index' will return the pandas.MultiIndex of the matches. The argument value 'series' will return a pandas.Series with zeros (distinct) and ones (matches). The argument value 'array' will return a numpy.ndarray with zeros and ones. Returns ------- pandas.Series A pandas Series with the labels 1 (for the matches) and 0 (for the non-matches). """ logging.info("Classifying - predict matches and non-matches") return self._predict(comparison_vectors, return_type)
def __init__(self, algorithms=[]): logging.info("Index - initialize {} class".format( self.__class__.__name__)) self.algorithms = [] self.add(algorithms)
def fit(self, comparison_vectors, match_index=None): """Train the classifier. Parameters ---------- comparison_vectors : pandas.DataFrame The comparison vectors (or features) to train the model with. match_index : pandas.MultiIndex A pandas.MultiIndex object with the true matches. The MultiIndex contains only the true matches. Default None. Note ---- A note in case of finding links within a single dataset (for example deduplication). Unsure that the training record pairs are from the lower triangular part of the dataset/matrix. See detailed information here: link. """ logging.info("Classification - start training {}".format( self.__class__.__name__) ) self._initialise_classifier(comparison_vectors) # start timing start_time = time.time() if isinstance(match_index, (pandas.MultiIndex, pandas.Index)): try: y = pandas.Series(0, index=comparison_vectors.index) y.loc[match_index & comparison_vectors.index] = 1 except pandas.IndexError as err: # The are no matches. So training is not possible. if len(match_index & comparison_vectors.index) == 0: raise LearningError( "both matches and non-matches needed in the" + "trainingsdata, only non-matches found" ) else: raise err self._fit(comparison_vectors.values, y.values) elif match_index is None: self._fit(comparison_vectors.values) else: raise ValueError( "'match_index' has incorrect type '{}'".format( type(match_index) ) ) # log timing logf_time = "Classification - training computation time: ~{:.2f}s" logging.info(logf_time.format(time.time() - start_time))
def prob(self, comparison_vectors, return_type='series'): """Compute the probabilities for each record pair. For each pair of records, estimate the probability of being a match. Parameters ---------- comparison_vectors : pandas.DataFrame The dataframe with comparison vectors. return_type : 'series' or 'array' Return a pandas series or numpy array. Default 'series'. Returns ------- pandas.Series or numpy.ndarray The probability of being a match for each record pair. """ logging.info("Classifying - compute probabilities") probs = self.classifier.predict_proba(comparison_vectors.as_matrix()) if return_type == 'series': return pandas.Series(probs[:, 0], index=comparison_vectors.index) elif return_type == 'array': return probs[:, 0] else: raise ValueError( "return_type {} unknown. Choose 'index', 'series' or " "'array'".format(return_type))
def predict(self, comparison_vectors): """Predict the class of the record pairs. Classify a set of record pairs based on their comparison vectors into matches, non-matches and possible matches. The classifier has to be trained to call this method. Parameters ---------- comparison_vectors : pandas.DataFrame Dataframe with comparison vectors. return_type : str Deprecated. Use recordlinkage.options instead. Use the option `recordlinkage.set_option('classification.return_type', 'index')` instead. Returns ------- pandas.Series A pandas Series with the labels 1 (for the matches) and 0 (for the non-matches). """ logging.info("Classification - predict matches and non-matches") # make the predicition prediction = self._predict(comparison_vectors.as_matrix()) self._post_predict(prediction) # format and return the result return self._return_result(prediction, comparison_vectors)
def prob(self, comparison_vectors, return_type=None): """Compute the probabilities for each record pair. For each pair of records, estimate the probability of being a match. Parameters ---------- comparison_vectors : pandas.DataFrame The dataframe with comparison vectors. return_type : str Deprecated. (default 'series') Returns ------- pandas.Series or numpy.ndarray The probability of being a match for each record pair. """ # deprecation if return_type is not None: warnings.warn( "The argument 'return_type' is removed. " "Default value is now 'series'.", VisibleDeprecationWarning, stacklevel=2) logging.info("Classification - compute probabilities") prob_match = self._prob_match(comparison_vectors.as_matrix()) return pandas.Series(prob_match, index=comparison_vectors.index)
def exact(self, s1, s2, *args, **kwargs): """ exact(s1, s2, agree_value=1, disagree_value=0, missing_value=0, label=None) Compare the record pairs exactly. This method initialises the exact similarity measurement between values. The similarity is 1 in case of agreement and 0 otherwise. Parameters ---------- s1 : str or int Field name to compare in left DataFrame. s2 : str or int Field name to compare in right DataFrame. agree_value : float, str, numpy.dtype The value when two records are identical. Default 1. If 'values' is passed, then the value of the record pair is passed. disagree_value : float, str, numpy.dtype The value when two records are not identical. missing_value : float, str, numpy.dtype The value for a comparison with a missing value. Default 0. label : label The label of the column in the resulting dataframe. """ # logging logging.info( "Comparing - initialize exact algorithm - compare {l_left} with " "{l_right}".format(l_left=s1, l_right=s2)) return self._compare_vectorized(_compare_exact, s1, s2, *args, **kwargs)
def __init__(self): logging.info("Classification - initialize {} class".format( self.__class__.__name__)) # The actual classifier. Maybe this is slightly strange because of # inheritance. self.classifier = None
def learn(self, comparison_vectors, match_index, return_type='index'): """Train the classifier. Parameters ---------- comparison_vectors : pandas.DataFrame The comparison vectors. match_index : pandas.MultiIndex The true matches. return_type : 'index' (default), 'series', 'array' The format to return the classification result. The argument value 'index' will return the pandas.MultiIndex of the matches. The argument value 'series' will return a pandas.Series with zeros (distinct) and ones (matches). The argument value 'array' will return a numpy.ndarray with zeros and ones. Returns ------- pandas.Series A pandas Series with the labels 1 (for the matches) and 0 (for the non-matches). """ logging.info("Classifying - start learning {}".format( self.__class__.__name__)) # start timing start_time = time.time() if isinstance(match_index, (pandas.MultiIndex, pandas.Index)): # The match_index variable is of type MultiIndex train_series = pandas.Series(False, index=comparison_vectors.index) try: train_series.loc[match_index & comparison_vectors.index] = True except pandas.IndexError as err: # The are no matches. So training is not possible. if len(match_index & comparison_vectors.index) == 0: raise LearningError( "both matches and non-matches needed in the" + "trainingsdata, only non-matches found") else: raise err self.classifier.fit(comparison_vectors.as_matrix(), numpy.array(train_series)) result = self._predict(comparison_vectors, return_type) # log timing logf_time = "Classifying - learning computation time: ~{:.2f}s" logging.info(logf_time.format(time.time() - start_time)) return result
def _compute(self, pairs, x, x_link=None): # start the timer for the comparing step start_time = time.time() sublabels_left = self._get_labels_left(validate=x) df_a_indexed = frame_indexing(x[sublabels_left], pairs, 0) if x_link is None: sublabels_right = self._get_labels_right(validate=x) df_b_indexed = frame_indexing(x[sublabels_right], pairs, 1) else: sublabels_right = self._get_labels_right(validate=x_link) df_b_indexed = frame_indexing(x_link[sublabels_right], pairs, 1) # log timing # index_time = time.time() - start_time features = [] for feat in self.features: lbl1 = feat.labels_left lbl2 = feat.labels_right data1 = tuple([df_a_indexed[lbl] for lbl in listify(lbl1)]) data2 = tuple([df_b_indexed[lbl] for lbl in listify(lbl2)]) result = feat._compute(data1, data2) features.append((result, feat.label)) features = self._union(features, pairs) # log timing n = pairs.shape[0] i_max = '?' if self._i_max is None else self._i_max eta = time.time() - start_time self._eta.append(eta) self._n.append(n) # log logging.info("comparing [{:d}/{}] - time: {:.2f}s - pairs: {}".format( self._i, i_max, eta, n)) # log total if self._output_log_total: n_total = np.sum(self._n) eta_total = np.sum(self._eta) logging.info( "comparing [{:d}/{}] - time: {:.2f}s - pairs_total: {}".format( self._i, i_max, eta_total, n_total)) self._i += 1 return features
def __init__(self, verify_integrity=True): super(BaseIndexator, self).__init__() self._n = [] self._n_max = [] self.verify_integrity = verify_integrity logging.info("Indexing - initialize {} class".format( self.__class__.__name__))
def __init__(self, verify_integrity=True, suffixes=('_1', '_2')): super(BaseIndexAlgorithm, self).__init__() self.suffixes = suffixes self.verify_integrity = verify_integrity self._n = [] self._n_max = [] logging.info("Indexing - initialize {} class".format( self.__class__.__name__))
def learn(self, comparison_vectors, init='jaro', return_type='index'): """ Train the algorithm. Train the Expectation-Maximisation classifier. This method is well- known as the ECM-algorithm implementation in the context of record linkage. Parameters ---------- comparison_vectors : pandas.DataFrame The dataframe with comparison vectors. params_init : dict A dictionary with initial parameters of the ECM algorithm (optional). return_type : 'index' (default), 'series', 'array' The format to return the classification result. The argument value 'index' will return the pandas.MultiIndex of the matches. The argument value 'series' will return a pandas.Series with zeros (distinct) and ones (matches). The argument value 'array' will return a numpy.ndarray with zeros and ones. Returns ------- pandas.Series A pandas Series with the labels 1 (for the matches) and 0 (for the non-matches). """ logging.info("Classifying - start learning {}".format( self.__class__.__name__) ) # start timing start_time = time.time() probs = self.algorithm.train(comparison_vectors.as_matrix()) n_matches = int(self.algorithm.p * len(probs)) self.p_threshold = numpy.sort(probs)[len(probs) - n_matches] prediction = self._decision_rule(probs, self.p_threshold) result = self._return_result( prediction, return_type, comparison_vectors ) # log timing logf_time = "Classifying - learning computation time: ~{:.2f}s" logging.info(logf_time.format(time.time() - start_time)) return result
def __init__(self, labels_left, labels_right, args=(), kwargs={}): self.labels_left = labels_left self.labels_right = labels_right self.args = args self.kwargs = kwargs self._f_compare_vectorized = None # logging logging.info("{} - initialize exact algorithm " "- compare {l_left} with {l_right}".format( self.__class__.__name__, l_left=labels_left, l_right=labels_right))
def __init__(self, algorithms=[]): logging.info("indexing - initialize {} class".format( self.__class__.__name__)) self.algorithms = [] self.add(algorithms) # logging self._i = 1 self._i_max = None self._n = [] self._n_max = [] self._eta = [] self._output_log_total = True
def predict(self, comparison_vectors, return_type='index', *args, **kwargs): """Predict the class of reord pairs. Classify a set of record pairs based on their comparison vectors into matches, non-matches and possible matches. The classifier has to be trained to call this method. Parameters ---------- comparison_vectors : pandas.DataFrame The dataframe with comparison vectors. return_type : 'index' (default), 'series', 'array' The format to return the classification result. The argument value 'index' will return the pandas.MultiIndex of the matches. The argument value 'series' will return a pandas.Series with zeros (distinct) and ones (matches). The argument value 'array' will return a numpy.ndarray with zeros and ones. Returns ------- pandas.Series A pandas Series with the labels 1 (for the matches) and 0 (for the non-matches). Note ---- Prediction is risky for this unsupervised learning method. Be aware that the sample from the population is valid. """ logging.info("Classifying - predict matches and non-matches") enc_vectors = self.algorithm._transform_vectors( comparison_vectors.as_matrix()) probs = self.algorithm._expectation(enc_vectors) prediction = self._decision_rule(probs, self.p_threshold) return self._return_result(prediction, return_type, comparison_vectors)
def __init__(self, pairs=None, df_a=None, df_b=None, low_memory=False, block_size=1000000, njobs=1, indexing_type='label', **kwargs): logging.info("Comparing - initialize {} class".format( self.__class__.__name__)) # public self.indexing_type = indexing_type # label of position # private self._compare_functions = [] if isinstance(pairs, (pandas.MultiIndex, pandas.Index)): self.deprecated = True warnings.warn( "It seems you are using the older version of the Compare API, " "see the documentation about how to update to the new API. " "http://recordlinkage.readthedocs.io/" "en/latest/ref-compare.html", VisibleDeprecationWarning) else: self.deprecated = False # deprecated self.df_a = df_a self.df_b = df_b if df_b is not None else df_a self.pairs = pairs self.low_memory = low_memory self.block_size = block_size self.njobs = njobs self._df_a_indexed = None self._df_b_indexed = None self.vectors = pandas.DataFrame(index=pairs)
def date(self, s1, s2, swap_month_day=0.5, swap_months='default', *args, **kwargs): """ date(self, s1, s2, swap_month_day=0.5, swap_months='default', missing_value=0, label=None) Compute the (partial) similarity between date values. Parameters ---------- s1 : str or int The name or position of the column in the left DataFrame. s2 : str or int The name or position of the column in the right DataFrame. swap_month_day : float The value if the month and day are swapped. swap_months : list of tuples A list of tuples with common errors caused by the translating of months into numbers, i.e. October is month 10. The format of the tuples is (month_good, month_bad, value). Default : swap_months = [(6, 7, 0.5), (7, 6, 0.5), (9, 10, 0.5), (10, 9, 0.5)] missing_value : numpy.dtype The value for a comparison with a missing value. Default 0. label : label The label of the column in the resulting dataframe. """ # logging logging.info( "Comparing - initialize date algorithm - compare {l_left} with " "{l_right}".format(l_left=s1, l_right=s2)) return self._compare_vectorized(_dates_internal, s1, s2, swap_month_day=swap_month_day, swap_months=swap_months, *args, **kwargs)
def compare_vectorized(self, comp_func, labels_left, labels_right, *args, **kwargs): """Compute the similarity between values with a callable. This method initialises the comparing of values with a custom function/callable. The function/callable should accept numpy.ndarray's. Example ------- >>> comp = recordlinkage.Compare() >>> comp.compare_vectorized(custom_callable, 'first_name', 'name') >>> comp.compare(PAIRS, DATAFRAME1, DATAFRAME2) Parameters ---------- comp_func : function A comparison function. This function can be a built-in function or a user defined comparison function. The function should accept numpy.ndarray's as first two arguments. labels_left : label, pandas.Series, pandas.DataFrame The labels, Series or DataFrame to compare. labels_right : label, pandas.Series, pandas.DataFrame The labels, Series or DataFrame to compare. *args : Additional arguments to pass to callable comp_func. **kwargs : Additional keyword arguments to pass to callable comp_func. (keyword 'label' is reserved.) label : (list of) label(s) The name of the feature and the name of the column. IMPORTANT: This argument is a keyword argument. """ log_str = "Comparing - initialize user defined function - " \ "compare {l_left} with {l_right}" logging.info(log_str.format(l_left=labels_left, l_right=labels_right)) return self._compare_vectorized(comp_func, labels_left, labels_right, *args, **kwargs)
def __init__(self, comp_func, labels_left, labels_right, args=(), kwargs={}, label=None, name="", description=""): self.comp_func = comp_func self.labels_left = labels_left self.labels_right = labels_right self.args = args self.kwargs = kwargs self.label = label self.description = description # logging logging.info("CompareFeature - initialize exact algorithm - compare " "{l_left} with {l_right}".format(l_left=labels_left, l_right=labels_right))
def _compute(self, *args): logging.info("Comparing - start comparing data") # start the timer for the comparing step start_time = time.time() c = self._compute_vectorized(*args) # log timing total_time = time.time() - start_time # log timing logging.info( "Comparing - computation time: ~{:.2f}s".format(total_time)) # log results logf_result = "Comparing - summary shape={}" logging.info(logf_result.format(c.shape)) return c
def _compute(self, pairs, x, x_link=None): logging.info("Comparing - start comparing data") # start the timer for the comparing step start_time = time.time() sublabels_left = self._get_labels_left(validate=x) df_a_indexed = frame_indexing(x[sublabels_left], pairs, 0) if x_link is None: sublabels_right = self._get_labels_right(validate=x) df_b_indexed = frame_indexing(x[sublabels_right], pairs, 1) else: sublabels_right = self._get_labels_right(validate=x_link) df_b_indexed = frame_indexing(x_link[sublabels_right], pairs, 1) # log timing index_time = time.time() - start_time results = pandas.DataFrame(index=pairs) label_num = 0 # make a label is label is None for feat, label in self.features: lbl1 = feat.labels_left lbl2 = feat.labels_right data1 = tuple([df_a_indexed[lbl] for lbl in listify(lbl1)]) data2 = tuple([df_b_indexed[lbl] for lbl in listify(lbl2)]) c = feat._compute(*tuple(data1 + data2)) if is_pandas_like(c): c = c.values # convert pandas into numpy if label is not None: label = listify(label) n_cols = 1 if len(c.shape) == 1 else c.shape[1] labels = [] for i in range(0, n_cols): label_val = label[i] if label is not None else label_num label_num += 1 labels.append(label_val) results[label_val] = c # log timing total_time = time.time() - start_time # log timing logging.info("Comparing - computation time: ~{:.2f}s (from which " "indexing: ~{:.2f}s)".format(total_time, index_time)) # log results logf_result = "Comparing - summary shape={}" logging.info(logf_result.format(results.shape)) return results
def numeric(self, s1, s2, method='linear', *args, **kwargs): """ numeric(s1, s2, method='linear', offset, scale, origin=0, missing_value=0, label=None) Compute the (partial) similarity between numeric values. This method initialises the similarity measurement between numeric values. The implemented algorithms are: 'step', 'linear', 'exp', 'gauss' or 'squared'. In case of agreement, the similarity is 1 and in case of complete disagreement it is 0. The implementation is similar with numeric comparing in ElasticSearch, a full-text search tool. The parameters are explained in the image below (source ElasticSearch, The Definitive Guide) .. image:: /images/elas_1705.png :width: 100% :target: https://www.elastic.co/guide/en/elasticsearch/guide/current/decay-functions.html :alt: Decay functions, like in ElasticSearch Parameters ---------- s1 : str or int The name or position of the column in the left DataFrame. s2 : str or int The name or position of the column in the right DataFrame. method : float The metric used. Options 'step', 'linear', 'exp', 'gauss' or 'squared'. Default 'linear'. offset : float The offset. See image above. scale : float The scale of the numeric comparison method. See the image above. This argument is not available for the 'step' algorithm. origin : str The shift of bias between the values. See image above. missing_value : numpy.dtype The value if one or both records have a missing value on the compared field. Default 0. label : label The label of the column in the resulting dataframe. Note ---- Numeric comparing can be an efficient way to compare date/time variables. This can be done by comparing the timestamps. """ if method == 'step': num_sim_alg = _step_sim elif method in ['linear', 'lin']: num_sim_alg = _linear_sim elif method == 'squared': num_sim_alg = _squared_sim elif method in ['exp', 'exponential']: num_sim_alg = _exp_sim elif method in ['gauss', 'gaussian']: num_sim_alg = _gauss_sim else: raise ValueError("The algorithm '{}' is not known.".format(method)) # logging logging.info( "Comparing - initialize numeric '{method}' algorithm - compare " "{l_left} with {l_right}".format(l_left=s1, l_right=s2, method=method)) return self._compare_vectorized(_num_internal, s1, s2, num_sim_alg, *args, **kwargs)
def geo(self, lat1, lng1, lat2, lng2, method='linear', *args, **kwargs): """ geo(lat1, lng1, lat2, lng2, method='linear', offset, scale, origin=0, missing_value=0, label=None) Compute the (partial) similarity between WGS84 coordinate values. Compare the geometric (haversine) distance between two WGS- coordinates. The similarity algorithms are 'step', 'linear', 'exp', 'gauss' or 'squared'. The similarity functions are the same as in :meth:`recordlinkage.comparing.Compare.numeric` Parameters ---------- lat1 : str or int The name or position of the column in the left DataFrame. lng1 : str or int The name or position of the column in the left DataFrame. lat2 : str or int The name or position of the column in the right DataFrame. lng2 : str or int The name or position of the column in the right DataFrame. method : str The metric used. Options 'step', 'linear', 'exp', 'gauss' or 'squared'. Default 'linear'. offset : float The offset. See Compare.numeric. scale : float The scale of the numeric comparison method. See Compare.numeric. This argument is not available for the 'step' algorithm. origin : float The shift of bias between the values. See Compare.numeric. missing_value : numpy.dtype The value for a comparison with a missing value. Default 0. label : label The label of the column in the resulting dataframe. """ if method == 'step': num_sim_alg = _step_sim elif method in ['linear', 'lin']: num_sim_alg = _linear_sim elif method == 'squared': num_sim_alg = _squared_sim elif method in ['exp', 'exponential']: num_sim_alg = _exp_sim elif method in ['gauss', 'gaussian']: num_sim_alg = _gauss_sim else: raise ValueError("The algorithm '{}' is not known.".format(method)) # logging logging.info("Comparing - initialize geographic '{method}' " "algorithm - compare {l_left} with {l_right}".format( l_left=(lat1, lng1), l_right=(lat2, lng2), method=method)) return self._compare_vectorized(_geo_internal, (lat1, lng1), (lat2, lng2), num_sim_alg, *args, **kwargs)
def compute(self, pairs, x, x_link=None): """Compare the records of each record pair. Calling this method starts the comparing of records. Parameters ---------- pairs : pandas.MultiIndex A pandas MultiIndex with the record pairs to compare. The indices in the MultiIndex are indices of the DataFrame(s) to link. x : pandas.DataFrame The DataFrame to link. If `x_link` is given, the comparing is a linking problem. If `x_link` is not given, the problem is one of deduplication. x_link : pandas.DataFrame, optional The second DataFrame. Returns ------- pandas.DataFrame A pandas DataFrame with feature vectors, i.e. the result of comparing each record pair. """ if not isinstance(pairs, pandas.MultiIndex): raise ValueError( "expected pandas.MultiIndex with record pair indices " "as first argument") if not isinstance(x, pandas.DataFrame): raise ValueError("expected pandas.DataFrame as second argument") if x_link is not None and not isinstance(x_link, pandas.DataFrame): raise ValueError("expected pandas.DataFrame as third argument") logging.info("Comparing - start comparing data") # start the timer for the comparing step start_time = time.time() sublabels_left = self._get_labels_left(validate=x) df_a_indexed = self._loc2(x[sublabels_left], pairs, 0) if x_link is None: sublabels_right = self._get_labels_right(validate=x) df_b_indexed = self._loc2(x[sublabels_right], pairs, 1) else: sublabels_right = self._get_labels_right(validate=x_link) df_b_indexed = self._loc2(x_link[sublabels_right], pairs, 1) # log timing index_time = time.time() - start_time results = pandas.DataFrame(index=pairs) label_num = 0 # make a label is label is None for f, lbl1, lbl2, label, args, kwargs in self._compare_functions: data1 = tuple([df_a_indexed[lbl] for lbl in listify(lbl1)]) data2 = tuple([df_b_indexed[lbl] for lbl in listify(lbl2)]) c = f(*tuple(data1 + data2 + args), **kwargs) if isinstance(c, (pandas.Series, pandas.DataFrame)): c = c.values # convert pandas into numpy if label is not None: label = listify(label) n_cols = 1 if len(c.shape) == 1 else c.shape[1] labels = [] for i in range(0, n_cols): label_val = label[i] if label is not None else label_num label_num += 1 labels.append(label_val) results[label_val] = c # log timing total_time = time.time() - start_time # log timing logging.info("Comparing - computation time: ~{:.2f}s (from which " "indexing: ~{:.2f}s)".format(total_time, index_time)) # log results logf_result = "Comparing - summary shape={}" logging.info(logf_result.format(results.shape)) return results
def index(self, x, x_link=None): """Make an index of record pairs. Use a custom function to make record pairs of one or two dataframes. Each function should return a pandas.MultiIndex with record pairs. Parameters ---------- x: pandas.DataFrame A pandas DataFrame. When `x_link` is None, the algorithm makes record pairs within the DataFrame. When `x_link` is not empty, the algorithm makes pairs between `x` and `x_link`. x_link: pandas.DataFrame, optional A second DataFrame to link with the DataFrame x. Returns ------- pandas.MultiIndex A pandas.MultiIndex with record pairs. Each record pair contains the index labels of two records. """ if x is None: # error raise ValueError("provide at least one dataframe") elif x_link is not None: # linking (two arg) x = (x, x_link) elif isinstance(x, (list, tuple)): # dedup or linking (single arg) x = tuple(x) else: # dedup (single arg) x = (x, ) if self.verify_integrity: for df in x: self._verify_integrety(df) # start timing start_time = time.time() # linking if not self._deduplication(x): logging.info("Indexing - start indexing two DataFrames") pairs = self._link_index(*x) names = self._make_index_names(x[0].index.name, x[1].index.name) # deduplication else: logging.info("Indexing - start indexing single DataFrame") pairs = self._dedup_index(*x) names = self._make_index_names(x[0].index.name, x[0].index.name) pairs.rename(names, inplace=True) # store the number of pairs self._n.append(pairs.shape[0]) self._n_max.append(max_pairs(x)) # summary n = len(pairs) rr = 1 - self._n[-1] / self._n_max[-1] rr_avg = 1 - np.sum(self._n) / np.sum(self._n_max) # log timing logf_time = "Indexing - computation time: ~{:.2f}s" logging.info(logf_time.format(time.time() - start_time)) # log results logf_result = "Indexing - summary n={:d}, " \ "reduction_ratio={:0.5f}, reduction_ratio_mean={:0.5f}" logging.info(logf_result.format(n, rr, rr_avg)) return pairs
def _compute(self, pairs, x, x_link=None): # start the timer for the comparing step start_time = time.time() sublabels_left = self._get_labels_left(validate=x) df_a_indexed = frame_indexing(x[sublabels_left], pairs, 0) if x_link is None: sublabels_right = self._get_labels_right(validate=x) df_b_indexed = frame_indexing(x[sublabels_right], pairs, 1) else: sublabels_right = self._get_labels_right(validate=x_link) df_b_indexed = frame_indexing(x_link[sublabels_right], pairs, 1) # log timing # index_time = time.time() - start_time features = [] for feat in self.features: # --- DATA1 # None: no data passed to func if feat.labels_left is None: data1 = tuple() # empty array: empty df with index passed to func elif feat.labels_left == []: data1 = (df_a_indexed[[]], ) # else: subset columns and pass tuple of series else: data1 = tuple( [df_a_indexed[lbl] for lbl in listify(feat.labels_left)]) # --- DATA2 # None: no data passed to func if feat.labels_right is None: data2 = tuple() # empty array: empty df with index passed to func elif feat.labels_right == []: data2 = (df_b_indexed[[]], ) # else: subset columns and pass tuple of series else: data2 = tuple( [df_b_indexed[lbl] for lbl in listify(feat.labels_right)]) result = feat._compute(data1, data2) features.append((result, feat.label)) features = self._union(features, pairs) # log timing n = pairs.shape[0] i_max = '?' if self._i_max is None else self._i_max eta = time.time() - start_time self._eta.append(eta) self._n.append(n) # log logging.info("comparing [{:d}/{}] - time: {:.2f}s - pairs: {}".format( self._i, i_max, eta, n)) # log total if self._output_log_total: n_total = np.sum(self._n) eta_total = np.sum(self._eta) logging.info( "comparing [{:d}/{}] - time: {:.2f}s - pairs_total: {}".format( self._i, i_max, eta_total, n_total)) self._i += 1 return features
def index(self, x, x_link=None): """Make an index of record pairs. Parameters ---------- x: pandas.DataFrame A pandas DataFrame. When `x_link` is None, the algorithm makes record pairs within the DataFrame. When `x_link` is not empty, the algorithm makes pairs between `x` and `x_link`. x_link: pandas.DataFrame, optional A second DataFrame to link with the DataFrame x. Returns ------- pandas.MultiIndex A pandas.MultiIndex with record pairs. Each record pair contains the index labels of two records. """ if not self.algorithms: raise ValueError("No algorithms given.") # start timing start_time = time.time() pairs = None for cl_alg in self.algorithms: pairs_i = cl_alg.index(x, x_link) if pairs is None: pairs = pairs_i else: pairs = pairs.union(pairs_i) if x_link is not None: n_max = max_pairs((x, x_link)) else: n_max = max_pairs(x) # store the number of pairs n = pairs.shape[0] eta = time.time() - start_time rr = 1 - n / n_max i_max = '?' if self._i_max is None else self._i_max self._eta.append(eta) self._n.append(n) self._n_max.append(n_max) # log logging.info("indexing [{:d}/{}] - time: {:.2f}s - pairs: {:d}/{:d} - " "rr: {:0.5f}".format(self._i, i_max, eta, n, n_max, rr)) # log total if self._output_log_total: n_total = np.sum(self._n) n_max_total = np.sum(self._n_max) rr_avg = 1 - n_total / n_max_total eta_total = np.sum(self._eta) logging.info("indexing [{:d}/{}] - time: {:.2f}s - " "pairs_total: {:d}/{:d} - rr_total: {:0.5f}".format( self._i, i_max, eta_total, n_total, n_max_total, rr_avg)) self._i += 1 return pairs