def __init__(self, connector, pruningclf, sbsmodel, classifier, ixname='ix', source_suffix='source', target_suffix='target', **kwargs): """ Args: connector (ConnectorMixin): Connector (Scorer) used to do the calculation, pruningclf (Explorer): Classifier used to do the pruning (0=no match, 1: potential match, 2: sure match) sbsmodel (TransformerMixin): Side-by-Side scorer, Can be FeatureUnion, Pipeline... classifier (ClassifierMixin): Classifier used to do the prediction ixname (str): 'ix' source_suffix (str): 'left' target_suffix (str): 'right' """ ClassifierMixin.__init__(self) self.ixname = ixname self.source_suffix = source_suffix self.target_suffix = target_suffix self.ixnamesource, self.ixnametarget, self.ixnamepairs = concatixnames( ixname=self.ixname, source_suffix=self.source_suffix, target_suffix=self.target_suffix) self.fitted = False self.connector = connector self.pruningclf = pruningclf self.sbsmodel = sbsmodel self.classifier = classifier pass
def _init_ixnames(): """ { 'ixname': 'ix, 'ixnamesource': 'ix_source' 'ixnametarget': 'ix_target' 'ixnamepairs': ['ix_source', 'ix_target'] 'source_suffix': 'left' 'target_suffix': 'right' } Returns: dict """ ixname = 'ix' source_suffix = 'source' target_suffix = 'target' ixnamesource, ixnametarget, ixnamepairs = concatixnames( ixname=ixname, source_suffix=source_suffix, target_suffix=target_suffix) names = dict() names['ixname'] = ixname names['ixnamesource'] = ixnamesource names['ixnametarget'] = ixnametarget names['ixnamepairs'] = ixnamepairs names['source_suffix'] = source_suffix names['target_suffix'] = target_suffix return names
def __init__(self, ixname='ix', source_suffix='source', target_suffix='target', **kwargs): ClassifierMixin.__init__(self) self.ixname = ixname self.source_suffix = source_suffix self.target_suffix = target_suffix self.ixnamesource, self.ixnametarget, self.ixnamepairs = concatixnames( ixname=self.ixname, source_suffix=self.source_suffix, target_suffix=self.target_suffix) # clusters self.clusters = None # number of unique clusters self.n_clusters = None # clusters where no match has been found self.nomatch = None # clusters where all elements are positive matches self.allmatch = None # clusters where there is positive and negative values (matche and non-match) self.mixedmatch = None # Clusters not found (added in no matc) self.notfound = None self.fitted = False pass
def __init__(self, transformer, classifier, ixname='ix', source_suffix='source', target_suffix='target', **kwargs): """ Args: transformer (TransformerMixin): classifier (ClassifierMixin): ixname (str): source_suffix (str): target_suffix (str): n_jobs (int): pruning_ths (float): return only the pairs which have a score greater than the store_ths """ ClassifierMixin.__init__(self) self.ixname = ixname self.source_suffix = source_suffix self.target_suffix = target_suffix self.ixnamesource, self.ixnametarget, self.ixnamepairs = concatixnames( ixname=self.ixname, source_suffix=self.source_suffix, target_suffix=self.target_suffix) self.fitted = False self.transformer = transformer self.classifier = classifier pass
def __init__(self, classifier, ixname='ix', source_suffix='source', target_suffix='target', **kwargs): """ This is a wrapper around a classifier that allows it to train on partial data where X and y do not have the same index, (because of pruning steps,...) It will train (fit) the classifier on the common index Args: classifier (ClassifierMixin): Classifier to use. Should be the output of the pipeline ixname (str): source_suffix (str): target_suffix (str): """ ClassifierMixin.__init__(self) self.ixname = ixname self.source_suffix = source_suffix self.target_suffix = target_suffix self.ixnamesource, self.ixnametarget, self.ixnamepairs = concatixnames( ixname=self.ixname, source_suffix=self.source_suffix, target_suffix=self.target_suffix) self.fitted = False self.classifier = classifier pass
def __init__(self, dedupe, data=None, ixname='ix', source_suffix='source', target_suffix='target', gidname='gid', verbose=False): """ Args: dedupe (suricate.LrDuplicateFinder): data (pd.DataFrame): None ixname (str): source_suffix (str): target_suffix (str): gidname (str) verbose (bool): """ self.ixname = ixname self.source_suffix = source_suffix self.target_suffix = target_suffix self.ixnamesource, self.ixnametarget, self.ixnamepairs = concatixnames( ixname=self.ixname, source_suffix=self.source_suffix, target_suffix=self.target_suffix) self.dedupe = dedupe self.verbose = verbose self.gidname = gidname if data is not None: self.data = data else: self.data = pd.DataFrame()
def __init__(self, on=None, ixname='ix', source_suffix='source', target_suffix='target', scoresuffix='score', **kwargs): """ Args: ixname (str): name of the index, default 'ix' source_suffix (str): suffix to be added to the left dataframe default 'left', gives --> 'ix_source' target_suffix (str): suffix to be added to the left dataframe default 'right', gives --> 'ixright' on (str): name of the column on which to do the join scoresuffix (str): suffix to be attached to the on column name """ TransformerMixin.__init__(self) self.ixname = ixname self.source_suffix = source_suffix self.target_suffix = target_suffix self.ixnamesource, self.ixnametarget, self.ixnamepairs = concatixnames( ixname=self.ixname, source_suffix=self.source_suffix, target_suffix=self.target_suffix) self.on = on self.scoresuffix = scoresuffix if self.on is None: self.outcol = self.scoresuffix else: self.outcol = self.on + '_' + self.scoresuffix self.fitted = False pass
def __init__(self, clustermixin=None, n_simple=10, n_hard=10, ixname='ix', source_suffix='source', target_suffix='target'): """ Args: clustermixin (ClusterMixin): if None, will use KbinsCluster with 25 clusters n_simple (int): number of simple questions per cluster n_hard (int): number of hard questions per cluster ixname (str): default 'ix' source_suffix (str): default 'left' target_suffix (str): default 'right' """ TransformerMixin.__init__(self) self.ixname = ixname self.source_suffix = source_suffix self.target_suffix = target_suffix self.ixnamesource, self.ixnametarget, self.ixnamepairs = concatixnames( ixname=self.ixname, source_suffix=self.source_suffix, target_suffix=self.target_suffix) if clustermixin is None: clustermixin = KBinsCluster(n_clusters=10) self._clustermixin = clustermixin self._simplequestions = SimpleQuestions(n_questions=n_simple) self._hardquestions = HardQuestions(n_questions=n_hard) self._clusterclassifier = ClusterClassifier( ixname=self.ixname, source_suffix=self.source_suffix, target_suffix=self.target_suffix) pass
def __init__(self, ixname='ix', source_suffix='source', target_suffix='target', usecols=None, **kwargs): TransformerMixin.__init__(self) self.ixname = ixname self.source_suffix = source_suffix self.target_suffix = target_suffix self.ixnamesource, self.ixnametarget, self.ixnamepairs = concatixnames( ixname=self.ixname, source_suffix=self.source_suffix, target_suffix=self.target_suffix) self.usecols = usecols pass
def __init__(self, ixname='ix', source_suffix='source', target_suffix='target'): TransformerMixin.__init__(self) self.ixname = ixname self.source_suffix = source_suffix self.target_suffix = target_suffix self.ixnamesource, self.ixnametarget, self.ixnamepairs = concatixnames( ixname=self.ixname, source_suffix=self.source_suffix, target_suffix=self.target_suffix) self.index = pd.Index self.dfnum = pd.DataFrame() self.dfix = pd.DataFrame() self.num = None
def __init__(self, ixname='ix', source_suffix='source', target_suffix='target'): """ Args: ixname: 'ix' source_suffix: 'source' target_suffix: 'target' """ TransformerMixin.__init__(self) self.ixname = ixname self.source_suffix = source_suffix self.target_suffix = target_suffix self.ixnamesource, self.ixnametarget, self.ixnamepairs = concatixnames( ixname=self.ixname, source_suffix=self.source_suffix, target_suffix=self.target_suffix)
def calc_existinggid(y_proba, refdata, ixname='ix', source_suffix='source', target_suffix='target', gidname='gid'): """ Args: y_proba (pd.DataFrame/pd.Series): {[ixnamesource, ixnametarget] : ['y_proba'] refdata (pd.DataFrame): {ixname:[gidname, cols..]} ixname (str): source_suffix (str): target_suffix (str): gidname (str): Returns: pd.Series: # results :{ ixname: gidname} """ def goodgids(r): """ Return the most common gid Args: r (pd.Series): {'ixnametarget':'gid'} for a common ixnamesource Returns: str """ assert isinstance(r, pd.Series) vc = r.value_counts() if vc.iloc[0] > 1: return vc.index[0] else: return r.iloc[0] ixnamesource, ixnametarget, ixnamepairs = concatixnames( ixname=ixname, source_suffix=source_suffix, target_suffix=target_suffix) if isinstance(y_proba, pd.Series): y_proba = pd.DataFrame(y_proba).reset_index(drop=False) for c in ixnamepairs + ['y_proba']: assert c in y_proba.columns, '{}'.format(c) if isinstance(refdata, pd.Series): ref = pd.DataFrame(refdata) else: assert isinstance(refdata, pd.DataFrame) assert gidname in refdata.columns assert refdata.index.name == ixname ref = refdata[[gidname]].copy() ref.index.name = ixnametarget ref.reset_index(drop=False, inplace=True) # Select positive matches pos_matches = y_proba.loc[y_proba['y_proba'] > 0.5].copy().sort_values( by='y_proba', ascending=False) # Select left ixes that are NOT in pos matches no_matches_atall = y_proba.loc[ ~(y_proba[ixnamesource].isin(pos_matches[ixnamesource].values)), ixnamesource].unique() results = pd.DataFrame(data=no_matches_atall, columns=[ixnamesource]) results[gidname] = None # results :{ rangeix: [ixnamesource, gidname]} # merge the two to get the the gids gids = pd.merge(left=pos_matches, right=ref, left_on=[ixnametarget], right_on=[ixnametarget], how='inner') gb = gids.groupby([ixnamesource]) wg = pd.DataFrame(gb[gidname].apply(goodgids)).reset_index(drop=False) assert isinstance(wg, pd.DataFrame), print(type(wg)) for c in [ixnamesource, gidname]: assert c in wg.columns results = pd.concat([results, wg[[ixnamesource, gidname]]], axis=0, ignore_index=True) results = results.rename(columns={ ixnamesource: ixname }).set_index(ixname)[gidname] return results