Exemplo n.º 1
0
    def __init__(self,
                 connector,
                 pruningclf,
                 sbsmodel,
                 classifier,
                 ixname='ix',
                 source_suffix='source',
                 target_suffix='target',
                 **kwargs):
        """

        Args:
            connector (ConnectorMixin): Connector (Scorer) used to do the calculation,
            pruningclf (Explorer): Classifier used to do the pruning (0=no match, 1: potential match, 2: sure match)
            sbsmodel (TransformerMixin): Side-by-Side scorer, Can be FeatureUnion, Pipeline...
            classifier (ClassifierMixin): Classifier used to do the prediction
            ixname (str): 'ix'
            source_suffix (str): 'left'
            target_suffix (str): 'right'
        """
        ClassifierMixin.__init__(self)
        self.ixname = ixname
        self.source_suffix = source_suffix
        self.target_suffix = target_suffix
        self.ixnamesource, self.ixnametarget, self.ixnamepairs = concatixnames(
            ixname=self.ixname,
            source_suffix=self.source_suffix,
            target_suffix=self.target_suffix)
        self.fitted = False
        self.connector = connector
        self.pruningclf = pruningclf
        self.sbsmodel = sbsmodel
        self.classifier = classifier
        pass
Exemplo n.º 2
0
def _init_ixnames():
    """
    {
        'ixname': 'ix,
        'ixnamesource': 'ix_source'
        'ixnametarget': 'ix_target'
        'ixnamepairs': ['ix_source', 'ix_target']
        'source_suffix': 'left'
        'target_suffix': 'right'
    }
    Returns:
        dict
    """
    ixname = 'ix'
    source_suffix = 'source'
    target_suffix = 'target'
    ixnamesource, ixnametarget, ixnamepairs = concatixnames(
        ixname=ixname,
        source_suffix=source_suffix,
        target_suffix=target_suffix)
    names = dict()
    names['ixname'] = ixname
    names['ixnamesource'] = ixnamesource
    names['ixnametarget'] = ixnametarget
    names['ixnamepairs'] = ixnamepairs
    names['source_suffix'] = source_suffix
    names['target_suffix'] = target_suffix
    return names
Exemplo n.º 3
0
    def __init__(self,
                 ixname='ix',
                 source_suffix='source',
                 target_suffix='target',
                 **kwargs):
        ClassifierMixin.__init__(self)
        self.ixname = ixname
        self.source_suffix = source_suffix
        self.target_suffix = target_suffix
        self.ixnamesource, self.ixnametarget, self.ixnamepairs = concatixnames(
            ixname=self.ixname,
            source_suffix=self.source_suffix,
            target_suffix=self.target_suffix)
        # clusters
        self.clusters = None

        # number of unique clusters
        self.n_clusters = None

        # clusters where no match has been found
        self.nomatch = None

        # clusters where all elements are positive matches
        self.allmatch = None

        # clusters where there is positive and negative values (matche and non-match)
        self.mixedmatch = None

        # Clusters not found (added in no matc)
        self.notfound = None

        self.fitted = False
        pass
Exemplo n.º 4
0
    def __init__(self,
                 transformer,
                 classifier,
                 ixname='ix',
                 source_suffix='source',
                 target_suffix='target',
                 **kwargs):
        """

        Args:
            transformer (TransformerMixin):
            classifier (ClassifierMixin):
            ixname (str):
            source_suffix (str):
            target_suffix (str):
            n_jobs (int):
            pruning_ths (float): return only the pairs which have a score greater than the store_ths
        """
        ClassifierMixin.__init__(self)
        self.ixname = ixname
        self.source_suffix = source_suffix
        self.target_suffix = target_suffix
        self.ixnamesource, self.ixnametarget, self.ixnamepairs = concatixnames(
            ixname=self.ixname,
            source_suffix=self.source_suffix,
            target_suffix=self.target_suffix)
        self.fitted = False
        self.transformer = transformer
        self.classifier = classifier
        pass
Exemplo n.º 5
0
 def __init__(self,
              classifier,
              ixname='ix',
              source_suffix='source',
              target_suffix='target',
              **kwargs):
     """
     This is a wrapper around a classifier that allows it to train on partial data
     where X and y do not have the same index, (because of pruning steps,...)
     It will train (fit) the classifier on the common index
     Args:
         classifier (ClassifierMixin): Classifier to use. Should be the output of the pipeline
         ixname (str):
         source_suffix (str):
         target_suffix (str):
     """
     ClassifierMixin.__init__(self)
     self.ixname = ixname
     self.source_suffix = source_suffix
     self.target_suffix = target_suffix
     self.ixnamesource, self.ixnametarget, self.ixnamepairs = concatixnames(
         ixname=self.ixname,
         source_suffix=self.source_suffix,
         target_suffix=self.target_suffix)
     self.fitted = False
     self.classifier = classifier
     pass
Exemplo n.º 6
0
    def __init__(self,
                 dedupe,
                 data=None,
                 ixname='ix',
                 source_suffix='source',
                 target_suffix='target',
                 gidname='gid',
                 verbose=False):
        """

        Args:
            dedupe (suricate.LrDuplicateFinder):
            data (pd.DataFrame): None
            ixname (str):
            source_suffix (str):
            target_suffix (str):
            gidname (str)
            verbose (bool):
        """
        self.ixname = ixname
        self.source_suffix = source_suffix
        self.target_suffix = target_suffix
        self.ixnamesource, self.ixnametarget, self.ixnamepairs = concatixnames(
            ixname=self.ixname,
            source_suffix=self.source_suffix,
            target_suffix=self.target_suffix)
        self.dedupe = dedupe
        self.verbose = verbose
        self.gidname = gidname
        if data is not None:
            self.data = data
        else:
            self.data = pd.DataFrame()
Exemplo n.º 7
0
 def __init__(self,
              on=None,
              ixname='ix',
              source_suffix='source',
              target_suffix='target',
              scoresuffix='score',
              **kwargs):
     """
     Args:
         ixname (str): name of the index, default 'ix'
         source_suffix (str): suffix to be added to the left dataframe default 'left', gives --> 'ix_source'
         target_suffix (str): suffix to be added to the left dataframe default 'right', gives --> 'ixright'
         on (str): name of the column on which to do the join
         scoresuffix (str): suffix to be attached to the on column name
     """
     TransformerMixin.__init__(self)
     self.ixname = ixname
     self.source_suffix = source_suffix
     self.target_suffix = target_suffix
     self.ixnamesource, self.ixnametarget, self.ixnamepairs = concatixnames(
         ixname=self.ixname,
         source_suffix=self.source_suffix,
         target_suffix=self.target_suffix)
     self.on = on
     self.scoresuffix = scoresuffix
     if self.on is None:
         self.outcol = self.scoresuffix
     else:
         self.outcol = self.on + '_' + self.scoresuffix
     self.fitted = False
     pass
Exemplo n.º 8
0
    def __init__(self,
                 clustermixin=None,
                 n_simple=10,
                 n_hard=10,
                 ixname='ix',
                 source_suffix='source',
                 target_suffix='target'):
        """

        Args:
            clustermixin (ClusterMixin): if None, will use KbinsCluster with 25 clusters
            n_simple (int): number of simple questions per cluster
            n_hard (int): number of hard questions per cluster
            ixname (str): default 'ix'
            source_suffix (str): default 'left'
            target_suffix (str): default 'right'
        """
        TransformerMixin.__init__(self)
        self.ixname = ixname
        self.source_suffix = source_suffix
        self.target_suffix = target_suffix
        self.ixnamesource, self.ixnametarget, self.ixnamepairs = concatixnames(
            ixname=self.ixname,
            source_suffix=self.source_suffix,
            target_suffix=self.target_suffix)
        if clustermixin is None:
            clustermixin = KBinsCluster(n_clusters=10)
        self._clustermixin = clustermixin
        self._simplequestions = SimpleQuestions(n_questions=n_simple)
        self._hardquestions = HardQuestions(n_questions=n_hard)
        self._clusterclassifier = ClusterClassifier(
            ixname=self.ixname,
            source_suffix=self.source_suffix,
            target_suffix=self.target_suffix)
        pass
Exemplo n.º 9
0
 def __init__(self,
              ixname='ix',
              source_suffix='source',
              target_suffix='target',
              usecols=None,
              **kwargs):
     TransformerMixin.__init__(self)
     self.ixname = ixname
     self.source_suffix = source_suffix
     self.target_suffix = target_suffix
     self.ixnamesource, self.ixnametarget, self.ixnamepairs = concatixnames(
         ixname=self.ixname,
         source_suffix=self.source_suffix,
         target_suffix=self.target_suffix)
     self.usecols = usecols
     pass
Exemplo n.º 10
0
 def __init__(self,
              ixname='ix',
              source_suffix='source',
              target_suffix='target'):
     TransformerMixin.__init__(self)
     self.ixname = ixname
     self.source_suffix = source_suffix
     self.target_suffix = target_suffix
     self.ixnamesource, self.ixnametarget, self.ixnamepairs = concatixnames(
         ixname=self.ixname,
         source_suffix=self.source_suffix,
         target_suffix=self.target_suffix)
     self.index = pd.Index
     self.dfnum = pd.DataFrame()
     self.dfix = pd.DataFrame()
     self.num = None
Exemplo n.º 11
0
    def __init__(self,
                 ixname='ix',
                 source_suffix='source',
                 target_suffix='target'):
        """

        Args:
            ixname: 'ix'
            source_suffix: 'source'
            target_suffix: 'target'
        """
        TransformerMixin.__init__(self)
        self.ixname = ixname
        self.source_suffix = source_suffix
        self.target_suffix = target_suffix
        self.ixnamesource, self.ixnametarget, self.ixnamepairs = concatixnames(
            ixname=self.ixname,
            source_suffix=self.source_suffix,
            target_suffix=self.target_suffix)
Exemplo n.º 12
0
def calc_existinggid(y_proba,
                     refdata,
                     ixname='ix',
                     source_suffix='source',
                     target_suffix='target',
                     gidname='gid'):
    """

    Args:
        y_proba (pd.DataFrame/pd.Series): {[ixnamesource, ixnametarget] : ['y_proba']
        refdata (pd.DataFrame): {ixname:[gidname, cols..]}
        ixname (str):
        source_suffix (str):
        target_suffix (str):
        gidname (str):

    Returns:
        pd.Series: # results :{ ixname: gidname}
    """
    def goodgids(r):
        """
        Return the most common gid
        Args:
            r (pd.Series): {'ixnametarget':'gid'} for a common ixnamesource

        Returns:
            str
        """
        assert isinstance(r, pd.Series)
        vc = r.value_counts()
        if vc.iloc[0] > 1:
            return vc.index[0]
        else:
            return r.iloc[0]

    ixnamesource, ixnametarget, ixnamepairs = concatixnames(
        ixname=ixname,
        source_suffix=source_suffix,
        target_suffix=target_suffix)

    if isinstance(y_proba, pd.Series):
        y_proba = pd.DataFrame(y_proba).reset_index(drop=False)
    for c in ixnamepairs + ['y_proba']:
        assert c in y_proba.columns, '{}'.format(c)
    if isinstance(refdata, pd.Series):
        ref = pd.DataFrame(refdata)
    else:
        assert isinstance(refdata, pd.DataFrame)
        assert gidname in refdata.columns
        assert refdata.index.name == ixname
        ref = refdata[[gidname]].copy()
        ref.index.name = ixnametarget
        ref.reset_index(drop=False, inplace=True)

    # Select positive matches
    pos_matches = y_proba.loc[y_proba['y_proba'] > 0.5].copy().sort_values(
        by='y_proba', ascending=False)
    # Select left ixes that are NOT in pos matches
    no_matches_atall = y_proba.loc[
        ~(y_proba[ixnamesource].isin(pos_matches[ixnamesource].values)),
        ixnamesource].unique()
    results = pd.DataFrame(data=no_matches_atall, columns=[ixnamesource])
    results[gidname] = None
    # results :{ rangeix: [ixnamesource, gidname]}

    # merge the two to get the the gids
    gids = pd.merge(left=pos_matches,
                    right=ref,
                    left_on=[ixnametarget],
                    right_on=[ixnametarget],
                    how='inner')
    gb = gids.groupby([ixnamesource])
    wg = pd.DataFrame(gb[gidname].apply(goodgids)).reset_index(drop=False)
    assert isinstance(wg, pd.DataFrame), print(type(wg))
    for c in [ixnamesource, gidname]:
        assert c in wg.columns
    results = pd.concat([results, wg[[ixnamesource, gidname]]],
                        axis=0,
                        ignore_index=True)
    results = results.rename(columns={
        ixnamesource: ixname
    }).set_index(ixname)[gidname]
    return results