def _train(settings_file, training_file, data, field_properties, sample_size):
    """Internal method that trains the deduper model if a training file does
    not exist.

        Parameters
        ----------
        settings_file : str
            A path to a settings file that will be loaded if it exists.
        training_file : str
            A path to a training file that will be loaded to keep training
            from.
        data : dict
            The dictionary form of the dataframe that dedupe requires.
        field_properties : dict
            The mapping of fields to their respective data types. Please
            see the dedupe documentation for further details.
        sample_size : float, default 0.3
            Specify the sample size used for training as a float from 0 to 1.
            By default it is 30% (0.3) of our data.

        Returns
        -------
        dedupe.Dedupe
            A dedupe model instance.
    """
    # If a settings file already exists, we'll just load that and skip training
    if os.path.exists(settings_file):
        print('reading from', settings_file)
        with open(settings_file, 'rb') as f:
            deduper = dedupe.StaticDedupe(f)
    else:
        # ## Training
        # Define the fields dedupe will pay attention to
        fields = []
        select_fields(fields, field_properties)

        # Create a new deduper object and pass our data model to it.
        deduper = dedupe.Dedupe(fields)

        # To train dedupe, we feed it a sample of records.
        sample_num = math.floor(len(data) * sample_size)
        deduper.sample(data, sample_num)

        # If we have training data saved from a previous run of dedupe,
        # look for it and load it in.
        # __Note:__ if you want to train from scratch, delete the training_file
        if os.path.exists(training_file):
            print('reading labeled examples from ', training_file)
            with open(training_file, 'rb') as f:
                deduper.readTraining(f)

        print('starting active labeling...')

        dedupe.consoleLabel(deduper)

        # Using the examples we just labeled, train the deduper and learn
        # blocking predicates
        deduper.train()

        # When finished, save our training to disk
        with open(training_file, 'w') as tf:
            deduper.writeTraining(tf)

        # Save our weights and predicates to disk.  If the settings file
        # exists, we will skip all the training and learning next time we run
        # this file.
        with open(settings_file, 'wb') as sf:
            deduper.writeSettings(sf)

    return deduper
示例#2
0
def _train(settings_file, training_file, clean_data, messy_data,
           field_properties, sample_size, update_model, n_cores):
    """Internal method that trains the deduper model from scratch or update
        an existing dedupe model.
        Parameters
        ----------
        settings_file : str
            A path to a settings file that will be loaded if it exists.
        training_file : str
            A path to a training file that will be loaded to keep training
            from.
        clean_data : dict
            The dictionary form of the gazette that gazetteer_dedupe requires.
        messy_data : dict
            The dictionary form of the messy data that needs to be deduplicated 
            (and canonicalized)
        field_properties : dict
            The mapping of fields to their respective data types. Please
            see the dedupe documentation for further details.
        sample_size : float, default 0.3
            Specify the sample size used for training as a float from 0 to 1.
            By default it is 30% (0.3) of our data.
        update_model : bool, default False
            If True, it allows user to update existing model by uploading
            training file.
        n_cores : int, default None
            Specify the number of cores to use during clustering.
            By default n_cores is equal to None (i.e. use multipressing equal to CPU count).
        Returns
        -------
        dedupe.Gazetteer
            A gazetteer model instance.
    """
    # Define the fields dedupe will pay attention to
    fields = []
    select_fields(fields, [field_properties])

    if update_model == False:

        # If a settings file already exists, we'll just load that and skip training
        if os.path.exists(settings_file):
            print('Reading from', settings_file)
            with open(settings_file, 'rb') as f:
                deduper = dedupe.StaticGazetteer(f, num_cores=n_cores)

        #Create a new deduper object and pass our data model to it.
        else:
            # Initialise dedupe
            deduper = dedupe.Gazetteer(fields, num_cores=n_cores)

            # Launch active learning
            deduper = _active_learning(clean_data, messy_data, sample_size,
                                       deduper, training_file, settings_file)

    else:
        # ## Training
        # Initialise dedupe
        deduper = dedupe.Gazetteer(fields, num_cores=n_cores)

        # Import existing model
        print('Reading labeled examples from ', training_file)
        with open(training_file, 'rb') as f:
            deduper.prepare_training(clean_data, messy_data, training_file=f)

        # Launch active learning
        deduper = _active_learning(clean_data, messy_data, sample_size,
                                   deduper, training_file, settings_file)

    return deduper
def _train(settings_file, training_file, data, field_properties, sample_size,
           update_model):
    """Internal method that trains the deduper model from scratch or update
        an existing dedupe model.
        Parameters
        ----------
        settings_file : str
            A path to a settings file that will be loaded if it exists.
        training_file : str
            A path to a training file that will be loaded to keep training
            from.
        data : dict
            The dictionary form of the dataframe that dedupe requires.
        field_properties : dict
            The mapping of fields to their respective data types. Please
            see the dedupe documentation for further details.
        sample_size : float, default 0.3
            Specify the sample size used for training as a float from 0 to 1.
            By default it is 30% (0.3) of our data.
        update_model : bool, default False
            If True, it allows user to update existing model by uploading
            training file.
        Returns
        -------
        dedupe.Dedupe
            A dedupe model instance.
    """
    # Define the fields dedupe will pay attention to
    fields = []
    select_fields(fields, field_properties)

    if update_model == False:

        # If a settings file already exists, we'll just load that and skip training
        if os.path.exists(settings_file):
            print('Reading from', settings_file)
            with open(settings_file, 'rb') as f:
                deduper = dedupe.StaticDedupe(f, num_cores=None)

        #Create a new deduper object and pass our data model to it.
        else:
            # Initialise dedupe
            deduper = dedupe.Dedupe(fields, num_cores=None)

            # Launch active learning
            deduper = _active_learning(data, sample_size, deduper,
                                       training_file, settings_file)

    else:
        # ## Training
        # Initialise dedupe
        deduper = dedupe.Dedupe(fields, num_cores=None)

        # Import existing model
        print('Reading labeled examples from ', training_file)
        with open(training_file, 'rb') as f:
            deduper.prepare_training(data, training_file=f)

        # Launch active learning
        deduper = _active_learning(data, sample_size, deduper, training_file,
                                   settings_file)

    return deduper
def dedupe_dataframe(df, field_properties, canonicalize=False,
                     config_name="dedupe_dataframe", recall_weight=1,
                     sample_size=0.3):
    """Deduplicates a dataframe given fields of interest.

        Parameters
        ----------
        df : pd.DataFrame
            The dataframe to deduplicate.
        field_properties : list
            A list specifying what fields to use for deduplicating records.
        canonicalize : bool, default False
            Option that provides the canonical record as additional columns.
        config_name : str, default dedupe_dataframe
            The configuration file name. Note that this will be used as 
            a prefixto save the settings and training files.
        recall_weight : int, default 1
            Find the threshold that will maximize a weighted average of our
            precision and recall.  When we set the recall weight to 2, we are
            saying we care twice as much about recall as we do precision.
        sample_size : float, default 0.3
            Specify the sample size used for training as a float from 0 to 1.
            By default it is 30% (0.3) of our data.

        Returns
        -------
        pd.DataFrame
            A pandas dataframe that contains the cluster id and confidence
            score. Optionally, it will contain canonicalized columns for all
            attributes of the record.

    """
    # Import Data  
    config_name = config_name.replace(" ", "_")
   
    settings_file = config_name + '_learned_settings'
    training_file = config_name + '_training.json'

    print('importing data ...')

    df = clean_punctuation(df)
    
    specify_type(df, field_properties)                
    
    df['dictionary'] = df.apply(
        lambda x: dict(zip(df.columns, x.tolist())), axis=1)
    data_d = dict(zip(df.index, df.dictionary))
    
    # If a settings file already exists, we'll just load that and skip training
    if os.path.exists(settings_file):
        print('reading from', settings_file)
        with open(settings_file, 'rb') as f:
            deduper = dedupe.StaticDedupe(f)
    else:
        # ## Training

        # Define the fields dedupe will pay attention to
        
        fields = []
        select_fields(fields, field_properties)

        # Create a new deduper object and pass our data model to it.
        deduper = dedupe.Dedupe(fields)

        # To train dedupe, we feed it a sample of records.
        sample_num = math.floor(len(df) * sample_size)
        deduper.sample(data_d, sample_num)

        # If we have training data saved from a previous run of dedupe,
        # look for it and load it in.
        # __Note:__ if you want to train from scratch, delete the training_file
        if os.path.exists(training_file):
            print('reading labeled examples from ', training_file)
            with open(training_file, 'rb') as f:
                deduper.readTraining(f)

        print('starting active labeling...')

        dedupe.consoleLabel(deduper)

        # Using the examples we just labeled, train the deduper and learn
        # blocking predicates
        deduper.train()

        # When finished, save our training to disk
        with open(training_file, 'w') as tf:
            deduper.writeTraining(tf)

        # Save our weights and predicates to disk.  If the settings file
        # exists, we will skip all the training and learning next time we run
        # this file.
        with open(settings_file, 'wb') as sf:
            deduper.writeSettings(sf)

    # ## Set threshold
    threshold = deduper.threshold(data_d, recall_weight=recall_weight)

    # ## Clustering
    print('clustering...')
    clustered_dupes = deduper.match(data_d, threshold)

    print('# duplicate sets', len(clustered_dupes))

    # Convert data_d to string so that Price & LatLong won't get traceback
    # during dedupe.canonicalize()
    for i in data_d.values():
        for key in i:
            if i[key] is None:
                pass
            else:
                i[key] = str(i[key])
            
    # ## Writing Results
    cluster_membership = {}
    cluster_id = 0
    for (cluster_id, cluster) in enumerate(clustered_dupes):
        id_set, scores = cluster
        cluster_d = [data_d[c] for c in id_set]
        canonical_rep = dedupe.canonicalize(cluster_d)
        for record_id, score in zip(id_set, scores):
            cluster_membership[record_id] = {
                "cluster id": cluster_id,
                "canonical representation": canonical_rep,
                "confidence": score
            }

    cluster_index = []
    for i in cluster_membership.items():
        cluster_index.append(i)

    # turn results into dataframe
    dfa = pd.DataFrame(cluster_index)
    dfa.rename(columns={0: 'Id'}, inplace=True)
    
    dfa['cluster id'] = dfa[1].apply(lambda x: x["cluster id"])
    dfa['confidence'] = dfa[1].apply(lambda x: x["confidence"])

    canonical_list = []
    
    if canonicalize:
        for i in dfa[1][0]['canonical representation'].keys():
            canonical_list.append(i)
            dfa[i + ' - ' + 'canonical'] = None
            dfa[i + ' - ' + 'canonical'] = dfa[1].apply(
                lambda x: x['canonical representation'][i])
    elif type(canonicalize) == list:
        for i in canonicalize:
            dfa[i + ' - ' + 'canonical'] = None
            dfa[i + ' - ' + 'canonical'] = dfa[1].apply(
                lambda x: x['canonical representation'][i])

    dfa.set_index('Id', inplace=True)
    df = df.join(dfa)            
    df.drop(columns=[1, 'dictionary'], inplace=True)

    return df