示例#1
0
  def main(self) :

    data_d = {}
    # import the specified CSV file

    data_d = csvhelpers.readData(self.input, self.field_names)
    
    logging.info('imported %d rows', len(data_d))

    # sanity check for provided field names in CSV file
    for field in self.field_definition :
      if self.field_definition[field]['type'] != 'Interaction' :
        if not field in data_d[0]:
        
          raise parser.error("Could not find field '" + field + "' in input")

    # Set up our data sample
    logging.info('taking a sample of %d possible pairs', self.sample_size)
    data_sample = dedupe.dataSample(data_d, self.sample_size)

    logging.info('using fields: %s' % self.field_definition.keys())
    # # Create a new deduper object and pass our data model to it.
    deduper = dedupe.Dedupe(self.field_definition)

    # If we have training data saved from a previous run of dedupe,
    # look for it an load it in.
    # __Note:__ if you want to train from scratch, delete the training_file

    if os.path.exists(self.training_file):
      logging.info('reading labeled examples from %s' % self.training_file)
      deduper.train(data_sample, str(self.training_file))
    elif self.skip_training:
      raise parser.error("You need to provide an existing training_file or run this script without --skip_training")

    if not self.skip_training:
      logging.info('starting active labeling...')

      deduper.train(data_sample, labeler.label)

      # When finished, save our training away to disk
      logging.info('saving training data to %s' % self.training_file)
      deduper.writeTraining(self.training_file)
    else:
      logging.info('skipping the training step')

    # ## Blocking

    logging.info('blocking...')
    # Initialize our blocker. We'll learn our blocking rules if we haven't
    # loaded them from a saved settings file.
    blocker = deduper.blockingFunction()

    # Load all the original data in to memory and place
    # them in to blocks. Each record can be blocked in many ways, so for
    # larger data, memory will be a limiting factor.

    blocked_data = dedupe.blockData(data_d, blocker)

    # ## Clustering

    # Find the threshold that will maximize a weighted average of our precision and recall. 
    # When we set the recall weight to 2, we are saying we care twice as much
    # about recall as we do precision.
    #
    # If we had more data, we would not pass in all the blocked data into
    # this function but a representative sample.

    logging.info('finding a good threshold with a recall_weight of %s' % 
                 self.recall_weight)
    threshold = deduper.goodThreshold(blocked_data, recall_weight=self.recall_weight)

    # `duplicateClusters` will return sets of record IDs that dedupe
    # believes are all referring to the same entity.

    logging.info('clustering...')
    clustered_dupes = deduper.duplicateClusters(blocked_data, threshold)

    logging.info('# duplicate sets %s' % len(clustered_dupes))

    # write out our results
    if self.output_file :
      with open(self.output_file, 'w') as output_file :
        csvhelpers.writeResults(clustered_dupes, self.input, output_file)
    else :
        csvhelpers.writeResults(clustered_dupes, self.input, sys.stdout)
print "number of known duplicate pairs", len(duplicates_s)

if os.path.exists(settings_file) and 0:
    deduper = dedupe.Dedupe(settings_file)
else:
    fields = {"title": {"type": "String"}}

    deduper = dedupe.Dedupe(fields)
    deduper.num_iterations = num_iterations

    print "Using a random sample of training pairs..."

    deduper._initializeTraining()
    deduper.training_pairs = randomTrainingPairs(data_d, duplicates_s, num_training_dupes, num_training_distinct)

    deduper.data_sample = dedupe.dataSample(data_d, 1000000, constrained_matching)

    deduper.training_data = dedupe.training.addTrainingData(
        deduper.training_pairs, deduper.data_model, deduper.training_data
    )

    deduper.alpha = dedupe.crossvalidation.gridSearch(
        deduper.training_data, dedupe.core.trainModel, deduper.data_model, k=10
    )

    deduper.data_model = dedupe.core.trainModel(deduper.training_data, deduper.data_model, deduper.alpha)

    deduper._logLearnedWeights()


print "blocking..."
示例#3
0
        'city': {
            'type': 'String'
        }
    }

    deduper = dedupe.Dedupe(fields)
    deduper.num_iterations = num_iterations

    print "Using a random sample of training pairs..."

    deduper._initializeTraining()
    deduper.training_pairs = randomTrainingPairs(data_d, duplicates_s,
                                                 num_training_dupes,
                                                 num_training_distinct)

    deduper.data_sample = dedupe.dataSample(data_d, 1000000)

    deduper.training_data = dedupe.training.addTrainingData(
        deduper.training_pairs, deduper.data_model, deduper.training_data)

    deduper.alpha = dedupe.crossvalidation.gridSearch(deduper.training_data,
                                                      dedupe.core.trainModel,
                                                      deduper.data_model,
                                                      k=10)

    deduper.data_model = dedupe.core.trainModel(deduper.training_data,
                                                deduper.data_model,
                                                deduper.alpha)

    deduper._logLearnedWeights()
示例#4
0
    return data_d


print 'importing data ...'
data_d = readData(input_file)

# ## Training

if os.path.exists(settings_file):
    print 'reading from', settings_file
    deduper = dedupe.Dedupe(settings_file)

else:
    # To train dedupe, we feed it a random sample of records.
    data_sample = dedupe.dataSample(data_d, 150000)

    # Define the fields dedupe will pay attention to
    #
    # Notice how we are telling dedupe to use a custom field comparator
    # for the 'Zip' field. 
    fields = {
        'Site name': {'type': 'String'},
        'Address': {'type': 'String'},
        'Zip': {'type': 'Custom', 
                'comparator' : sameOrNotComparator, 
                'Has Missing' : True},
        'Phone': {'type': 'String', 'Has Missing' : True},
        }

    # Create a new deduper object and pass our data model to it.
示例#5
0
    def main(self):

        data_d = {}
        # import the specified CSV file

        data_d = csvhelpers.readData(self.input, self.field_names)

        logging.info('imported %d rows', len(data_d))

        # sanity check for provided field names in CSV file
        for field in self.field_definition:
            if self.field_definition[field]['type'] != 'Interaction':
                if not field in data_d[0]:

                    raise parser.error("Could not find field '" + field +
                                       "' in input")

        # Set up our data sample
        logging.info('taking a sample of %d possible pairs', self.sample_size)
        data_sample = dedupe.dataSample(data_d, self.sample_size)

        logging.info('using fields: %s' % self.field_definition.keys())
        # # Create a new deduper object and pass our data model to it.
        deduper = dedupe.Dedupe(self.field_definition)

        # If we have training data saved from a previous run of dedupe,
        # look for it an load it in.
        # __Note:__ if you want to train from scratch, delete the training_file

        if os.path.exists(self.training_file):
            logging.info('reading labeled examples from %s' %
                         self.training_file)
            deduper.train(data_sample, str(self.training_file))
        elif self.skip_training:
            raise parser.error(
                "You need to provide an existing training_file or run this script without --skip_training"
            )

        if not self.skip_training:
            logging.info('starting active labeling...')

            deduper.train(data_sample, labeler.label)

            # When finished, save our training away to disk
            logging.info('saving training data to %s' % self.training_file)
            deduper.writeTraining(self.training_file)
        else:
            logging.info('skipping the training step')

        # ## Blocking

        logging.info('blocking...')
        # Initialize our blocker. We'll learn our blocking rules if we haven't
        # loaded them from a saved settings file.
        blocker = deduper.blockingFunction()

        # Load all the original data in to memory and place
        # them in to blocks. Each record can be blocked in many ways, so for
        # larger data, memory will be a limiting factor.

        blocked_data = dedupe.blockData(data_d, blocker)

        # ## Clustering

        # Find the threshold that will maximize a weighted average of our precision and recall.
        # When we set the recall weight to 2, we are saying we care twice as much
        # about recall as we do precision.
        #
        # If we had more data, we would not pass in all the blocked data into
        # this function but a representative sample.

        logging.info('finding a good threshold with a recall_weight of %s' %
                     self.recall_weight)
        threshold = deduper.goodThreshold(blocked_data,
                                          recall_weight=self.recall_weight)

        # `duplicateClusters` will return sets of record IDs that dedupe
        # believes are all referring to the same entity.

        logging.info('clustering...')
        clustered_dupes = deduper.duplicateClusters(blocked_data, threshold)

        logging.info('# duplicate sets %s' % len(clustered_dupes))

        # write out our results
        if self.output_file:
            with open(self.output_file, 'w') as output_file:
                csvhelpers.writeResults(clustered_dupes, self.input,
                                        output_file)
        else:
            csvhelpers.writeResults(clustered_dupes, self.input, sys.stdout)
示例#6
0
if os.path.exists(settings_file):
    deduper = dedupe.Dedupe(settings_file)
else:
    fields = {'title': {'type': 'String'}}

    deduper = dedupe.Dedupe(fields)
    deduper.num_iterations = num_iterations

    print "Using a random sample of training pairs..."

    deduper._initializeTraining()
    deduper.training_pairs = randomTrainingPairs(data_d, duplicates_s,
                                                 num_training_dupes,
                                                 num_training_distinct)

    deduper.data_sample = dedupe.dataSample(data_d, 1000000,
                                            constrained_matching)

    deduper.training_data = dedupe.training.addTrainingData(
        deduper.training_pairs, deduper.data_model, deduper.training_data)

    deduper.alpha = dedupe.crossvalidation.gridSearch(deduper.training_data,
                                                      dedupe.core.trainModel,
                                                      deduper.data_model,
                                                      k=10)

    deduper.data_model = dedupe.core.trainModel(deduper.training_data,
                                                deduper.data_model,
                                                deduper.alpha)

    deduper._logLearnedWeights()
示例#7
0

# Read in the data
input_df = pd.read_csv(input_file)
data_d = readDataFrame(input_df)


# Training
time_start = time.time()
if os.path.exists(settings_file):
    print 'reading from', settings_file
    deduper = dedupe.Dedupe(settings_file)

else:
    # To train dedupe, we feed it a random sample of records.
    data_sample = dedupe.dataSample(data_d, 10 * input_df.shape[0])
    # Define the fields dedupe will pay attention to

    fields = {'first': {'type': 'String'},
              'last': {'type': 'String'},
              'mi':{'type': 'String'}# ,
              # 'birth_year': {'type': 'Custom', 'comparator': sameYear}
              }

    # Create a new deduper object and pass our data model to it.
    deduper = dedupe.Dedupe(fields)

    if os.path.exists(training_file):
        print 'reading labeled examples from ', training_file
        deduper.train(data_sample, training_file)
示例#8
0
              'cuisine': {'type': 'String'},
              'city' : {'type' : 'String'}
              }

    deduper = dedupe.Dedupe(fields)
    deduper.num_iterations = num_iterations

    print "Using a random sample of training pairs..."

    deduper._initializeTraining()
    deduper.training_pairs = randomTrainingPairs(data_d,
                                                 duplicates_s,
                                                 num_training_dupes,
                                                 num_training_distinct)
    
    deduper.data_sample = dedupe.dataSample(data_d, 1000000)


    deduper.training_data = dedupe.training.addTrainingData(deduper.training_pairs,
                                                            deduper.data_model,
                                                            deduper.training_data)

    deduper.alpha = dedupe.crossvalidation.gridSearch(deduper.training_data,
                                                      dedupe.core.trainModel,
                                                      deduper.data_model,
                                                      k=10)

    deduper.data_model = dedupe.core.trainModel(deduper.training_data,
                                                deduper.data_model,
                                                deduper.alpha)
示例#9
0
    return data_d


print 'importing data ...'
data_d = readData(input_file)

# ## Training

if os.path.exists(settings_file):
    print 'reading from', settings_file
    deduper = dedupe.Dedupe(settings_file)

else:
    # To train dedupe, we feed it a random sample of records.
    data_sample = dedupe.dataSample(data_d, 150000)

    # Define the fields dedupe will pay attention to
    #
    # Notice how we are telling dedupe to use a custom field comparator
    # for the 'Zip' field.
    fields = {
        'Site name': {
            'type': 'String'
        },
        'Address': {
            'type': 'String'
        },
        'Zip': {
            'type': 'Custom',
            'comparator': sameOrNotComparator,