Exemplo n.º 1
0
def active_training():
    print("MODE: Active training")

    # carga fichero de datos a deduplicar
    try:
        data_d = read_messy_data(CONFIG.PATHS.INPUT_FILE)
    except IOError:
        print("No se pudo abrir el fichero de records de entrada - " + CONFIG.PATHS.INPUT_FILE)
        raise IOError

    # Entrenamiento Activo
    deduper = dedupe.Dedupe(CONFIG.DEDUPE.FIELDS)
    deduper.sample(data_d, CONFIG.DEDUPE.SAMPLE_SIZE)

    # Carga de registros clasificados de entrenamientos anteriores
    if CONFIG.GENERAL.LOAD_TRAINING:
        try:
            with open(CONFIG.PATHS.TRAINING_FILE) as f:
                deduper.readTraining(f)
        except IOError:
            print("No se pudo abrir el fichero de entrenamiento activo -" + CONFIG.PATHS.TRAINING_FILE)

    dedupe.consoleLabel(deduper)

    with open(CONFIG.PATHS.TRAINING_FILE, 'w') as tf:
        deduper.writeTraining(tf)
Exemplo n.º 2
0
    def dedupe_training(self, deduper) :

        # __Note:__ if you want to train from scratch, delete the training_file
        if os.path.exists(self.training_file):
            logging.info('reading labeled examples from %s' %
                         self.training_file)
            with open(self.training_file) as tf:
                deduper.readTraining(tf)
        elif self.skip_training:
            raise self.parser.error(
                "You need to provide an existing training_file or run this script without --skip_training")

        if not self.skip_training:
            logging.info('starting active labeling...')

            dedupe.consoleLabel(deduper)

            # When finished, save our training away to disk
            logging.info('saving training data to %s' % self.training_file)
            if sys.version < '3' :
                with open(self.training_file, 'wb') as tf:
                    deduper.writeTraining(tf)
            else :
                with open(self.training_file, 'w') as tf:
                    deduper.writeTraining(tf)
        else:
            logging.info('skipping the training step')

        deduper.train()

        # After training settings have been established make a cache file for reuse
        logging.info('caching training result set to file %s' % self.settings_file)
        with open(self.settings_file, 'wb') as sf:
            deduper.writeSettings(sf)
Exemplo n.º 3
0
    def train(self):
        # data_d = self._read_data(self.input_file)
        # self.data_d = data_d
        data_1 = self._read_data(self.input_file_base)
        data_2 = self._read_data(self.input_file)
        self.data_1 = data_1
        self.data_2 = data_2

        fields = self.fields
        deduper = dedupe.RecordLink(fields)
        training_file = self.training_file

        # ## training data
        if os.path.exists(training_file):
            logging.info('reading labeled examples from ', training_file)
            with open(training_file, 'rb') as f:
                deduper.prepare_training(data_1, data_2, f)
        else:
            deduper.prepare_training(data_1, data_2)

        print('starting active labeling...')
        dedupe.consoleLabel(deduper)
        deduper.train(recall=1)

        with open(training_file, 'w') as tf:
            deduper.writeTraining(tf)

        self.deduper = deduper
Exemplo n.º 4
0
def train_linker(df1, df2):
    if READ_FROM_SETTINGS_FILE:
        with open(SETTINGS_FILE, 'rb') as sf:
            linker = dedupe.StaticRecordLink(sf)
    else:
        linker = dedupe.RecordLink(FIELDS)
        # It's terrible that you have to do this next line!!!!
        linker.classifier = rlr.RegularizedLogisticRegression()

        linker.sample(df1, df2, BLOCKING_TRAINING_SAMPLE_SIZE)

        if READ_FROM_TRAINING_FILE:
            print('reading labeled examples from ', TRAINING_FILE)
            with open(TRAINING_FILE, 'rb') as tf:
                linker.readTraining(tf)
        else:
            dedupe.consoleLabel(linker)

        linker.train()

    if WRITE_SETTINGS_FILE:
        with open(SETTINGS_FILE, 'wb') as sf:
            linker.writeSettings(sf)
    if WRITE_TRAINING_FILE:
        with open(TRAINING_FILE, 'w') as tf:
            linker.writeTraining(tf)

    return linker
Exemplo n.º 5
0
def deduplicate():
    print("MODE: Deduplicate")

    # Comprueba que va a haber registros clasificados con los que entrenar un modelo:
    if not (CONFIG.GENERAL.LOAD_TRAINING or CONFIG.GENERAL.PERFORM_ACTIVE_TRAINING):
        print("ERROR: El entrenamiento activo y la carga desde fichero están desactivados")
        return

    # Carga fichero de datos a deduplicar
    try:
        data_d = read_messy_data(CONFIG.PATHS.INPUT_FILE)
    except IOError:
        print("No se pudo abrir el fichero de records de entrada - " + CONFIG.PATHS.INPUT_FILE)
        raise IOError

    if CONFIG.GENERAL.LOAD_SETTINGS:
        try:
            with open(CONFIG.PATHS.SETTINGS_FILE, 'rb') as f:
                deduper = dedupe.StaticDedupe(f)
        except IOError:
            print("No se pudo abrir el fichero de settings de dedupe - " + CONFIG.PATHS.SETTINGS_FILE)
            raise IOError
    else:
        # Inicializa objeto dedupe
        deduper = dedupe.Dedupe(CONFIG.DEDUPE.FIELDS)

        # Carga de registros clasificados de entrenamientos anteriores
        if CONFIG.GENERAL.LOAD_TRAINING:
            try:
                with open(CONFIG.PATHS.TRAINING_FILE) as f:
                    deduper.readTraining(f)
            except IOError:
                print("No se pudo abrir el fichero de entrenamiento activo - " + CONFIG.PATHS.TRAINING_FILE)
                raise IOError

        if CONFIG.GENERAL.PERFORM_ACTIVE_TRAINING:
            # Muestreo y entrenamiento activo
            deduper.sample(data_d, CONFIG.DEDUPE.SAMPLE_SIZE)
            dedupe.consoleLabel(deduper)

        # Entrenamiento de modelo predictivo (por defecto regresión logística
        deduper.train(CONFIG.DEDUPE.USE_INDEX_PREDICATES)

        # Guarda entrenamiento activo, y modelo predictivo + predicados
        with open(CONFIG.PATHS.TRAINING_FILE, 'w') as tf:
            deduper.writeTraining(tf)
        with open(CONFIG.PATHS.SETTINGS_FILE, 'wb') as sf:
            deduper.writeSettings(sf)

    try:
        # Calcula umbral para la regresión logistica
        threshold = deduper.threshold(data_d, recall_weight=CONFIG.DEDUPE.RECALL_WEIGHT)
        # Agrupación de matches en clusters
        clustered_dupes = deduper.match(data_d, threshold)
        # Escrirura en fichero
        write_clusters(clustered_dupes)
    except NameError:
        print("Error - No se pudo inicializar el objeto dedupe")
        raise NameError
Exemplo n.º 6
0
def run_dedupe(project):
    deduper, data = create_deduper(project)
    deduper.train()
    dedupe.consoleLabel(deduper)
    deduper.train()
    # TODO: lower the recall weight?
    threshold = deduper.threshold(data, recall_weight=1)
    clustered_dupes = deduper.match(data, threshold)
    print('# duplicate sets', len(clustered_dupes))
Exemplo n.º 7
0
def trainIncoming(name):
    from geocoder.deduper import DatabaseGazetteer
    import simplejson as json
    import dedupe

    engine = create_engine(DB_CONN)

    deduper = DatabaseGazetteer([{
        'field': 'complete_address',
        'type': 'Address'
    }],
                                engine=engine)

    sql_table = checkForTable(engine, name)

    if sql_table == None:
        sys.exit()

    primary_key = sql_table.primary_key.columns.keys()[0]

    messy_table = ''' 
        SELECT {0}, complete_address
        FROM {1}
        WHERE address_id IS NULL
          AND complete_address IS NOT NULL
    '''.format(primary_key, name)

    curs = engine.execute(messy_table)

    messy_data = ({'complete_address': r.complete_address} for r in curs)

    deduper.drawSample(messy_data, sample_size=30000)

    if os.path.exists('geocoder/data/training.json'):
        print('reading labeled examples from geocoder/data/training.json')
        with open('geocoder/data/training.json') as tf:
            deduper.readTraining(tf)

    dedupe.consoleLabel(deduper)

    deduper.train(ppc=0.1, index_predicates=False)

    # When finished, save our training away to disk
    with open('geocoder/data/training.json', 'w') as tf:
        deduper.writeTraining(tf)

    # Save our weights and predicates to disk.  If the settings file
    # exists, we will skip all the training and learning next time we run
    # this file.
    with open('geocoder/dedupe.settings', 'wb') as sf:
        deduper.writeSettings(sf)

    deduper.cleanupTraining()
Exemplo n.º 8
0
def trainIncoming(name):
    from geocoder.deduper import DatabaseGazetteer
    import simplejson as json
    import dedupe

    engine = create_engine(DB_CONN)
    
    deduper = DatabaseGazetteer([{'field': 'complete_address', 'type': 'Address'}],
                                engine=engine)
    
    sql_table = checkForTable(engine, name)
    
    if sql_table == None:
        sys.exit()

    primary_key = sql_table.primary_key.columns.keys()[0]
    
    messy_table = ''' 
        SELECT {0}, complete_address
        FROM {1}
        WHERE address_id IS NULL
          AND complete_address IS NOT NULL
    '''.format(primary_key, name)

    curs = engine.execute(messy_table)

    messy_data = ({'complete_address': r.complete_address} for r in curs)

    deduper.drawSample(messy_data, sample_size=30000)
    
    if os.path.exists('geocoder/data/training.json'):
        print('reading labeled examples from geocoder/data/training.json')
        with open('geocoder/data/training.json') as tf :
            deduper.readTraining(tf)
    
    dedupe.consoleLabel(deduper)

    deduper.train(ppc=0.1, index_predicates=False)
    
    # When finished, save our training away to disk
    with open('geocoder/data/training.json', 'w') as tf :
        deduper.writeTraining(tf)

    # Save our weights and predicates to disk.  If the settings file
    # exists, we will skip all the training and learning next time we run
    # this file.
    with open('geocoder/dedupe.settings', 'wb') as sf :
        deduper.writeSettings(sf)

    deduper.cleanupTraining()
Exemplo n.º 9
0
    def train(self, data_d):
        """
        Train classifiers using logistic regression over the the RVD flaws
        """
        cyan("Duplicates, training the classifiers using dedupe...")

        # Create a new deduper object and pass our data model to it.
        deduper = dedupe.Dedupe(self.fields)

        # If we have training data saved from a previous run of dedupe,
        # look for it and load it in.
        # __Note:__ if you want to train from scratch, delete the training_file
        if os.path.exists(self.training_file):
            gray("Reading labeled examples from ", self.training_file)
            with open(self.training_file, "rb") as f:
                deduper.prepare_training(data_d, f)
        else:
            deduper.prepare_training(data_d)

        # ## Active learning
        # Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.
        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        gray("Starting active labeling...")
        dedupe.consoleLabel(deduper)

        # Using the examples we just labeled, train the deduper and learn
        # blocking predicates
        gray("Training...")
        deduper.train()

        # When finished, save our training to disk
        gray("Saving results to training file...")
        with open(self.training_file, "w+") as tf:
            deduper.writeTraining(tf)

        # Save our weights and predicates to disk.  If the settings file
        # exists, we will skip all the training and learning next time we run
        # this file.
        gray("Saving weights and predicates to settings file...")
        with open(self.settings_file, "wb+") as sf:
            deduper.writeSettings(sf)

        return deduper
Exemplo n.º 10
0
def deduplicate(
    df,
    recall_weight=1,
    sample_size=0.3,
    settings_file="training-data/dedupe_learned_settings",
    training_file="training-data/dedupe_training.json",
):
    fields = df.columns
    df, data_d = data_prep(df)

    if os.path.exists(settings_file):
        logger.info("Existing settings found. Loading from: %s", settings_file)
        with open(settings_file, "rb") as f:
            deduper = dedupe.StaticDedupe(f)
    else:
        fields = select_fields(fields)
        deduper = dedupe.Dedupe(fields)
        sample_num = math.floor(len(data_d) * sample_size)

        logger.info("Extracting data sample of %s records", sample_num)
        deduper.sample(data_d, sample_num)

        if os.path.exists(training_file):
            logger.info("Reading training examples from: %s", training_file)
            with open(training_file, "rb") as f:
                deduper.readTraining(f)

        logger.info("Starting active labeling")

        dedupe.consoleLabel(deduper)

        deduper.train()

        with open(training_file, "w") as tf:
            deduper.writeTraining(tf)
        with open(settings_file, "wb") as sf:
            deduper.writeSettings(sf)

    threshold = deduper.threshold(data_d, recall_weight=recall_weight)

    clustered_df = data_cluster(deduper, data_d, threshold)
    results = df.join(clustered_df, how="left")
    results.drop(["dictionary"], axis=1, inplace=True)

    return results
Exemplo n.º 11
0
 def manual_training(self, *args, **options):
     logger.info('Starting dedupe training.')
     data = {}
     logger.info('Loading training data...')
     fields = [
         {'field': 'First Name', 'type': 'String'},
         {'field': 'Last Name', 'type': 'String'},
         {'field': 'Mobile', 'type': 'String'},
     ]
     with open('temp/report1445218513088.csv', 'rb') as training_file:
         reader = csv.DictReader(training_file)
         identifier = 1
         for row in reader:
             clean = [(k, self.preprocess_column(v)) for (k, v) in row.items()]
             clean = dict(clean)
             # remove empty records
             delete = False
             for field in fields:
                 if not clean[field['field']]:
                     delete = True
                     break
             if not delete:
                 data[identifier] = clean
                 identifier += 1
     logger.info('Loaded training data.')
     deduper = dedupe.Dedupe(fields)
     logger.info('Taking samples...')
     deduper.sample(data, 10000)
     dedupe.consoleLabel(deduper)
     deduper.train()
     with open('training.json', 'wb') as training_file:
         deduper.writeTraining(training_file)
     with open('settings.json', 'wb') as settings_file:
         deduper.writeSettings(settings_file)
     logger.info('Beggining blocking.')
     logger.info('Computing thresholds...')
     threshold = deduper.threshold(data, recall_weight=2)
     logger.info('Clustering...')
     clustered_dupes = deduper.match(data, threshold)
     logger.info('%s duplicate sets.' % len(clustered_dupes))
     with open('results', 'wb') as output_file:
         output_file.write(repr(clustered_dupes))
     logger.info('Finished blocking.')
     logger.info('Finished dedupe training.')
Exemplo n.º 12
0
    def active_train(self, input_data, training_file):
        data_d = input_data
        fields = self.fields
        deduper = dedupe.Dedupe(fields)
        # ## training data
        if os.path.exists(training_file):
            logging.info('reading labeled examples from ', training_file)
            with open(training_file, 'rb') as f:
                deduper.prepare_training(data_d, f)
        else:
            deduper.prepare_training(data_d)

        print('starting active labeling...')
        dedupe.consoleLabel(deduper)
        deduper.train()

        with open(training_file, 'w') as tf:
            deduper.writeTraining(tf)
        self.deduper = deduper
        return self
Exemplo n.º 13
0
    def active_train(self, data_1, data_2, training_file):
        fields = self.fields
        deduper = dedupe.RecordLink(fields)

        # ## training data
        if os.path.exists(training_file):
            logging.info('reading labeled examples from ', training_file)
            with open(training_file, 'rb') as f:
                deduper.prepare_training(data_1, data_2, f)
        else:
            deduper.prepare_training(data_1, data_2)

        print('starting active labeling...')
        dedupe.consoleLabel(deduper)
        deduper.train(recall=1)

        with open(training_file, 'w') as tf:
            deduper.writeTraining(tf)

        self.deduper = deduper
        return self
Exemplo n.º 14
0
def label_and_train(d_dict):
    deduper = dedupe.Dedupe(FIELDS)

    print("Sampling data...")
    deduper.sample(d_dict, 15_000)

    if TRAINING_PATH.exists():
        with TRAINING_PATH.open('rb') as f:
            print(f"Loading trained examples from {TRAINING_PATH.name}.")
            deduper.readTraining(f)

    print("Starting active labeling...")
    dedupe.consoleLabel(deduper)

    deduper.train()

    with TRAINING_PATH.open('w') as tf:
        deduper.writeTraining(tf)

    with SETTINGS_PATH.open('wb') as sf:
        deduper.writeSettings(sf)
Exemplo n.º 15
0
def train():
    fields = [{
        'field': 'album_name',
        'variable name': 'album',
        'type': 'String'
    }, {
        'field': 'track_name',
        'variable name': 'track',
        'type': 'String'
    }, {
        'field': 'artist_name',
        'variable name': 'artist',
        'type': 'String'
    }, {
        'field': 'track_duration',
        'type': 'Price',
        'has missing': True
    }, {
        'type': 'Interaction',
        'interaction variables': ['artist', 'album']
    }]

    deduper = dedupe.Dedupe(fields)

    deduper.sample(data_d, 150000)

    if os.path.exists(training_file):
        print 'reading labeled examples from ', training_file
        with open(training_file) as tf:
            deduper.readTraining(tf)

    print 'starting active labeling...'
    dedupe.consoleLabel(deduper)
    deduper.train()

    with open(training_file, 'w') as tf:
        deduper.writeTraining(tf)

    with open(settings_file, 'w') as sf:
        deduper.writeSettings(sf)
Exemplo n.º 16
0
 def start_active_labeling(self, training_file, settings):
     if(not(self.settings_used)):
         print 'Active Labeling...'
         dedupe.consoleLabel(self.duper)
         self.duper.train()
         self.save(training_file, settings)
Exemplo n.º 17
0
    def main(self):

        data_1 = {}
        data_2 = {}
        # import the specified CSV file

        data_1 = csvhelpers.readData(self.input_1, self.field_names_1,
                                     prefix='input_1')
        data_2 = csvhelpers.readData(self.input_2, self.field_names_2,
                                     prefix='input_2')

        # sanity check for provided field names in CSV file
        for field in self.field_names_1:
            if field not in list(data_1.values())[0]:
                raise self.parser.error(
                    "Could not find field '" + field + "' in input")

        for field in self.field_names_2:
            if field not in list(data_2.values())[0]:
                raise self.parser.error(
                    "Could not find field '" + field + "' in input")

        if self.field_names_1 != self.field_names_2:
            for record_id, record in data_2.items():
                remapped_record = {}
                for new_field, old_field in zip(self.field_names_1,
                                                self.field_names_2):
                    remapped_record[new_field] = record[old_field]
                data_2[record_id] = remapped_record

        logging.info('imported %d rows from file 1', len(data_1))
        logging.info('imported %d rows from file 2', len(data_2))

        logging.info('using fields: %s' % [field['field']
                                           for field in self.field_definition])
        # # Create a new deduper object and pass our data model to it.
        deduper = dedupe.RecordLink(self.field_definition)

        # Set up our data sample
        logging.info('taking a sample of %d possible pairs', self.sample_size)
        deduper.sample(data_1, data_2, self.sample_size)

        # If we have training data saved from a previous run of dedupe,
        # look for it an load it in.
        # __Note:__ if you want to train from scratch, delete the training_file

        if os.path.exists(self.training_file):
            logging.info('reading labeled examples from %s' %
                         self.training_file)
            with open(self.training_file) as tf:
                deduper.readTraining(tf)
        elif self.skip_training:
            raise self.parser.error(
                "You need to provide an existing training_file or run this script without --skip_training")

        if not self.skip_training:
            logging.info('starting active labeling...')

            dedupe.consoleLabel(deduper)

            # When finished, save our training away to disk
            logging.info('saving training data to %s' % self.training_file)
            if sys.version < '3' :
                with open(self.training_file, 'wb') as tf:
                    deduper.writeTraining(tf)
            else :
                with open(self.training_file, 'w') as tf:
                    deduper.writeTraining(tf)
        else:
            logging.info('skipping the training step')

        deduper.train()

        # ## Blocking

        logging.info('blocking...')

        # ## Clustering

        # Find the threshold that will maximize a weighted average of our precision and recall. 
        # When we set the recall weight to 2, we are saying we care twice as much
        # about recall as we do precision.
        #
        # If we had more data, we would not pass in all the blocked data into
        # this function but a representative sample.

        logging.info('finding a good threshold with a recall_weight of %s' %
                     self.recall_weight)
        threshold = deduper.threshold(data_1, data_2,
                                      recall_weight=self.recall_weight)

        # `duplicateClusters` will return sets of record IDs that dedupe
        # believes are all referring to the same entity.

        logging.info('clustering...')
        clustered_dupes = deduper.match(data_1, data_2, threshold)

        logging.info('# duplicate sets %s' % len(clustered_dupes))

        write_function = csvhelpers.writeLinkedResults
        # write out our results

        if self.output_file:
            if sys.version < '3' :
                with open(self.output_file, 'wb') as output_file:
                    write_function(clustered_dupes, self.input_1, self.input_2,
                                   output_file, self.inner_join)
            else :
                with open(self.output_file, 'w') as output_file:
                    write_function(clustered_dupes, self.input_1, self.input_2,
                                   output_file, self.inner_join)
        else:
            write_function(clustered_dupes, self.input_1, self.input_2,
                           sys.stdout, self.inner_join)
Exemplo n.º 18
0
    # look for it an load it in.
    # __Note:__ if you want to train from scratch, delete the training_file
    if os.path.exists(training_file):
        print('reading labeled examples from ', training_file)
        with open(training_file) as tf:
            gazetteer.readTraining(tf)

    # ## Active learning
    # Dedupe will find the next pair of records
    # it is least certain about and ask you to label them as matches
    # or not.
    # use 'y', 'n' and 'u' keys to flag duplicates
    # press 'f' when you are finished
    print('starting active labeling...')

    dedupe.consoleLabel(gazetteer)

    gazetteer.train()

    # When finished, save our training away to disk
    with open(training_file, 'w') as tf:
        gazetteer.writeTraining(tf)

    # Make the canonical set
    gazetteer.index(canonical)
    
    # Save our weights and predicates to disk.  If the settings file
    # exists, we will skip all the training and learning next time we run
    # this file.
    with open(settings_file, 'wb') as sf:
        gazetteer.writeSettings(sf, index=True)
Exemplo n.º 19
0
  def main(self) :

    data_d = {}
    # import the specified CSV file

    data_d = csvhelpers.readData(self.input, self.field_names)
    
    logging.info('imported %d rows', len(data_d))

    # sanity check for provided field names in CSV file
    for field in self.field_definition :
      if self.field_definition[field]['type'] != 'Interaction' :
        if not field in data_d[0]:
        
          raise parser.error("Could not find field '" + field + "' in input")

    logging.info('using fields: %s' % self.field_definition.keys())
    # # Create a new deduper object and pass our data model to it.
    deduper = dedupe.Dedupe(self.field_definition)

    # Set up our data sample
    logging.info('taking a sample of %d possible pairs', self.sample_size)
    deduper.sample(data_d, self.sample_size)

    # If we have training data saved from a previous run of dedupe,
    # look for it an load it in.
    # __Note:__ if you want to train from scratch, delete the training_file

    if os.path.exists(self.training_file):
      logging.info('reading labeled examples from %s' % self.training_file)
      deduper.readTraining(self.training_file)
    elif self.skip_training:
      raise parser.error("You need to provide an existing training_file or run this script without --skip_training")

    if not self.skip_training:
      logging.info('starting active labeling...')

      dedupe.consoleLabel(deduper)
      deduper.train()

      # When finished, save our training away to disk
      logging.info('saving training data to %s' % self.training_file)
      deduper.writeTraining(self.training_file)
    else:
      logging.info('skipping the training step')
      deduper.train()

    # ## Blocking

    logging.info('blocking...')

    # ## Clustering

    # Find the threshold that will maximize a weighted average of our precision and recall. 
    # When we set the recall weight to 2, we are saying we care twice as much
    # about recall as we do precision.
    #
    # If we had more data, we would not pass in all the blocked data into
    # this function but a representative sample.

    logging.info('finding a good threshold with a recall_weight of %s' % 
                 self.recall_weight)
    threshold = deduper.threshold(data_d, recall_weight=self.recall_weight)

    # `duplicateClusters` will return sets of record IDs that dedupe
    # believes are all referring to the same entity.

    logging.info('clustering...')
    clustered_dupes = deduper.match(data_d, threshold)

    logging.info('# duplicate sets %s' % len(clustered_dupes))

    write_function = csvhelpers.writeResults
    # write out our results
    if self.destructive:
      write_function = csvhelpers.writeUniqueResults

    if self.output_file :
      with open(self.output_file, 'w') as output_file :
        write_function(clustered_dupes, self.input, output_file)
    else :
        write_function(clustered_dupes, self.input, sys.stdout)
def gazetteer_dataframes(messy_df,
                         canonical_df,
                         field_properties,
                         recall_weight,
                         n_matches,
                         config_name="gazetteer_dataframes"):

    config_name = config_name.replace(" ", "_")

    settings_file = config_name + '_learned_settings'
    training_file = config_name + '_training.json'

    print('importing data ...')

    messy_df = clean_punctuation(messy_df)
    specify_type(messy_df, field_properties)

    messy_df['index_field'] = messy_df.index
    messy_df['index_field'] = messy_df['index_field'].apply(
        lambda x: "messy_df" + str(x))
    messy_df.set_index(['index_field'], inplace=True)

    data_1 = messy_df.to_dict(orient='index')

    canonical_df = clean_punctuation(canonical_df)
    specify_type(canonical_df, field_properties)

    canonical_df['index_field'] = canonical_df.index
    canonical_df['index_field'] = canonical_df['index_field'].apply(
        lambda x: "canonical_df" + str(x))
    canonical_df.set_index(['index_field'], inplace=True)

    data_2 = canonical_df.to_dict(orient='index')

    # ---------------------------------------------------------------------------------

    # ## Training

    if os.path.exists(settings_file):
        print('reading from', settings_file)
        with open(settings_file, 'rb') as sf:
            gazetteer = dedupe.StaticGazetteer(sf)

    else:
        # Define the fields the linker will pay attention to
        #
        # Notice how we are telling the linker to use a custom field comparator
        # for the 'price' field.

        fields = []
        select_fields(fields, field_properties)

        # Create a new gazetteer object and pass our data model to it.
        gazetteer = dedupe.Gazetteer(fields)

        # To train the gazetteer, we feed it a sample of records.
        gazetteer.sample(data_1, data_2, 15000)

        # If we have training data saved from a previous run of linker,
        # look for it an load it in.
        # __Note:__ if you want to train from scratch, delete the training_file
        if os.path.exists(training_file):
            print('reading labeled examples from ', training_file)
            with open(training_file) as tf:
                linker.readTraining(tf)

        # ## Active learning
        # Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as matches
        # or not.
        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print('starting active labeling...')

        dedupe.consoleLabel(gazetteer)

        gazetteer.train()

        # When finished, save our training awak to disk

        with open(training_file, 'w') as tf:
            gazetteer.writeTraining(tf)

        # Make the canonical set
        gazetteer.index(data_2)

        # Save our weights and predicates to disk. If the settings file exists,
        # we will skip all training and learning next time we run this file.
        with open(settings_file, 'wb') as sf:
            gazetteer.writeSettings(sf, index=True)

        gazetteer.cleanupTraining()

    gazetteer.index(data_2)
    #Calc threshold
    print('start calculating threshold')
    threshold = gazetteer.threshold(data_1, recall_weight)
    print('Threshold: {}'.format(threshold))

    results = gazetteer.match(data_1, threshold=threshold, n_matches=n_matches)

    results_df = pd.DataFrame(results)

    results_df['messy_df_link'] = results_df[0].apply(lambda x: x[0][0])
    results_df['messy_df_link'] = results_df['messy_df_link'].str.strip(
        'messy_df')
    results_df['messy_df_link'] = results_df['messy_df_link'].astype(int)

    results_df['canonical_df_link'] = results_df[0].apply(lambda x: x[0][1])
    results_df['canonical_df_link'] = results_df[
        'canonical_df_link'].str.strip('canonical_df')
    results_df['canonical_df_link'] = results_df['canonical_df_link'].astype(
        int)

    results_df['confidence'] = results_df[0].apply(lambda x: x[1])
    results_df['cluster id'] = results_df.index

    results_df = results_df.rename(columns={0: 'results'})
    results_df['results'] = results_df['results'].astype(str)

    #For both messy_df & canonical_df, add cluster id & confidence score from results_df
    messy_df.index.rename('messy_df_link', inplace=True)
    messy_df = messy_df.rename(columns={'unique_id': 'messy_unique_id'})
    messy_df = messy_df.merge(results_df_copy, on='messy_df_link', how='left')

    canonical_df.index.rename('canonical_df_link', inplace=True)
    canonical_df = canonical_df.rename(
        columns={'unique_id': 'canonical_unique_id'})
    canonical_df = canonical_df.merge(results_df_copy,
                                      on='canonical_df_link',
                                      how='left')

    #Merge messy_df & canonical_df together
    final_df = messy_df.merge(canonical_df, on='results')

    return final_df
Exemplo n.º 21
0
def train(clients):

    if os.path.exists(settings_file):
        with open(settings_file) as sf:
            print 'reading from', settings_file
            return dedupe.StaticDedupe(sf)

    else:
        fields = [
            {
                'field': 'city',
                'type': 'ShortString',
                'Has Missing': True
            },
            {
                'field': 'country',
                'type': 'ShortString',
                'Has Missing': True
            },
            {
                'field': 'zip',
                'type': 'ShortString',
                'Has Missing': True
            },
            {
                'field': 'state',
                'type': 'ShortString',
                'Has Missing': True
            },
            {
                'field': 'address',
                'type': 'String',
                'Has Missing': True
            },
            {
                'field': 'description',
                'type': 'Text',
                'Has Missing': True,
                'corpus':
                []  #map(lambda x: x['description'],clients.values())},
            },
            {
                'field': 'specific_issues',
                'type': 'Text',
                'Has Missing': True,
                'corpus':
                []  #map(lambda x: x['specific_issues'],clients.values())
            },
            {
                'field': 'exact_name',
                'type': 'String'
            },
            {
                'field': 'alis',
                'type': 'Set',
                'corpus': []  #map(lambda x: x['alis'],clients.values())
            },
            {
                'field': 'houseID',
                'type': 'Exact'
            },
            {
                'field': 'senate',
                'type': 'Exact'
            },
        ]
        #for definition in fields[:] :
        #    if definition.field != 'houseID':
        #        fields[k+"-houseID"] = {'type':'Interaction',
        #                                'Interaction Fields': ["houseID", k]}
        # Create a new deduper object and pass our data model to it.
        deduper = dedupe.Dedupe(fields)

        # To train dedupe, we feed it a random sample of records.
        deduper.sample(clients, 150000)

        # If we have training data saved from a previous run of dedupe,
        # look for it an load it in.
        # __Note:__ if you want to train from scratch, delete the training_file
        if os.path.exists(training_file):
            with open(training_file) as tf:
                print 'reading labeled examples from ', training_file
                deduper.readTraining(tf)

        # ## Active learning
        # Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.
        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print 'starting active labeling...'

        dedupe.consoleLabel(deduper)

        deduper.train(ppc=0.1, uncovered_dupes=2)

        # When finished, save our training away to disk
        with open(training_file, 'w') as tf:
            deduper.writeTraining(tf)

        # Save our weights and predicates to disk.  If the settings file
        # exists, we will skip all the training and learning next time we run
        # this file.
        with open(settings_file, 'w') as sf:
            deduper.writeSettings(sf)

        deduper.cleanupTraining()

        return deduper
Exemplo n.º 22
0
def get_resolved_df(data_1,data_2,fields):

    # Training

    if os.path.exists(settings_file):
        print ('reading from', settings_file)
        with open(settings_file, 'rb') as sf :
            linker = dedupe.StaticRecordLink(sf)

    else:
        # Create a new linker object and pass our data model to it.
        linker = dedupe.RecordLink(fields)
        # To train the linker, we feed it a sample of records.
        linker.sample(data_1, data_2, 15000)

        # If we have training data saved from a previous run of linker,
        # look for it an load it in.
        if os.path.exists(training_file):
            print('reading labeled examples from ', training_file)
            with open(training_file) as tf :
                linker.readTraining(tf)

        # ## Active learning
        print('starting active labeling...')
        dedupe.consoleLabel(linker)

        linker.train()

        # When finished, save our training away to disk
        with open(training_file, 'w') as tf :
            linker.writeTraining(tf)

		# Save our weights and predicates to disk.  If the settings file
        # exists, we will skip all the training and learning next time we run
        # this file.
        with open(settings_file, 'wb') as sf :
    	    linker.writeSettings(sf)

    # ## Clustering

    print('clustering...')
    linked_records = linker.match(data_1, data_2, 0)

    print('# duplicate sets', len(linked_records))
    print (linked_records)

    # ## Writing Results

    df_list = []

    for cluster, score in linked_records:

        #obtain filename + record no clusters
        cluster1 = cluster[0]
        cluster2 = cluster[1]
        match = re.match(r"(\w+)\.csv(\d+)",cluster1)
        filename,idx = match.groups()
        filename +='.csv'
        filename = pr.resource_filename('src.input',filename)
        idx = int(idx)
        print filename
        print idx

        #dataframe for cluster - 1
        df1 = pd.DataFrame(columns=pd.read_csv(filename).columns)
        df1.loc[0] = pd.read_csv(filename).ix[idx]

        #for cluster 2
        match = re.match(r"(\w+)\.csv(\d+)",cluster2)
        filename,idx = match.groups()
        filename += '.csv'
        filename = pr.resource_filename('src.input',filename)
        idx = int(idx)
        print filename
        print idx

        #dataframe for cluster 2
        df2 = pd.DataFrame(columns=pd.read_csv(filename).columns)
        df2.loc[0] = pd.read_csv(filename).ix[idx]

        #common attribute
        df2['score'] = [score]
        df1['score'] = [score]

        df = pd.merge(df1,df2,on = 'score')
        df_list.append(df)

    results_df = pd.concat(df_list,ignore_index = True)
    return results_df
Exemplo n.º 23
0
def deDuplication():

    input_file = 'csv_example_input.csv'
    output_file = 'csv_example_output.csv'
    settings_file = 'csv_example_learned_settings3'
    training_file = 'csv_example_training.json3'

    def preProcess(column):

        try:
            column = column.decode('utf-8')
        except AttributeError:
            pass
        column = unidecode(column)
        column = re.sub(' +', ' ', column)
        column = re.sub('\n', ' ', column)
        column = column.strip().strip('"').strip("'").lower().strip()

        if not column:
            column = None
        return column

    # Read in the data from CSV file:
    def readData(filename):

        data_d = {}
        with open(filename, encoding="utf-8") as f:
            reader = csv.DictReader(f)
            for row in reader:
                clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
                row_id = row['id']
                data_d[row_id] = dict(clean_row)

        return data_d

    print('importing data ...')
    data_d = readData(input_file)

    if os.path.exists(settings_file):
        print('reading from', settings_file)
        with open(settings_file, 'rb') as f:
            deduper = dedupe.StaticDedupe(f)
    else:
        fields = [
            {
                'field': 'DisplayName_Processed',
                'type': 'String'
            },
            {
                'field': 'Email_Processed',
                'type': 'String'
            },
            {
                'field': 'Info',
                'type': 'String'
            },
        ]
        deduper = dedupe.Dedupe(fields)
        deduper.sample(data_d, 15000)

        if os.path.exists(training_file):
            print('reading labeled examples from ', training_file)
            with open(training_file, 'rb') as f:
                deduper.readTraining(f)

        print('starting active labeling...')

        dedupe.consoleLabel(deduper)

        deduper.train()

        with open(training_file, 'w') as tf:
            deduper.writeTraining(tf)

        with open(settings_file, 'wb') as sf:
            deduper.writeSettings(sf)

    threshold = deduper.threshold(data_d, recall_weight=1)

    print('clustering...')
    clustered_dupes = deduper.match(data_d, threshold)

    print('# duplicate sets', len(clustered_dupes))

    cluster_membership = {}
    cluster_id = 0
    for (cluster_id, cluster) in enumerate(clustered_dupes):
        id_set, scores = cluster
        cluster_d = [data_d[c] for c in id_set]
        canonical_rep = dedupe.canonicalize(cluster_d)
        for record_id, score in zip(id_set, scores):
            cluster_membership[record_id] = {
                "cluster id": cluster_id,
                "canonical representation": canonical_rep,
                "confidence": score
            }

    singleton_id = cluster_id + 1

    with open(output_file, 'w',
              encoding="utf-8") as f_output, open(input_file,
                                                  encoding="utf-8") as f_input:
        writer = csv.writer(f_output)
        reader = csv.reader(f_input)

        heading_row = next(reader)
        heading_row.insert(0, 'confidence_score')
        heading_row.insert(0, 'Cluster ID')
        canonical_keys = canonical_rep.keys()
        for key in canonical_keys:
            heading_row.append('canonical_' + key)

        writer.writerow(heading_row)

        for row in reader:
            row_id = row[0]
            if row_id in cluster_membership:
                cluster_id = cluster_membership[row_id]["cluster id"]
                canonical_rep = cluster_membership[row_id][
                    "canonical representation"]
                row.insert(0, cluster_membership[row_id]['confidence'])
                row.insert(0, cluster_id)
                for key in canonical_keys:
                    row.append(canonical_rep[key].encode('utf8'))
            else:
                row.insert(0, None)
                row.insert(0, singleton_id)
                singleton_id += 1
                for key in canonical_keys:
                    row.append(None)
            writer.writerow(row)
    return clustered_dupes
Exemplo n.º 24
0
def dedupe_training(category, recall_weight=1):
    assert (category in ['mangas', 'animes']),"Only mangas or animes needs training"
    data = {}
    for work in Work.objects.filter(category__slug=category[:-1]).values_list('title', 'vo_title', 'id'):
        data[work[2]] = {'title': work[0], 'vo_title': work[1]}
        for field in ['title', 'vo_title']:
            if not data[work[2]][field]:
                data[work[2]][field] = None
    fields = [
        {'field': 'title', 'type': 'String'},
        {'field': 'vo_title', 'type': 'String'},
    ]
    output_file = os.path.join(DEDUPE_DIR, category + '_output.csv')
    settings_file = os.path.join(DEDUPE_DIR, category + '_learned_settings')
    training_file = os.path.join(DEDUPE_DIR, category+'_training.json')

    deduper = Dedupe(fields)
    deduper.sample(data)
    consoleLabel(deduper)

    if os.path.exists(training_file):
        print('reading labeled examples from ', training_file)
        with open(training_file, 'rb') as f:
            deduper.readTraining(f)

    deduper.train()

    with open(training_file, 'w') as tf:
        deduper.writeTraining(tf)

    with open(settings_file, 'wb') as sf:
        deduper.writeSettings(sf)

    #recall_weight defines the value you give to recall opposed to the value of precision
    threshold = deduper.threshold(data, recall_weight)

    print('clustering...')
    clustered_dupes = deduper.match(data, threshold)

    print('# duplicate sets', len(clustered_dupes))

    input_file = os.path.join(DEDUPE_DIR, category + '.csv')
    with open(input_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f, delimiter='\t')
        writer.writerow(['id', 'title', 'vo_title'])
        for work_id in data:
            title = data[work_id]['title']
            vo_title = data[work_id]['vo_title']
            writer.writerow([work_id, title, vo_title])
    cluster_membership = {}
    cluster_id = 0
    for (cluster_id, cluster) in enumerate(clustered_dupes):
        id_set, scores = cluster
        cluster_d = [data[c] for c in id_set]
        canonical_rep = canonicalize(cluster_d)
        for record_id, score in zip(id_set, scores):
            cluster_membership[record_id] = {
                "cluster id" : cluster_id,
                "canonical representation" : canonical_rep,
                "confidence": score
            }
    singleton_id = cluster_id + 1

    with open(output_file, 'w', newline='', encoding='utf-8') as f_output, open(input_file, newline='', encoding='utf-8') as f_input:
        writer = csv.writer(f_output, delimiter='\t')
        reader = csv.reader(f_input, delimiter='\t')

        heading_row = next(reader)
        heading_row.insert(0, 'confidence_score')
        heading_row.insert(0, 'Cluster ID')
        canonical_keys = canonical_rep.keys()
        for key in canonical_keys:
            heading_row.append('canonical_' + key)

        writer.writerow(heading_row)

        for row in reader:
            row_id = int(row[0])
            if row_id in cluster_membership:
                cluster_id = cluster_membership[row_id]["cluster id"]
                canonical_rep = cluster_membership[row_id]["canonical representation"]
                row.insert(0, cluster_membership[row_id]['confidence'])
                row.insert(0, cluster_id)
                for key in canonical_keys:
                    row.append(canonical_rep[key])
            else:
                row.insert(0, None)
                row.insert(0, singleton_id)
                singleton_id += 1
                for key in canonical_keys:
                    row.append(None)
            writer.writerow(row)
    print('output file written')
Exemplo n.º 25
0
def train(clients):
    
    
    if os.path.exists(settings_file):
        with open(settings_file) as sf :
            print 'reading from', settings_file
            return dedupe.StaticDedupe(sf)

    else:
        fields = [{'field' : 'city', 'type': 'ShortString', 'Has Missing': True},
                  {'field' : 'country', 'type': 'ShortString', 'Has Missing': True},
                  {'field' : 'zip', 'type': 'ShortString', 'Has Missing': True},
                  {'field' : 'state', 'type': 'ShortString', 'Has Missing': True},                  
                  {'field' : 'address', 'type': 'String', 'Has Missing': True},
                  
                  {'field' : 'description', 'type': 'Text', 'Has Missing': True,
                   'corpus': [] #map(lambda x: x['description'],clients.values())},
                   },
                  {'field' : 'specific_issues', 'type': 'Text', 'Has Missing': True,
                                      'corpus': [] #map(lambda x: x['specific_issues'],clients.values())
                                  },
                  {'field' : 'exact_name', 'type': 'String'},
                  {'field' : 'alis', 'type': 'Set',
                   'corpus': [] #map(lambda x: x['alis'],clients.values())
                           },

                  {'field' : 'houseID', 'type': 'Exact'},
                  {'field' : 'senate', 'type': 'Exact'},

               ]
        #for definition in fields[:] :
        #    if definition.field != 'houseID':
        #        fields[k+"-houseID"] = {'type':'Interaction',
        #                                'Interaction Fields': ["houseID", k]}
        # Create a new deduper object and pass our data model to it.
        deduper = dedupe.Dedupe(fields)

        # To train dedupe, we feed it a random sample of records.
        deduper.sample(clients, 150000)

        
        # If we have training data saved from a previous run of dedupe,
        # look for it an load it in.
        # __Note:__ if you want to train from scratch, delete the training_file
        if os.path.exists(training_file):
            with open(training_file) as tf :
                print 'reading labeled examples from ', training_file
                deduper.readTraining(tf)
                        
        # ## Active learning
        # Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.
        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print 'starting active labeling...'

        dedupe.consoleLabel(deduper)

        deduper.train(ppc=0.1, uncovered_dupes=2)

        # When finished, save our training away to disk
        with open(training_file, 'w') as tf :
            deduper.writeTraining(tf)

        # Save our weights and predicates to disk.  If the settings file
        # exists, we will skip all the training and learning next time we run
        # this file.
        with open(settings_file, 'w') as sf : 
            deduper.writeSettings(sf)

        deduper.cleanupTraining()

        return deduper
Exemplo n.º 26
0
def start_dedupe(df):
    # If a settings file already exists, we'll just load that and skip training
    if os.path.exists(settings_file):
        print('reading from', settings_file)
        with open(settings_file, 'rb') as f:
            deduper = dedupe.StaticDedupe(f)

    else:
        # ## Training

        # Define the fields dedupe will pay attention to
        fields = [
            {
                'field': 'MESSZEIT',
                'type': 'Exact'
            },
            {
                'field': 'KENNUNG',
                'type': 'Exact'
            },
            {
                'field': 'GEOGR_BREITE',
                'type': 'Custom',
                'comparator': sim_num_abs
            },
            {
                'field': 'GEOGR_LAENGE',
                'type': 'Custom',
                'comparator': sim_num_abs
            },
            {
                'field': 'HORIZONTALE_SICHT',
                'type': 'ShortString',
                'has missing': True
            },
            {
                'field': 'WETTER',
                'type': 'Custom',
                'comparator': sim_ww,
                'has missing': True
            },
        ]

        # Create a new deduper object and pass our data model to it.
        deduper = dedupe.Dedupe(fields)

        # To train dedupe, we feed it a sample of records.
        deduper.sample(df, 15000, .5)

        # If we have training data saved from a previous run of dedupe,
        # look for it and load it in.
        # __Note:__ if you want to train from scratch, delete the training_file
        if os.path.exists(training_file):
            print('reading labeled examples from ', training_file)
            with open(training_file, 'rb') as f:
                deduper.readTraining(f)

        # ## Active learning
        # Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.
        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print('starting active labeling...')

        dedupe.consoleLabel(deduper)

        # Using the examples we just labeled, train the deduper and learn
        # blocking predicates
        deduper.train()

        # When finished, save our training to disk
        with open(training_file, 'w') as tf:
            deduper.writeTraining(tf)

        # Save our weights and predicates to disk.  If the settings file
        # exists, we will skip all the training and learning next time we run
        # this file.
        with open(settings_file, 'wb') as sf:
            deduper.writeSettings(sf)

    threshold = deduper.threshold(df, recall_weight=1)
    print(f'threshold: {threshold}')

    print('clustering...')
    # clustered_dupes = deduper.match(df, .98)
    clustered_dupes = deduper.match(df, threshold)

    print(f'# duplicate sets {len(clustered_dupes)}')
    return clustered_dupes
Exemplo n.º 27
0
  def main(self) :

    data_1 = {}
    data_2 = {}
    # import the specified CSV file

    data_1 = csvhelpers.readData(self.input_1, 
                                 self.field_names_1,
                                 prefix='input_1')
    data_2 = csvhelpers.readData(self.input_2, 
                                 self.field_names_2,
                                 prefix='input_2')

    # sanity check for provided field names in CSV file
    for field in self.field_names_1 :
      if field not in data_1.values()[0]:
        raise parser.error("Could not find field '" + field + "' in input")

    for field in self.field_names_2 :
      if field not in data_2.values()[0]:
        raise parser.error("Could not find field '" + field + "' in input")


    if self.field_names_1 != self.field_names_2 :
      for record_id, record in data_2.items() :
        remapped_record = {}
        for new_field, old_field in zip(self.field_names_1, self.field_names_2) :
          remapped_record[new_field] = record[old_field]
        data_2[record_id] = remapped_record
    
    logging.info('imported %d rows from file 1', len(data_1))
    logging.info('imported %d rows from file 2', len(data_2))

    logging.info('using fields: %s' % self.field_definition.keys())
    # # Create a new deduper object and pass our data model to it.
    deduper = dedupe.RecordLink(self.field_definition)

    # Set up our data sample
    logging.info('taking a sample of %d possible pairs', self.sample_size)
    deduper.sample(data_1, data_2, self.sample_size)

    # If we have training data saved from a previous run of dedupe,
    # look for it an load it in.
    # __Note:__ if you want to train from scratch, delete the training_file

    if os.path.exists(self.training_file):
      logging.info('reading labeled examples from %s' % self.training_file)
      deduper.readTraining(self.training_file)
    elif self.skip_training:
      raise parser.error("You need to provide an existing training_file or run this script without --skip_training")

    if not self.skip_training:
      logging.info('starting active labeling...')

      dedupe.consoleLabel(deduper)
      deduper.train()

      # When finished, save our training away to disk
      logging.info('saving training data to %s' % self.training_file)
      deduper.writeTraining(self.training_file)
    else:
      logging.info('skipping the training step')
      deduper.train()

    # ## Blocking

    logging.info('blocking...')

    # ## Clustering

    # Find the threshold that will maximize a weighted average of our precision and recall. 
    # When we set the recall weight to 2, we are saying we care twice as much
    # about recall as we do precision.
    #
    # If we had more data, we would not pass in all the blocked data into
    # this function but a representative sample.

    logging.info('finding a good threshold with a recall_weight of %s' % 
                 self.recall_weight)
    threshold = deduper.threshold(data_1, data_2, recall_weight=self.recall_weight)

    # `duplicateClusters` will return sets of record IDs that dedupe
    # believes are all referring to the same entity.

    logging.info('clustering...')
    clustered_dupes = deduper.match(data_1, data_2, threshold)

    logging.info('# duplicate sets %s' % len(clustered_dupes))

    write_function = csvhelpers.writeLinkedResults
    # write out our results

    if self.output_file :
      with open(self.output_file, 'w') as output_file :
        write_function(clustered_dupes, 
                       self.input_1, 
                       self.input_2, 
                       output_file,
                       self.inner_join)
    else :
        write_function(clustered_dupes, 
                       self.input_1, 
                       self.input_2, 
                       sys.stdout,
                       self.inner_join)
    'field': 'primary_author_name',
    'type': 'String'
}, {
    'field': 'full_book_title',
    'type': 'String'
}]

# Initialise deduplication object
deduper = dedupe.Dedupe(variables)

# Train the model
deduper.sample(
    data, 150000,
    .5)  # This randomly generates 150000 pairs of records to compare
dedupe.consoleLabel(
    deduper
)  # Call up command line tool for manually labelling training pairs.
deduper.train()  # Trains the model to work out which pairs are the same

# Write training to disk
with open(training_file, 'w') as tf:
    deduper.writeTraining(tf)

with open(settings_file, 'wb') as sf:
    deduper.writeSettings(sf)

# Now make our predictions
threshold = deduper.threshold(
    data, recall_weight=1)  # We care equally about precision and recall

print("clustering ...")
Exemplo n.º 29
0
    # look for it an load it in.
    # __Note:__ if you want to train from scratch, delete the training_file
    if os.path.exists(training_file):
        print('reading labeled examples from ', training_file)
        with open(training_file) as tf :
            linker.readTraining(tf)

    # ## Active learning
    # Dedupe will find the next pair of records
    # it is least certain about and ask you to label them as matches
    # or not.
    # use 'y', 'n' and 'u' keys to flag duplicates
    # press 'f' when you are finished
    print('starting active labeling...')

    dedupe.consoleLabel(linker)

    linker.train()

    # When finished, save our training away to disk
    with open(training_file, 'w') as tf :
        linker.writeTraining(tf)

    # Save our weights and predicates to disk.  If the settings file
    # exists, we will skip all the training and learning next time we run
    # this file.
    with open(settings_file, 'wb') as sf :
        linker.writeSettings(sf)


# ## Blocking
    # look for it an load it in.
    # __Note:__ if you want to train from scratch, delete the training_file
    if os.path.exists(training_file):
        print 'reading labeled examples from ', training_file
        with open(training_file, 'rb') as f:
            deduper.readTraining(f)

    # ## Active learning
    # Dedupe will find the next pair of records
    # it is least certain about and ask you to label them as duplicates
    # or not.
    # use 'y', 'n' and 'u' keys to flag duplicates
    # press 'f' when you are finished
    print 'starting active labeling...'

    dedupe.consoleLabel(deduper)

    deduper.train()

    # When finished, save our training away to disk
    with open(training_file, 'w') as tf :
        deduper.writeTraining(tf)

    # Save our weights and predicates to disk.  If the settings file
    # exists, we will skip all the training and learning next time we run
    # this file.
    with open(settings_file, 'w') as sf :
        deduper.writeSettings(sf)


# ## Blocking
Exemplo n.º 31
0
def main_dedupe_method(data_d,spot_d,spot_data,chosen_cols3,country_code,settings_file,training_file,fields):
        
     
    print '''############# Welcome to our Matching Algorithm  ################
    =============== Initializing Part 1 - Data Retrieving =================\n'''
    start_time = time.time()
    
 
    print "Getting data from BP companies..."                
    
    # Select all companies from selected country
    con = psy.connect(database='msi_bp_%s' % (country_code), user = '******', host='basil.colours.local', password='******')
    con.set_client_encoding('UTF8')
    c = con.cursor(cursor_factory=psy.extras.RealDictCursor)
    
    
    q_companies = '''SELECT --DISTINCT on (auth.company_int_id_number)
	CONCAT(ROW_NUMBER() OVER(ORDER BY auth.company_int_id_number),'c') AS line_nr,
	auth.current_authority_pk as auth_nr,
	auth.s_acct_name as "name",
	auth.s_tax_number as vat,
	ads.address_line1 as address,
	ads.address_line3 as town,
	ads.postcode as postcode
	FROM gdm.authorities auth
	LEFT JOIN gdm.addresses ads
		ON ads.address_id_pk = auth.s_address_id
	WHERE auth.s_acct_name is not null
	ORDER BY auth.company_int_id_number'''    
    c.execute(q_companies)
    companies = c.fetchall()

    comp_d = {}
    for row in companies:
        clean_row = {ky: p.proc(row[ky]) for ky in chosen_cols3[1:]}
        row_id = row['line_nr']
        comp_d[row_id] = dict(clean_row)


    print "Saving all data into database..."

    all_data_d=md.merge_dicts(spot_d,comp_d)

    all_data=[]

    for k,v in all_data_d.iteritems():   
        new_row = [v[ky] for ky in chosen_cols3[1:]]
        new_row.insert(0,k)
        all_data.append(new_row)


    c2 = con.cursor()


    c2.execute('DROP TABLE IF EXISTS htc.all_data')

    field_string = ','.join('%s varchar(200)' % name for name in chosen_cols3)
    c2.execute('''CREATE TABLE htc.all_data
               (%s)''' %(field_string))

    num_cols = len(chosen_cols3)
    mog = "(" + ("%s,"*(num_cols -1)) + "%s)"
    args_str = ','.join(c2.mogrify(mog,x) for x in all_data)
    values = "("+ ','.join(x for x in chosen_cols3) +")"
    c2.execute("INSERT INTO htc.all_data %s VALUES %s" % (values, args_str))
    con.commit()


    print 'Data retrieval took ', time.time() - start_time, 'seconds (', (time.time() - start_time)/60, 'minutes)'
    start_time1 = time.time()


    print "Arranging data to create blocks..."

    u_keys = set()

    for n,m in all_data_d.iteritems():
        for k,v in m.iteritems():
            u_keys.add(v)

    u_keys = list(u_keys)


    block_keys = {}
    count = 0

    for k in u_keys:
        block_keys[k] = count
        count += 1

    
    print 'Checking blocks...'

    ids = []
    for k, v in all_data_d.items():
        ids.append([block_keys[v['name']],v['name'], k])
        if 'postcode' in v.keys():
            if v['postcode'] != None:
                ids.append([block_keys[v['postcode']],v['postcode'], k])
        if 'town' in v.keys():
            if v['town'] != None:
                ids.append([block_keys[v['town']],v['town'], k])


    print 'Writing tables...'

    column_names = ['block_key','block_id','record_id']

    c3 = con.cursor()

    print 'Writing htc.blocks_recordlink_test'

    ## Table with all blocks and companies ##
    c3.execute('DROP TABLE IF EXISTS htc.blocks_recordlink_test')
    field_string = ','.join('%s varchar(200)' % name for name in column_names)
    c3.execute('''CREATE TABLE htc.blocks_recordlink_test
               (%s)''' %(field_string))

    num_cols = len(column_names)
    mog = "(" + ("%s,"*(num_cols -1)) + "%s)"
    args_str = ','.join(c3.mogrify(mog,x) for x in ids)
    values = "("+ ','.join(x for x in column_names) +")"
    c3.execute("INSERT INTO htc.blocks_recordlink_test %s VALUES %s" % (values, args_str))
    con.commit()

    print 'Writing htc.plural_blocks'

    ## Table with all blocks that have more than one company ##

    c3.execute("DROP TABLE IF EXISTS htc.plural_blocks")
    c3.execute("DROP TABLE IF EXISTS htc.covered_blocks")

    c3.execute("""CREATE TABLE htc.plural_blocks as (
                SELECT b.block_id 
                FROM htc.blocks_recordlink_test b
                INNER JOIN (SELECT DISTINCT block_id 
                        FROM htc.blocks_recordlink_test 
                        WHERE record_id like '%c') bp
                ON bp.block_id = b.block_id
                INNER JOIN (SELECT DISTINCT block_id 
                        FROM htc.blocks_recordlink_test 
                        WHERE record_id not like '%c') comp
                ON comp.block_id = b.block_id
                GROUP BY b.block_id 
                HAVING COUNT(*) > 1)""")
    con.commit()

    print 'Writing htc.covered_blocks'

    ## Table with list of blocks for each company ##

    c3.execute("""CREATE TABLE htc.covered_blocks as (
                SELECT record_id, array_agg(r.block_key ORDER BY r.block_key) AS sorted_ids
                FROM htc.blocks_recordlink_test r
                INNER JOIN htc.plural_blocks bl
                ON r.block_id = bl.block_id
                GROUP BY record_id)""")
    con.commit()

    c3 = con.cursor(cursor_factory=psy.extras.RealDictCursor)

    print "Blocking..."

    ## Get all companies and blocks ##

    c3.execute("""SELECT br.record_id, m.*, br.block_key, cb.sorted_ids
               FROM htc.blocks_recordlink_test br
               LEFT JOIN 
		        (SELECT * FROM htc.all_data) m
                    ON m.line_nr = br.record_id
                INNER JOIN htc.covered_blocks cb
                    ON br.record_id = cb.record_id
                INNER JOIN htc.plural_blocks pb
                    ON br.block_id = pb.block_id
                ORDER BY br.block_key""")


    rec_ids = c3.fetchall()


    print 'Writing tables took ', time.time() - start_time1, 'seconds (', (time.time() - start_time1)/60, 'minutes)'
    start_time2 = time.time()

    print "Blocking..."
    ## Arrange data into the needed syntax to use in dedupe ##

    blocks = []
    last_key = 0
    count = 0
    bp_block = []
    comp_block = []


    for rec in rec_ids:
        if rec['block_key'] == last_key:
            if rec['record_id'][-1:] != 'c':
                bp_block.append(tuple([rec['record_id'],
                                       {re.sub('_bp','',your_key): p.proc(rec[your_key]) for your_key in chosen_cols3[1:]},
                                       set(rec['sorted_ids'][:rec['sorted_ids'].index(rec['block_key'])])
                                       ]))  
            else:
                comp_block.append(tuple([rec['record_id'],
                                         comp_d[rec['record_id']],
                                         set(rec['sorted_ids'][:rec['sorted_ids'].index(rec['block_key'])])
                                         ])) 

            
        else:
            if bp_block != [] and comp_block != []:
                blocks.append((bp_block, comp_block))
                bp_block = []
                comp_block = []
                if rec['record_id'][-1:] == 'c':
                    comp_block =[tuple([rec['record_id'],
                                        comp_d[rec['record_id']], 
                                        set(rec['sorted_ids'][:rec['sorted_ids'].index(rec['block_key'])])
                                        ])]
            else:
                            bp_block = [tuple([rec['record_id'],
                                               {re.sub('_bp','',your_key): p.proc(rec[your_key]) for your_key in chosen_cols3[1:]},
                                               set(rec['sorted_ids'][:rec['sorted_ids'].index(rec['block_key'])])
                                               ])]
    
            last_key = rec['block_key']

        count +=1

    blocks.append((bp_block, comp_block))

    blocks = tuple(blocks)


    del rec_ids
    del all_data

    con.close() 



    # Dinamically create fields accordingly to chosen columns
    # We need to see how can we say to our model if we only have name to work with



    print 'Blocking took ', time.time() - start_time2, 'seconds (', (time.time() - start_time2)/60, 'minutes)'
    start_time3 = time.time()


    print "=============== Initializing Part 2 - Dedupe ===============\n"

    print 'Entering Dedupe ...'

    #================================ DEDUPING WITH BLOCKS ==============================================

    if os.path.exists(settings_file):
        print 'Reading from: ', settings_file
        with open(settings_file) as sf :
        
            linker = dedupe.StaticRecordLink(sf)

    else:
        
        linker = dedupe.RecordLink(fields)
        
        linker.sample(data_d,comp_d)

        if os.path.exists(training_file):
            print 'Reading labeled examples from: ', training_file
            with open(training_file) as tf :
                linker.readTraining(tf)

        print 'Starting Active Labeling...'
        dedupe.consoleLabel(linker)
        linker.train()
        with open(training_file, 'w') as tf :
            linker.writeTraining(tf)

        with open(settings_file, 'w') as sf :
            linker.writeSettings(sf)

    print 'Creating blocks...'

    threshold = linker.thresholdBlocks(blocks, recall_weight=2)

    print 'Generating clusters...'

    clusters = linker.matchBlocks(blocks, threshold)
    clust=list(clusters)

    print 'Deduping took ', time.time() - start_time3, 'seconds (', (time.time() - start_time3)/60, 'minutes)'
    start_time4 = time.time()  

    print 'Writing results to database...'

    con = psy.connect(database='msi_bp_%s' % (country_code), user = '******', host='basil.colours.local', password='******')
    c = con.cursor()

    c.execute('DROP TABLE if exists htc.recordlink_blocks_result')
    con.commit()
    c.execute('CREATE TABLE htc.recordlink_blocks_result (spot_id varchar, comp_id varchar, score double precision)')


    #register_adapter(numpy.int64, addapt_numpy_int64)
    num_cols = 3
    mog = "(" + ("%s,"*(num_cols -1)) + "%s)"
    args_str = ','.join(c.mogrify(mog,x[0]+(float(x[1]),)) for x in clust)
    values = "("+ ','.join(x for x in ['spot_id','comp_id', 'score']) +")"
    c.execute("INSERT INTO htc.recordlink_blocks_result %s VALUES %s" % (values, args_str))


    c.execute('DROP TABLE if exists htc.recordlink_blocks_result_all')

    c.execute("""CREATE TABLE htc.recordlink_blocks_result_all as (
               SELECT adt.line_nr as spot_id,br.comp_id as comp_id,br.score as score,
               CASE WHEN br.comp_id IS NULL THEN 'Not a Client'
               ELSE 'Check Match Score'
               END AS "Result",
               adt.name as name_spot,b.name as name_comp, b.auth_nr as auth_nr_comp
               FROM htc.all_data adt
               LEFT JOIN htc.recordlink_blocks_result br
                   ON br.spot_id = adt.line_nr
                  LEFT JOIN 
                    (SELECT *
                    FROM htc.recordlink_blocks_result br
                    INNER JOIN htc.all_data adt
                    ON adt.line_nr= br.comp_id) b
                ON b.spot_id = adt.line_nr
                WHERE adt.line_nr not like '%c%')""")
    con.commit()
    con.close()



    print 'Writing results took ', time.time() - start_time4, 'seconds (', (time.time() - start_time4)/60, 'minutes)'
    print 'Ended in ' , time.time() - start_time, 'seconds (', (time.time() - start_time)/60, 'minutes)'