def trainIncoming(name): from geocoder.deduper import DatabaseGazetteer import simplejson as json import dedupe engine = create_engine(DB_CONN) deduper = DatabaseGazetteer([{'field': 'complete_address', 'type': 'Address'}], engine=engine) sql_table = checkForTable(engine, name) if sql_table == None: sys.exit() primary_key = sql_table.primary_key.columns.keys()[0] messy_table = ''' SELECT {0}, complete_address FROM {1} WHERE address_id IS NULL AND complete_address IS NOT NULL '''.format(primary_key, name) curs = engine.execute(messy_table) messy_data = ({'complete_address': r.complete_address} for r in curs) deduper.drawSample(messy_data, sample_size=30000) if os.path.exists('geocoder/data/training.json'): print('reading labeled examples from geocoder/data/training.json') with open('geocoder/data/training.json') as tf : deduper.readTraining(tf) dedupe.consoleLabel(deduper) deduper.train(ppc=0.1, index_predicates=False) # When finished, save our training away to disk with open('geocoder/data/training.json', 'w') as tf : deduper.writeTraining(tf) # Save our weights and predicates to disk. If the settings file # exists, we will skip all the training and learning next time we run # this file. with open('geocoder/dedupe.settings', 'wb') as sf : deduper.writeSettings(sf) deduper.cleanupTraining()
# # suburbs.mergeTables() connection.close() if args.train: from geocoder.deduper import DatabaseGazetteer import simplejson as json import dedupe from geocoder.app_config import DB_CONN engine = create_engine(DB_CONN) deduper = DatabaseGazetteer([{ 'field': 'complete_address', 'type': 'Address' }], engine=engine) messy_data = json.load(open('geocoder/data/messy_addresses.json')) deduper.drawSample(messy_data, sample_size=30000) if os.path.exists('geocoder/data/training.json'): print('reading labeled examples from geocoder/data/training.json') with open('geocoder/data/training.json') as tf: deduper.readTraining(tf) dedupe.consoleLabel(deduper) deduper.train(ppc=0.1, index_predicates=False)
# # suburbs.run(download_url=download_url) # # suburbs.mergeTables() connection.close() if args.train: from geocoder.deduper import DatabaseGazetteer import simplejson as json import dedupe from geocoder.app_config import DB_CONN engine = create_engine(DB_CONN) deduper = DatabaseGazetteer([{'field': 'complete_address', 'type': 'Address'}], engine=engine) messy_data = json.load(open('geocoder/data/messy_addresses.json')) deduper.drawSample(messy_data, sample_size=30000) if os.path.exists('geocoder/data/training.json'): print('reading labeled examples from geocoder/data/training.json') with open('geocoder/data/training.json') as tf : deduper.readTraining(tf) dedupe.consoleLabel(deduper) deduper.train(ppc=0.1, index_predicates=False) # When finished, save our training away to disk with open('geocoder/data/training.json', 'w') as tf :
def trainIncoming(name): from geocoder.deduper import DatabaseGazetteer import simplejson as json import dedupe engine = create_engine(DB_CONN) deduper = DatabaseGazetteer([{ 'field': 'complete_address', 'type': 'Address' }], engine=engine) sql_table = checkForTable(engine, name) if sql_table == None: sys.exit() primary_key = sql_table.primary_key.columns.keys()[0] messy_table = ''' SELECT {0}, complete_address FROM {1} WHERE address_id IS NULL AND complete_address IS NOT NULL '''.format(primary_key, name) curs = engine.execute(messy_table) messy_data = ({'complete_address': r.complete_address} for r in curs) deduper.drawSample(messy_data, sample_size=30000) if os.path.exists('geocoder/data/training.json'): print('reading labeled examples from geocoder/data/training.json') with open('geocoder/data/training.json') as tf: deduper.readTraining(tf) dedupe.consoleLabel(deduper) deduper.train(ppc=0.1, index_predicates=False) # When finished, save our training away to disk with open('geocoder/data/training.json', 'w') as tf: deduper.writeTraining(tf) # Save our weights and predicates to disk. If the settings file # exists, we will skip all the training and learning next time we run # this file. with open('geocoder/dedupe.settings', 'wb') as sf: deduper.writeSettings(sf) deduper.cleanupTraining()