def __getitem__(self, index): orgData = utils.picCut( np.array(image.open(self.filepath[index]).convert('RGB')), config.crop_size_id) # RGB noiData = utils.addGaussianNoise(orgData, 1) return noiData.astype(np.float32).transpose( [2, 0, 1]) / 255 - 0.5, orgData.astype(np.float32).transpose( [2, 0, 1]) / 255 - 0.5
def netInit(self): utils.logMaker('INFO', 'NETWORK FILES INITIALIZING...') self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.generator_srgan = torch.load(config.network_srg_path, map_location=self.device.type) self.generator_idgan = torch.load(config.network_idg_path, map_location=self.device.type)
def __getitem__(self, index): cropSize = utils.sizeRecurrect(config.crop_size_esr, config.up_scale) hrPic = utils.picCut( np.array(image.open(self.filepath[index]).convert('RGB')), cropSize) lrPic = utils.resize( hrPic, (cropSize // config.up_scale, cropSize // config.up_scale)) return lrPic.astype(np.float32).transpose( [2, 0, 1]) / 255 - 0.5, hrPic.astype(np.float32).transpose( [2, 0, 1]) / 255 - 0.5
def countyGroupByStatePersistence(): stateWisePath = './pages/json/statewise/' statesUnique = usLiveCounty['state'].unique() tmpData = {} for state in statesUnique: tmpFrame = usLiveCounty[usLiveCounty['state'] == state].fillna(value=0) tmpData['countyX'] = tmpFrame['county'].tolist() tmpData['casesY'] = tmpFrame['cases'].astype(int).tolist() tmpData['deathsY'] = tmpFrame['deaths'].astype(int).tolist() statefips = str(tmpFrame['fips'].iloc[0])[:2] UTILS.toJsonFile(tmpData, '{}{}/'.format(stateWisePath, statefips), 'counties-under.json')
def OCR(image_path): """ Function to perform OCR given an Image Path. """ UTILS.save_to_grayscale(image_path) with io.open(image_path, 'rb') as image_file: content = image_file.read() image = types.Image(content=content) response = client.document_text_detection(image=image) texts = response.text_annotations return texts[0].description
def mapDataPersistence(): mapDataPath = './pages/json/mapdata/' tmp = [usLiveState, usLiveCounty] for idx in range(len(tmp)): df = tmp[idx] fipsX, seriesY = getCasesOrDeathsSeries(df, 'fips', 'both') if len(fipsX) != len(seriesY['cases']) or len(fipsX) != len( seriesY['deaths']): raise Exception('Index Must Match') tmpCaseData = [] tmpDeathsData = [] for i in range(len(fipsX)): tmpCaseEntry = {} tmpDeathEntry = {} tmpCaseEntry['fipsCode'] = fipsX[i] tmpDeathEntry['fipsCode'] = fipsX[i] tmpCaseEntry['value'] = seriesY['cases'][i][0] tmpDeathEntry['value'] = seriesY['deaths'][i][0] tmpCaseData.append(tmpCaseEntry) tmpDeathsData.append(tmpDeathEntry) if idx == 0: UTILS.toJsonFile(tmpCaseData, mapDataPath, 'states-cases.json') UTILS.toJsonFile(tmpDeathsData, mapDataPath, 'states-deaths.json') else: UTILS.toJsonFile(tmpCaseData, mapDataPath, 'counties-cases.json') UTILS.toJsonFile(tmpDeathsData, mapDataPath, 'counties-deaths.json')
def jwsim_contracts_irs(contracts, irs, suffix): ''' Takes the contracts and IRS dataframes and returns a dataframe of records with matching names where the JW similarity is >= JWSIM_THRESH. ''' # Rename the columns in IRS: irs = u.rename_cols(irs, irs.columns, suffix) # Restrict the contracts df to just those from IL contracts = contracts[contracts.CSDS_Contract_ID.str.startswith('IL')] # Take the cartesian product between the two; replace np.NaN with '' prod = mn.cart_prod(contracts, irs) prod = prod.replace(np.NaN, '') # Print progress report print('Calculating Jaro-Winkler similarity on vendor names') # Compute the Jaro-Winkler similarity on the VendorName cols col1 = 'VendorName' arg = ((prod, col1, col1 + suffix)) jwsim = mn.parallelize(mn.jwsim, arg) # Return only the rows where JW similarity >= JWSIM_THRESH return jwsim[jwsim.JWSimilarity >= JWSIM_THRESH]
def try_fill(df): ''' Fills in missing zip codes and coordinates as best as possible. Copies in values from elsewhere in the dataset and from the geocoded HQ addresses. Returns a dataframe. ''' # Print progress report print('\nFilling in missing zip codes and coordinates as best as possible') # Fill in missing zip codes as best as possible targetsZ = ['ZipCode'] keys1Z = ['Address', 'City', 'State'] keys2Z = ['Name', 'Longitude', 'Latitude'] df = filler(df, targetsZ, str, keys1Z, keys2Z) # Fill in missing longitude and latitude coordinates as best as possible targetsL = ['Longitude', 'Latitude'] keys1L = ['Address', 'City', 'State', 'ZipCode'] keys2L = ['Name'] df = filler(df, targetsL, float, keys1L, keys2L) # Read in the geocoded HQ addresses and fill in zip codes and coordinates as # best as possible geo = read_geo() subset = ['Address', 'City', 'State'] df = u.merge_coalesce(df.reset_index(drop=True), geo, subset) return df
def import_pb(fname): ''' Reads in the PurpleBinder dataset. Splits each record into multiple based on the number of locations contained in the locations field. Splits the location column into its component parts (Address1, Address2, City, State, & ZipCode) and then converts all the strings to uppercase. Returns a dataframe. ''' # Read in the json file df = read_pb(fname) # Split the locations into multiple rows (one row per location) splitR = split_rows(df) # Split the location column into its component parts splitC = split_cols(splitR) # Convert string columns to uppercase df_upper = u.upper(splitC) # There are serious problems with some of the geocoding in the PB data, so # drop the coordinates df_upper = df_upper.drop(['Latitude', 'Longitude'], axis=1) return df_upper
def merger(dollars_divided, geo): ''' Merges the dollars_divided and geo dataframes, coalescing the values across matching columns. Drops unwanted columns. Returns a dataframe. ''' # Define the arguments to merge_coalesce keys = ['Address', 'City', 'State', 'ZipCode'] sfx = '_R' how = 'left' # Merge dollars_divided and geo together, filling in coordinates df = u.merge_coalesce(dollars_divided, geo, keys, sfx, how) # Drop these columns df = df.drop([ 'ClusterID', 'VendorName_LINK1', 'VendorName_LINK2', 'Name', 'CSDS_Vendor_ID_LINK2' ], axis=1) # Drop duplicates based only on this subset subset = ['CSDS_Vendor_ID', 'Address', 'City', 'State', 'ZipCode'] return df.drop_duplicates(subset=subset).reset_index(drop=True)
def read_svc(): ''' Reads in the service agency addresses. Calls the COMPARE_ADDRESSES module to merge duplicate addresses per agency. Counts the number of service addresses per organization. Returns a dataframe. ''' # Print progress report print('\nReading in service agencies') # Read in the service agencies, converting zip code to string df = pd.read_csv(SVC, converters={'ZipCode': str}) # Append '_SVC' to all columns except CSDS_Svc_ID df = u.rename_cols(df, [x for x in df.columns if x != 'CSDS_Svc_ID'], '_SVC') # Rename a column to prepare for linking df = df.rename(columns={'CSDS_Svc_ID': 'CSDS_Vendor_ID_LINK2'}, index=str) # Use the COMPARE_ADDRESSES module to clean up multiple strings for a single # address record key = 'CSDS_Vendor_ID_LINK2' target = 'Address_SVC' fixed_addresses = ca.fix_duplicate_addresses(df, key, target) # Drop duplicates based on the key and target fields fixed_addresses = fixed_addresses.drop_duplicates(subset=[key, target]) return fixed_addresses
def read_contracts(): ''' Reads in the contracts dataset via the MERGE_CONTRACTS module. Returns a dataframe. ''' # Initialize an empty list to hold the dataframes dfs = [] # For every ((filename,label)) tuple: for fname_tuple in mc.FNAMES: # Read in and process the dataset df = mc.process_dataset(fname_tuple) # If the label == 'CHI': if fname_tuple[-1] == 'CHI': # Send the dataframe through the round2 address cleaner df = addclean.round2(df) # Send the Address1 field through the address cleaner df['Address1'] = df['Address1'].apply(addclean.address_cleaner) # Add the newly processed dataframe into the list dfs.append(df) # Concatenate all the dataframes merged = pd.concat(dfs) # Convert the text columns (except for the URLs) to uppercase merged = u.upper(merged) # There should be this many records in the dataframe: 6591 records return merged
def import_addresses(dataset): ''' Reads in one of three address datasets (specified with a string). Returns a dataframe. ''' print('Reading in {} addresses'.format(dataset.upper())) # Read in the COOK address dataset; rename a column if dataset == 'cook': df = ad.read_cook_addr() df = df.rename(columns={'ID': 'VendorName'}, index=str) # Read in the IRS dataset; rename a column and standardize names elif dataset == 'irs': df = ad.read_irs() df = df.rename(columns={'OrganizationName': 'VendorName'}, index=str) df['VendorName'] = df['VendorName'].apply(stdname) # Read in the IL address dataset; standardize names elif dataset == 'il': df = ad.read_il_addr() df['VendorName'] = df['VendorName'].apply(stdname) # Conver text fields to uppercase df = u.upper(df) return df
def preprocess_contracts(): ''' Reads in the contract records. Preprocesses them to clean the amounts and keep only those over the minimum amount specified in the MIN_DOLLARS constant. Imports hand-collected addresses for Cook and IL contracts and merges in addresses from IRS990 forms to fill in as many blanks as possible. Returns a dataframe. ''' # Read in the contracts and clean the dollar amounts contracts = read_contracts() contracts = clean_amounts(contracts) # Read in the COOK addresses dataset cook = import_addresses('cook') # Fill in addresses from the COOK dataset, matching on VendorName; then, # standardize VendorName print('Coalescing COOK address matches') merged = u.merge_coalesce(contracts, cook, 'VendorName') merged['VendorName'] = merged['VendorName'].apply(stdname) # Read in the IRS dataset irs = import_addresses('irs') # Get a datframe of JW similarity matches >= JWSIM_THRESH between the merged # and irs dataframes sfx = '_IRS' jwsim = jwsim_contracts_irs(merged, irs, sfx) # Print progress report print('Coalescing IRS matches') # Fill in addresses from the IRS dataset coalesced = coalesce_matches(merged, jwsim, sfx) # Read in the IL addresses dataset il = import_addresses('il') # Print progress report print('Coalescing IL matches') # Fill in addresses from the IL dataset, matching on VendorName df = u.merge_coalesce(coalesced, il, 'VendorName') return df
def __init__(self): utils.logMaker('INFO', 'APPLICATION LAUNCHED') super(Launcher, self).__init__() self.setupUi(self) self.setFixedSize(self.width(), self.height()) self.setWindowIcon(QIcon(config.icon_path)) self.setWindowFlags(Qt.FramelessWindowHint) self.open.clicked.connect(self.callManage) self.denoising.clicked.connect(self.callManage) self.save.clicked.connect(self.callManage) self.exit.clicked.connect(self.callManage) self.timer.timeout.connect(self.tipClose) self.thumbnailPath = None self.filePath = None self.denoised_thumbnailPath = None self.denoisedPath = None self.denoised_fileName = None
def import_dfss(fname): ''' Reads in the DFSS dataset, converting strings to uppercase. Assigns an ID. Returns a dataframe. ''' df = read_dfss(fname) df_upper = u.upper(df) return df_upper
def main(): urls = [] while True: line = input() if line: urls.append(line) else: break for url in urls: siteMap[_parse_site(url)](url) time.sleep(1) UTILS()
def completion(self): check = utils.completionCheck() if not check[0]: self.ok = QPushButton("确定") self.ok.setStyleSheet( "background-color:rgb(110,200,209);color:white;") self.tipBox = QMessageBox() self.tipBox.setWindowFlags(Qt.FramelessWindowHint) self.tipBox.setText("文件缺失") self.tipBox.setWindowTitle("提示") self.tipBox.setStyleSheet( "background-color:rgb(51,51,51);color:white;") self.tipBox.addButton(self.ok, QMessageBox.AcceptRole) self.tipBox.setIcon(QMessageBox.NoIcon) self.tipBox.show() utils.logMaker('ERROR', 'FILES NOT EXIST', check[1]) utils.logMaker('INFO', 'EXCEPTION CLOSED') else: self.netInit() self.show()
def lineDataPersistence(): stateWisePath = './pages/json/statewise/' overviewPath = './pages/json/overview/' timeSplit = [7, 30, 365] for timeScale in timeSplit: fullDataWithinScale = gainDataWithinGivenDays(usFull, timeScale) dateSeries = list(np.array(fullDataWithinScale.index.unique())) dateSeries = UTILS.datetime64ToStr(dateSeries) tmpData = {} tmpData['dayX'] = dateSeries tmpData['casesY'] = fullDataWithinScale['cases'].tolist() tmpData['deathsY'] = fullDataWithinScale['deaths'].tolist() UTILS.toJsonFile(tmpData, overviewPath, '{}.json'.format(timeScale)) for timeScale in timeSplit: stateDataWithinScale = gainDataWithinGivenDays(usState, timeScale) dateSeries = list(np.array(stateDataWithinScale.index.unique())) dateSeries = UTILS.datetime64ToStr(dateSeries) stateX, stateY = getCasesOrDeathsSeries(stateDataWithinScale, identifiedCol='fips', casesOrDeaths='both') if len(stateX) != len(stateY['cases']) or len(stateX) != len( stateY['deaths']): raise Exception('Index Must Match') for i in range(len(stateX)): tmpData = {} tmpData['dayX'] = dateSeries tmpData['casesY'] = stateY['cases'][i] tmpData['deathsY'] = stateY['deaths'][i] UTILS.toJsonFile(tmpData, '{}{}/'.format(stateWisePath, stateX[i]), '{}.json'.format(timeScale))
def runNet(self, path): fileName = path.split('\\')[-1] data = torch.tensor( np.array(Image.open(path).convert('RGB'), dtype=np.float32).transpose([2, 0, 1]) / 255 - 0.5).unsqueeze(dim=0).to(self.device) self.saveTip.setText('网络计算中....') data_denoised = self.generator_idgan(data) data_upscale = self.generator_srgan(data_denoised) utils.logMaker('INFO', 'OPERATION SUCCESSFULLY') pic_array = (data_upscale[0].cpu().detach().numpy() + 0.5) * 255 picDenoised = Image.fromarray( pic_array.transpose([1, 2, 0]).astype(np.uint8)) cachePath = os.path.join(config.cache_dir, 'denoised_{}'.format(fileName)) picDenoised.save(cachePath) utils.logMaker('INFO', 'DENOISED FILE SAVED IN CACHE', [cachePath]) return cachePath
def train(new=False): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') Generator = torch.load( config.network_idg_path, map_location=device.type) if new and os.path.exists( config.network_idg_path) else Generator_ID().to(device) Discriminator = torch.load( config.network_idd_path, map_location=device.type) if new and os.path.exists( config.network_idd_path) else Discriminator_ID().to(device) optimizerGen = opt.Adam(Generator.parameters()) optimizerDis = opt.Adam(Discriminator.parameters()) spl = Sampling_ID(config.train_dir) dataset = DataLoader(dataset=spl, batch_size=config.batch_size_idgan, shuffle=True, num_workers=4) epoch = 0 while True: Generator.train() Discriminator.train() for no, (noi, org) in enumerate(dataset): noi, org = noi.to(device), org.to(device) ################################# Discriminator ################################# fake = Generator(noi) fakePrediction = Discriminator(fake) realPrediction = Discriminator(org) lossDis = -torch.mean( torch.log(realPrediction) + torch.log(1. - fakePrediction)) optimizerDis.zero_grad() lossDis.backward() optimizerDis.step() ################################# Generator ################################# prediction = Discriminator(fake) lossGen=config.alphaADV*-torch.mean(torch.log(prediction))+config.alphaPIX*utils.pixelLoss(org,fake)\ +config.alphaFEA*utils.featureLoss(org,fake,device,config.num_vggLayer_idgan)+config.alphaSMO*utils.smoothLoss(fake) optimizerGen.zero_grad() lossGen.backward() optimizerGen.step() print('{}_{}_{}_{}'.format(epoch, no, lossDis, lossGen)) torch.save(Discriminator, config.network_idd_path) torch.save(Generator, config.network_idg_path) epoch += 1
def coalesce_matches(contracts, jwsim, suffix): ''' Pulls in the addresses from IRS records previously deemed to match the IL agencies. Returns a dataframe. ''' jwsim = trim_jwsim(jwsim, suffix) # Define the key on which to coalesce keys = ['CSDS_Contract_ID'] # Fill in missing values in contracts from matches in jwsim, matchin on keys df = u.merge_coalesce(contracts, jwsim, keys, suffix) return df
def train(new=False): device=torch.device('cuda' if torch.cuda.is_available() else 'cpu') Generator=torch.load(config.network_srg_path,map_location=device.type) if new and os.path.exists(config.network_srg_path) else Generator_ESR().to(device) Discriminator=torch.load(config.network_srd_path,map_location=device.type) if new and os.path.exists(config.network_srd_path) else Discriminator_ESR().to(device) optimizerGen=opt.Adam(Generator.parameters()) optimizerDis=opt.Adam(Discriminator.parameters()) spl=Sampling_ESR(config.train_dir) dataset=DataLoader(dataset=spl,batch_size=config.batch_size_srgan,shuffle=True,num_workers=4) epoch=0 while True: Generator.train() Discriminator.train() for no,(lrPic,hrPic) in enumerate(dataset): lrPic,hrPic=lrPic.to(device),hrPic.to(device) ############################## Discriminator ############################### fakeHR=Generator(lrPic) fakeHR_Prediction_Dis=Discriminator(fakeHR).mean() realHR_Prediction_Dis=Discriminator(hrPic).mean() real_RelativisticLoss=1-(realHR_Prediction_Dis-fakeHR_Prediction_Dis) fake_RelativisticLoss_Dis=fakeHR_Prediction_Dis-realHR_Prediction_Dis lossDiscriminator=real_RelativisticLoss+fake_RelativisticLoss_Dis optimizerDis.zero_grad() lossDiscriminator.backward() optimizerDis.step() ############################## Generator ############################### fakeHR_Prediction_Gen=Discriminator(fakeHR) realHR_Prediction_Gen=Discriminator(hrPic) fake_RelativisticLoss_Gen=1-(fakeHR_Prediction_Gen-realHR_Prediction_Gen) lossGenerator=config.alphaADV_ESR*fake_RelativisticLoss_Gen+config.alphaPIX_ESR*utils.pixelLoss(hrPic,fakeHR)+\ config.alphaFEA_ESR*utils.featureLoss(hrPic,fakeHR,device,config.num_vggLayer_srgan)+config.alphaSMO_ESR*utils.smoothLoss_ESR(fakeHR) optimizerGen.zero_grad() lossGenerator.backward() optimizerGen.step() print('{}_{}_{}_{}'.format(epoch,no,lossDiscriminator,lossGenerator)) torch.save(Discriminator,config.network_srd_path) torch.save(Generator,config.network_srg_path) epoch+=1
def test(): device=torch.device('cuda' if torch.cuda.is_available() else 'cpu') net=torch.load(config.network_idg_path,map_location=device.type) spl=Sampling_ID(config.test_dir) dataset=DataLoader(dataset=spl,batch_size=config.batch_size_srgan,shuffle=True) ssim_sum=0. num=0 for lr,hr in dataset: lr,hr=lr.to(device),hr.to(device) fake_hr=net(lr) ssim_sum+=utils.qualityRank(fake_hr,hr) num+=lr.size()[0] print('SSIM FOR IDGAN : {}'.format(ssim_sum/float(num)))
def import_wchi(fname): ''' Reads in the West Chi dataset. Splits the address field into its component parts. Converts strings to uppercase. Returns a dataframe. ''' # Read in the WESTCHI file df = read_wc(fname) # Split addresses into their compnent parts split = split_addr(df) # Convert strings to uppercase df_upper = u.upper(split) return df_upper
def import_mc(fname, sheetname): ''' Reads in one MapsCorps dataset. Replaces str(np.NaN) with the empty string. Converts string values to uppercase. Drops duplicates. Returns a dataframe. ''' # Extracts the year from the sheetname year = get_year(sheetname) # Uses a different function to read in the file based on the year if year == 2009: df = read_2009(fname, sheetname) elif year == 2016: df = read_2016(fname, sheetname) # Replaces the string 'nan' (str(np.NaN)) with the empty string and converts # strings to uppercase df = df.replace('nan', '') df_upper = u.upper(df) return df_upper.drop_duplicates().reset_index(drop=True)
def linker(): ''' Reads in the linker file (to link HQ agencies to service agencies). Merges a copy of itself on cluster ID, then eliminates records that match on vendor ID (to produce only matches that have different vendor IDs). Returns a dataframe. ''' # Read in the link dataframe link = read_linker() # Make two new dataframes by copying the link dataframe and renaming columns link1 = link.rename(columns={'VendorName': 'VendorName_LINK1'}, index=str) link2 = u.rename_cols(link, ['VendorName', 'CSDS_Vendor_ID'], '_LINK2') # Merge the two link dataframes together df = link1.merge(link2, how='left') # Drop self-matches and reset the index df = df[df['CSDS_Vendor_ID'] != df['CSDS_Vendor_ID_LINK2']].reset_index( drop=True) return df
GPIO.setup(buttonLEDPin, GPIO.OUT, initial=GPIO.LOW) GPIO.output(buttonLEDPin, GPIO.LOW) n = 0 while True: # Run forever # watch for button press to stop the clock pressed = GPIO.input(buttonPin) if pressed == GPIO.HIGH: buttonToggleState = not buttonToggleState sleep(.1) if buttonToggleState: GPIO.output(buttonLEDPin, GPIO.HIGH) else: GPIO.output(buttonLEDPin, GPIO.LOW) # count up to 255 if buttonToggleState: binaryNumbers = UTILS.int2bin(n) values = UTILS.getBinaryOnArray(binaryNumbers) for index, ledOn in enumerate(values): pin = pins[index]; if( ledOn ): GPIO.output(pin, GPIO.HIGH) # Turn on else: GPIO.output(pin, GPIO.LOW) # Turn off n += 1 if( n >= 256 ): n = 0 # count by the second sleep(1) GPIO.cleanup()
import CONFIG import RTC import AZURE from azure.devops.v5_0.work_item_tracking.models import JsonPatchOperation from azure.devops.v5_1.work_item_tracking.models import Comment from azure.devops.v5_1.work_item_tracking.models import CommentCreate from datetime import datetime import json import os import UTILS import glob import mmap FOLDER = CONFIG.DEFECT_FOLDER UTILS.remove(FOLDER) os.mkdir(FOLDER) os.mkdir(FOLDER + '\items') # Clients validate_only = CONFIG.validate_only bypass_rules = CONFIG.bypass_rules suppress_notifications = CONFIG.suppress_notifications rtcclient = RTC.rtcclient queryclient = rtcclient.query core_client = AZURE.core_client wit_client = AZURE.wit_client wit_5_1_client = AZURE.wit_5_1_client # Project
def fix_duplicate_addresses(df, key='ClusterID', target='Address_SVC'): ''' Takes in a dataframe. Attempts to fix duplicate addresses (by default, in the 'Address_SVC' field) if they have the same key (bby default, the 'ClusterID' field). Returns a dataframe. ''' print('\nFixing duplicate addresses') # Sort the target field by length, longest to shortest sorter = df[target].str.len().sort_values(ascending=False).index df = df.reindex(sorter) # Make a mini version of the dataframe with two fields, the key & the target # (which has been renamed to indicate it's the original field) minimized_df = df[[key, target]].drop_duplicates().dropna() minimized_df[target + '_Original'] = minimized_df[target] # Make a list of the unique values in the key field unique_keys = list(minimized_df[key].unique()) # Set a flag to FALSE new_df_exists = False # OVERVIEW: Call the iter_df() function on subsets of the dataframe (one # subset per key) to compare and fix the address strings assigned to that # key. # For each value in the list of unique keys: # Make a mini dataframe that is just the rows corresponding to that key # If the there is more than 1 row: # Call iter_df() on the mini df & assign the result to local_df2 # If the new_df_exists flag is set to TRUE: # Create new_df by concatenating the existing new_df and local_df2 # else: Assign the name new_df to local_df2 and set the new_df_exists to TRUE for uKey in unique_keys: local_df = minimized_df[minimized_df[key] == uKey] if len(local_df) > 1: local_df2 = iter_df( local_df.copy().drop_duplicates().reset_index(drop=True), target) if new_df_exists: new_df = pd.concat([new_df, local_df2]) else: new_df = local_df2 new_df_exists = True print('Coalescing fixed addresses') # Rename the columns in preparation of calling merge_coalesce() new_cols = {target: target + '_COAL', target + '_Original': target} new_df = new_df.rename(columns=new_cols, index=str) # Rename the columns in preparation of calling merge_coalesce() min_cols = {target + '_Original': target + '_COAL'} minimized_df = minimized_df.rename(columns=min_cols, index=str) # Coalesce with the dfs in this order so that we keep the new values merged = u.merge_coalesce(new_df, minimized_df, [key, target], how='right') # Merge the new address strings in, drop the original field, and rename the # new one df = df.merge(merged, how='left').drop(target, axis=1) df = df.rename(columns={target + '_COAL': target}, index=str) return df