def copy_attributes(mapfile, newdir, filenames, key_lstrip=None, key_rstrip=None): if os.path.exists(newdir): shutil.rmtree(newdir) os.makedirs(newdir) metadata = pd.read_csv(mapfile, sep='\t') filenames = pd.DataFrame(filenames, columns=['filename']) filenames['dataset_key'] = filenames['filename'].apply(lambda x: strip_key(x, key_lstrip, key_rstrip)) joined = pd.merge(metadata, filenames, on='dataset_key', how='inner') assert len(joined) == len(metadata) == len(filenames) for index, row in joined.iterrows(): id = row['id'] filename = row['filename'] link_name = os.path.join(newdir, '%s.txt' % (id)) #os.symlink(filename, link_name) shutil.copy(filename, link_name)
def main(edit_file, metadata_in, metadata_out, key_rstrip=None): # load the table of edits edits = pd.read_csv(edit_file, sep='\t', names=['filename', 'var', 'value'], encoding='UTF8') # load metadata metadata = pd.read_csv(metadata_in, sep='\t', encoding='UTF8', dtype=str) # metadata as the var's in columns. if any of the var's we are editing don't already # exist, add them as new cols initialized to missing edit_vars = pd.Series(edits['var'].unique()) new_vars = edit_vars[~edit_vars.isin(metadata.columns)] for new_var in new_vars: metadata[new_var] = np.NAN # cfg file name should match the dataset key in the metadata file edits['dataset_key'] = edits['filename'].apply(lambda x: strip_key(x, None, key_rstrip)) # log all edit files we don't have in the metadata table missing_files = edits[~edits['dataset_key'].isin(metadata['dataset_key'])]['dataset_key'] if len(missing_files) > 0: print("metadata edits for these missing files will be skipped: ", ' '.join(list(missing_files))) # apply remaining edits. could probably vectorize this into some kind # of join, but its not much data, and simpler to think about in a loop, # at least for me :) edits = edits[edits['dataset_key'].isin(metadata['dataset_key'])] for index, row in edits.iterrows(): filename, var, value = row['dataset_key'], row['var'], row['value'] metadata.loc[metadata['dataset_key'] == filename, var] = value # write new edited metadata file metadata.to_csv(metadata_out, sep='\t', header=True, index=True, encoding='UTF8')