def enumerate_matches(self): ''' List every row id with every row id in its cluster. Optionally list with confidence score column Returns a list of dicts ''' #raw_data=self.get_data_dict() header = [self.id_col, self.id_col + '_match'] #create dict of {cluster_id:[ids]} cluster_dict = {} with self.get_file() as f: reader = csv.DictReader(f) for row in reader: curr_val = cluster_dict.get(row[self.cluster_col], []) curr_val.append(row[self.id_col]) cluster_dict[row[self.cluster_col]] = curr_val #use itertools.permutations for each key with StringIO() as outp: writer = csv.DictWriter(outp, fieldnames=header) writer.writeheader() for k, v in cluster_dict.items(): combos = permutations(v, 2) for c in combos: writer.writerow({header[0]: c[0], header[1]: c[1]}) new_config = self.file_config new_config['filename'] = 'enum_' + self.file_config['filename'] s3.S3File(new_config).write_file(outp.getvalue())
def post_file(self,s3bucket,file_config,box_folder_id): file=storage.S3File(s3bucket,file_config) file_bytes=BytesIO(file.get_file_bytes()) print('####hEEEYYYY#####') #self.client.folder(box_folder_id).upload_stream(file_bytes, file_config['filename']) self.client.folder(box_folder_id).upload('/code/mocj_porter/test.csv','test') return {'status':200, 'message': 'Success'}
def post_csvs(self): ''' Move converted files to S3 ''' for f in os.listdir(self.local_path): if f.endswith('.csv') and f.startswith(self.transform_prefix): self.input_config['filename'] = f tgt_s3 = s3.S3File(self.input_config) with open(os.path.join(self.local_path, f), 'r') as csvfile: tgt_s3.write_file(csvfile.read())
def write_to_disk(self): for file in self.input_folder.list_stg_files(): self.input_config['filename'] = file f = s3.S3File(self.input_config) tgt_path = os.path.join(self.local_path, file) if not os.path.exists(self.local_path): os.makedirs(self.local_path) with open(tgt_path, 'wb+') as outfile: outfile.write(f.get_file_bytes().read())
def __init__(self, data_source, settings_file, id_col, outp_file, recall_weight=.5): self.settings_file = s3.S3File(settings_file) self.outp_file = DataSource(outp_file, id_col) self.recall_weight = recall_weight self.data_source = DataSource(data_source, id_col)
def get_distinct_from_file(self, file_config): ''' Read in a user generated set of near misses in format" FieldA1, FieldB1 ......FieldA2, FieldB2 Name1, DOB1.....Name2, DOB2 ''' distinct_list = [] distinct_file = s3.S3File(file_config) with distinct_file.get_file() as f: reader = csv.DictReader(f) header = reader.fieldnames for row in reader: record1 = {} record2 = {} for f in self.training_data.match_fields: fieldname = f['field'] record1[fieldname] = self.training_data._preprocess_col( row[fieldname + '1']) record2[fieldname] = self.training_data._preprocess_col( row[fieldname + '2']) distinct_list.append(self._make_tuple(record1, record2)) return distinct_list
def __init__(self, training_data, settings_file, match_fields, id_col, cluster_col): self.settings_file = s3.S3File(settings_file) self.training_data = MatchedData(training_data, id_col, cluster_col, match_fields)