예제 #1
0
 def enumerate_matches(self):
     '''
     List every row id with every row id in its cluster.
     Optionally list with confidence score column
     Returns a list of dicts
     '''
     #raw_data=self.get_data_dict()
     header = [self.id_col, self.id_col + '_match']
     #create dict of {cluster_id:[ids]}
     cluster_dict = {}
     with self.get_file() as f:
         reader = csv.DictReader(f)
         for row in reader:
             curr_val = cluster_dict.get(row[self.cluster_col], [])
             curr_val.append(row[self.id_col])
             cluster_dict[row[self.cluster_col]] = curr_val
     #use itertools.permutations for each key
     with StringIO() as outp:
         writer = csv.DictWriter(outp, fieldnames=header)
         writer.writeheader()
         for k, v in cluster_dict.items():
             combos = permutations(v, 2)
             for c in combos:
                 writer.writerow({header[0]: c[0], header[1]: c[1]})
         new_config = self.file_config
         new_config['filename'] = 'enum_' + self.file_config['filename']
         s3.S3File(new_config).write_file(outp.getvalue())
예제 #2
0
 def post_file(self,s3bucket,file_config,box_folder_id):
     file=storage.S3File(s3bucket,file_config)
     file_bytes=BytesIO(file.get_file_bytes())
     print('####hEEEYYYY#####')
     #self.client.folder(box_folder_id).upload_stream(file_bytes, file_config['filename'])
     self.client.folder(box_folder_id).upload('/code/mocj_porter/test.csv','test')
     return {'status':200, 'message': 'Success'}
예제 #3
0
 def post_csvs(self):
     '''
     Move converted files to S3
     '''
     for f in os.listdir(self.local_path):
         if f.endswith('.csv') and f.startswith(self.transform_prefix):
             self.input_config['filename'] = f
             tgt_s3 = s3.S3File(self.input_config)
             with open(os.path.join(self.local_path, f), 'r') as csvfile:
                 tgt_s3.write_file(csvfile.read())
예제 #4
0
    def write_to_disk(self):
        for file in self.input_folder.list_stg_files():
            self.input_config['filename'] = file
            f = s3.S3File(self.input_config)
            tgt_path = os.path.join(self.local_path, file)

            if not os.path.exists(self.local_path):
                os.makedirs(self.local_path)
            with open(tgt_path, 'wb+') as outfile:
                outfile.write(f.get_file_bytes().read())
예제 #5
0
 def __init__(self,
              data_source,
              settings_file,
              id_col,
              outp_file,
              recall_weight=.5):
     self.settings_file = s3.S3File(settings_file)
     self.outp_file = DataSource(outp_file, id_col)
     self.recall_weight = recall_weight
     self.data_source = DataSource(data_source, id_col)
예제 #6
0
 def get_distinct_from_file(self, file_config):
     '''
     Read in a user generated set of near misses in format"
     FieldA1, FieldB1 ......FieldA2, FieldB2
     Name1, DOB1.....Name2, DOB2
     '''
     distinct_list = []
     distinct_file = s3.S3File(file_config)
     with distinct_file.get_file() as f:
         reader = csv.DictReader(f)
         header = reader.fieldnames
         for row in reader:
             record1 = {}
             record2 = {}
             for f in self.training_data.match_fields:
                 fieldname = f['field']
                 record1[fieldname] = self.training_data._preprocess_col(
                     row[fieldname + '1'])
                 record2[fieldname] = self.training_data._preprocess_col(
                     row[fieldname + '2'])
             distinct_list.append(self._make_tuple(record1, record2))
     return distinct_list
예제 #7
0
 def __init__(self, training_data, settings_file, match_fields, id_col,
              cluster_col):
     self.settings_file = s3.S3File(settings_file)
     self.training_data = MatchedData(training_data, id_col, cluster_col,
                                      match_fields)