def test_load_md5_from_file(self): '''test load_md5_from_file''' expected = '43247f482b82e38a190c4d3243f97ea8' prefix = os.path.join(data_dir, 'load_md5_from_file.') self.assertEqual(expected, utils.load_md5_from_file(prefix + 'good_mac')) self.assertEqual(expected, utils.load_md5_from_file(prefix + 'good_linux')) with self.assertRaises(utils.Error): utils.load_md5_from_file(prefix + 'bad_mac') with self.assertRaises(utils.Error): utils.load_md5_from_file(prefix + 'bad_linux')
def test_load_md5_from_file(self): """test load_md5_from_file""" expected = "43247f482b82e38a190c4d3243f97ea8" prefix = os.path.join(data_dir, "load_md5_from_file.") self.assertEqual(expected, utils.load_md5_from_file(prefix + "good_mac")) self.assertEqual(expected, utils.load_md5_from_file(prefix + "good_linux")) with self.assertRaises(Exception): utils.load_md5_from_file(prefix + "bad_mac") with self.assertRaises(Exception): utils.load_md5_from_file(prefix + "bad_linux")
def _validate_data(cls, database, data, dropbox_dir): """Input should be data, made by spreadsheet_helper.load_data_from_spreadsheet(). Sanity checks that it is ok, and returns a list of error messages. If the list has length zero, then all is OK.""" errors = [] all_filenames = {} all_replicates = {} replicate_keys = ( "subject_id", "site_id", "lab_id", "isolate_number", "sequence_replicate_number", ) for data_dict in data: if type(data_dict["submission_date"]) is not datetime.date: errors.append( "Date format error: " + spreadsheet_helper.row_data_dict_to_string(data_dict)) for i in [1, 2]: read_file_key = "reads_file_" + str(i) filename = data_dict[read_file_key] md5_key = read_file_key + "_md5" if not os.path.exists(os.path.join(dropbox_dir, filename)): errors.append("Reads file not found: " + filename) all_filenames[filename] = all_filenames.get(filename, 0) + 1 md5_file = os.path.join(dropbox_dir, filename + ".md5") if os.path.exists(md5_file): md5sum_from_file = utils.load_md5_from_file(md5_file) else: md5sum_from_file = None if md5sum_from_file is None and data_dict[md5_key] is None: errors.append("No md5 for reads file " + filename) elif (md5sum_from_file is not None and data_dict[md5_key] is not None and md5sum_from_file != data_dict[md5_key]): errors.append("Mismatch in md5 info for reads file " + filename) elif data_dict[ md5_key] is None and md5sum_from_file is not None: data_dict[md5_key] = md5sum_from_file replicate = tuple([data_dict[x] for x in replicate_keys]) all_replicates[replicate] = all_replicates.get(replicate, 0) + 1 patient_site_lab_unique, replicates_exist, sample_id = database._get_sample_and_replicate_uniqueness( data_dict) if not patient_site_lab_unique: errors.append( "Subject(" + data_dict["subject_id"] + ") + site(" + data_dict["site_id"] + ") + lab(" + data_dict["lab_id"] + ") found more than once in database. Something very wrong!" ) if replicates_exist: errors.append("Replicate already found for " + ",".join(replicate_keys) + ": " + ",".join([data_dict[x] for x in replicate_keys])) for filename, count in sorted(all_filenames.items()): if count > 1: errors.append("Reads file " + filename + " found " + str(count) + " times") for replicate, count in sorted(all_replicates.items()): if count > 1: errors.append("Replicate " + ",".join(replicate_keys) + " " + ",".join(replicate) + " found " + str(count) + " times in spreadsheet") return errors
def _validate_data(cls, database, data, dropbox_dir): '''Input should be data, made by spreadsheet_helper.load_data_from_spreadsheet(). Sanity checks that it is ok, and returns a list of error messages. If the list has length zero, then all is OK.''' errors = [] all_filenames = {} all_replicates = {} replicate_keys = ('subject_id', 'site_id', 'lab_id', 'isolate_number', 'sequence_replicate_number') for data_dict in data: if type(data_dict['submission_date']) is not datetime.date: errors.append( 'Date format error: ' + spreadsheet_helper.row_data_dict_to_string(data_dict)) for i in [1, 2]: read_file_key = 'reads_file_' + str(i) filename = data_dict[read_file_key] md5_key = read_file_key + '_md5' if not os.path.exists(os.path.join(dropbox_dir, filename)): errors.append('Reads file not found: ' + filename) all_filenames[filename] = all_filenames.get(filename, 0) + 1 md5_file = os.path.join(dropbox_dir, filename + '.md5') if os.path.exists(md5_file): md5sum_from_file = utils.load_md5_from_file(md5_file) else: md5sum_from_file = None if md5sum_from_file is None and data_dict[md5_key] is None: errors.append('No md5 for reads file ' + filename) elif md5sum_from_file is not None and data_dict[ md5_key] is not None and md5sum_from_file != data_dict[ md5_key]: errors.append('Mismatch in md5 info for reads file ' + filename) elif data_dict[ md5_key] is None and md5sum_from_file is not None: data_dict[md5_key] = md5sum_from_file replicate = tuple([data_dict[x] for x in replicate_keys]) all_replicates[replicate] = all_replicates.get(replicate, 0) + 1 patient_site_lab_unique, replicates_exist, sample_id = database._get_sample_and_replicate_uniqueness( data_dict) if not patient_site_lab_unique: errors.append( 'Subject(' + data_dict['subject_id'] + ') + site(' + data_dict['site_id'] + ') + lab(' + data_dict['lab_id'] + ') found more than once in database. Something very wrong!' ) if replicates_exist: errors.append('Replicate already found for ' + ','.join(replicate_keys) + ': ' + ','.join([data_dict[x] for x in replicate_keys])) for filename, count in sorted(all_filenames.items()): if count > 1: errors.append('Reads file ' + filename + ' found ' + str(count) + ' times') for replicate, count in sorted(all_replicates.items()): if count > 1: errors.append('Replicate ' + ','.join(replicate_keys) + ' ' + ','.join(replicate) + ' found ' + str(count) + ' times in spreadsheet') return errors