def test_default_x_test(self): (x_train, y_train, x_test, y_test) = parse_metadata(metadata=self.file) correct = [str(x) for x in range(self.length) if (x % 2) == 1] val = False if np.array_equal(np.unique(x_test, return_counts=True), np.unique(correct, return_counts=True)): val = True self.assertTrue(val)
def test_ova_x_train(self): (x_train, y_train, x_test, y_test) = parse_metadata(metadata=self.file, one_vs_all='a') correct = [str(x) for x in range(self.length) if (x % 2) == 0] val = False if np.array_equal(np.unique(x_train, return_counts=True), np.unique(correct, return_counts=True)): val = True self.assertTrue(val, msg='\n' + str(correct) + '\n' + str(x_train))
def test_default_y_train(self): (x_train, y_train, x_test, y_test) = parse_metadata(metadata=self.file) correct = [ self.classifications[x % self.classes] for x in range(self.length) if (x % 2) == 0 ] val = False if np.array_equal(np.unique(y_train, return_counts=True), np.unique(correct, return_counts=True)): val = True self.assertTrue(val)
def test_blacklist(self): bl = np.unique( np.random.randint(self.length, size=(int(0.1 * self.length)))) (x_train, y_train, x_test, y_test) = parse_metadata(metadata=self.file, blacklist=bl) count1 = 0 count2 = 0 for elem in bl: count2 += 1 if elem not in x_train and elem not in x_test: count1 += 1 self.assertEqual(count1, count2)
def test_removed_x_test(self): if self.classes <= 2: return unittest.skip('Too few classes') (x_train, y_train, x_test, y_test) = parse_metadata(metadata=self.file, remove='a') correct = [str(x) for x in range(self.length) if (x % 2) == 1] correct = [x for x in correct if int(x) % self.classes != 0] val = False if np.array_equal(np.unique(x_test, return_counts=True), np.unique(correct, return_counts=True)): val = True self.assertTrue(val)
def get_roary_from_list(kwargs=None, roary_sheet=constants.ROARY, gene_header='Gene', valid_header='Valid', valid_features_table=constants.ROARY_VALID): """ Gets the Roary data from roary_sheet for the genomes specified by kwargs, uses utils.parse_metadata. Does initial feature selection by removing features who are not labeled as valid in valid_features_table. Args: kwargs (dict): The arguments to pass to parse_metadata. roary_sheet (str): File containing Roary data. gene_header (str): Header for the column that contains the gene names. valid_header (str): Header for the column that contains T/F values determining if a gene is valid. valid_features_table (str): csv table containing a list of valid and invalid genes. Returns: tuple: (x_train, y_train, x_test, y_test), feature_names, file_names, LabelEncoder """ kwargs = kwargs or {} (x_train, y_train, x_test, y_test) = parse_metadata(**kwargs) test_files = [str(x) for x in x_test] roary_data = pd.read_csv(roary_sheet) valid_features = pd.read_csv(valid_features_table) features = list(valid_features[valid_header]) roary_data = roary_data[roary_data[gene_header].isin(features)] valid_cols = [x_train.index(x) for x in x_train if x in list(roary_data)] x_train = [x_train[x] for x in valid_cols] y_train = [y_train[x] for x in valid_cols] valid_cols = [x_test.index(x) for x in x_test if x in list(roary_data)] x_test = [x_test[x] for x in valid_cols] if list(y_test): y_test = [y_test[x] for x in valid_cols] x_train = roary_data[x_train].T.values x_test = roary_data[x_test].T.values y_train, y_test, le = encode_labels(y_train, y_test) output_data = (x_train, y_train, x_test, y_test) return (output_data, features, test_files, le)
def test_ova_y_test(self): (x_train, y_train, x_test, y_test) = parse_metadata(metadata=self.file, one_vs_all='a') correct = [ self.classifications[x % self.classes] for x in range(self.length) if (x % 2) == 1 ] correct = [x if x == 'a' else 'Other' for x in correct] val = False if np.array_equal(np.unique(y_test, return_counts=True), np.unique(correct, return_counts=True)): val = True self.assertTrue(val)
def get_omnilog_data(metadata_kwargs=None, omnilog_sheet=constants.OMNILOG_DATA, validate=True): """ Gets the omnilog data contained in omnilog_sheet for the genomes specified by kwargs. Uses utils.parse_metadata Args: kwargs (dict): The arguments to pass to parse_metadata. omnilog_sheet (str): File containing omnilog data. validate (bool): If True y_test is created, if False y_test is an empty ndarray. Returns: tuple: (x_train, y_train, x_test, y_test), feature_names, file_names, LabelEncoder """ metadata_kwargs = metadata_kwargs or {} metadata_kwargs['validate'] = validate (x_train, y_train, x_test, y_test) = parse_metadata(**metadata_kwargs) test_files = [str(x) for x in x_test] omnilog_data = pd.read_csv(omnilog_sheet, index_col=0) valid_cols = [x_train.index(x) for x in x_train if x in list(omnilog_data)] x_train = [x_train[x] for x in valid_cols] y_train = [y_train[x] for x in valid_cols] valid_cols = [x_test.index(x) for x in x_test if x in list(omnilog_data)] x_test = [x_test[x] for x in valid_cols] if validate: y_test = [y_test[x] for x in valid_cols] feature_names = omnilog_data.index output_data = [] x_train = omnilog_data[x_train].T.values x_test = omnilog_data[x_test].T.values imputer = Imputer() x_train = imputer.fit_transform(x_train) x_test = imputer.transform(x_test) y_train, y_test, le = encode_labels(y_train, y_test) output_data = (x_train, y_train, x_test, y_test) return output_data, feature_names, test_files, le
def test_default_test(self): (x_train, y_train, x_test, y_test) = parse_metadata(metadata=self.file) count1 = 0 count2 = 0 correct_x = [str(x) for x in range(self.length) if (x % 2) == 1] correct_y = [ self.classifications[x % self.classes] for x in range(self.length) if (x % 2) == 1 ] for elem in x_test: index = correct_x.index(elem) if y_test[count2] == correct_y[index]: count1 += 1 count2 += 1 self.assertEqual(count1, count2)
def get_genome_regions(kwargs=None, table=constants.GENOME_REGION_TABLE, sep=None, validate=True): """ Gets genome region presence absence data from a binary table output by Panseq for the genomes specified by kwargs. Uses utils.parse_metadata Args: kwargs (dict): The arguments to pass to parse_metadata. table (str): binary_table.txt output from panseq. sep (str or None): The separator used in table. validate (bool): If True y_test is created, if False y_test is an empty ndarray. Returns: tuple: (x_train, y_train, x_test, y_test), feature_names, file_names, LabelEncoder """ kwargs = kwargs or {} kwargs['validate'] = validate (train_label, y_train, test_label, y_test) = parse_metadata(**kwargs) x_train = [] x_test = [] if sep is None: data = pd.read_csv(table, sep=sep, engine='python', index_col=0) else: data = pd.read_csv(table, sep=sep, index_col=0) for header in train_label: x_train.append(data[header].tolist()) for header in test_label: x_test.append(data[header].tolist()) x_train = np.asarray(x_train) x_test = np.asarray(x_test) feature_names = np.asarray(data.index) y_train, y_test, le = encode_labels(y_train, y_test) output_data = (x_train, y_train, x_test, y_test) return (output_data, feature_names, test_label, le)
def test_ova_test(self): (x_train, y_train, x_test, y_test) = parse_metadata(metadata=self.file, one_vs_all='a') correct_x = [str(x) for x in range(self.length) if (x % 2) == 1] correct_y = [ self.classifications[x % self.classes] for x in range(self.length) if (x % 2) == 1 ] correct_y = [x if x == 'a' else 'Other' for x in correct_y] count1 = 0 count2 = 0 for elem in x_test: index = correct_x.index(elem) if y_test[count2] == correct_y[index]: count1 += 1 count2 += 1 self.assertEqual(count1, count2)
def get_roary_data(kwargs=None, roary_sheet=constants.ROARY, validate=True): """ Get the Roary data from roary_sheet for the genomes specified by kwargs, uses utils.parse_metadata. Args: kwargs (dict): The arguments to pass to parse_metadata. roary_sheet (str): File containing Roary data. validate (bool): If True y_test is created, if False y_test is an empty ndarray. Returns: tuple: (x_train, y_train, x_test, y_test), feature_names, file_names, LabelEncoder """ kwargs = kwargs or {} kwargs['validate'] = validate (x_train, y_train, x_test, y_test) = parse_metadata(**kwargs) test_files = [str(x) for x in x_test] roary_data = pd.read_csv(roary_sheet, index_col=0) feature_names = roary_data.index valid_cols = [x_train.index(x) for x in x_train if x in list(roary_data)] x_train = [x_train[x] for x in valid_cols] y_train = [y_train[x] for x in valid_cols] valid_cols = [x_test.index(x) for x in x_test if x in list(roary_data)] x_test = [x_test[x] for x in valid_cols] y_test = [y_test[x] for x in valid_cols] x_train = roary_data[x_train].T.values x_test = roary_data[x_test].T.values y_train, y_test, le = encode_labels(y_train, y_test) output_data = (x_train, y_train, x_test, y_test) return (output_data, feature_names, test_files, le)
def test_removed_test(self): if self.classes <= 2: return unittest.skip('Too few classes') (x_train, y_train, x_test, y_test) = parse_metadata(metadata=self.file, remove='a') correct_x = [str(x) for x in range(self.length) if (x % 2) == 1] correct_x = [x for x in correct_x if int(x) % self.classes != 0] correct_y = [ self.classifications[x % self.classes] for x in range(self.length) if (x % 2) == 1 ] correct_y = [x for x in correct_y if x != 'a'] count1 = 0 count2 = 0 for elem in x_test: index = correct_x.index(elem) if y_test[count2] == correct_y[index]: count1 += 1 count2 += 1 self.assertEqual(count1, count2)
def get_filtered_roary_data(kwargs=None, roary_sheet=constants.ROARY, limit=10, validate=True): """ Gets the Roary data from roary_sheet for the genomes specified by kwargs, uses utils.parse_metadata. Does initial feature selection by removing features whose in proportion between classes is less than limit, based on the feature selection done by Lupolova et. al. Args: kwargs (dict): The arguments to pass to parse_metadata. roary_sheet (str): File containing Roary data. limit (int): Value used to determine which features are removed validate (bool): If True y_test is created, if False y_test is an empty ndarray. Returns: tuple: (x_train, y_train, x_test, y_test), feature_names, file_names, LabelEncoder """ kwargs = kwargs or {} kwargs['validate'] = validate (x_train, y_train, x_test, y_test) = parse_metadata(**kwargs) test_files = [str(x) for x in x_test] roary_data = pd.read_csv(roary_sheet, index_col=0) class_labels = np.unique(y_train) classes = [] for c in class_labels: class_members = [x for x in x_train if y_train[x_train.index(x)] == c] classes.append(roary_data[class_members].mean(axis=1) * 100) proportions = pd.concat(classes, axis=1) diffs = np.diff(proportions.values, axis=1) diffs = np.absolute(diffs.mean(axis=1)) idx = list(proportions.index) col = ['Diff'] avg_diff = pd.DataFrame(diffs, index=idx, columns=col) invalid = list(avg_diff[avg_diff['Diff'] < limit].index) roary_data = roary_data.drop(invalid) feature_names = roary_data.index valid_cols = [x_train.index(x) for x in x_train if x in list(roary_data)] x_train = [x_train[x] for x in valid_cols] y_train = [y_train[x] for x in valid_cols] valid_cols = [x_test.index(x) for x in x_test if x in list(roary_data)] x_test = [x_test[x] for x in valid_cols] if validate: y_test = [y_test[x] for x in valid_cols] x_train = roary_data[x_train].T.values x_test = roary_data[x_test].T.values y_train, y_test, le = encode_labels(y_train, y_test) output_data = (x_train, y_train, x_test, y_test) return (output_data, feature_names, test_files, le)
def get_kmer(metadata_kwargs=None, kmer_kwargs=None, recount=False, database=constants.DEFAULT_DB, validate=True, complete_count=True): """ Get kmer data for genomes specified in kwargs, uses kmer_counter and utils.parse_metadata Args: kwargs (dict): The arguments to pass to parse_metadata database (str): lmdb database to store kmer counts recount (bool): If True the kmers are recounted k (int): Size of kmer to be counted. Ignored if recount is false L (int): kmer cutoff value. Ignored if recount is false validate (bool): If True y_test is created, if False y_test is an empty ndarray. Returns: tuple: (x_train, y_train, x_test, y_test), feature_names, file_names, LabelEncoder """ if complete_count: counter = complete_kmer_counter else: counter = kmer_counter metadata_kwargs = metadata_kwargs or {} metadata_kwargs['validate'] = validate kmer_kwargs = kmer_kwargs or {} if 'name' in kmer_kwargs: name = kmer_kwargs['name'] else: name = constants.DEFAULT_NAME if 'output_db' in kmer_kwargs: output_db = kmer_kwargs['output_db'] else: output_db = database (x_train, y_train, x_test, y_test) = parse_metadata(**metadata_kwargs) test_files = [str(x) for x in x_test] all_files = x_train + x_test if recount: counter.count_kmers(all_files, database, **kmer_kwargs, force=True) else: try: temp = counter.get_counts(x_train, output_db, name) except KmerCounterError as e: msg = 'Warning: get_counts failed, attempting a recount' logging.exception(msg) counter.count_kmers(all_files, database, **kmer_kwargs) x_train = counter.get_counts(x_train, output_db, name) x_test = counter.get_counts(x_test, output_db, name) feature_names = counter.get_kmer_names(output_db, name) y_train, y_test, le = encode_labels(y_train, y_test) output_data = (x_train, y_train, x_test, y_test) return (output_data, feature_names, test_files, le)
def get_genome_custom_filtered(input_table=constants.GENOME_REGION_TABLE, filter_table=constants.PREDICTIVE_RESULTS, sep=None, col='Ratio', cutoff=0.25, absolute=True, greater=True, kwargs=None): """ Gets genome region presence absence data from input_table, but performs initial feature selection using the values in col in filter_table. Uses utils.parse_metadata Args: input_table (str): A binary_table output by panseq filter_table (str): A csv table to filter input_table by. sep (str): The delimiter used in both tables. col (str): Column name for the decision column in filter_table cutoff (float): What the values in col are compared to, absolute (bool): If true the absolute value of values in col is used greater (bool): If true values in "col" must be greater than cutoff kwargs (dict): Arguments to be passed to parse_metadata. Returns: tuple: (x_train, y_train, x_test, y_test), feature_names, file_names, LabelEncoder """ kwargs = kwargs or {} labels = parse_metadata(**kwargs) train_label = labels[0] y_train = labels[1] test_label = labels[2] y_test = labels[3] if sep is None: input_data = pd.read_csv(input_table, sep=sep, engine='python', index_col=0) filter_data = pd.read_csv(filter_table, sep=sep, engine='python', index_col=0) else: input_data = pd.read_csv(input_table, sep=sep, index_col=0) filter_data = pd.read_csv(filter_table, sep=sep, index_col=0) if absolute and greater: data = input_data.loc[filter_data.loc[ abs(filter_data[col]) > cutoff].index] elif absolute and not greater: data = input_data.loc[filter_data.loc[ abs(filter_data[col]) < cutoff].index] elif not absolute and greater: data = input_data.loc[filter_data.loc[filter_data[col] > cutoff].index] elif not absolute and not greater: data = input_data.loc[filter_data.loc[filter_data[col] < cutoff].index] x_train = [] x_test = [] for header in train_label: x_train.append(data[header].tolist()) for header in test_label: x_test.append(data[header].tolist()) x_train = np.asarray(x_train) x_test = np.asarray(x_test) feature_names = np.asarray(data.index) y_train, y_test, le = encode_labels(y_train, y_test) output_data = (x_train, y_train, x_test, y_test) return (output_data, feature_names, test_label, le)
def get_genome_prefiltered(input_table=constants.GENOME_REGION_TABLE, filter_table=constants.PREDICTIVE_RESULTS, sep=None, count=50, kwargs=None): """ Gets genome region presence absence from input_table for the genomes specified by kwargs. Does initial feature selection by using only the features in the top count rows of filter_table. Uses utils.parse_metadata Args: input_table (str): A binary_table output by panseq filter_table (str): A table containing all the same rows as input_table, but different columns. sep (str or None): The delimiter used in input_table and filter_table count (int): How many of the top rows to keep. kwargs (dict): Arguments to be passed to parse_metadata. Returns: tuple: (x_train, y_train, x_test, y_test), feature_names, file_names, LabelEncoder """ kwargs = kwargs or {} labels = parse_metadata(**kwargs) train_label = labels[0] y_train = labels[1] test_label = labels[2] y_test = labels[3] if sep is None: input_data = pd.read_csv(input_table, sep=sep, engine='python', index_col=0) validation_data = pd.read_csv(filter_table, sep=sep, engine='python', index_col=0) else: input_data = pd.read_csv(input_table, sep=sep, index_col=0) validation_data = pd.read_csv(validation_data, sep=sep, index_col=0) validation_data = validation_data.head(count) input_data = input_data.loc[validation_data.index] x_train = [] x_test = [] for header in train_label: x_train.append(input_data[header].tolist()) for header in test_label: x_test.append(input_data[header].tolist()) x_train = np.asarray(x_train) x_test = np.asarray(x_test) feature_names = np.asarray(input_data.index) y_train, y_test, le = encode_labels(y_train, y_test) output_data = (x_train, y_train, x_test, y_test) return (output_data, feature_names, test_label, le)
def setUp(self): self.default = parse_metadata() self.empty = parse_metadata(**{})