예제 #1
0
def get_kmer_from_json(train,
                       test,
                       database=constants.DEFAULT_DB,
                       recount=False,
                       k=7,
                       L=13,
                       kwargs=None,
                       validate=True,
                       complete_count=True):
    """
    Gets kmer data for the genomes specified in the json files. Divides genomes
    into train/test sets and classifies them with utils.parse_json.

    Args:
        train (str):        The filepath to the json file containing the
                            training genome inforamtion.
        test (str):         The filepath to the json file containing the testing
                            genome information.
        database (str):     lmdb database to store kmer counts.
        recount (bool):     If True the kmers are recounted.
        k (int):            Size of kmer to be counted. Ignored if recount is
                            false
        L (int):            kmer cutoff value. Ignored if recount is false.
        kwargs (dict):      The arguments to pass to parse_json.
        validate (bool):    If True and the kmers are being recounter a status
                            bar displaying the kmer count progress is output.

    Returns:
        tuple:  (x_train, y_train, x_test, y_test), feature_names, file_names,
                LabelEncoder
    """
    if complete_count:
        counter = complete_kmer_counter
    else:
        counter = kmer_counter
    kwargs = kwargs or {}
    kwargs['validate'] = validate

    (x_train, y_train, x_test, y_test) = parse_json(train, test, **kwargs)

    test_files = [str(x) for x in x_test]

    if recount:
        counter.count_kmers(x_train + x_test,
                            database,
                            k=k,
                            limit=L,
                            force=True)

    x_train = counter.get_counts(x_train, database)
    x_test = counter.get_counts(x_test, database)

    feature_names = counter.get_kmer_names(database)

    y_train, y_test, le = encode_labels(y_train, y_test)

    output_data = (x_train, y_train, x_test, y_test)

    return (output_data, feature_names, test_files, le)
예제 #2
0
def get_roary_from_list(kwargs=None,
                        roary_sheet=constants.ROARY,
                        gene_header='Gene',
                        valid_header='Valid',
                        valid_features_table=constants.ROARY_VALID):
    """
    Gets the Roary data from roary_sheet for the genomes specified by kwargs,
    uses utils.parse_metadata. Does initial feature selection by removing
    features who are not labeled as valid in valid_features_table.

    Args:
        kwargs (dict):              The arguments to pass to parse_metadata.
        roary_sheet (str):          File containing Roary data.
        gene_header (str):          Header for the column that contains the
                                    gene names.
        valid_header (str):         Header for the column that contains T/F
                                    values determining if a gene is valid.
        valid_features_table (str): csv table containing a list of valid and
                                    invalid genes.

    Returns:
        tuple:  (x_train, y_train, x_test, y_test), feature_names, file_names,
                LabelEncoder
    """
    kwargs = kwargs or {}

    (x_train, y_train, x_test, y_test) = parse_metadata(**kwargs)

    test_files = [str(x) for x in x_test]

    roary_data = pd.read_csv(roary_sheet)
    valid_features = pd.read_csv(valid_features_table)
    features = list(valid_features[valid_header])
    roary_data = roary_data[roary_data[gene_header].isin(features)]

    valid_cols = [x_train.index(x) for x in x_train if x in list(roary_data)]
    x_train = [x_train[x] for x in valid_cols]
    y_train = [y_train[x] for x in valid_cols]

    valid_cols = [x_test.index(x) for x in x_test if x in list(roary_data)]
    x_test = [x_test[x] for x in valid_cols]
    if list(y_test):
        y_test = [y_test[x] for x in valid_cols]

    x_train = roary_data[x_train].T.values
    x_test = roary_data[x_test].T.values

    y_train, y_test, le = encode_labels(y_train, y_test)

    output_data = (x_train, y_train, x_test, y_test)

    return (output_data, features, test_files, le)
예제 #3
0
def get_omnilog_data(metadata_kwargs=None,
                     omnilog_sheet=constants.OMNILOG_DATA,
                     validate=True):
    """
    Gets the omnilog data contained in omnilog_sheet for the genomes specified
    by kwargs. Uses utils.parse_metadata

    Args:
        kwargs (dict):       The arguments to pass to parse_metadata.
        omnilog_sheet (str): File containing omnilog data.
        validate (bool):     If True y_test is created, if False y_test is an
                             empty ndarray.

    Returns:
        tuple:  (x_train, y_train, x_test, y_test), feature_names, file_names,
                LabelEncoder

    """
    metadata_kwargs = metadata_kwargs or {}
    metadata_kwargs['validate'] = validate

    (x_train, y_train, x_test, y_test) = parse_metadata(**metadata_kwargs)

    test_files = [str(x) for x in x_test]

    omnilog_data = pd.read_csv(omnilog_sheet, index_col=0)
    valid_cols = [x_train.index(x) for x in x_train if x in list(omnilog_data)]
    x_train = [x_train[x] for x in valid_cols]
    y_train = [y_train[x] for x in valid_cols]

    valid_cols = [x_test.index(x) for x in x_test if x in list(omnilog_data)]
    x_test = [x_test[x] for x in valid_cols]
    if validate:
        y_test = [y_test[x] for x in valid_cols]

    feature_names = omnilog_data.index

    output_data = []
    x_train = omnilog_data[x_train].T.values
    x_test = omnilog_data[x_test].T.values

    imputer = Imputer()
    x_train = imputer.fit_transform(x_train)
    x_test = imputer.transform(x_test)

    y_train, y_test, le = encode_labels(y_train, y_test)

    output_data = (x_train, y_train, x_test, y_test)

    return output_data, feature_names, test_files, le
예제 #4
0
def get_genome_regions(kwargs=None,
                       table=constants.GENOME_REGION_TABLE,
                       sep=None,
                       validate=True):
    """
    Gets genome region presence absence data from a binary table output by
    Panseq for the genomes specified by kwargs. Uses utils.parse_metadata

    Args:
        kwargs (dict):      The arguments to pass to parse_metadata.
        table (str):        binary_table.txt output from panseq.
        sep (str or None):  The separator used in table.
        validate (bool):    If True y_test is created, if False y_test is
                            an empty ndarray.

    Returns:
        tuple:  (x_train, y_train, x_test, y_test), feature_names, file_names,
                LabelEncoder
    """
    kwargs = kwargs or {}
    kwargs['validate'] = validate

    (train_label, y_train, test_label, y_test) = parse_metadata(**kwargs)

    x_train = []
    x_test = []
    if sep is None:
        data = pd.read_csv(table, sep=sep, engine='python', index_col=0)
    else:
        data = pd.read_csv(table, sep=sep, index_col=0)

    for header in train_label:
        x_train.append(data[header].tolist())

    for header in test_label:
        x_test.append(data[header].tolist())

    x_train = np.asarray(x_train)
    x_test = np.asarray(x_test)

    feature_names = np.asarray(data.index)

    y_train, y_test, le = encode_labels(y_train, y_test)

    output_data = (x_train, y_train, x_test, y_test)

    return (output_data, feature_names, test_label, le)
예제 #5
0
def get_roary_data(kwargs=None, roary_sheet=constants.ROARY, validate=True):
    """
    Get the Roary data from roary_sheet for the genomes specified by kwargs,
    uses utils.parse_metadata.

    Args:
        kwargs (dict):      The arguments to pass to parse_metadata.
        roary_sheet (str):  File containing Roary data.
        validate (bool):    If True y_test is created, if False y_test is an
                            empty ndarray.

    Returns:
        tuple:  (x_train, y_train, x_test, y_test), feature_names, file_names,
                LabelEncoder
    """
    kwargs = kwargs or {}
    kwargs['validate'] = validate

    (x_train, y_train, x_test, y_test) = parse_metadata(**kwargs)

    test_files = [str(x) for x in x_test]

    roary_data = pd.read_csv(roary_sheet, index_col=0)

    feature_names = roary_data.index

    valid_cols = [x_train.index(x) for x in x_train if x in list(roary_data)]
    x_train = [x_train[x] for x in valid_cols]
    y_train = [y_train[x] for x in valid_cols]

    valid_cols = [x_test.index(x) for x in x_test if x in list(roary_data)]
    x_test = [x_test[x] for x in valid_cols]
    y_test = [y_test[x] for x in valid_cols]

    x_train = roary_data[x_train].T.values
    x_test = roary_data[x_test].T.values

    y_train, y_test, le = encode_labels(y_train, y_test)

    output_data = (x_train, y_train, x_test, y_test)

    return (output_data, feature_names, test_files, le)
예제 #6
0
def get_kmer_from_directory(train_dir,
                            test_dir,
                            database=constants.DEFAULT_DB,
                            recount=False,
                            k=7,
                            L=13,
                            validate=True,
                            complete_count=True):
    """
    Organizes fasta files into train/test splits and classifies them based
    on their location in a directory structure rather than a metadata sheet.
    Returns kmer count data.

    With the following directory structure and train_dir set to Data/Train/ and
    test_dir set to Data/Test/ x_train would contain genomes 1-5 with genomes
    1-3 being classified as Class1 and genomes 4 and 5 being classified as
    Class2 (or whatever their respective directories are named) and x_test would
    contain genomes 6-9. If validate is true genomes 6 and 7 would be
    classified as Class1 and genmes 8 and 9 would classified as Class2. If
    validate is set to False genomes 6-9 will not be classified and y_test will
    be empty.

    Data/
      |--Train/
      |     |--Class1/
      |     |   |--genome1.fasta
      |     |   |--genome2.fasta
      |     |   |--genome3.fasta
      |     |
      |     |--Class2/
      |         |--genome4.fasta
      |         |--genome5.fasta
      |
      |--Test/
            |--Class1/
            |   |--genome6.fasta
            |   |--genome7.fasta
            |
            |--Class2/
                |--genome8.fasta
                |--genome9.fasta

    Args:
        train_dir (str):    Filepath to directory containing subdirectories for
                            each possible classification that contain fasta
                            files for training.
        test_dir (str):     Filepath to directory containing subdirectories for
                            each possible classification that contain fasta
                            files for testing.
        database (str):     lmdb database to store kmer counts.
        recount (bool):     If True the kmers are recounted.
        k (int):            Size of kmer to be counted. Ignored if recount is
                            false.
        L (int):            kmer cutoff value. Ignored if recount is false.
        validate (bool):    If True y_test is created, if False y_test is an
                            empty ndarray.

    Returns:
        tuple:  (x_train, y_train, x_test, y_test), feature_names, file_names,
                LabelEncoder
    """
    if complete_count:
        counter = complete_kmer_counter
    else:
        counter = kmer_counter

    train_directories = [train_dir + x for x in os.listdir(train_dir)]
    test_directories = [test_dir + x for x in os.listdir(test_dir)]

    train_files = []
    train_classes = []
    for d in train_directories:
        files = setup_files(d)
        train_files.append(files)
        train_classes.append(d.replace(train_dir, ''))

    test_files = []
    test_classes = []
    for d in test_directories:
        files = setup_files(d)
        test_files.append(files)
        test_classes.append(d.replace(test_dir, ''))

    if recount:
        all_files = train_files + test_files
        all_files = [x for l in all_files for x in l]
        counter.count_kmers(all_files, database, k=k, limit=L, force=True)

    train_counts = []
    for group in train_files:
        temp = counter.get_counts(group, database)
        temp = np.asarray(temp, dtype='float64')
        train_counts.append(temp)

    test_counts = []
    for group in test_files:
        temp = counter.get_counts(group, database)
        temp = np.asarray(temp, dtype='float64')
        test_counts.append(temp)

    test_files = [x for l in test_files for x in l]

    x_train, y_train = shuffle(train_counts, train_classes)
    x_test, y_test = shuffle(test_counts, test_classes)

    if not validate:
        y_test = np.array([], dtype='float64')

    feature_names = counter.get_kmer_names(database)

    y_train, y_test, le = encode_labels(y_train, y_test)

    output_data = (x_train, y_train, x_test, y_test)

    return (output_data, feature_names, test_files, le)
예제 #7
0
def get_genome_prefiltered(input_table=constants.GENOME_REGION_TABLE,
                           filter_table=constants.PREDICTIVE_RESULTS,
                           sep=None,
                           count=50,
                           kwargs=None):
    """
    Gets genome region presence absence from input_table for the genomes
    specified by kwargs. Does initial feature selection by using only the
    features in the top count rows of filter_table. Uses utils.parse_metadata

    Args:
        input_table (str):  A binary_table output by panseq
        filter_table (str): A table containing all the same rows as
                            input_table, but different columns.
        sep (str or None):  The delimiter used in input_table and filter_table
        count (int):        How many of the top rows to keep.
        kwargs (dict):      Arguments to be passed to parse_metadata.

    Returns:
        tuple:  (x_train, y_train, x_test, y_test), feature_names, file_names,
                LabelEncoder
    """
    kwargs = kwargs or {}

    labels = parse_metadata(**kwargs)
    train_label = labels[0]
    y_train = labels[1]
    test_label = labels[2]
    y_test = labels[3]

    if sep is None:
        input_data = pd.read_csv(input_table,
                                 sep=sep,
                                 engine='python',
                                 index_col=0)
        validation_data = pd.read_csv(filter_table,
                                      sep=sep,
                                      engine='python',
                                      index_col=0)
    else:
        input_data = pd.read_csv(input_table, sep=sep, index_col=0)
        validation_data = pd.read_csv(validation_data, sep=sep, index_col=0)

    validation_data = validation_data.head(count)
    input_data = input_data.loc[validation_data.index]

    x_train = []
    x_test = []

    for header in train_label:
        x_train.append(input_data[header].tolist())

    for header in test_label:
        x_test.append(input_data[header].tolist())

    x_train = np.asarray(x_train)
    x_test = np.asarray(x_test)

    feature_names = np.asarray(input_data.index)

    y_train, y_test, le = encode_labels(y_train, y_test)

    output_data = (x_train, y_train, x_test, y_test)

    return (output_data, feature_names, test_label, le)
예제 #8
0
def get_genome_custom_filtered(input_table=constants.GENOME_REGION_TABLE,
                               filter_table=constants.PREDICTIVE_RESULTS,
                               sep=None,
                               col='Ratio',
                               cutoff=0.25,
                               absolute=True,
                               greater=True,
                               kwargs=None):
    """
    Gets genome region presence absence data from input_table, but performs
    initial feature selection using the values in col in filter_table. Uses
    utils.parse_metadata

    Args:
        input_table (str):  A binary_table output by panseq
        filter_table (str): A csv table to filter input_table by.
        sep (str):          The delimiter used in both tables.
        col (str):          Column name for the decision column in filter_table
        cutoff (float):     What the values in col are compared to,
        absolute (bool):    If true the absolute value of values in col is used
        greater (bool):     If true values in "col" must be greater than cutoff
        kwargs (dict):      Arguments to be passed to parse_metadata.

    Returns:
        tuple:  (x_train, y_train, x_test, y_test), feature_names, file_names,
                LabelEncoder
    """
    kwargs = kwargs or {}

    labels = parse_metadata(**kwargs)
    train_label = labels[0]
    y_train = labels[1]
    test_label = labels[2]
    y_test = labels[3]

    if sep is None:
        input_data = pd.read_csv(input_table,
                                 sep=sep,
                                 engine='python',
                                 index_col=0)
        filter_data = pd.read_csv(filter_table,
                                  sep=sep,
                                  engine='python',
                                  index_col=0)
    else:
        input_data = pd.read_csv(input_table, sep=sep, index_col=0)
        filter_data = pd.read_csv(filter_table, sep=sep, index_col=0)

    if absolute and greater:
        data = input_data.loc[filter_data.loc[
            abs(filter_data[col]) > cutoff].index]
    elif absolute and not greater:
        data = input_data.loc[filter_data.loc[
            abs(filter_data[col]) < cutoff].index]
    elif not absolute and greater:
        data = input_data.loc[filter_data.loc[filter_data[col] > cutoff].index]
    elif not absolute and not greater:
        data = input_data.loc[filter_data.loc[filter_data[col] < cutoff].index]

    x_train = []
    x_test = []

    for header in train_label:
        x_train.append(data[header].tolist())

    for header in test_label:
        x_test.append(data[header].tolist())

    x_train = np.asarray(x_train)
    x_test = np.asarray(x_test)

    feature_names = np.asarray(data.index)

    y_train, y_test, le = encode_labels(y_train, y_test)

    output_data = (x_train, y_train, x_test, y_test)

    return (output_data, feature_names, test_label, le)
예제 #9
0
def get_filtered_roary_data(kwargs=None,
                            roary_sheet=constants.ROARY,
                            limit=10,
                            validate=True):
    """
    Gets the Roary data from roary_sheet for the genomes specified by kwargs,
    uses utils.parse_metadata. Does initial feature selection by removing
    features whose in proportion between classes is less than limit, based on
    the feature selection done by Lupolova et. al.

    Args:
        kwargs (dict):      The arguments to pass to parse_metadata.
        roary_sheet (str):  File containing Roary data.
        limit (int):        Value used to determine which features are removed
        validate (bool):    If True y_test is created, if False y_test is an
                            empty ndarray.

    Returns:
        tuple:  (x_train, y_train, x_test, y_test), feature_names, file_names,
                LabelEncoder
    """
    kwargs = kwargs or {}
    kwargs['validate'] = validate

    (x_train, y_train, x_test, y_test) = parse_metadata(**kwargs)

    test_files = [str(x) for x in x_test]

    roary_data = pd.read_csv(roary_sheet, index_col=0)

    class_labels = np.unique(y_train)
    classes = []
    for c in class_labels:
        class_members = [x for x in x_train if y_train[x_train.index(x)] == c]
        classes.append(roary_data[class_members].mean(axis=1) * 100)

    proportions = pd.concat(classes, axis=1)
    diffs = np.diff(proportions.values, axis=1)
    diffs = np.absolute(diffs.mean(axis=1))
    idx = list(proportions.index)
    col = ['Diff']
    avg_diff = pd.DataFrame(diffs, index=idx, columns=col)
    invalid = list(avg_diff[avg_diff['Diff'] < limit].index)
    roary_data = roary_data.drop(invalid)

    feature_names = roary_data.index

    valid_cols = [x_train.index(x) for x in x_train if x in list(roary_data)]
    x_train = [x_train[x] for x in valid_cols]
    y_train = [y_train[x] for x in valid_cols]

    valid_cols = [x_test.index(x) for x in x_test if x in list(roary_data)]
    x_test = [x_test[x] for x in valid_cols]
    if validate:
        y_test = [y_test[x] for x in valid_cols]

    x_train = roary_data[x_train].T.values
    x_test = roary_data[x_test].T.values

    y_train, y_test, le = encode_labels(y_train, y_test)

    output_data = (x_train, y_train, x_test, y_test)

    return (output_data, feature_names, test_files, le)
예제 #10
0
def get_kmer(metadata_kwargs=None,
             kmer_kwargs=None,
             recount=False,
             database=constants.DEFAULT_DB,
             validate=True,
             complete_count=True):
    """
    Get kmer data for genomes specified in kwargs, uses kmer_counter and
    utils.parse_metadata

    Args:
        kwargs (dict):   The arguments to pass to parse_metadata
        database (str):  lmdb database to store kmer counts
        recount (bool):  If True the kmers are recounted
        k (int):         Size of kmer to be counted. Ignored if recount is
                         false
        L (int):         kmer cutoff value. Ignored if recount is false
        validate (bool): If True y_test is created, if False y_test is
                         an empty ndarray.

    Returns:
        tuple:  (x_train, y_train, x_test, y_test), feature_names, file_names,
                LabelEncoder
    """
    if complete_count:
        counter = complete_kmer_counter
    else:
        counter = kmer_counter

    metadata_kwargs = metadata_kwargs or {}
    metadata_kwargs['validate'] = validate
    kmer_kwargs = kmer_kwargs or {}

    if 'name' in kmer_kwargs:
        name = kmer_kwargs['name']
    else:
        name = constants.DEFAULT_NAME
    if 'output_db' in kmer_kwargs:
        output_db = kmer_kwargs['output_db']
    else:
        output_db = database

    (x_train, y_train, x_test, y_test) = parse_metadata(**metadata_kwargs)

    test_files = [str(x) for x in x_test]
    all_files = x_train + x_test

    if recount:
        counter.count_kmers(all_files, database, **kmer_kwargs, force=True)
    else:
        try:
            temp = counter.get_counts(x_train, output_db, name)
        except KmerCounterError as e:
            msg = 'Warning: get_counts failed, attempting a recount'
            logging.exception(msg)
            counter.count_kmers(all_files, database, **kmer_kwargs)

    x_train = counter.get_counts(x_train, output_db, name)
    x_test = counter.get_counts(x_test, output_db, name)

    feature_names = counter.get_kmer_names(output_db, name)

    y_train, y_test, le = encode_labels(y_train, y_test)

    output_data = (x_train, y_train, x_test, y_test)

    return (output_data, feature_names, test_files, le)