Пример #1
0
def kmer_sample_db(data,
                   kim_file,
                   contains_sample_kmer_file,
                   lock,
                   truths=None):
    kim = load_pickle(kim_file)
    kmer_sample_chunk = []
    for line in data:
        linelist = line.split('\t')
        kmer = linelist[0]

        kmer_sample_lines = []
        for sample_ in linelist[1:]:
            sample = sample_.split(',')
            kmer_sample_lines.append(
                f'{sample[0]}\t{kim[kmer]}'
            )  # add some perturbation of sample[1] to the end to get different truth values for different num CNVs
        kmer_sample_chunk.append('\n'.join(kmer_sample_lines))

        # write every 500k to limit memory usage
        if len(kmer_sample_chunk) >= 500000:
            lock.acquire()
            write_list(kmer_sample_chunk, contains_sample_kmer_file)
            lock.release()
            kmer_sample_chunk = []
    write_files(lock, (kmer_sample_chunk, contains_sample_kmer_file))
Пример #2
0
def main():
    # get params
    params = get_params()
    project = params['project']

    # define file paths
    INPUT_FILE = join(project, 'data', 'postprocessed', 'KMERPHENO.txt')
    pim_file = join(project, 'data', 'preprocessed', 'pheno_int_map.pkl')
    fsa_file = join(project, 'data', 'postprocessed', 'scored_kmers.fsa')
    kim_file = join(project, 'data', 'preprocessed', 'kmer_int_map.pkl')
    scored_kmers_file = join(project, 'data', 'postprocessed',
                             'scored_kmers.txt')
    outdir = join(project, 'data', 'postprocessed')

    # create output files if they do not exist
    if file_exists(fsa_file):
        fsa_file = None
    if file_exists(scored_kmers_file):
        scored_kmers_file = None
    if fsa_file or scored_kmers_file:
        lock = Manager().Lock()
        pim = load_pickle(pim_file)

        process_file(process,
                     INPUT_FILE,
                     lock=lock,
                     pim=pim,
                     uim_file=uim_file,
                     fsa_file=fsa_file,
                     scored_unitigs_file=scored_unitigs_file)
    separate_phenos(scored_kmers_file, outdir, params['separate-phenos'],
                    params['no-consolidate'])
Пример #3
0
def main():
    #truths = get_truth_kmer_seqs_that_occur_in_data()
    #usm = read_unitig_sample_map_into_dict()
    #crossed = cross_truth_seqs_with_usm(usm, truths)
    #dump_pickle(crossed, 'crossed_truth.pkl')
    crossed = load_pickle('crossed_target.pkl')
    convert_sample_ids_to_pheno_values(crossed)
Пример #4
0
def process(data, lock, pim, kim_file, fsa_file, scored_kmers_file):
    kim = load_pickle(kim_file)
    chunk = []
    for line in data:
        linelist = line.split()
        outline = (kim[int(linelist[0])], pim[int(linelist[1])], linelist[2])
        chunk.append(outline)
    kmers = [f'>{i}\n{line[0]}' for i, line in enumerate(chunk)]
    values = ['\t'.join(tup) for tup in chunk]
    write_files(lock, (values, scored_kmers_file), (kmers, fsa_file))
Пример #5
0
def kmer_pheno_db(data,
                  kim_file,
                  value_kmer_pheno_file,
                  truth_kmer_pheno_file,
                  baseline_kmer_pheno_file,
                  lock,
                  truths=None,
                  baseline=None):
    kim = load_pickle(kim_file)
    kmer_pheno_chunk = []
    if truths:
        truths_chunk = []
    else:
        truths_chunk = None
    if baseline:
        baseline_chunk = []
    else:
        baseline_chunk = None
    for line in data:
        linelist = line.split('\t')
        kmer = linelist[0]
        for pheno in linelist[1:]:
            kmer_pheno_chunk.append(f'{kim[kmer]}\t{pheno}')
            if truths and kmer_in_truths(kmer, truths, pheno):
                truths_chunk.append(f'{kim[kmer]}\t{pheno}')
            if baseline:
                score = kmer_in_truths(kmer, baseline, pheno)
                if score is True:
                    baseline_chunk.append(f'{kim[kmer]}\t{pheno}')
                elif score is not False and score > 0.0 and score < 1.0:
                    baseline_chunk.append(f'{kim[kmer]}\t{pheno}\t{score}')
        if len(kmer_pheno_chunk) >= 500000:
            write_files(lock, (kmer_pheno_chunk, value_kmer_pheno_file),
                        (truths_chunk, truth_kmer_pheno_file),
                        (baseline_chunk, baseline_kmer_pheno_file))
            kmer_pheno_chunk = []
            if truths_chunk is not None:
                truths_chunk = []
    write_files(lock, (kmer_pheno_chunk, value_kmer_pheno_file),
                (truths_chunk, truth_kmer_pheno_file),
                (baseline_chunk, baseline_kmer_pheno_file))
Пример #6
0
def main():
    # get params
    params = get_params()
    project = params['project']

    # define file paths
    unique_kmers_file = join(project, 'data', 'preprocessed', 'unique_kmers.txt')
    phenos_file = join(project, 'data', 'raw', params['pheno'])
    samples_file = join(project, 'data', 'raw', params['sample'])
    similarities_tsv = join(project, 'data', 'preprocessed', 'sample_similarities.tsv')
    hist_orig_file = join(project, 'data', 'preprocessed', 'hist_orig.png')
    hist_sim_scaled_file = join(project, 'data', 'preprocessed', 'hist_sim_scaled.png')
    hist_dissim_scaled_file = join(project, 'data', 'preprocessed', 'hist_dissim_scaled.png')
    similar_sample_file = join(project, 'data', 'preprocessed', 'similarSample_obs.txt')
    dissimilar_sample_file = join(project, 'data', 'preprocessed', 'dissimilarSample_obs.txt')
    kmer_sample_file = join(project, 'data', 'preprocessed', 'kmer_sample_map.txt')
    kmer_pheno_file = join(project, 'data', 'preprocessed', 'kmer_pheno_map.txt')
    sim_file = join(project, 'data', 'preprocessed', 'sample_int_map.pkl') 
    pim_file = join(project, 'data', 'preprocessed', 'pheno_int_map.pkl')
    uim_file = join(project, 'data', 'preprocessed', 'kmer_int_map.pkl')

    # create and load sample and pheno int maps
    if not file_exists(sim_file):
        int_maps.create_sample_int_map(samples_file, phenos_file, sim_file)
    if not file_exists(pim_file):
        int_maps.create_pheno_int_map(phenos_file, pim_file)
    sim = load_pickle(sim_file)
    
    # only do processing if output files do not exist
    if (not file_exists(kmer_sample_file) or not file_exists(kmer_pheno_file) 
            or ((not file_exists(similar_sample_file) or not file_exists(dissimilar_sample_file))
            and not file_exists(similarities_tsv))):
        # dfs holding samples that display vs not display pheno
        dfdisp, dfnodisp = create_disp_nodisp_dfs(phenos_file, sim)
        # read in all sequences in input into python object
        seqs = parse_input(samples_file)
        # number of samples
        n_samples = int(len(sim) / 2)
        # upper and lower bounds for frequency of samples to filter kmers by
        upper = int(params['maxkf'] * n_samples)
        lower = int(params['minkf'] * n_samples)
        # multiprocessing queue for transferring data to the main thread
        m = Manager()
        q = m.Queue()
        # multiprocessing lock for locking file before writing to it
        lock = m.Lock()
        # kmers file name reference for subprocesses to write to
        kmer_sample_file_ref = kmer_sample_file # because the int map uses it
        if file_exists(kmer_sample_file):
            kmer_sample_file_ref = None
        if file_exists(kmer_pheno_file):
            kmer_pheno_file = None
        
        kwargs = dict(raw=seqs, k=params['k'], thresh=params['correlation-thresh'],
                    upper=upper, lower=lower, dfdisp=dfdisp, dfnodisp=dfnodisp,
                    sim=sim, n=n_samples,
                    kmer_sample_file=kmer_sample_file_ref,
                    kmer_pheno_file=kmer_pheno_file)

        process_file(create_kmer_sample_map, unique_kmers_file, q=q, lock=lock, **kwargs)
       
        sample_matrix = np.zeros((n_samples, n_samples))
        num_kmers = 0
        # write all chunks to output files sequentially
        while not q.empty():
            q_num_kmers, q_sample_matrix = q.get()
            num_kmers += q_num_kmers
            sample_matrix += q_sample_matrix
        
        # create sample similarity file if the similarities tsv does not exist
        if not file_exists(similar_sample_file) or not file_exists(dissimilar_sample_file):
            similar_sample(sample_matrix, num_kmers, similarities_tsv,
                hist_orig_file, hist_sim_scaled_file, hist_dissim_scaled_file,
                similar_sample_file, dissimilar_sample_file)
    if (not file_exists(similar_sample_file) or not file_exists(dissimilar_sample_file)) and file_exists(similarities_tsv):
        similar_sample(None, None, similarities_tsv, hist_orig_file,
            hist_sim_scaled_file, hist_dissim_scaled_file,
            similar_sample_file, dissimilar_sample_file)
    # create kmer int map
    if not file_exists(uim_file):
        int_maps.create_kmer_int_map(kmer_sample_file, uim_file)
Пример #7
0
#!/usr/bin/env python3
import pandas as pd
import numpy as np
from utility import load_pickle, dump_pickle
import random

base = 'ecoli/data/preprocessed/'
#truths_file = base + 'truth_unitig_pheno.txt'
unitig_int_map = load_pickle(base + 'unitig_int_map.pkl')
unitig_sample_map_file = base + 'unitig_sample_map.txt'
phenos_df = pd.read_csv('ecoli/data/raw/phenos.tsv', sep='\t', index_col=0)
sim = load_pickle(base + 'sample_int_map.pkl')
pim = load_pickle(base + 'pheno_int_map.pkl')


def get_truth_kmer_seqs_that_occur_in_data():
    with open(truths_file, 'r') as f:
        truthslines = f.readlines()
    #with open(unique_kmers_file, 'r') as f:
    #    kmerslines = f.readlines()
    #kmerslines = set(k.split('\t')[0] for k in kmerslines)
    truthslines = [l for l in truthslines if l != '\n']
    truthslines = [l.split('\t') for l in truthslines]
    truthslines = [(unitig_int_map[int(k)], p) for k, p, _ in truthslines]
    return truthslines


def read_unitig_sample_map_into_dict():
    with open(unitig_sample_map_file, 'r') as f:
        lines = f.readlines()
    lines = {l.split('\t')[0]: l.split('\t')[1:] for l in lines}
Пример #8
0
def main():
    # get params
    params = get_params()
    project = params['project']

    # define data paths
    sim_file = join(project, 'data', 'preprocessed', 'sample_int_map.pkl')
    pim_file = join(project, 'data', 'preprocessed', 'pheno_int_map.pkl')
    kim_file = join(project, 'data', 'preprocessed', 'kmer_int_map.pkl')
    kmer_sample_map_file = join(project, 'data', 'preprocessed',
                                'kmer_sample_map.txt')
    kmer_pheno_map_file = join(project, 'data', 'preprocessed',
                               'kmer_pheno_map.txt')
    phenos_file = join(project, 'data', 'raw', params['pheno'])
    contains_sample_kmer_file = join(project, 'data', 'preprocessed',
                                     'contains_obs.txt')
    value_sample_pheno_file = join(project, 'data', 'preprocessed',
                                   'samplePheno_obs.txt')
    value_kmer_pheno_file = join(project, 'data', 'preprocessed',
                                 'kmerPheno_target.txt')
    similar_pheno_pheno_file = join(project, 'data', 'preprocessed',
                                    'similarPheno_obs.txt')

    sim = load_pickle(sim_file)
    pim = load_pickle(pim_file)

    # incorporate truth data
    if params.get('truth'):
        truths_infile = join(project, 'data', 'raw', params['truth'])
        truths_dict = create_truths_dict(truths_infile, pim)
        truth_kmer_pheno_file = join(project, 'data', 'preprocessed',
                                     'kmerPheno_truth.txt')
    else:
        truths_dict = None
        truth_kmer_pheno_file = None

    # incorporate baseline data
    if params.get('baseline'):
        baseline_infile = join(project, 'data', 'raw', params['baseline'])
        baseline_dict = create_truths_dict(baseline_infile, pim)
        baseline_kmer_pheno_file = join(project, 'data', 'preprocessed',
                                        'baseline_obs.txt')
    else:
        baseline_dict = None
        baseline_kmer_pheno_file = None

    # create smaller psl input files that can be efficiently done w 1 thread
    if not file_exists(value_sample_pheno_file):
        sample_pheno(phenos_file, sim, pim, value_sample_pheno_file)
    if not file_exists(similar_pheno_pheno_file):
        similar_pheno(phenos_file, pim, similar_pheno_pheno_file)

    contains_exists = file_exists(contains_sample_kmer_file)
    value_exists = file_exists(value_kmer_pheno_file)
    truths_exists = file_exists(truth_kmer_pheno_file) if params.get(
        'truth') else True
    baseline_exists = file_exists(baseline_kmer_pheno_file) if params.get(
        'baseline') else True

    lock = Manager().Lock()

    if not contains_exists:
        process_file(kmer_sample_db,
                     kmer_sample_map_file,
                     kim_file=kim_file,
                     lock=lock,
                     truths=truths_dict,
                     contains_sample_kmer_file=contains_sample_kmer_file)

    if not value_exists or not truths_exists or not baseline_exists:
        if value_exists:
            value_kmer_pheno_file = None
        if truths_exists:
            truth_kmer_pheno_file = None
        if baseline_exists:
            baseline_kmer_pheno_file = None
        process_file(kmer_pheno_db,
                     kmer_pheno_map_file,
                     kim_file=kim_file,
                     value_kmer_pheno_file=value_kmer_pheno_file,
                     truth_kmer_pheno_file=truth_kmer_pheno_file,
                     lock=lock,
                     truths=truths_dict,
                     baseline=baseline_dict,
                     baseline_kmer_pheno_file=baseline_kmer_pheno_file)