Exemplo n.º 1
0
    import os
    print("start dedup")
    fasta = {}
    for ID, seq in fasta_iter(infile):
        if seq in fasta:
            fasta[seq][1] += 1
        else:
            fasta[seq] = [ID, 1]

    outfile1 = infile.replace('.faa.gz', '.raw_number.tsv.gz')
    outfile2 = infile.replace('.faa.gz', '.dedup.faa.gz')
    out1 = gzip.open(outfile1, "wt", compresslevel=1)
    out2 = gzip.open(outfile2, "wt", compresslevel=1)
    print("start sort")
    for seq, (ID, count) in sorted(fasta.items()):
        out1.write(f"{count}\t{seq}\n")
        out2.write(f">{ID}\n{seq}\n")
    out1.close()
    out2.close()
    os.unlink(infile)
    print("finish dedup and sort")
    return (outfile1, outfile2)


INPUT_FILE = "data/GMSC10.metag_smorfs.faa.gz"

splits = splitseq(INPUT_FILE)

for sp in bvalue(splits):
    dedup_fasta(sp)
Exemplo n.º 2
0
def create_ena_file_map(studies_tables, vol_map, MIRROR_BASEDIR):
    def annotate_link(p):
        def drop_hostname(addr):
            while not addr.startswith("vol1"):
                newaddr = addr.split('/', 1)
                if len(newaddr) == 1:
                    raise ValueError("Couldn't find mirror root")
                addr = newaddr[-1]
            return addr

        if pd.isnull(p):
            return (p, p)

        p = drop_hostname(p)
        if p.endswith('_1.fastq.gz'):
            return ("fastq_1", p)
        if p.endswith('_2.fastq.gz'):
            return ("fastq_2", p)
        if p.endswith('.fastq.gz'):
            return ("fastq_single", p)
        raise ValueError("Cannot annotate {}".format(p))

    with open(path.join(MIRROR_BASEDIR, vol_map), 'w') as out:
        out.write(
            "#study_accession\trun_accession\tsample_accession\texperiment_accession\tfastq_1\tfastq_2\tfastq_single\n"
        )
        for study in studies_tables:
            data = bvalue(
                studies_tables[study])  ## bvalue because this is a Tasklet
            if data is None:
                print("We got no file information for", study,
                      ". Incomplete/older jug internal state? Skipping", study)
                continue
            annotations = data["ftp"].map(annotate_link)
            data["filetype"], data["filepath"] = zip(*annotations)

            subdata = data[[
                "study_accession", "run_accession", "sample_accession",
                "experiment_accession"
            ]].drop_duplicates()
            files = data.pivot(values="filepath", columns="filetype")
            grouped = pd.merge(subdata,
                               files,
                               left_index=True,
                               right_index=True)

            for _, record in grouped.iterrows():
                # 6 columns: project_accession, sample_accession, experiment_accession, fastq_1, fastq_2, fastq_single
                try:
                    fastq_1 = record.fastq_1
                except:
                    fastq_1 = ''
                else:
                    if fastq_1 is None:
                        fastq_1 = ''
                    else:
                        fastq_1 = path.join(MIRROR_BASEDIR, fastq_1)

                try:
                    fastq_2 = record.fastq_2
                except:
                    fastq_2 = ''
                else:
                    if fastq_2 is None:
                        fastq_2 = ''
                    else:
                        fastq_2 = path.join(MIRROR_BASEDIR, fastq_2)

                try:
                    fastq_single = record.fastq_single
                except:
                    fastq_single = ''
                else:
                    if fastq_single is None:
                        fastq_single = ''
                    else:
                        fastq_single = path.join(MIRROR_BASEDIR, fastq_single)

                out.write(
                    f"{record.study_accession}\t{record.run_accession}\t{record.sample_accession}\t{record.experiment_accession}\t{fastq_1}\t{fastq_2}\t{fastq_single}\n"
                )
Exemplo n.º 3
0
                 max_lengthscale=1000.0,
                 max_variance=1000.0))
    experiment_storage_path, _, common_run_settings, dataset_custom_settings = get_settings(
        dataset_name)
    baseline_exps[dataset_name] = FullbatchUciExperiment(
        **{
            **common_run_settings,
            **dataset_custom_settings,
            **baseline_custom_settings
        })
    (
        all_model_parameters[dataset_name],
        full_rmses[dataset_name],
        full_nlpps[dataset_name],
        baseline_lmls[dataset_name],
    ) = jug.bvalue(run_baseline(baseline_exps[dataset_name]))


@jug.TaskGenerator
def run_sparse_init(exp):
    print(exp)
    exp.setup_model()
    exp.init_params()
    print_post_run(exp)
    elbo, upper, rmse, nlpp = compute_model_stats(exp)
    return elbo, upper, rmse, nlpp


# Sparse experiments
init_Z_runs = {}
init_Z_task_results = {}
Exemplo n.º 4
0
from jug import barrier, Task, bvalue
import math


def double(x):
    return 2 * x


two = Task(double, 1)
two = bvalue(two)
four = 2 * two
Exemplo n.º 5
0
        topology=cfg['topology'],
        sasa_sidechain_h5=sc_sasa_h5,
        cluster_distance=cfg['cluster_distance_metric'],
        cluster_radii=cfg['cluster_radii'],
        kmedoids_updates=10)

selected_cluster_results = {}
for protein, cluster_result_list in cluster_results.items():
    if "lag_time" not in CONFIGS[protein]:
        continue

    for radius, cluster_result in zip(cfg['cluster_radii'],
                                      cluster_result_list):
        if CONFIGS[protein]["model_cluster_radius"] == radius:
            selected_cluster_results[protein] = cluster_result
            lag_time = CONFIGS[protein]["lag_time"]

            if cluster_result.assignments.can_load():
                dirname, assigs_file = os.path.split(
                    jug.bvalue(cluster_result.assignments))

                dirname = os.path.join(os.path.split(dirname)[0], 'models')
                msm_filename = assigs_file.replace(
                    '-assignments.h5',
                    '-%02dprior-%slt-msm' % (prior_count, lag_time))

                assignments = cluster_result.assignments
                msm2file(os.path.join(dirname, msm_filename),
                         assignments,
                         lag_time=lag_time)
Exemplo n.º 6
0
    oname = deeparg_output+'.hamronized'
    with open(oname, 'wb') as out:
        subprocess.check_call([
            'conda', 'run', '-n', 'hamronization',
            'hamronize', 'deeparg',
            '--input_file_name', deeparg_input,
            '--analysis_software_version', deeparg_software_version,
            '--reference_database_version', deeparg_db_version,
            deeparg_output + '.mapping.ARG'],
            stdout=out)
    return oname


splits_faa = split_seq_file('data/GMGC10.wastewater.95nr.test_10k.faa.gz')
partials = []
for faa in (bvalue(splits_faa)):
    partials.append(
            run_rgi_hamronize(faa, run_rgi(faa)))
concat_partials(partials, 'outputs/rgi.full.tsv.gz')

splits_fna = split_seq_file('data/GMGC10.wastewater.95nr.test_10k.fna.gz')
for fa in bvalue(splits_fna):
    for db in ['resfinder', 'card','argannot','ncbi','megares']:
        run_abricate_hamronize(run_abricate(fa, db))


    run_deeparg_hamronize(fa, run_deeparg(fa))

@TaskGenerator
def run_hamronize_summarize(reports, combined):
    '''Combine outputs of all the tools
Exemplo n.º 7
0
from jug import barrier, Task, bvalue
import math

def double(x):
    return 2*x

two = Task(double,1)
two = bvalue(two)
four = 2*two
Exemplo n.º 8
0
def cluster(
        tag, trajectories, topology, sasa_sidechain_h5, cluster_radii,
        cluster_algorithm='khybrid', cluster_distance='euclidean',
        kmedoids_updates=5):

    import os
    from collections import namedtuple

    sc_sasa_filename = jug.bvalue(sasa_sidechain_h5)

    def make_cluster_name(sc_sasa_filename, suffix):
        f = sc_sasa_filename \
            .rstrip('sasas-sidechains.h5') \
            .replace('/features', '/cluster/') + \
            "-".join(['sidechains', cluster_algorithm, radius,
                      cluster_distance] + (
                          ['kmedoids' + str(kmedoids_updates)]
                          if cluster_algorithm == 'khybrid'
                          else []))
        return f

    ClusterFiles = namedtuple(
        'ClusterFiles',
        ['assignments', 'distances', 'center_indices',
         'structure_centers', 'feature_centers'])

    cluster_results = []
    for radius in cluster_radii:

        CLUSTER_STEM = sc_sasa_filename \
            .replace('sasas-sidechains.h5', '') \
            .replace('/features/', '/cluster/') + \
            "-".join(['sidechains', cluster_algorithm, radius,
                      cluster_distance] + (
                          ['kmedoids' + str(kmedoids_updates)]
                          if cluster_algorithm == 'khybrid'
                          else []))

        ASSIGNMENTS_FILE = CLUSTER_STEM + '-assignments.h5'
        DISTANCES_FILE = CLUSTER_STEM + '-distances.h5'
        FEATURE_CENTERS_FILE = CLUSTER_STEM + '-feature-centers.npy'
        CENTER_INDS_FILE = CLUSTER_STEM + '-center-inds.npy'

        assignments, distances, center_features, center_indices = \
            jug.iteratetask(tasks.cluster_features(
                sasa_sidechain_h5,
                ASSIGNMENTS_FILE,
                DISTANCES_FILE,
                FEATURE_CENTERS_FILE,
                CENTER_INDS_FILE,
                radius,
                'euclidean',
                cluster_algorithm,
                cluster_iterations=kmedoids_updates
            ), n=4)

        ctr_structs = tasks.write_struct_ctrs(
            trajectories, topology, center_indices,
            CLUSTER_STEM + "-structure-centers.h5")

        result = ClusterFiles(
            assignments=assignments,
            distances=distances,
            center_indices=center_indices,
            feature_centers=center_features,
            structure_centers=ctr_structs)
        cluster_results.append(result)

        PLOT_PATH = os.path.join(
            'figures', 'implied',
            os.path.basename(CLUSTER_STEM) + 'implied-timescales.png')

        tasks.implied_timescales(
            assignments=assignments,
            plot_path=PLOT_PATH
        )

    return cluster_results