Exemplo n.º 1
0
def make_bigwig(genomic_fasta, kmer_score_h5, motif_id, score_column, nlargest,
                output_file):

    print('Reading scores')
    kmer_score_lookup = read_score_lookup_array(kmer_score_h5, motif_id,
                                                score_column, nlargest)

    shp = dinopy.shape.Shape(K_MER_LENGTH)

    far = dinopy.FastaReader(genomic_fasta)

    header = []
    for __, chromosome, length, __ in far.entries():
        header.append((chromosome.decode(), length))

    with pyBigWig.open(output_file, 'w') as bw:
        bw.addHeader(header)

        for sequence, chromosome, length, interval in far.entries():
            chromosome = chromosome.decode()
            values = consume_sequence(kmer_score_lookup, sequence)
            # Cast to int to get more repeating values
            # values = np.asarray(values, dtype=int)

            pos = 0
            bw.addEntries(chromosome, 0, values=values, span=1, step=1)
Exemplo n.º 2
0
def make_bigwig(genomic_fasta, kmer_score_h5, motif_id, score_column, nlargest,
                output_file):

    kmer_dict = get_kmer_dict(kmer_score_h5, motif_id, score_column, nlargest)

    far = dinopy.FastaReader(genomic_fasta)

    with open(output_file, 'w') as bed_file:
        for sequence, chromosome, length, interval in far.entries():
            chromosome = chromosome.decode()
            sequence = sequence.decode()

            for start, kmer in enumerate(iter_kmers(sequence, k=K_MER_LENGTH)):
                try:
                    name = kmer_dict[kmer]
                except KeyError:
                    try:
                        kmer = dinopy.reverse_complement(kmer)
                        name = kmer_dict[kmer]
                    except KeyError:
                        continue

                end = start + K_MER_LENGTH

                bed_file.write(f'{chromosome}\t{start}\t{end}\t{name}\n')
def get_stacks_data(args):
    """Read in stacks VCF file."""
    loc_seqs = dict()
    haplotypes_file = pysam.VariantFile(args.stacks_haplo, 'r')
    indexed_far = dp.FastaReader(args.stacks_fa)

    record = None
    last_locus = None
    chromosome = None

    # merge consecutive lines describing SNPs on the same locus.
    for variant_record in haplotypes_file:
        chromosome = variant_record.chrom
        if record is None:
            seq = list(indexed_far[chromosome])[0].sequence
            record = VCFRecord(chromosome, seq, [variant_record], False)
            last_locus = variant_record.chrom
        elif variant_record.chrom == last_locus:
            record.data.append(variant_record)
        else:
            loc_seqs[last_locus] = record
            seq = list(indexed_far[chromosome])[0].sequence
            record = VCFRecord(chromosome, seq, [variant_record], False)
            last_locus = variant_record.chrom
    # print("LOC SEQS", loc_seqs)

    # write the last record
    if chromosome is not None:
        loc_seqs[chromosome] = record

    # add all remaining loci without variants to the dictionary
    # so that they can be compared with the ground truth
    far = dp.FastaReader(args.stacks_fa)
    for seq, name, *_ in far.chromosomes():

        # Split off the second part of stacks locus names.
        # In the vcf files, the information is not included
        chromosome = name.decode().split()[0]

        # add a record without variants (variant record) for loci without
        # variants detected by stacks.
        if chromosome not in loc_seqs:
            loc_seqs[chromosome] = VCFRecord(chromosome, seq, [], False)
    # print("LOC SEQS 2", loc_seqs)
    return list(loc_seqs.values())
Exemplo n.º 4
0
def fasta2dazzdb(args: argparse.Namespace):
    """Fix the FASTA/FASTQ header/id's to a DAZZ_DB compatible format such that
    these reads can be imported."""

    file_format = args.format
    if not file_format:
        if args.input != sys.stdin:
            filename = args.input.name
            file_ext = filename[filename.rfind('.')+1:]

            file_format = 'fastq' if file_ext in ('fq', 'fastq') else 'fasta'

    if not file_format:
        logger.error("Could not determine file format. Please specify using "
                     "the -f option.")
        return

    if file_format == 'fastq':
        seq_iter = iter(dinopy.FastqReader(args.input).reads(
            quality_values=False))
    else:
        seq_iter = iter(dinopy.FastaReader(args.input).reads(read_names=True))

    if args.input == sys.stdin:
        name = args.name if args.name else random_string(10)
    else:
        name = os.path.basename(args.input.name)

    moviename = daligner.generate_moviename_hash(name)
    name_mapping = {}
    seq_iter = iter(daligner.fix_header(seq_iter, moviename, name_mapping))

    logger.info("Converting FASTA/FASTQ entries...")
    with dinopy.FastaWriter(args.output, force_overwrite=True) as fw:
        fw.write_entries(seq_iter)

    if args.translations:
        logger.info("Writing name mappings to file...")
        json.dump(name_mapping, args.translations)

    logger.info("Done.")
Exemplo n.º 5
0
def overlap(args):
    args.output.write(gfa.gfa_header())
    overlapper = ExactOverlapper()
    fr = dinopy.FastaReader(args.fasta_input)

    logger.info("Building suffix tree and searching for pairwise overlaps...")
    for entry in fr.entries():
        name = entry.name.decode('utf-8')
        seq = entry.sequence.decode('utf-8')
        args.output.write(gfa.gfa_line("S", name, entry.length, "*"))
        overlapper.add_sequence(name + "+", seq)
        overlapper.add_sequence(name + "-", dinopy.reverse_complement(seq))

    overlaps = overlapper.overlaps(args.min_length)

    logger.info("Writing to GFA2...")

    for aread, bread, astart, aend, bstart, bend in overlaps:
        args.output.write(gfa.gfa_line(
            "E", "*", aread, bread, astart, aend, bstart, bend, "*"))

    logger.info("Done.")
Exemplo n.º 6
0
            if "errorprob" in res.content.decode():
                res_all.append(res.content.decode())
            c += 1
            if c%1000 == 0:
                print(c)
    except:
        print("Max retries, going to sleep for 100 sec.")
        print("j= " + str(j))
        sleep(100)
        run_requests(j, res_all, entry_list, c, payload, header, url)


if __name__ == '__main__':
    url = 'https://mesa.mosla.de/api/all' #'http://137.248.121.201:5000/api/all'
    header = {'content-type': 'application/json;charset=UTF-8'}
    with open("mesa.json") as json_file:
        config = json.load(json_file)
    f = dp.FastaReader("mcgr_test.fasta")
    payload = config
    payload['asHTML'] = False
    payload["key"] = ''
    c = 0
    res_all = list()
    i = 0
    entry_list = list(f.entries())
    run_requests(i, res_all, entry_list, c, payload, header, url)
    with open("results.txt", "w") as f_:
        for ent in res_all:
            f_.write(ent)
            f_.write("\n")
Exemplo n.º 7
0
import dinopy
import pandas as pd
import tables
import numpy as np

f_names = []
uniq_seqs = []
for i in range(len(snakemake.input)):
    seqs = dinopy.FastaReader(snakemake.input[i])
    uniq_seqs = uniq_seqs + list(
        set([entry.sequence.decode() for entry in seqs.entries()]))
uniq_seqs = set(uniq_seqs)

# create empty matrix and fill, all other solutions cost too much memory
sample_names = [i.split("/")[-1].split(".")[0] for i in snakemake.input]
df = pd.DataFrame(0, index=uniq_seqs, columns=sample_names, dtype=np.uint16)

# fill matrix
for i in range(len(snakemake.input)):
    sample_name = sample_names[i]
    seqs = dinopy.FastaReader(snakemake.input[i])
    for entry in seqs.entries():
        seq = entry.sequence.decode()
        value = np.uint16(entry.name.decode().split("size=")[1].split(";")[0])
        df.at[seq, sample_name] = value

# save to file
df.index.name = "sequences"
df.to_hdf(snakemake.output[1], key='df', mode='w')
df.to_csv(snakemake.output[0])
Exemplo n.º 8
0
import pandas as pd
import logging
import dinopy

far = dinopy.FastaReader(snakemake.input[0])
header = [i.name.decode().split(" ", 1) for i in far.entries()]
df = pd.DataFrame(header, columns=["id", "taxonomy"])
df = df.set_index(keys="id", drop=True)

df.to_hdf(snakemake.output[0], key='df', mode='w')
Exemplo n.º 9
0
def sequence_dict(fasta_file):
    return {entry.name:entry.sequence for entry
                in dinopy.FastaReader(str(fasta_file)).entries()}
Exemplo n.º 10
0
 def __init__(self, file_source):
     super().__init__()
     self.reader = dinopy.FastaReader(file_source)