Exemplo n.º 1
0
    def test_renamed(self):
        """
        Check if sequences in a FASTA file are properly renamed.
        """
        renamer = bioformats.seqname.FastaSeqRenamer()
        renamer.read_renaming_dict(self.__renaming_dict)
        with open(self.__output, "w") as output_fasta:
            for line in renamer.renamed(self.__fasta):
                output_fasta.write(line)

        # perform the reverse renaming
        rev_renamer = bioformats.seqname.FastaSeqRenamer()
        rev_renamer.read_renaming_dict(self.__renaming_dict)
        with open(self.__rev_output, "w") as rev_output_fasta:
            for line in renamer.renamed(self.__output, reverse=True):
                rev_output_fasta.write(line)

        # compare the original and reverse-renamed FASTA files
        original_fasta = Fasta(self.__fasta)
        rev_renamed_fasta = Fasta(self.__rev_output)
        for x, y in zip(original_fasta.keys(), rev_renamed_fasta.keys()):
            self.assertEqual(x, y)

        # check if the missing sequence exception is raised
        del renamer.renaming_dict["seq2"]
        with self.assertRaises(MissingSeqNameError):
            for _ in renamer.renamed(self.__fasta):
                pass

        os.unlink(self.__output)
        os.unlink(self.__rev_output)
Exemplo n.º 2
0
def fasta_to_df(fasta: pyfaidx.Fasta) -> pd.DataFrame:
    """Convert the fasta file from seqextractor to a Pandas DataFrame

    Parameters
    ----------
    fasta : :class:`pyfaidx.Fasta`
        Parsed FASTA with sequences to examine for spacers that needs to be
        converted to a Pandas DataFrame

    Results
    ----------
    :class:`pd.DataFrame`
    """
    df = pd.DataFrame(
        [fasta[_].name.split("_") for _ in fasta.keys()],
        columns=["gene_name", "feature_id", "strand", "start", "stop", "seq_hash"],
    )

    df = df.astype(
        {
            "feature_id": "category",
            "gene_name": "category",
            "strand": "category",
            "start": np.uint32,
            "stop": np.uint32,
            "seq_hash": np.int32,
        },
        copy=False,
    )

    df["sequence"] = pd.Series([fasta[_][:].seq for _ in fasta.keys()])
    df["reverse_complement"] = pd.Series(
        [fasta[_][:].reverse.complement.seq for _ in fasta.keys()]
    )
    return df
Exemplo n.º 3
0
    def test_renamed(self):
        formats = self.__formats
        for i, j in itertools.product(formats[:-1], formats):
            renamer = bioformats.seqname.NcbiFastaSeqRenamer()
            for k in self.__acc_num_files:
                renamer.read_ncbi_acc_num(k, i, j)
            # convert sequence IDs
            input_file = os.path.join(self.__test_dir, 'ncbi_' + i + '.fa')
            with open(self.__output, 'w') as output_fasta:
                for line in renamer.renamed(input_file):
                    output_fasta.write(line)

            example_file = os.path.join(self.__test_dir, 'ncbi_' + j + '.fa')

            for k in (self.__output + '.fai', example_file + '.fai'):
                if os.path.isfile(k):
                    os.unlink(k)

            output_fasta = Fasta(self.__output)
            example_fasta = Fasta(example_file)
            # compare the obtained file to the example
            self.assertEqual(output_fasta.keys(), example_fasta.keys())

        # test for an incorrect format
        with self.assertRaises(SeqRenameError):
            renamer = bioformats.seqname.NcbiFastaSeqRenamer()
            renamer.read_ncbi_acc_num(
                'unknown', 'chr_refseq',
                os.path.join(self.__test_dir, 'ncbi_chr_refseq.fa'))
        with self.assertRaises(SeqRenameError):
            renamer.read_ncbi_acc_num(
                'chr_refseq', 'unknown',
                os.path.join(self.__test_dir, 'ncbi_chr_refseq.fa'))

        # test for an incorrect NCBI accession number dictionary
        with self.assertRaises(IncorrectDictError):
            renamer.read_ncbi_acc_num(self.__chr_incorrect, 'refseq_full',
                                      'chr_refseq')

        # check if sequence versions are removed
        renamer = bioformats.seqname.NcbiFastaSeqRenamer()
        for k in self.__acc_num_files:
            renamer.read_ncbi_acc_num(k,
                                      'chr',
                                      'genbank',
                                      remove_seq_version=True)
        input_file = os.path.join(self.__test_dir, 'ncbi_chr.fa')
        example_file = os.path.join(self.__test_dir, 'ncbi_genbank_nover.fa')
        with open(self.__output, 'w') as output_fasta:
            for line in renamer.renamed(input_file):
                output_fasta.write(line)

        for k in (self.__output + '.fai', example_file + '.fai'):
            if os.path.isfile(k):
                os.unlink(k)

        output_fasta = Fasta(self.__output)
        example_fasta = Fasta(example_file)
        self.assertEqual(output_fasta.keys(), example_fasta.keys())
        os.unlink(example_file + '.fai')
Exemplo n.º 4
0
    def test_renamed(self):
        """
        Check if sequences in a FASTA file are properly renamed.
        """
        renamer = bioformats.seqname.FastaSeqRenamer()
        renamer.read_renaming_dict(self.__renaming_dict)
        with open(self.__output, 'w') as output_fasta:
            for line in renamer.renamed(self.__fasta):
                output_fasta.write(line)

        # perform the reverse renaming
        rev_renamer = bioformats.seqname.FastaSeqRenamer()
        rev_renamer.read_renaming_dict(self.__renaming_dict)
        with open(self.__rev_output, 'w') as rev_output_fasta:
            for line in renamer.renamed(self.__output, reverse=True):
                rev_output_fasta.write(line)

        # compare the original and reverse-renamed FASTA files
        original_fasta = Fasta(self.__fasta)
        rev_renamed_fasta = Fasta(self.__rev_output)
        for x, y in zip(
                original_fasta.keys(),
                rev_renamed_fasta.keys()):
            self.assertEqual(x, y)

        # check if the missing sequence exception is raised
        del renamer.renaming_dict['seq2']
        with self.assertRaises(MissingSeqNameError):
            for _ in renamer.renamed(self.__fasta):
                pass

        os.unlink(self.__output)
        os.unlink(self.__rev_output)
Exemplo n.º 5
0
class GenVCF:
    def __init__(self, ref_fasta_path, vcf_path, kmer_size, nprocs):
        self.vcf_path = vcf_path
        self.fasta_path = ref_fasta_path
        self.ref = Fasta(ref_fasta_path)
        self.vcf = VCF(vcf_path)
        self.kmer_size = kmer_size
        self.nprocs = nprocs
        self.keys = [c for c in self.vcf.seqnames if c in self.ref.keys()]
        self.directory = None
        if len(self.keys) == 0:
            self.keys = self.ref.keys()
            print('No common keys found. Using reference.')

    def set_destination(self, path):
        self.directory = path

    def full_string(self):
        ref_seq = ""
        for chrom in self.keys:
            ref_seq += str(self.ref[chrom])
        return ref_seq

    def get_kmer_frequency(self, klen):
        # returns a counter
        return kt.get_kmer_count(self.full_string(), klen)

    def vcf_scan(self):
        if self.nprocs == 0:
            self.nprocs = mp.cpu_count()
        regions = kt.get_split_vcf_regions(self.vcf_path, self.nprocs)
        args = [[region, self.vcf_path, self.fasta_path, self.kmer_size] for region in regions]
        pool = mp.Pool(self.nprocs)
        results = [funccall.get() for funccall in [pool.starmap_async(process_region, args)]]
        pool.close()
        all_vars, singletons, not_singletons = ['all_vars'], ['singleton_vars'], ['notsingleton_vars']
        all_transitions, singleton_transitions, notsingleton_transitions, mismatches = ['all_transitions'], [
            'singleton_transitions'], ['notsingleton_transitions'], ['mismatches']
        for result in results[0]:
            for key, value in result.items():
                if key == 'all':
                    all_transitions.append(value[0])
                    all_vars.append(value[1])
                if key == 'singletons':
                    singleton_transitions.append(value[0])
                    singletons.append(value[1])
                if key == 'not_singletons':
                    notsingleton_transitions.append(value[0])
                    not_singletons.append(value[1])
                if key == 'mismatches':
                    mismatches.append(value[0])
        all_results = [all_vars, singletons, not_singletons, all_transitions, singleton_transitions,
                       notsingleton_transitions, mismatches]
        destination = kt.prepare_directory(parent=self.directory)
        for res in all_results:
            merge_and_save(res, destination)
        return #results
Exemplo n.º 6
0
def filter_fasta(infa, outfa, regex=".*", v=False, force=False):
    """Filter fasta file based on regex.

    Parameters
    ----------
    infa : str
        Filename of input fasta file.

    outfa : str
        Filename of output fasta file. Cannot be the same as infa.

    regex : str, optional
        Regular expression used for selecting sequences.

    v : bool, optional
        If set to True, select all sequence *not* matching regex.

    force : bool, optional
        If set to True, overwrite outfa if it already exists.

    Returns
    -------
        fasta : Fasta instance
            pyfaidx Fasta instance of newly created file
    """
    if infa == outfa:
        raise ValueError("Input and output FASTA are the same file.")

    if os.path.exists(outfa):
        if force:
            os.unlink(outfa)
            if os.path.exists(outfa + ".fai"):
                os.unlink(outfa + ".fai")
        else:
            raise ValueError(
                "{} already exists, set force to True to overwrite".format(
                    outfa))

    filt_function = re.compile(regex).search
    fa = Fasta(infa, filt_function=filt_function)
    seqs = fa.keys()
    if v:
        original_fa = Fasta(infa)
        seqs = [s for s in original_fa.keys() if s not in seqs]
        fa = original_fa

    if len(seqs) == 0:
        raise ValueError("No sequences left after filtering!")

    with open(outfa, "w") as out:
        for chrom in seqs:
            out.write(">{}\n".format(fa[chrom].name))
            out.write("{}\n".format(fa[chrom][:].seq))

    return Fasta(outfa)
Exemplo n.º 7
0
def filter_fasta(infa, outfa, regex=".*", v=False, force=False):
    """Filter fasta file based on regex.

    Parameters
    ----------
    infa : str
        Filename of input fasta file.
    
    outfa : str
        Filename of output fasta file. Cannot be the same as infa.

    regex : str, optional
        Regular expression used for selecting sequences.

    v : bool, optional
        If set to True, select all sequence *not* matching regex.

    force : bool, optional
        If set to True, overwrite outfa if it already exists.

    Returns
    -------
        fasta : Fasta instance
            pyfaidx Fasta instance of newly created file
    """
    if infa == outfa:
        raise ValueError("Input and output FASTA are the same file.")

    if os.path.exists(outfa):
        if force:
            os.unlink(outfa)
            if os.path.exists(outfa + ".fai"):
                os.unlink(outfa + ".fai")
        else:
            raise ValueError(
                    "{} already exists, set force to True to overwrite".format(outfa))
            
    filt_function = re.compile(regex).search
    fa = Fasta(infa, filt_function=filt_function)
    seqs = fa.keys()
    if v:
        original_fa = Fasta(infa)
        seqs = [s for s in original_fa.keys() if s not in seqs]
        fa = original_fa
    
    if len(seqs) == 0:
        raise ValueError("No sequences left after filtering!")

    with open(outfa, "w") as out:
        for chrom in seqs:
            out.write(">{}\n".format(fa[chrom].name))
            out.write("{}\n".format(fa[chrom][:].seq))

    return Fasta(outfa)
Exemplo n.º 8
0
def get_sequence_fasta(region, reference=None, padding=True):
    ref = Fasta(reference)
    if "chr" not in list(ref.keys())[0] and "chr" in region.chr:
        chrom = region.chr.split("chr")[1]
    elif "chr" not in region.chr and "chr" in list(ref.keys())[0]:
        chrom = "chr" + region.chr
    else:
        chrom = region.chr

    if not padding:
        return ref[chrom][region.start:region.stop].seq
    else:
        return ref[chrom][region.start_w_padding:region.stop_w_padding].seq
Exemplo n.º 9
0
def write_sequence(args):
    _, ext = os.path.splitext(args.fasta)
    if ext:
        ext = ext[1:]  # remove the dot from extension
    filt_function = re.compile(args.regex).search
    fasta = Fasta(args.fasta, default_seq=args.default_seq, strict_bounds=not args.lazy, split_char=args.delimiter, filt_function=filt_function, rebuild=not args.no_rebuild)

    regions_to_fetch, split_function = split_regions(args)
    if not regions_to_fetch:
        regions_to_fetch = fasta.keys()
    if args.invert_match:
        sequences_to_exclude = set([split_function(region)[0] for region in regions_to_fetch])
        fasta = Fasta(args.fasta, default_seq=args.default_seq, strict_bounds=not args.lazy, split_char=args.delimiter, rebuild=not args.no_rebuild)
        regions_to_fetch = (key for key in fasta.keys() if key not in sequences_to_exclude)
        split_function = ucsc_split

    header = False
    for region in regions_to_fetch:
        name, start, end = split_function(region)
        if args.size_range:
            if start is not None and end is not None:
                sequence_len = end - start
            else:
                sequence_len = len(fasta[name])
            if args.size_range[0] > sequence_len or args.size_range[1] < sequence_len:
                continue
        if args.split_files:  # open output file based on sequence name
            filename = '.'.join(str(e) for e in (name, start, end, ext) if e)
            filename = ''.join(c for c in filename if c.isalnum() or c in keepcharacters)
            outfile = open(filename, 'w')
        elif args.out:
            outfile = args.out
        else:
            outfile = sys.stdout
        try:
            if args.transform:
                if not header and args.transform == 'nucleotide':
                    outfile.write("name\tstart\tend\tA\tT\tC\tG\tN\n")
                    header = True
                outfile.write(transform_sequence(args, fasta, name, start, end))
            else:
                for line in fetch_sequence(args, fasta, name, start, end):
                    outfile.write(line)
        except FetchError as e:
            raise FetchError(e.msg.rstrip() + "Try setting --lazy.\n")
        if args.split_files:
            outfile.close()
    fasta.__exit__()
Exemplo n.º 10
0
def write_sequence(args):
    _, ext = os.path.splitext(args.fasta)
    if ext:
        ext = ext[1:]  # remove the dot from extension
    filt_function = re.compile(args.regex).search
    fasta = Fasta(args.fasta, default_seq=args.default_seq, key_function=eval(args.header_function), strict_bounds=not args.lazy, split_char=args.delimiter, filt_function=filt_function, rebuild=not args.no_rebuild)

    regions_to_fetch, split_function = split_regions(args)
    if not regions_to_fetch:
        regions_to_fetch = fasta.keys()
    if args.invert_match:
        sequences_to_exclude = set([split_function(region)[0] for region in regions_to_fetch])
        fasta = Fasta(args.fasta, default_seq=args.default_seq, key_function=eval(args.header_function), strict_bounds=not args.lazy, split_char=args.delimiter, rebuild=not args.no_rebuild)
        regions_to_fetch = (key for key in fasta.keys() if key not in sequences_to_exclude)
        split_function = ucsc_split

    header = False
    for region in regions_to_fetch:
        name, start, end = split_function(region)
        if args.size_range:
            if start is not None and end is not None:
                sequence_len = end - start
            else:
                sequence_len = len(fasta[name])
            if args.size_range[0] > sequence_len or args.size_range[1] < sequence_len:
                continue
        if args.split_files:  # open output file based on sequence name
            filename = '.'.join(str(e) for e in (name, start, end, ext) if e)
            filename = ''.join(c for c in filename if c.isalnum() or c in keepcharacters)
            outfile = open(filename, 'w')
        elif args.out:
            outfile = args.out
        else:
            outfile = sys.stdout
        try:
            if args.transform:
                if not header and args.transform == 'nucleotide':
                    outfile.write("name\tstart\tend\tA\tT\tC\tG\tN\n")
                    header = True
                outfile.write(transform_sequence(args, fasta, name, start, end))
            else:
                for line in fetch_sequence(args, fasta, name, start, end):
                    outfile.write(line)
        except FetchError as e:
            raise FetchError(str(e) + " Try setting --lazy.\n")
        if args.split_files:
            outfile.close()
    fasta.__exit__()
Exemplo n.º 11
0
def prepare_reference_dict(fasta_path,
                           variants,
                           delim='\t',
                           primary_chroms=True,
                           nprocs=6):
    """
    :param nprocs: number of CPUs to use
    :param primary_chroms: boolean True means only include original autosomal chromosomes, False includes everything
    :param delim: indicates how your variant file is separated
    :param fasta_path: path to file containing reference sequence
    :param variants: path to bed file containing the variants in the reference sequence
    :return: prints files to a specified directory, 1 per chromosome
    """
    start = time.time()
    fa = Fasta(fasta_path)
    final_dir = '/uufs/chpc.utah.edu/common/home/u0319040/longo_scratch/output/chroms/'
    # var_df = pd.read_csv(variants, sep=delim, low_memory=False)
    if primary_chroms:
        keys = get_primary_chroms_grch38(fasta_path)
    else:
        keys = fa.keys()
    args = []
    directory = prepare_directory(new_folder='./ref_var_dict/')
    for key in keys:
        args.append((key, final_dir, fasta_path, directory))
    pool = mp.Pool(nprocs)
    results = [
        funccall.get()
        for funccall in [pool.starmap_async(process_chrom, args)]
    ]
    pool.close()
    print('Done processing variants in %f' % (time.time() - start), flush=True)
    return directory
def calc_bkgd_counts(fasta_filename, region_size_min,
                    region_size_max, ignore_chroms,
                    only_chroms, verbose):
    ''' calculate nuc frequencies for normalization.
        Returns: dict of nucleotide frequencies.
    '''

    nuc_counts = defaultdict(Counter)

    fasta = Fasta(fasta_filename, as_raw = True)

    for chrom in fasta.keys():

        # skip data based on specified chromosomes
        if chrom in ignore_chroms: continue

        if only_chroms and chrom not in only_chroms: continue

        seq_len = len(fasta[chrom])
        for idx in range(seq_len + 1):

            for region_size in range(region_size_min,
                                     region_size_max + 1):

                nucs = fasta[chrom][idx:idx+region_size]

                nuc_counts[region_size][nucs] += 1

    # remove entries that are not equal to region_size
    for region_size, nuc_dict in nuc_counts.items():
        for nuc, count in nuc_dict.items():
            if len(nuc) != region_size:
                nuc_dict.pop(nuc)

    return nuc_counts
def calc_bkgd_counts(fasta_filename, region_size_min, region_size_max,
                     ignore_chroms, only_chroms, verbose):
    ''' calculate nuc frequencies for normalization.
        Returns: dict of nucleotide frequencies.
    '''

    nuc_counts = defaultdict(Counter)

    fasta = Fasta(fasta_filename, as_raw=True)

    for chrom in fasta.keys():

        # skip data based on specified chromosomes
        if chrom in ignore_chroms: continue

        if only_chroms and chrom not in only_chroms: continue

        seq_len = len(fasta[chrom])
        for idx in range(seq_len + 1):

            for region_size in range(region_size_min, region_size_max + 1):

                nucs = fasta[chrom][idx:idx + region_size]

                nuc_counts[region_size][nucs] += 1

    # remove entries that are not equal to region_size
    for region_size, nuc_dict in nuc_counts.items():
        for nuc, count in nuc_dict.items():
            if len(nuc) != region_size:
                nuc_dict.pop(nuc)

    return nuc_counts
Exemplo n.º 14
0
    def test_split_seq(self):
        """ Fetch sequence by blocks """
        fa = Fasta('data/chr17.hg19.part.fa')
        
        gene = Fasta("data/gene.bed12.fasta")
        expect = gene[list(gene.keys())[0]][:].seq
        
        bed = "data/gene.bed12"
        with open(bed) as fi:
            record = fi.readline().strip().split("\t")

        chrom = record[0]
        start = int(record[1])
        strand = record[5]

        # parse bed12 format
        starts = [int(x) for x in record[11].split(",")[:-1]] 
        sizes = [int(x) for x in record[10].split(",")[:-1]]
        starts = [start + x  for x in starts]
        ends = [start + size  for start,size in zip(starts, sizes)] 
        
        # bed half-open
        if strand == "-":
            starts = [start + 1 for start in starts]
        else: 
            ends = [end - 1 for end in ends]
        
        intervals = zip(starts, ends) 
        result = fa.get_spliced_seq(chrom, intervals, rc=True)
        print(result.seq)
        print("====")
        print(expect)

        assert result.seq == expect
Exemplo n.º 15
0
def pairwise_align(folder, work_dir):
    try:
        os.mkdir(work_dir)
    except:
        pass
    fastas = [
        folder + '/' + file for file in os.listdir(folder)
        if file.endswith('.fasta')
    ]

    for count, fasta in enumerate(fastas):
        subprocess.call(
            "rm temp.fa.fai && reformat.sh in=%s out=temp.fa addunderscore overwrite=true"
            % fasta,
            shell=True)
        f = Fasta('temp.fa')
        seqs = f.keys()
        subprocess.call("samtools faidx temp.fa %s > %s/main.fa" %
                        (seqs[0], work_dir),
                        shell=True)
        subprocess.call(
            'samtools faidx temp.fa %s > %s/temp.fa && lastz --format=maf %s/main.fa %s/temp.fa > %s/%d.maf'
            % (' '.join(
                seqs[1:]), work_dir, work_dir, work_dir, work_dir, count),
            shell=True)
Exemplo n.º 16
0
class TestFeatureKeyFunction:
    def __init__(self):
        self.fasta = os.path.join(path, 'data/genes.fasta')
        self.faidx = Faidx(self.fasta, key_function=get_gene_name)
        self.genes = Fasta(self.fasta, key_function=get_gene_name)

    def test_keys(self):
        expect = ['BARD1', 'FGFR2', 'KF435149.1', 'MDM4', 'NM_000465.3', 'NM_001282543.1', 'NM_001282545.1', 'NM_001282548.1', 'NM_001282549.1', 'NR_104212.1', 'NR_104215.1', 'XM_005249642.1', 'XM_005249643.1', 'XM_005249644.1', 'XM_005249645.1', 'XM_005265507.1', 'XM_005265508.1', 'XR_241079.1', 'XR_241080.1', 'XR_241081.1']
        result = sorted(self.genes.keys())
        assert result == expect

    def test_key_function_by_dictionary_get_key(self):
        expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA'
        result = self.genes['MDM4'][100-1:150]
        assert str(result) == expect

    def test_key_function_by_fetch(self):
        expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA'
        result = self.faidx.fetch('MDM4',
                             100, 150)
        assert str(result) == expect

    @raises(ValueError)
    def test_duplicated_keys(self):
        genes = Fasta(self.fasta, key_function=get_duplicated_gene_name)
Exemplo n.º 17
0
def get_refseq(ref):
    refseq = None
    fa = Fasta(ref)
    for genome_id in fa.keys():
        genome = genome_id
        refseq = str(fa[genome_id])
    return refseq
Exemplo n.º 18
0
def remakeProt(fasta, outfile, idfile, id):
    fasta_index = Fasta(fasta)
    lookup = {}
    for protein in fasta_index.keys():
        size = len(fasta_index[protein])
        gene, isoform = stripName(protein)
        if gene in lookup:
            if size > lookup[gene][0]:
                lookup[gene] = (size, isoform)
        else:
            lookup[gene] = (size, isoform)
    
    with open(outfile,"w") as f, open(idfile, "a") as q:
        for i, gene in enumerate(lookup):
            isoform = lookup[gene][1]
            name = "".join([gene, "_P", isoform])
            if name not in fasta_index:
                if gene == isoform:
                    name = gene
                else:
                    name = "".join([gene, "_T", isoform])
            q.write("{}_{}: {}\n".format(id, i, gene))
            f.write(">{}_{}\n".format(id,i))
            for line in fasta_index[name]:
                f.write("{}\n".format(str(line)))
Exemplo n.º 19
0
class FastaChunkReader:
    def __init__(self, filename, chunk_size=10000, kmer_size=31):
        self.fasta = Fasta(filename)
        self.current_ref = 0
        self.current_start = 0
        self.chunk_size = chunk_size
        self.kmer_size = kmer_size
        self.seqnames = list(self.fasta.keys())
        #self.chunk         = chunk
        #self.total_chunks  = total_chunks

    def __iter__(self):
        return self

    def __next__(self):
        if len(self.seqnames) == self.current_ref:
            self.fasta.close()
            raise StopIteration
        seqname = self.seqnames[self.current_ref]
        start = self.current_start
        end = start + self.chunk_size
        self.current_start = end - self.kmer_size
        if end >= len(self.fasta[seqname]):
            self.current_start = 0
            self.current_ref += 1
            end = len(self.fasta[seqname])
        return {
            "seqname": seqname,
            "start": start,
            "end": end,
            "seq": self.fasta[seqname][start:end].seq,
        }
Exemplo n.º 20
0
def write_sequence(args):
    _, ext = os.path.splitext(args.fasta)
    if ext:
        ext = ext[1:]  # remove the dot from extension
    fasta = Fasta(args.fasta, default_seq=args.default_seq, strict_bounds=not args.lazy, split_char=args.delimiter)

    regions_to_fetch, split_function = split_regions(args)
    if not regions_to_fetch:
        regions_to_fetch = tuple(fasta.keys())

    for region in regions_to_fetch:
        name, start, end = split_function(region)
        if args.split_files:  # open output file based on sequence name
            filename = '.'.join(str(e) for e in (name, start, end, ext) if e)
            filename = ''.join(c for c in filename if c.isalnum() or c in keepcharacters)
            outfile = open(filename, 'w')
        else:
            outfile = sys.stdout
        try:
            for line in fetch_sequence(args, fasta, name, start, end):
                outfile.write(line)
        except FetchError as e:
            raise FetchError(e.msg.rstrip() + "Try setting --lazy.\n")
        if args.split_files:
            outfile.close()
    fasta.__exit__()
Exemplo n.º 21
0
def main(options):
    transcripts=read_strand_file(options.strand)
    ref=Fasta(options.ref)
    for chrom in ref.keys():
        print(chrom, file=sys.stderr)
        print(">"+chrom)

        plus=np.array([False]*len(ref[chrom]))
        minus=np.array([False]*len(ref[chrom]))

	ti=0
        for transcript in transcripts["chr"+chrom]:
            if not ti % 1000:
                print("\r"+chrom+":trans"+str(ti), file=sys.stderr)
            if transcript[0]=="+":
                plus[transcript[1]:transcript[2]]=True
            elif transcript[0]=="-":
                minus[transcript[1]:transcript[2]]=True
            ti+=1

        print(chrom+":writing", file=sys.stderr)
        chrom_tx_strand = "".join(MAP[1*plus+2*minus])
        #output=textwrap.fill(chrom_tx_strand,40)
	print(chrom_tx_strand)
        print(chrom+":done", file=sys.stderr)
Exemplo n.º 22
0
    def flaimapper(self, settings, first_task):
        '''
        Make bam files and fragment them.
        Use Flaimapper on fragmented bam.
        Combine Flaimapper output and count reads using Bedtools intersect.
        '''
        overlap = settings["overlap_range"]
        size_range = settings["size_range"]
        #get chrom lengths
        genome = Fasta(settings["genome"], one_based_attributes=False)
        genome_lengths = {}
        for chrom in genome.keys():
            genome_lengths[chrom] = len(genome[chrom])
        #This allowes 2bp non-overlaping pp till length 1418
##        overlap = [0.888,0.916,0.939,0.957,0.97,0.979,0.985,0.9898,0.9931,\
##                   0.9953,0.99685,0.99789] #as list [x,y]
##        size_range = [24,33,47,67,97,140,197,292,432,636,950]\
##                     #as list [z], has to be shorter by 1 from overlap
        pool = mp.Pool(processes=settings["CPUs"])
        results = [pool.apply_async(self.flaimapper_by_library, \
                        args = (settings,library,overlap,size_range,\
                                genome_lengths,first_task)) \
                        for library in sorted(settings["libraries"])]
        pool.close()
        pool.join()
        for r in results:
            r.get()
Exemplo n.º 23
0
    def test_split_seq(self):
        """ Fetch sequence by blocks """
        fa = Fasta('data/chr17.hg19.part.fa')

        gene = Fasta("data/gene.bed12.fasta")
        expect = gene[list(gene.keys())[0]][:].seq

        bed = "data/gene.bed12"
        with open(bed) as fi:
            record = fi.readline().strip().split("\t")

        chrom = record[0]
        start = int(record[1])
        strand = record[5]

        # parse bed12 format
        starts = [int(x) for x in record[11].split(",")[:-1]]
        sizes = [int(x) for x in record[10].split(",")[:-1]]
        starts = [start + x for x in starts]
        ends = [start + size for start, size in zip(starts, sizes)]

        # bed half-open
        if strand == "-":
            starts = [start + 1 for start in starts]
        else:
            ends = [end - 1 for end in ends]

        intervals = zip(starts, ends)
        result = fa.get_spliced_seq(chrom, intervals, rc=True)
        print(result.seq)
        print("====")
        print(expect)

        assert result.seq == expect
Exemplo n.º 24
0
def prepare_reference_dict(fasta_path, variants, delim='\t', primary_chroms=True):
    """
    :param primary_chroms: boolean True means only include original autosomal chromosomes, False includes everything
    :param delim: indicates how your variant file is separated
    :param fasta_path: path to file containing reference sequence
    :param variants: path to bed file containing the variants in the reference sequence
    :return: a dictionary mapping chromosome names to an array of tuples containing the reference allele in the first index and the variant allele in the second index if it exists
    """
    start = time.time()
    fa = Fasta(fasta_path)
    var_df = pd.read_csv(variants, sep=delim)
    if primary_chroms:
        keys = get_primary_chroms(fasta_path)
    else:
        keys = fa.keys()
    args = []
    directory = prepare_directory(new_folder='./ref_var_dict/')
    for key in keys:
        args.append((key, var_df[var_df.iloc[:, 0] == key], fasta_path, directory))
    pool = mp.Pool(mp.cpu_count())
    results = [funccall.get() for funccall in [pool.starmap_async(zip_chrom, args)]]
    pool.close()
    print('Done processing variants in %f' % (time.time() - start))
    directory = prepare_directory(parent='./ref_var_dict/')
    for chrom_key in results[0]:
        fp = open(directory + chrom_key[0] + '.csv', 'w')
        fp.write('REF_fasta\tALT\tREF_vcf')
        for rec in chrom_key[1]:
            for i in rec:
                fp.write(str(i) + '\t')
            fp.write('\n')
        fp.close()
    print('Reference dictionary prepared in %f' % (time.time() - start))
    return directory
Exemplo n.º 25
0
def parse_fasta(f, output, window_size, chromosome, line_len):
    # ----  Load in input information ----
    fasta_file = Fasta(str(f))
    headers = fasta_file.keys()

    flag = True
    for header in list(headers):
        print( f"Parsing out {header} into {window_size} base pair windows")

        sample_seq_df = {str(header): list(fasta_file[str(header)][:].seq)}

        number_of_out_seqs = (
            len(sample_seq_df[str(header)]) // window_size + 1)

        start_pos = 0
        end_pos = window_size

        for _ in range(number_of_out_seqs):
            current_file_path = output / f"{chromosome}/{chromosome}_{start_pos}_{end_pos}.fasta"
            if flag:
                if current_file_path.is_file():
                    os.remove(current_file_path)
                else:
                    pass

            with open(current_file_path, 'a') as current_file:
                seq = textwrap.wrap("".join(list(sample_seq_df[str(header)][start_pos:end_pos])), line_len)
                if len(seq) == 0:
                    continue
                current_file.write(">{}\n".format(header))
                current_file.write("{}\n".format("\n".join(seq)))
                start_pos += window_size
                end_pos += window_size
        flag = False
    return
Exemplo n.º 26
0
def locate(args):
    kmers, fd, fo = args.kmer, args.db, args.out
    fg = args.fg
    db = Fasta(fd)
    #
    kseqs = kmers.split(',')
    kseqs2 = [Sequence(name='kmer',seq=kseq).reverse.complement.seq for kseq in kseqs]
    ptn = "|".join([ "("+k+")" for k in kseqs+kseqs2 ])
    #
    seqs = []
    if fg != '':
        fhg = open(fg, 'r')
        for line in fhg:
            line = line.rstrip("\n")
            if not line: continue
            gid = line.split()[0]
            if gid == 'gid': continue
            if gid not in db: continue
            seqs.append(gid)
    else:
        seqs = db.keys()

    fho = open(fo, 'w')
    fho.write('kmer\tsid\tstart\tend\tsrd\n')
    i = 1
    for seqid in seqs:
        seq = db[seqid][0:].seq
        for m in re.finditer(ptn, seq):
            start, end = m.start()+1, m.end()
            srd = "+" if m.group(0) in kseqs else "-"
            fho.write(f"{m.group(0)}\t{seqid}\t{start}\t{end}\t{srd}\n")
            i += 1
    fho.close()
Exemplo n.º 27
0
def get_fasta_length(filename):
    """Get length of reference sequence"""

    refseq = Fasta(filename)
    key = list(refseq.keys())[0]
    l = len(refseq[key])
    return l
Exemplo n.º 28
0
    def __init__(self, assembly, data_manager):
        sub_type = assembly.replace('.', '').replace('_', '')
        if sub_type.startswith('R6'):
            sub_type = 'R627'
        fasta_config = data_manager.get_config('FASTA')

        for sub_type_config in fasta_config.get_sub_type_objects():
            if not sub_type == sub_type_config.get_sub_data_type():
                self.logger.info(sub_type_config.get_sub_data_type())
                continue
            filepath = sub_type_config.get_filepath()
            self.logger.info(filepath)
            break

        if filepath is None:
            self.logger.warning("Can't find Assembly filepath for %s",
                                assembly)
            sys.exit(3)

        self.assembly = assembly
        self.filepath = filepath
        fasta_data = Fasta(filepath)
        while len(fasta_data.keys()) == 0:
            time.sleep(6)
            os.remove(filepath + ".fai")
            fasta_data = Fasta(filepath)
        self.fasta_data = fasta_data
Exemplo n.º 29
0
def generate_sizes(name, genome_dir):
    """Generate a sizes file with length of sequences in FASTA file."""
    fa = os.path.join(genome_dir, name, "{}.fa".format(name))
    sizes = fa + ".sizes"
    g = Fasta(fa)
    with open(sizes, "w") as f:
        for seqname in g.keys():
            f.write("{}\t{}\n".format(seqname, len(g[seqname])))
Exemplo n.º 30
0
def generate_sizes(name, genome_dir):
    """Generate a sizes file with length of sequences in FASTA file."""
    fa = os.path.join(genome_dir, name, "{}.fa".format(name))
    sizes = fa + ".sizes"
    g = Fasta(fa)
    with open(sizes, "w") as f:
        for seqname in g.keys():
            f.write("{}\t{}\n".format(seqname, len(g[seqname])))
Exemplo n.º 31
0
def ref_genome_as_string(ref_fasta, keys=None):
    ref_genome = Fasta(ref_fasta)
    if keys is None:
        keys = ref_genome.keys()
    ref_seq = ""
    for chrom in keys:
        ref_seq += str(ref_genome[chrom])
    return ref_seq
Exemplo n.º 32
0
class FastaWrapper(GenomeWrapper):
    def __init__(self,
                 fasta_file,
                 alpha='dna',
                 one_hot=True,
                 channel_last=True,
                 in_mem=False,
                 thread_safe=False,
                 read_ahead=10000):
        super().__init__(alpha, one_hot, channel_last, in_mem, thread_safe)
        self.fasta = Fasta(fasta_file,
                           as_raw=True,
                           sequence_always_upper=True,
                           read_ahead=read_ahead)
        self._chroms = list(self.fasta.keys())
        seq_lens = [len(self.fasta[chrom]) for chrom in self._chroms]
        self._chroms_size = dict(zip(self._chroms, seq_lens))
        self.read_ahead = read_ahead
        if in_mem:
            fasta_onehot_dict = self._encode_seqs(self.fasta)
            self.fasta.close()
            self.fasta = fasta_onehot_dict
            self.thread_safe = True
        else:
            if thread_safe:
                self.fasta.close()
                self.fasta = fasta_file

    def close(self):
        if not self.thread_safe:
            self.fasta.close()

    @staticmethod
    def _encode_seqs(fasta):
        # Converts a FASTA object into a dictionary of one-hot coded boolean matrices
        fasta_dict = {}
        pbar = tqdm(fasta)
        for record in pbar:
            pbar.set_description(desc='Loading sequence: ' + record.name)
            seq = record[:]
            seq = np.array(list(seq))
            fasta_dict[record.name] = seq
        return fasta_dict

    def _get_seq(self, chrom, start, stop):
        if self.in_mem:
            seq = self.fasta[chrom][start:stop]
        else:
            if self.thread_safe:
                fasta = Fasta(self.fasta,
                              as_raw=True,
                              sequence_always_upper=True,
                              read_ahead=self.read_ahead)
                seq = np.array(list(fasta[chrom][start:stop]))
                fasta.close()
            else:
                seq = np.array(list(self.fasta[chrom][start:stop]))
        return seq
Exemplo n.º 33
0
    def test_ncbiseqrename_fasta(self):
        """
        Check if NCBI sequence names in a FASTA file are properly
        changed.
        """
        sys.argv = [
            '', 'ncbirenameseq', self.__fasta, 'genbank', self.__output,
            'ucsc', '--chr', self.__chr, '--unloc', self.__unloc, '--unpl',
            self.__unpl, '--fasta'
        ]

        bioformats.cli.bioformats()

        # check if the obtained and original files are the same
        original_fasta = Fasta(self.__ucsc_fasta)
        renamed_fasta = Fasta(self.__output)
        for x, y in zip(original_fasta.keys(), renamed_fasta.keys()):
            self.assertEqual(x, y)
Exemplo n.º 34
0
def get_lengths(input, cutoff):
    lookup = {}
    contigs = Fasta(input)

    for contig in contigs.keys():
        if len(contigs[contig]) < cutoff:
            lookup[contig] = 1

    return lookup
Exemplo n.º 35
0
def get_fasta_sequence(filename, start, end, key=0):
    """Get chunk of indexed fasta sequence at start/end points"""

    from pyfaidx import Fasta
    refseq = Fasta(filename)
    if type(key) is int:
        chrom = list(refseq.keys())[key]
    seq = refseq[chrom][start:end].seq
    return seq
Exemplo n.º 36
0
def tadpole_fastqs(f1,
                   out,
                   verbose=False,
                   k=66,
                   threads=1,
                   tadpole_bin='tadpole.sh',
                   bm1=1,
                   bm2=1,
                   mincontig="auto",
                   mincountseed=100,
                   return_contigs=False):
    ''' use tadpole from bbtools to assemble a cloud of sequences controlled by a UMI
    '''
    #tadpole.sh in=tmp/f1_ACTTCGCCAGAGTTGG_GTGCGAGAGGGTA.fastq out=mini k=66 overwrite=True bm1=1 bm2=1 mincountseed=4
    cmd = f"{tadpole_bin} in={f1} out={out} k={k} overwrite=True bm1={bm1} bm2={bm2} t={threads} -Xmx6g mincontig={mincontig} mincountseed={mincountseed} rcomp=f"
    if verbose:
        print(cmd)
    pp = subprocess.run(cmd.split(),
                        stdout=subprocess.PIPE,
                        stderr=subprocess.PIPE)

    if "RuntimeException" in pp.stderr.decode():
        #means the command failed
        print("RunTimeEx")
        return (False)
    contigs = pp.stderr.decode().split("Contigs generated:")[1].split(
        '\n')[0].split('\t')[1]
    contigs = int(contigs)

    if contigs == 0:
        return (False)

    else:
        #total = ''.join([i for i in pp.stderr.decode().split('\n') if "Pairs" in i])
        #total = float(total.split('\t')[-1].replace("%", ''))
        #joined = ''.join([i for i in pp.stderr.decode().split('\n') if "Joined" in i])
        #joined = float(joined.split('\t')[-1].replace("%", ''))/100
        log = '\n'.join([i for i in pp.stderr.decode().split('\n')])
        if "contig" in out:
            log_ofn = out.replace('contig', 'log')
        else:
            log_ofn = out + '.log'
        if verbose:
            print(pp.stderr.decode())
            print("tadpole log %s" % (log_ofn))

        with open(log_ofn, 'w') as logout:
            for i in log:
                logout.write(i)
        if return_contigs:
            from pyfaidx import Fasta
            contigs = Fasta(out, as_raw=True)
            contigs = [(k, contigs[k][:]) for k in contigs.keys()]
            return (contigs)

        return (True)
Exemplo n.º 37
0
def format_alleles_from_sam(ref_path, fa_ofn):
    # we add the right header #TODO improve
    import uuid
    from pyfaidx import Fasta
    import subprocess
    import pysam

    unique_filename = "tmpfa_"+str(uuid.uuid4())

    with open(unique_filename, 'w') as ofa, open(fa_ofn) as infile:
        FF = Fasta(ref_path)
        entries = [(f, len(FF[f][:])) for f in FF.keys()]
        for entry in entries:
            ofa.write(f'@SQ\tSN:{entry[0]}\tLN:{entry[1]}\n')
            print(entry)

        for line in infile.readlines():
            #print(line)
            ofa.write(line)

    subprocess.run(f'mv {unique_filename} {fa_ofn}'.split())
    
    with open(fa_ofn.replace('fa', 'concise'), 'w') as tab:
        tab.write("\t".join(['cbc', 'umi', 'molid', 'contig_str', 'contig_cov', 'reads', 'contig_len','cigar', 'allele', 'pruneda']) + '\n')

        for read in pysam.AlignmentFile(fa_ofn):
            start = read.reference_start
            trim_start = 1111
            trim_end = 1515
            refi = 0
            quei = 0
            #if start<trim_start:
            current = start
            allele = []
            for i in read.cigartuples:
                if i[0] == 0: #match
                    quei += i[1]
                    if len(allele) == 0:
                        allele.append(f'{current+i[1]-trim_start}M')
                    elif len(allele) == 100*len(read.cigartuples)-1:
                        allele.append(f'{trim_end -current+i[1]}M')
                    else:
                        allele.append(f'{current+i[1]}M')
                    current = current + i[1]
                if i[0] == 1: #insertion
                    allele.append(f'{current+i[1]}I')
                    current = current + i[1]
                if i[0] == 2: #deletion
                    allele.append(f'{current+i[1]}D')
                    current = current + i[1]

            cbc, umi, molid, contig_str, contig_cov, reads, contig_len = read.query_name.split('_')
            pruneda = ":".join(allele[1:-1])
            if len(pruneda) == 0 or len(allele) == 0:
                pruneda = 'NA'
            tab.write("\t".join([cbc, umi, molid, contig_str, contig_cov, reads, contig_len, read.cigarstring, ':'.join(allele), pruneda]) + '\n')
Exemplo n.º 38
0
def main(options):
    """
    Iterate and remove motif (by setting to N)
    """
    ref=Fasta(options.ref)
    reg=re.compile(motif)
    for chrom in ref.keys():
        print(">"+chrom)
        new_seq=reg.sub("N"*motif_length, ref[chrom][:].seq.upper())
        print(new_seq)
def get_prot_lens(faa_file, phage):
    len_dict={}
    digits=get_digits(faa_file)
    #def make_seq_len_dict(faa):
    f=Fasta(faa_file)
    for i in f.keys():
        name=get_locus_tag(i, digits=digits, phage=phage)
        length=len(str(f[i]))
        len_dict[name]=length
    return len_dict
def get_prot_lens(faa_file, phage):
    len_dict = {}
    digits = get_digits(faa_file)
    #def make_seq_len_dict(faa):
    f = Fasta(faa_file)
    for i in f.keys():
        name = get_locus_tag(i, digits=digits, phage=phage)
        length = len(str(f[i]))
        len_dict[name] = length
    return len_dict
Exemplo n.º 41
0
Arquivo: UM.py Projeto: tzeitim/ogtk
def clean_fasta(fn, prefix):
    fa = Fasta(fn)
    tmp_fa_fn = prefix+'tmp.fa'
    tmp_fa = open(tmp_fa_fn, 'w')
    for entry in fa.keys():
        tmp_fa.write(">%s\n%s\n"%(entry, fa[entry][:].seq.replace('\n', '')))
    fa.close()
    tmp_fa.close()
    cmd = "mv %s %s"%(tmp_fa_fn, fn)
    subprocess.run(cmd.split())
Exemplo n.º 42
0
    def test_ncbiseqrename_fasta(self):
        """
        Check if NCBI sequence names in a FASTA file are properly
        changed.
        """
        sys.argv = ['', self.__fasta, 'genbank', self.__output, 'ucsc',
                    '--chr', self.__chr, '--unloc', self.__unloc,
                    '--unpl', self.__unpl, '--fasta']

        bioformats.cli.ncbirenameseq()

        # check if the obtained and original files are the same
        original_fasta = Fasta(self.__ucsc_fasta)
        renamed_fasta = Fasta(self.__output)
        for x, y in zip(original_fasta.keys(), renamed_fasta.keys()):
            self.assertEqual(x, y)

        os.unlink(self.__ucsc_fasta + '.fai')
        os.unlink(self.__output)
        os.unlink(self.__output + '.fai')
Exemplo n.º 43
0
def binding_sites(kmer, genome_fp):
    genome = Fasta(genome_fp)
    locations = {}
    kmer = str(kmer)
    for record in genome.keys():
        seq = str(genome[record])
        locations[record] = substr_indices(kmer, seq)
        # append reversed primer locations as well
        locations[record] += substr_indices(revcomp(kmer), seq)
    if locations == {}:
        raise ValueError(
            "No locations for {} found in fg genome!".format(kmer))
    return locations
def write_read_lengths_to_file(read_fasta_files, output_file):
    out=open(output_file,"w")
    out.write("fasta_file\tseq_id\tread_len\n")
    
    readfiles=list(read_fasta_files)
    for r in readfiles:
        f=Fasta(r)
        for i in f.keys():
            length=len(str(f[i]))
            fasta=r.split("/")[-1]
            sequence=i
            out.write("%s\t%s\t%s\n" % (fasta, sequence, length))
    out.close()
Exemplo n.º 45
0
def chromosome_ends(genome_fp):
    '''
    Returns the locations of the starts/ends of each chromosome (record) in a
    genome where all the chromosomes are concatenated (so i.e. the 2nd genome
    start site is len(1st genome), and all indices are 0-based).
    '''
    genome = Fasta(genome_fp)
    len_so_far = 0
    chr_ends = {}
    for record in genome.keys():
        chromosome = genome[record]
        chr_len = len(chromosome)
        chr_ends[record] = [len_so_far, chr_len + len_so_far - 1]
        len_so_far += chr_len
    return chr_ends
def split_fasta(number_files, fasta_file):
    try:
        fasta=Fasta(fasta_file)
    except:
        print "could not open fasta"
        exit()
    number_seqs=len(fasta.keys())
    splits=int(np.ceil(number_seqs/number_files))
    #print(splits)
    ranges=range(0, number_seqs, splits)
    print(ranges)
    ranges[-1]=number_seqs
    print(ranges)

    
    for i in range(0, number_files):
        start=ranges[i]
        stop=ranges[i+1]
        label=re.sub(r"\.fa.*","."+str(i+1)+".fasta", fasta_file)
        out=open(label,"w")
        
        for f in fasta.keys()[start:stop]:
            out.write(">"+f+"\n"+str(fasta[f])+"\n")
        out.close()
Exemplo n.º 47
0
def fillgaps(consensusdict, fasta):
    """
    """
    print("filling consensus...")
    fastascaf = Fasta(fasta, mutable=True)
    for chrom in fastascaf.keys():
        for suc in consensusdict.keys():
            t1 = int(suc.split(":")[0])
            t2 = int(suc.split(":")[1])
            assert (t2 - t1) == len(fastascaf[chrom][t1:t2].seq)
#            print(consensusdict[suc])
#            print(fastascaf[chrom][t1:t2].seq)
            fastascaf[chrom][t1:t2] = consensusdict[suc]
#            print(fastascaf[chrom][t1:t2].seq)
    return(None)
Exemplo n.º 48
0
def generate_gap_bed(fname, outname):
    """ Generate a BED file with gap locations.

    Parameters
    ----------
    fname : str
        Filename of input FASTA file.

    outname : str
        Filename of output BED file.
    """ 
    f = Fasta(fname)
    with open(outname, "w") as bed:
        for chrom in f.keys():
            for m in re.finditer(r'N+', f[chrom][:].seq):
                bed.write("{}\t{}\t{}\n".format(chrom, m.start(0), m.end(0)))
Exemplo n.º 49
0
def fasta_stats(fasta_fp):
    """
    Retrieves the number of bases and number of records in a FASTA file. Also
    creates a FASTA index (.fai) for later searching. May be slow for very large
    files.
    """
    # pyfaidx can't handle blank lines within records, so we have to check :(
    check_empty_lines(fasta_fp)
    try:
        fasta = Fasta(fasta_fp)
        length = fasta_len_quick(fasta_fp)
        nrecords = len(fasta.keys())
        return length, nrecords
    except:
        click.secho(
            "\nError reading %s: invalid FASTA format?" % fasta_fp, fg="red")
        raise
Exemplo n.º 50
0
def readFASTA(x, splitKey = None):
    """
    Is sequence file? Load from file if so. File should be FASTA format
    Use pyfasta
    """

    if type(x) is not str:
        raise TypeError("input must be type str. filename or sequence")
    if os.path.isfile(x):
        tmp_o = Fasta(x, key_function=lambda key: key.split()[0])
        if (splitKey is None):
            o = tmp_o
        else:
            o = { i.split(splitKey)[0] : tmp_o[i] for i in tmp_o.keys() }
    else:
        o = x
    return o
Exemplo n.º 51
0
def size(args):
    if args.header:
        print("seqid\tsize")
    fname, fext = op.splitext(args.fi)
    if args.fi in ['stdin', '-'] or fext in ['.gz','.bz2']:
        fh = must_open(args.fi)
        for rcd in SeqIO.parse(fh, "fasta"):
            sid, size = rcd.id, len(rcd)
            if args.bed:
                print("%s\t%d\t%d" % (sid, 0, size))
            else:
                print("%s\t%d" % (sid, size))
    elif fext in [".%s" % x for x in FastaExt]:
        from pyfaidx import Fasta
        fas = Fasta(args.fi)
        for sid in fas.keys():
            size = len(fas[sid])
            if args.bed:
                print("%s\t%d\t%d" % (sid, 0, size))
            else:
                print("%s\t%d" % (sid, size))
    else:
        logging.error("%s is not a supported format" % fext)
Exemplo n.º 52
0
    EXAMPLE:   genomeiden_combined.py my_fasta.fa [OPTIONS]'
    print >>sys.stderr, msg
    sys.exit(1)

options = { 'windowed': False }
# genomeiden_combined.py my_fasta.fa [OPTIONS]
fasta_filename = sys.argv[1] 
if len(sys.argv) >2:
    coord_format = sys.argv[2] 

    if coord_format == '--windowed':
        options['windowed'] = True

test_fasta = Fasta(fasta_filename)

names = test_fasta.keys()
print names
def find_all_slippery_seq(seq,n=7):
   
    slippery_seqs = []
    seqs = []

	# searches for slip seq every nucleotide 
    for i in xrange(len(seq)): 
        # forward and revcomp seqs
        strands = ('+', '-')
        starts = (i, i + n) 
        zipped_seqs = zip( (seq[i:i+n], -seq[i:i+n]), strands, starts )
        map(seqs.append,  zipped_seqs)
    i = 0 
    for zseq, zstrand, zidx in seqs:
 def test_keys(self):
     fasta = Fasta('data/genes.fasta', split_char='|')
     expect = ['530364724', '530364725', '530364726', '530373235', '530373237', '530384534', '530384536', '530384538', '530384540', '543583738', '543583740', '543583785', '543583786', '543583788', '543583794', '543583795', '543583796', '557361097', '557361099', '563317589', 'AB821309.1', 'KF435149.1', 'KF435150.1', 'NM_000465.3', 'NM_001282543.1', 'NM_001282545.1', 'NM_001282548.1', 'NM_001282549.1', 'NR_104212.1', 'NR_104215.1', 'NR_104216.1', 'XM_005249642.1', 'XM_005249643.1', 'XM_005249644.1', 'XM_005249645.1', 'XM_005265507.1', 'XM_005265508.1', 'XR_241079.1', 'XR_241080.1', 'XR_241081.1', 'dbj']
     result = sorted(fasta.keys())
     assert result == expect
#!usr/bin/python

from pyfaidx import Fasta
from Bio.Seq import translate

tara=Fasta("./databases/OM-RGC_seq.release.fna")

tara_aa=open("./databases/OM-RGC_seq.translated.fasta","w")

for s in tara.keys():
    tara_aa.write(">"+s+"\n"+translate(tara[s])+"\n")

tara_aa.close()
Exemplo n.º 55
0
 def test_keys(self):
     genes = Fasta('data/genes.fasta', key_function=get_gene_name)
     expect = ['BARD1', 'FGFR2', 'MDM4', 'gi|530364724|ref|XR_241079.1|', 'gi|530364725|ref|XR_241080.1|', 'gi|530364726|ref|XR_241081.1|', 'gi|530373235|ref|XM_005265507.1|', 'gi|530373237|ref|XM_005265508.1|', 'gi|530384534|ref|XM_005249642.1|', 'gi|530384536|ref|XM_005249643.1|', 'gi|530384538|ref|XM_005249644.1|', 'gi|530384540|ref|XM_005249645.1|', 'gi|543583738|ref|NM_001282548.1|', 'gi|543583740|ref|NM_001282549.1|', 'gi|543583785|ref|NM_000465.3|', 'gi|543583786|ref|NM_001282543.1|', 'gi|543583788|ref|NM_001282545.1|', 'gi|543583794|ref|NR_104212.1|', 'gi|543583795|ref|NR_104215.1|', 'gi|557361097|gb|KF435149.1|']
     result = sorted(genes.keys())
     assert result == expect
Exemplo n.º 56
0
    CRICK_MAX =  count_crickMAX(args)
    print "now starting Fasta import"
    seq_in = Fasta(args.seqin)
    print "done with Fasta import"
    clusters = open(args.clusters)
    outsam = args.samout


# path = '/Volumes/data/epiGBS/Baseclear/Athal/'
# path = '/Volumes/data/epiGBS/DNAVISION/Project_DNA11032___140919_SN170_0407_AC52R6ACXX/Sample_DNA11032-001-L1/output/seqykJJfz/scabiosa/'
# path = '/tmp/'
# path = '/Volumes/data/epiGBS/FINAL/Scabiosa/BASECLEAR/'
# seq_in = Fasta(path+'Scabiosa_combined.fa')
#fasta_in = SeqIO.parse(open('/tmp/test.fa', 'r'), 'fasta')
seq_in_keymap = {}
for key in seq_in.keys():
    seq_in_keymap[key.split(';')[0]] = key
    faidx_rec = seq_in[key]

# clusters = open(path +'derep.uc', 'r')
# outsam = path+'derep_out.sam'

#clusters = open('/Volumes/data/epiGBS/test_scabi/cluster_sorted_a.uc', 'r')
#out_fa = open('/Volumes/data/epiGBS/test_scabi/output3.fa', 'w')
#outsam = '/Volumes/data/epiGBS/test_scabi/output3.sam'

#seq_in = SeqIO.parse(open('/Volumes/data/galaxy/database/files/009/dataset_9152.dat', 'r'), 'fastq')
#clusters = open('/Volumes/data/epiGBS/test_scabi/cluster_923.uc', 'r')
#cluster_records = pickle.load(open( "/tmp/save.p", "rb" ))
#
#print 'boe'
Exemplo n.º 57
0
def rename(args):
    import re
    from pyfaidx import Fasta

    fi, fo, fmf, fmb = args.fi, args.fo, args.fmf, args.fmb
    merge_short, gap = args.merge_short, args.gap
    prefix_chr, prefix_ctg = args.prefix_chr, args.prefix_ctg

    db = Fasta(fi)

    ptn1 = "^(chr)?([0-9]{1,2})"
    ptn2 = "chromosome *([0-9]{1,2})"

    sdic, cdic = dict(), dict()
    ccnt = 1
    for sid in db.keys():
        size = len(db[sid])
        res1 = re.search(ptn1, sid, re.IGNORECASE)
        if res1:
            sdic[sid] = [int(res1.group(2)), size]
        else:
            sid_long = db[sid].long_name
            res2 = re.search(ptn2, sid_long, re.IGNORECASE)
            if res2:
                sdic[sid] = [int(res2.group(1)), size]
            else:
                cdic[sid] = [ccnt, size]
                ccnt += 1

    if len(sdic.keys()) == 0:
        print("Error: no chromosomes detected")
        sys.exit(1)

    slst = sorted(sdic.items(), key = lambda t: t[1][0])
    clst = sorted(cdic.items(), key = lambda t: t[1][0])

    nchrom = slst[-1][1][0]
    sdigits = ndigit(slst[-1][1][0])
    cdigits = ndigit(clst[-1][1][0]) if len(clst) > 0 else 1
    sfmt = "%s%%0%dd" % (prefix_chr, sdigits)
    cfmt = "%s%%0%dd" % (prefix_ctg, cdigits)
    logging.debug("%d chromosomes, %d scaffolds/contigs" % (len(sdic), len(cdic)))

    fname, fext = op.splitext(fi)
    if fext not in [".%s" % x for x in FastaExt]:
        logging.error("%s is not a supported format" % fext)
        sys.exit(1)

    fho = open(fo, "w")
    fhf = open(fmf, "w")
    fhb = open(fmb, "w")
    for sid, sval in slst:
        scnt, size = sval
        nsid = sfmt % scnt
        fhf.write("%s\t%d\t%d\t+\t%s\t%d\t%d\t%d\n" % (sid, 0, size, nsid, 0, size, scnt))
        fhb.write("%s\t%d\t%d\t+\t%s\t%d\t%d\t%d\n" % (nsid, 0, size, sid, 0, size, scnt))
        nrcd = SeqRecord(Seq(str(db[sid])), id = nsid, description = '')
        SeqIO.write(nrcd, fho, "fasta")
    i = nchrom + 1
    if len(clst) > 0 and merge_short:
        zid = "%sx" % prefix_chr
        if sdigits == 2:
            zid = "%s99" % prefix_chr
        else:
            assert sdigits == 1, "wrong number of chroms: %d" % sdigits
        pos = 0
        seq = ''
        for cid, sval in clst:
            ccnt, size = sval
            start, end = pos, pos + size
            if pos > 0:
                start += gap
                end += gap
                seq += "N" * gap
            seq += str(db[cid])
            fhf.write("%s\t%d\t%d\t+\t%s\t%d\t%d\t%d\n" % (cid, 0, size, zid, start, end, i))
            fhb.write("%s\t%d\t%d\t+\t%s\t%d\t%d\t%d\n" % (zid, start, end, cid, 0, size, i))
            pos = end
            i += 1
        nrcd = SeqRecord(Seq(seq), id = zid, description = '')
        SeqIO.write(nrcd, fho, "fasta")
    else:
        for cid, sval in clst:
            ccnt, size = sval
            ncid = cfmt % ccnt
            fhf.write("%s\t%d\t%d\t+\t%s\t%d\t%d\t%d\n" % (cid, 0, size, ncid, 0, size, i))
            fhb.write("%s\t%d\t%d\t+\t%s\t%d\t%d\t%d\n" % (ncid, 0, size, cid, 0, size, i))
            nrcd = SeqRecord(Seq(str(db[cid])), id = ncid, description = '')
            SeqIO.write(nrcd, fho, "fasta")
            i += 1
    fhf.close()
    fhb.close()
    fho.close()
from pyfaidx import Fasta

maysFasta = Fasta('name')

maysFasta.keys()
        print species
        x[species] = 0
        y[species] = 0
        z[species] = []
        for file in os.listdir('.'):
            if file.endswith('.fai') and protId[species] in file:
                with open(file,'r') as f:
                    lines = f.readlines()
                    for line in lines:
                        if line:
                            x[species] += int(line.split('\t')[1])#abs(int(line.split('\t')[3]) - int(line.split('\t')[2]))#max(map(int,line.split('\t')[2:4]))-min(map(int,line.split('\t')[2:4]))
        for folder in [folder2 for folder2 in fastaFolders if species+'.fa' in os.listdir(folder2)]:
            try:
                fa = Fasta(folder+species+'.fa')
                #bedText = '\n'.join('\t'.join(['_'.join(line.split('_')[:-2])] + line.split('_')[-2:]) for line in fa.keys())
                y[species] += sum([len(fa[key][:].seq) for key in fa.keys()])#findlen(BedTool(bedText, from_string=True))
            except:
                print 'Error for ' + folder+species+'.fa'
            print y[species]

    """
    with open('finalSyntenyMultipleSpecies.bed','r') as f:
        print 'Bed Open...'
        lineOut = []
        for line in f.readlines():
            lineOut.append('-'.join(line.split('\t')[0:4])+'|'+line[line.rfind('\t')+1:])
        for line in lineOut:
            for seq in line.split('|'):
                y[specId[seq.split('-')[0]]] += abs(int(seq.split('-')[3]) - int(seq.split('-')[2]))
    """