Exemplo n.º 1
0
 def test_reverse_insertion(self):
     ''' check that reverse_indel works correctly for insertions
     '''
     genome = Fasta(self.fa)
     var = self.Var(pos=11, chrom='N', ref='G', alts=['GAA'])
     rev = reverse_indel(var, genome)
     self.assertEqual(rev.pos, 10)
     self.assertEqual(rev.ref, 'G')
     self.assertEqual(rev.alts, ['GTT'])
     genome.close()
Exemplo n.º 2
0
 def test_reverse_deletion(self):
     ''' check that reverse_indel works correctly for deletions
     '''
     genome = Fasta(self.fa)
     var = self.Var(pos=10, chrom='N', ref='CTA', alts=['C'])
     rev = reverse_indel(var, genome)
     self.assertEqual(rev.pos, 7)
     self.assertEqual(rev.ref, 'CTA')
     self.assertEqual(rev.alts, ['C'])
     genome.close()
Exemplo n.º 3
0
def INDEX_GENOME(OUTDIR, GENOME_FILE):
	LOGGER.info('Indexing the genome')
	GENOMEIDX = Fasta(GENOME_FILE)
	GENOMEPREFIX = os.path.splitext(GENOME_FILE)[0]
	FAIDX = pd.read_csv(GENOME_FILE + '.fai', sep='\t', names=['SCAFFOLD', 'SCAFF_LENGTH', 'three', 'four', 'five'])
	#FAIDX = FAIDX[['SCAFFOLD', 'SCAFF_LENGTH']]
	FILE = GENOMEPREFIX + '.fai'
	INDEX = os.path.join(OUTDIR, FILE)
	FAIDX.to_csv(INDEX, sep='\t', header=False, index=False)
	return INDEX
Exemplo n.º 4
0
 def parse_fasta(self, file_name: str) -> None:
     tf = tempfile.NamedTemporaryFile()
     if self.naked:
         tf.seek(0)
         tf.write(bytes(file_name, 'utf-8'))
         tf.flush()
         file_name = tf.name
     self.store = Fasta(file_name)
     self.ids = self.store.keys()
     tf.close()
Exemplo n.º 5
0
def extract_chromosome_data(chromosome_key, start, end):
    data_path = path_for_chromosome_data(chromosome_key)

    if not data_path.exists():
        raise Exception("Chromosome data not downloaded")

    all_data = Fasta(str(data_path))
    sliced_data = all_data[chromosome_key][start:end].seq

    return sliced_data
Exemplo n.º 6
0
def gc_correct(input, output, reference, frac_n, frac_r, iter, frac_lowess):
    fasta = Fasta(reference)
    bed_lines = [
        BedLine(*map(attempt_numeric, x.split("\t"))) for x in open(input)
    ]
    corrected = correct(bed_lines, fasta, frac_n, frac_r, iter, frac_lowess)

    with open(output, "wb") as ohandle:
        for line in corrected:
            ohandle.write(bytes(str(line) + "\n", 'utf-8'))
def get_prot_lens(faa_file, phage):
    len_dict = {}
    digits = get_digits(faa_file)
    #def make_seq_len_dict(faa):
    f = Fasta(faa_file)
    for i in f.keys():
        name = get_locus_tag(i, digits=digits, phage=phage)
        length = len(str(f[i]))
        len_dict[name] = length
    return len_dict
Exemplo n.º 8
0
    def post(self):
        gene_ids = request.get_json(force=True)['gene_ids']
        edit = request.get_json(force=True)['edit']
        genome = request.get_json(force=True)['genome']
        if not gene_ids:  # TODO improve
            raise BadRequest('gene_ids not set')

        if genome not in ['hg19', 'mm10']:  #
            raise BadRequest(f'{genome} not supported')

        if edit and len(gene_ids) != 1:
            raise BadRequest('gene_ids needs to have length 1 if editing..')

        # TODO here goes all the computation for checking wether SNP and CNSD
        # influence the guides. For now return the 6 best guides
        aggregation_pipeline = [
            # filter our genes
            {
                '$match': {
                    '$and': [{
                        'gene_id': {
                            '$in': gene_ids
                        }
                    }, {
                        'genome': genome
                    }]
                }
            },
            # unwind guides so we can access their score
            # {'$unwind': '$guides'},
            # # sort by score
            # {'$sort': {'guides.score': -1}},
            # # group guides together again (contrary of unwind)
            # {'$group': {
            #     '_id': '$_id',
            #     'gene_id': {'$first': '$gene_id'},
            #     'chromosome': {'$first': '$chromosome'},
            #     'pdbs': {'$first': '$pdbs'},
            #     'exons': {'$first': '$exons'},
            #     'guides': {'$push': '$guides'}
            # }},
        ]
        result = list(guide_collection.aggregate(aggregation_pipeline))
        if edit:
            df = gencode_exons(genome)
            exons = df[(df.gene_id == gene_ids[0])]
            chromosome = exons.seqname.iloc[0]
            # TODO here i have to change things..
            fasta = Fasta(GENOME_FILE.format(GENOME), as_raw=True)

            seq = fasta[chromosome][min(exons.start):max(exons.end)]
            # if self.strand == '-':  # i think this is done on the client...
            #     seq = seq.reverse.complement
            result[0]['sequence'] = seq
        return result
Exemplo n.º 9
0
def raw_error_rate(fig_fn):
	n = 0
	tmp_out = os.path.dirname(os.path.abspath(fig_fn)) + '/raw_cons_error.out'
	for sample, read_fn, ref_fn, info_fn, cons_ep_fn in zip(samples, read_fas, ref_fns, cons_info_fns, cons_ep_fn):
		read_fa = Fasta(read_fn)
		ref_fa = Fasta(ref_fn)
		with open(ref_fn) as ref_fp, open(cons_ep_fn) as cons_ep_fp, open(info_fn) as info_fp, open(tmp_out, 'w') as out_fp:
			out_fp.write('Sample\tCopyNum\tRawError\tConsError\n')
			last_name = ''
			for cons_name in ref_fa.keys():
				read_name = cons_name.rsplit('_')[0]
				if read_name == last_name:
					continue
				copy_num, raw_error, cons_error = 0, 0, 0
				ref_seq = ref_fa[cons_name][:].seq.upper()
				read_seq = read_fa[read_name][:].seq.upper()
				raw_error = get_mp_error_rate(ref_seq, read_seq)
				if raw_error < 0: continue

				for eline in cons_ep_fp:
					if eline.startswith('#'): continue
					ele = iline.rsplit()
					name, error = ele[ep_idx['#READ_NAME']], ele[ep_idx['ERR_RATE']][:-1]/100.0
					if name == cons_name:
						cons_error = error
					else:
						continue

				for sline in info_fp:
					ele = sline.rsplit()
					name, num = ele[info_idx['CONS_NAME']], ele[info_idx['COPY_NUM']]
					if name == cons_name:
						copy_num = int(num)
					else:
						continue
				out_fp.write('{}\t{}\t{}\t{}\n'.format(sample, copy_num, raw_error, cons_error))
				last_name = read_name
				n+=1
				if n== 10:
					sys.exit(1)
	cmd = 'Rscript /home/gaoy1/program/circ_plot/error_rate.R {} {}'.format(ep_fn, fig_fn)
	print(cmd)
Exemplo n.º 10
0
    def __init__(self, ref_fasta_fn):
        """"""

        # Init dict with chromosomes names
        self.sites = OrderedDict()
        with Fasta(ref_fasta_fn) as fa:
            for ref in fa:
                self.sites[ref.name] = OrderedDict()

        # Init other self variables
        self.counter = Counter()
Exemplo n.º 11
0
 def __init__(self, reference, annot_file, desc):
     """
     Usage:  PrimerDesign(reference, annotation, description)
     Initialise a design object witha  reference assembly and
     annotation file(s)
     """
     self.reference = Fasta(reference)
     self.annotations = BedTool(annot_file)
     self.desc = desc
     self.genome = re.sub("fasta$", "fasta.fai",
                          re.sub("fa$", "fa.fai", self.reference.filename))
Exemplo n.º 12
0
 def regex_filer(_fname, _regex, _v):
     infa = _fname + "_to_regex"
     os.rename(_fname, infa)
     # filter the fasta and store the output's keys
     keys_out = filter_fasta(infa,
                             outfa=_fname,
                             regex=_regex,
                             v=_v,
                             force=True).keys()
     keys_in = Fasta(infa).keys()
     return [k for k in keys_in if k not in keys_out]
Exemplo n.º 13
0
def fasta_extract_regions(fa_fname, intervals):
    """Extract an iterable of regions from an indexed FASTA file.

    Input: FASTA file name; iterable of (seq_id, start, end) (1-based)
    Output: iterable of string sequences.
    """
    with Fasta(fa_fname, as_raw=True) as fa_file:
        for chrom, subarr in intervals.by_chromosome():
            logging.info("Extracting sequences from chromosome %s", chrom)
            for _chrom, start, end in subarr.coords():
                yield fa_file[_chrom][start.item():end.item()]
Exemplo n.º 14
0
 def test_revcomp_whole_entry(self):
     fasta = Fasta('data/genes.fasta')
     if test_bio:
         with open('data/genes.fasta', "rU") as fh:
             seqio = SeqIO.to_dict(SeqIO.parse(fh, "fasta"))
         assert str(
             fasta['gi|557361099|gb|KF435150.1|'][:].reverse.complement
         ) == str(
             seqio['gi|557361099|gb|KF435150.1|'].reverse_complement().seq)
     else:
         raise SkipTest
Exemplo n.º 15
0
def fasta(fasta_file):
    """Load organism fasta file for use in pyfaidx module

	Args:
	fasta_file = the full filepath, including the file itself, to the organism's fasta file

	Note: an index of the fasta file should also be present in the same directory. This can
		  be produced using samtools faidx command and will have the suffix .fai
	"""
    org = Fasta(fasta_file)
    return ('%s accessed' % fasta_file), org
Exemplo n.º 16
0
def split_target_sequence(target_chroms, target_fasta_name, inter_files):
    Faidx(target_fasta_name)
    genome_size =0
    target_fasta = Fasta(target_fasta_name, key_function = lambda x: x.split()[0])
    for value in target_fasta.values():
        genome_size += len(value)
    for chrm in target_chroms:
        if chrm != target_fasta_name:
            out=open( inter_files + "/" + chrm+".fa", 'w')
            out.write(">" + chrm + "\n" + str(target_fasta[chrm]))
    return genome_size
Exemplo n.º 17
0
def get_transcripts(reference_file, transcript_file, vcf_file):
    """Take a FASTA reference file and a VCF file, and generate a FASTA file
    with changes from the vcf file"""
    shutil.copyfile(reference_file, transcript_file)
    transcripts = Fasta(transcript_file, mutable=True)
    with open(vcf_file) as f:
        for (accession, pos, ref, alt) in get_variations(f):
            if accession not in transcripts:
                raise ValueError('VCF accession {0} not found in reference'.\
                                 format(accession))
            transcripts[accession][(pos - 1):pos] = alt
Exemplo n.º 18
0
 def test_fetch_whole_entry(self):
     fasta = Fasta('data/genes.fasta')
     if test_bio:
         with open('data/genes.fasta', "rU") as fh:
             seqio = SeqIO.to_dict(SeqIO.parse(fh, "fasta"))
         assert str(fasta['gi|557361099|gb|KF435150.1|']) == str(
             seqio['gi|557361099|gb|KF435150.1|'].seq)
         assert fasta['gi|557361099|gb|KF435150.1|'].name == str(
             seqio['gi|557361099|gb|KF435150.1|'].name)
     else:
         raise SkipTest
Exemplo n.º 19
0
def processMAF(args, subtypes_dict):
    
    fasta_reader = Fasta(args.fastafile, read_ahead=1000000)
    
    nbp = (args.length-1)//2
    samples_dict = {}

    # M = np.zeros((len(samples), len(subtypes_dict)))
    numsites_keep = 0
    numsites_skip = 0
    chrseq = '0'

    f = open(args.input, 'r', encoding = "ISO-8859-1")

    reader = csv.DictReader(filter(lambda row: row[0]!='#', f), delimiter='\t')
    counter = 0
    for row in reader:

        if(row['Variant_Type'] != "SNP"): continue
            
        pos = int(row['Start_position'])
        ref = row['Reference_Allele']
        alt = row['Tumor_Seq_Allele2']
        sample = row[args.groupvar]
        
        if row['Chromosome'] != chrseq:
            sequence = fasta_reader[row['Chromosome']]
            chrseq = row['Chromosome']
        
        counter += 1
        mu_type = ref + alt
        category = getCategory(mu_type)
        lseq = sequence[pos-(nbp+1):pos+nbp].seq
        
        motif_a = getMotif(pos, lseq)
        subtype = str(category + "." + motif_a)
        st = subtypes_dict[subtype]

        if sample not in samples_dict:
            samples_dict[sample] = {}

        if subtype not in samples_dict[sample]:
            samples_dict[sample][subtype] = 1
        else:
            samples_dict[sample][subtype] += 1

        if (counter%1000 != 0): continue
        util_log.debug(args.input + ": " + str(counter) + " sites counted")

    M = DataFrame(samples_dict).T.fillna(0).values
    samples = sorted(samples_dict)

    out = collections.namedtuple('Out', ['M', 'samples'])(M, samples)
    return out
Exemplo n.º 20
0
def read_pep_fa(protein_file):
    import pandas as pd
    proteins = Fasta(str(protein_file))
    pl = []
    for v in proteins:
        names = v.long_name.split(" ", 8)
        d = {"protein_id": names[0], 'protein_type': names[1]}
        d = {**d, **dict([n.split(":", 1) for n in names[2:]])}
        d['seq'] = str(proteins[v.name])
        pl.append(d)
    return pd.DataFrame(pl)
Exemplo n.º 21
0
def generate_fasta(intersection_bedtool, fasta_filename, revcomp, verbose):

    if verbose:
        print >> sys.stderr, ">> generating fasta of positions ..."

    # -s: force strandedness
    fasta_seqs = intersection_bedtool.sequence(fi=fasta_filename, s=True)

    fasta = Fasta(fasta_seqs.seqfn)

    return fasta
Exemplo n.º 22
0
def fasta_extract_regions(fa_fname, intervals):
    """Extract an iterable of regions from an indexed FASTA file.

    Input: FASTA file name; iterable of (seq_id, start, end) (1-based)
    Output: iterable of string sequences.
    """
    with Fasta(fa_fname, as_raw=True) as fa_file:
        for chrom, rows in groupby(intervals, lambda cse: cse[0]):
            logging.info("Extracting sequences from chromosome %s", chrom)
            for _chrom, start, end in rows:
                yield fa_file[_chrom][start:end]
Exemplo n.º 23
0
    def set_peak_sequences_using_fasta(self,
                                       fasta_file_location="grch38.fasta"):
        logging.info("Setting peak sequences using fasta index")
        genome = Fasta(fasta_file_location)
        i = 0
        for peak in self.peaks:
            if i % 10000 == 0:
                logging.info("%d/%d peaks processed" % (i, len(self.peaks)))
            i += 1

            peak.set_sequence_using_fasta_index(genome)
Exemplo n.º 24
0
    def __select_ref(self, ref_reads, min_coverage, min_ref_length,
                     downsample_high_coverage):
        """Select ref_id with a minimal coverage in both sample + downsample if needed"""
        valid_ref_reads = OrderedDict()
        c = Counter()
        with Fasta(self._fasta_fn) as fasta:
            for ref_id, ref_dict in ref_reads.items():
                try:
                    # Discard reference transcripts shorter than the threshold
                    assert len(fasta[ref_id]) > min_ref_length
                    valid_dict = OrderedDict()
                    for cond_lab, cond_dict in ref_dict.items():
                        valid_dict[cond_lab] = OrderedDict()
                        for sample_lab, read_list in cond_dict.items():
                            logger.trace(
                                f"Asserting if {ref_id} has enough coverage in {sample_lab}"
                            )
                            # Filter out if coverage too low
                            assert len(read_list) >= min_coverage
                            logger.trace(
                                f"ref_id {ref_id} has {len(read_list)} reads in {sample_lab}"
                            )
                            # Downsample if coverage too high
                            if downsample_high_coverage and len(
                                    read_list) > downsample_high_coverage:
                                read_list = random.sample(
                                    read_list, downsample_high_coverage)
                            valid_dict[cond_lab][sample_lab] = read_list

                    # If all valid add to new dict
                    logger.trace(
                        f"ref_id {ref_id} has enough coverage in all samples: keeping it"
                    )
                    valid_ref_reads[ref_id] = valid_dict

                    # Save extra info for debug
                    c["valid_ref_id"] += 1
                    for cond_lab, cond_dict in valid_dict.items():
                        for sample_lab, read_list in cond_dict.items():
                            lab = "{} {} Reads".format(cond_lab, sample_lab)
                            c[lab] += len(read_list)

                except AssertionError:
                    logger.trace(
                        f"ref_id {ref_id} does not have enough coverage in at least one sample: discarding it"
                    )
                    c["invalid_ref_id"] += 1

        logger.debug(counter_to_str(c))
        logger.info(
            "\tReferences remaining after reference coverage filtering: {}".
            format(len(valid_ref_reads)))
        return valid_ref_reads
Exemplo n.º 25
0
 def __init__(self, ref_fasta_path, vcf_path, kmer_size, nprocs):
     self.vcf_path = vcf_path
     self.fasta_path = ref_fasta_path
     self.ref = Fasta(ref_fasta_path)
     self.vcf = VCF(vcf_path)
     self.kmer_size = kmer_size
     self.nprocs = nprocs
     self.keys = [c for c in self.vcf.seqnames if c in self.ref.keys()]
     self.directory = None
     if len(self.keys) == 0:
         self.keys = self.ref.keys()
         print('No common keys found. Using reference.')
Exemplo n.º 26
0
def faabed(faa, output):
    """
    create a fake bed file to keep backwards compatibility
    """
    fa = Fasta(faa)
    chrom = 1
    with open(output, "w") as fbed:
        for seq in fa:
            s = "chrom_{}\t1\t{}\t.\t{}\n".format(chrom, len(seq), seq.name)
            fbed.write(s)
            chrom += 1
    return output
Exemplo n.º 27
0
def load_fasta_sequences(fasta_file, return_keys=False):
    """
    Reads a FASTA file and returns list of string sequences
    """
    fasta = Fasta(fasta_file, as_raw=True, sequence_always_upper=True)
    seqs = [seq[:] for seq in fasta]
    if return_keys:
        keys = list(fasta.keys())
    fasta.close()
    if return_keys:
        return seqs, keys
    return seqs
Exemplo n.º 28
0
 def __get_kmer_list(self, ref_id, start, end, kmer_size=5):
     """ Extract fasta record corresponding to ref with error handling """
     try:
         with Fasta(self._fasta_fn) as fasta:
             fasta =(fasta [ref_id])
             seq = str(fasta[start:end+5])
             kmer_list = []
             for i in range(end-start):
                 kmer_list.append(seq[i:i+5])
             return kmer_list
     except KeyError:
         raise NanocomporeError("Reference id not present in fasta file")
Exemplo n.º 29
0
 def __init__(self, speciesNumber, genomeFileList, gffFileList, speciesName,
              speciesShortName):
     self.speciesNumber = speciesNumber
     for file in genomeFileList:
         if self.speciesNumber in file:
             self.genome = Fasta(file)
     for file in gffFileList:
         if self.speciesNumber in file and 'PAC' in file:
             self.gffFile = file
     self.speciesName = speciesName
     self.speciesShortName = speciesShortName
     self.conservedElementsBed = '%s_ConservedElements.bed' % self.speciesName
Exemplo n.º 30
0
def main():
    """ read fasta of 26bp seq of 3' pacbio transcript """
    fasta = Fasta("../data/pacbio/pacbio_new_gene_model.bam.down26.fasta",
                  duplicate_action="longest")
    for name in fasta.keys():
        seq = str(fasta[name])
        m = re.search('^(' + "A" + '+)', seq)
        if m:
            p = str(m.group(1))
            print(p + "\t" + str(len(p)))
        else:
            print("N" + "\t" + str(0))