Пример #1
0
 def scaffold_dict_init(self, file, introns):
     sequences_object = FastaFile(file)
     dict = {}
     for i in introns:
         if (not (i.scaffold_id in dict)):
             dict[i.scaffold_id] = sequences_object.fetch(i.scaffold_id)
     return dict
def data_iter(genome_path, label_file):
    """
    Extract 1000 long base pair sequences with corresponding labels.

    Parameters
    ----------
    genome_path :  '.genome.fa' file with genome sequence
    label_file  :  '.bed' file with label & location information

    Returns
    -------
    Generator of extracted 1000 long base pair sequence
    format: ((region, label), sequence))
    """

    min_region_size = 1000
    
    genome = FastaFile(genome_path)
    
    for region, label in iter_peaks_and_labels(label_file):
        # create a new region exactly min_region_size basepairs long centered on 
        # region  
        expanded_start = region[1] + (region[2] - region[1])/2 - min_region_size/2
        expanded_stop = expanded_start + min_region_size
        region = (region[0], expanded_start, expanded_stop)
        yield ((region, label), genome.fetch(*region))
    return
class FastaHandler:
    """
    Handles fasta files using pyfaidx API
    """
    def __init__(self, reference_file_path):
        """
        create fasta file object given file path to a fasta reference file
        :param fasta_file_path: full path to a fasta reference file
        """

        self.fasta_file_path = reference_file_path

        try:
            self.fasta = FastaFile(self.fasta_file_path)
        except:
            raise IOError("FASTA FILE READ ERROR")

    def get_sequence(self, chromosome_name, start, stop):
        """
        Return the sequence of a query region
        :param chromosome_name: Chromosome name
        :param start: Region start
        :param stop: Region end
        :return: Sequence of the region
        """
        return self.fasta.fetch(region=chromosome_name, start=start, end=stop).upper()

    def get_chr_sequence_length(self, chromosome_name):
        """
        Get sequence length of a chromosome. This is used for selecting windows of parallel processing.
        :param chromosome_name: Chromosome name
        :return: Length of the chromosome reference sequence
        """
        return self.fasta.get_reference_length(chromosome_name)
Пример #4
0
def extract_fasta_to_file(fasta,
                          output_dir,
                          mode='2D_transpose_bcolz',
                          overwrite=False):
    assert mode in _array_writer

    makedirs(output_dir, exist_ok=overwrite)
    fasta_file = FastaFile(fasta)
    file_shapes = {}
    for chrom, size in zip(fasta_file.references, fasta_file.lengths):
        data = np.zeros((size, NUM_SEQ_CHARS), dtype=np.float32)
        seq = fasta_file.fetch(chrom)
        one_hot_encode_sequence(seq, data)
        shape = data.shape
        shape_transpose = shape[::-1]
        file_shapes[chrom] = shape_transpose
        _array_writer[mode](data, os.path.join(
            output_dir,
            chrom))  #We have the metadata shape to be the transposed shape

    with open(os.path.join(output_dir, 'metadata.json'), 'w') as fp:
        json.dump(
            {
                'file_shapes': file_shapes,
                'type': 'array_{}'.format(mode),
                'source': fasta
            }, fp)
Пример #5
0
 def __init__(self,
              ref_fa_path=None,
              vcf_path=None,
              idx_path=None,
              batch_size=32,
              bin_size=100,
              tie='r'):
     '''
     :param str ref_fa_path: Path to indexed reference fasta
     :param str vcf_path: Path to indexed vcf
     :param str idx_path: Path to bed-file which will contain the names and locations of compatible variants
     :param int batch_size: Batch size
     :param int bin_size: Length of the DNA-sequences (centered on the start position of the variant)
     '''
     self.vcf = VariantFile(vcf_path)
     self.ref = FastaFile(ref_fa_path)
     assert os.path.isfile(
         ref_fa_path +
         '.fai'), 'Error: no index found for Fasta-file: {}'.format(
             ref_fa_path)
     self.idx_path = idx_path
     self.batch_size = batch_size
     self.bin_size = bin_size
     assert tie in ['l', 'r']
     self.tie = tie
     if not bin_size % 2:
         self.offset = 0 if tie == 'r' else 1
     else:
         self.offset = 0
     self.n_variants = self._initialize_index()
     self._verify_refmatch()
Пример #6
0
def load_sequences_and_labels(regions_fname, genome_fa_fname, balanced):
    seqs, labels = [], []
    min_region_size = 1000
    genome = FastaFile(genome_fa_fname)
    for region, label in iter_peaks_and_labels(sys.argv[1]):
        # create a new region exactly min_region_size basepairs long centered on 
        # region  
        expanded_start = region[1] + (region[2] - region[1])/2 - min_region_size/2
        expanded_stop = expanded_start + min_region_size
        region = (region[0], expanded_start, expanded_stop)
        seqs.append(genome.fetch(*region))

        if label == 'promoter': labels.append(1)
        elif label == 'enhancer': labels.append(0)
        else: assert False

    # crew code begin to balance data
    if balanced:
    	sequences = pd.DataFrame(seqs)
    	sequences['Labels'] = pd.Series(labels)
    	p_seqs = sequences[sequences['Labels'].isin([1])]
    	p_seqs.index = range(len(p_seqs))
    	e_seqs = sequences[sequences['Labels'].isin([0])]
    	e_seqs.index = range(len(e_seqs))

    	p_seqs_sample = p_seqs.sample(len(e_seqs))
   
    	balanced_seqs = p_seqs_sample.append(e_seqs)
    	balanced_seqs.index = range(len(balanced_seqs))
	
	shuffled_balanced_seqs = balanced_seqs.reindex(np.random.permutation(balanced_seqs.index))
	shuffled_balanced_seqs.index = range(len(shuffled_balanced_seqs))

    	return one_hot_encode_sequences(shuffled_balanced_seqs.iloc[:,0].as_matrix())[:,None,:,:],np.array(shuffled_balanced_seqs['Labels'].as_matrix())
    return one_hot_encode_sequences(seqs)[:,None,:,:],np.array(labels)
Пример #7
0
def shotgun_library(fasta_file, mu, sigma, direction=(1, -1)):
    """Generate random fragment sequences of a given input sequence

    :param seq: input sequence.
    :param mu: mean fragment length.
    :param sigma: stdv of fragment length.
    :param direction: tuple represention direction of output sequences with
        respect to the input sequence.

    :yields: sequence fragments.

    .. note:: Could be made more efficient using buffers for random samples
        and handling cases separately.
    """
    fasta = FastaFile(fasta_file)
    seq_lens = [fasta.get_reference_length(x) for x in fasta.references]
    total_len = sum(seq_lens)
    seq_probs = [x / total_len for x in seq_lens]
    # FastaFile.fetch is proper slow, just read everything
    refs = fasta.references
    fasta = {k: fasta.fetch(k) for k in refs}

    def random_buffer(probs, size=10000):
        while True:
            buf = []
            for x, n in zip(range(len(probs)),
                            np.random.multinomial(size, probs)):
                buf.extend([x] * n)
            np.random.shuffle(buf)
            for x in buf:
                yield x

    seq_chooser = random_buffer(seq_probs)

    # parameters for lognormal
    mean = np.log(mu / np.sqrt(1 + sigma**2 / mu**2))
    stdv = np.sqrt(np.log(1 + sigma**2 / mu**2))

    while True:
        # choose a seq based on length
        seq_i = next(seq_chooser)
        seq = fasta[refs[seq_i]]
        seq_len = seq_lens[seq_i]

        start = np.random.randint(0, seq_len)
        frag_length = int(np.random.lognormal(mean, stdv))
        move = np.random.choice(direction)
        end = max(0, start + move * frag_length)
        start, end = sorted([start, end])

        if end - start < 2:
            # Expand a bit to ensure we grab at least one base.
            start = max(0, start - 1)
            end += 1

        frag_seq = seq[start:end]
        if move == -1:
            frag_seq = reverse_complement(frag_seq)
        yield frag_seq, refs[seq_i], start, end, '+' if move == 1 else '-'
Пример #8
0
class FastaHandler:
    """
    Handles fasta files using pyfaidx API
    """
    def __init__(self, reference_file_path):
        """
        create fasta file object given file path to a fasta reference file
        :param fasta_file_path: full path to a fasta reference file
        """

        self.fasta_file_path = reference_file_path

        try:
            self.fasta = FastaFile(self.fasta_file_path)
        except:
            raise IOError("FASTA FILE READ ERROR")

    def get_sequence(self, chromosome_name, start, stop):
        """
        Return the sequence of a query region
        :param chromosome_name: Chromosome name
        :param start: Region start
        :param stop: Region end
        :return: Sequence of the region
        """
        return self.fasta.fetch(region=chromosome_name, start=start,
                                end=stop).upper()

    def get_chr_sequence_length(self, chromosome_name):
        """
        Get sequence length of a chromosome. This is used for selecting windows of parallel processing.
        :param chromosome_name: Chromosome name
        :return: Length of the chromosome reference sequence
        """
        return self.fasta.get_reference_length(chromosome_name)

    def get_contig_names(self):
        return self.fasta.references

    def get_ref_of_region(self, contig, site):
        """
        Return a string containing reference of a site
        :param contig: Contig [ex chr3]
        :param site: Site [ex 100000-200000]
        :return:
        """
        ret_val = ""
        error_val = 0
        try:
            ret_val = self.fasta.fetch(region=contig + site).upper()
        except:
            print("ERROR IN REF FETCH: ", contig, site)
            error_val = 1
        return ret_val, error_val

    def close(self):
        self.fasta.close()
Пример #9
0
def fasta_extract_exons(fasta_file, database_file, output, raw=False):
    start = time.time()

    if isinstance(fasta_file, FastaFile):
        fasta = fasta_file
    else:
        fasta_file = g2g_fu.check_file(fasta_file)
        fasta = FastaFile(fasta_file)

    database_file = g2g_fu.check_file(database_file)

    fasta_out = sys.stdout

    if output:
        output = g2g_fu.check_file(output, 'w')
        fasta_out = open(output, "w")

    LOG.info("FASTA FILE: {0}".format(fasta.filename))
    LOG.info("DATABASE FILE: {0}".format(database_file))
    LOG.info("OUTPUT FILE: {0}".format(fasta_out.name))

    try:
        transcripts = get_transcripts_simple(database_file)
        for i, transcript in enumerate(transcripts):

            if transcript.seqid not in fasta.references:
                continue

            for ensembl_id, exon in transcript.exons.iteritems():
                LOG.debug("Exon={0}".format(exon))

                partial_seq = fasta.fetch(exon.seqid, exon.start-1, exon.end)
                partial_seq_str = partial_seq

                if transcript.strand == -1:
                    partial_seq_str = str(reverse_complement_sequence(partial_seq))

                LOG.debug("{0}:{1}-{2} (Length: {3})\n{4}".format(exon.seqid, exon.start, exon.end, len(partial_seq), partial_seq_str))

                if raw:
                    fasta_out.write(partial_seq_str)
                else:
                    fasta_id = ">{0} {1}:{2}-{3}\n".format(exon.ensembl_id, exon.seqid, exon.start, exon.end)
                    fasta_out.write(fasta_id)

                    for line in wrap_sequence(partial_seq_str):
                        fasta_out.write(line.strip())
                        fasta_out.write('\n')

    except G2GValueError as e:
        LOG.info(e.msg.rstrip())
        raise e
    except G2GFastaError as e:
        LOG.info(e.msg.rstrip())
        raise e

    LOG.info("Execution complete: {0}".format(format_time(start, time.time())))
Пример #10
0
def count_callable(callable_file, chrom, start=None, end=None):

    callable_file = FastaFile(callable_file)

    seq = callable_file.fetch(chrom, start, end)
    chrom_length = len(seq)
    callable_sites = seq.count('0')

    return chrom_length, callable_sites
Пример #11
0
    def _extract(self, intervals, out, **kwargs):
        fasta = FastaFile(self._datafile)

        for index, interval in enumerate(intervals):
            seq = fasta.fetch(str(interval.chrom), interval.start,
                              interval.stop)

            out[index, :, :, 0] = one_hot_encode_sequence(seq)

        return out
Пример #12
0
class IndexedFasta(DataSource):
    name = "indexed_bedfile"
    version = "0.1.0"
    container = "python"
    partition_access = True
    description = "A bgzipped and indexed fasta file"

    def __init__(self, urlpath, metadata=None):
        self._urlpath = urlpath
        self._dataset = None
        self._dtype = None
        self._chroms = None
        super().__init__(metadata=metadata)

    def _open_dataset(self):
        self._dataset = FastaFile(self._urlpath)

    def _get_schema(self):
        if self._dataset is None:
            self._open_dataset()
        self._chroms = list(self._dataset.references)
        chrom_lengths = [{
            "chrom": t[0],
            "length": t[1]
        } for t in zip(self._dataset.references, self._dataset.lengths)]
        return Schema(
            datashape=None,
            dtype=None,
            shape=None,
            npartitions=len(self._chroms),
            extra_metadata={"chroms": chrom_lengths},
        )

    def _get_partition(self, i):
        chrom = self._chroms[i]
        return [{"seqid": chrom, "seq": self._dataset.fetch(chrom)}]

    def read_chunked(self):
        self._load_metadata()
        for i in range(self.npartitions):
            yield self._get_partition(i)

    def to_dask(self):
        from dask import bag as db

        self._load_metadata()
        return db.from_delayed([
            dask.delayed(self._get_partition(i))
            for i in range(self.npartitions)
        ])

    def _close(self):
        # close any files, sockets, etc
        if self._dataset is not None:
            self._dataset.close()
Пример #13
0
def shotgun_library(fasta_file, mu, sigma, direction=(1,-1)):
    """Generate random fragment sequences of a given input sequence

    :param seq: input sequence.
    :param mu: mean fragment length.
    :param sigma: stdv of fragment length.
    :param direction: tuple represention direction of output sequences with
        respect to the input sequence.

    :yields: sequence fragments.

    .. note:: Could be made more efficient using buffers for random samples
        and handling cases separately.
    """
    fasta = FastaFile(fasta_file)
    seq_lens = [fasta.get_reference_length(x) for x in fasta.references]
    total_len = sum(seq_lens)
    seq_probs = [x / total_len for x in seq_lens]
    # FastaFile.fetch is proper slow, just read everything
    refs = fasta.references
    fasta = {k:fasta.fetch(k) for k in refs}

    def random_buffer(probs, size=10000):
        while True:
            buf = []
            for x, n in zip(range(len(probs)), np.random.multinomial(size, probs)):
                buf.extend([x]*n)
            np.random.shuffle(buf)
            for x in buf: 
                yield x
    seq_chooser = random_buffer(seq_probs)

    while True:
        # choose a seq based on length
        seq_i = next(seq_chooser)
        seq = fasta[refs[seq_i]]
        seq_len = seq_lens[seq_i]

        start = np.random.randint(0, seq_len)
        frag_length = int(np.random.normal(mu, sigma))
        move = np.random.choice(direction)
        end = max(0, start + move*frag_length)
        start, end = sorted([start, end])

        if end - start < 2:
            # Expand a bit to ensure we grab at least one base.
            start = max(0, start - 1)
            end += 1

        frag_seq = seq[start:end]
        if move == -1:
            frag_seq = reverse_complement(frag_seq)
        yield frag_seq
Пример #14
0
def _chrom_sizes(fasta_file):
    """Get the chromosome sizes for a fasta file
    """
    from pysam import FastaFile
    fa = FastaFile(fasta_file)
    chrom_lens = OrderedDict([(name, l) for name, l in zip(fa.references, fa.lengths)])
    if len(chrom_lens) == 0:
        raise ValueError(f"no chromosomes found in fasta file: {fasta_file}. "
                         "Make sure the file path is correct and that the fasta index "
                         "file {fasta_file}.fai is up to date")
    fa.close()
    return chrom_lens
Пример #15
0
    def __init__(self, reference_file_path):
        """
        create fasta file object given file path to a fasta reference file
        :param fasta_file_path: full path to a fasta reference file
        """

        self.fasta_file_path = reference_file_path

        try:
            self.fasta = FastaFile(self.fasta_file_path)
        except:
            raise IOError("FASTA FILE READ ERROR")
def main():
    min_region_size = 1000
    genome = FastaFile("GRCh38.genome.fa")
    for region, label in iter_peaks_and_labels(sys.argv[1]):
        # create a new region exactly min_region_size basepairs long centered on 
        # region  
        expanded_start = region[1] + (region[2] - region[1])/2 - min_region_size/2
        expanded_stop = expanded_start + min_region_size
        region = (region[0], expanded_start, expanded_stop)
        print region, label
        print genome.fetch(*region)
    return
def generate_homopolymer_plots(bed_file, fasta_file, bam_file):
    bed_file_records = open(bed_file, 'r')
    for line in bed_file_records:
        contig, start_pos, end_pos = line.rstrip().split('\t')
        start_pos = int(start_pos)
        end_pos = int(end_pos)
        if start_pos < 1000:
            continue
        if end_pos - start_pos > 50:
            continue

        samfile = pysam.AlignmentFile(bam_file, "rb")

        assembly_fasta_file = FastaFile(fasta_file)
        reference_sequence = assembly_fasta_file.fetch(reference=contig, start=start_pos, end=start_pos + 200)

        reference_homopolymer_index_start = 1
        reference_homopolymer_index_end = 1
        homopolymer_base = reference_sequence[reference_homopolymer_index_start]
        # print(homopolymer_base)
        while reference_homopolymer_index_end < len(reference_sequence) and reference_sequence[reference_homopolymer_index_end] == homopolymer_base:
            reference_homopolymer_index_end += 1

        # print(reference_sequence[reference_homopolymer_index_start:reference_homopolymer_index_end])
        reference_homopolymer_length = reference_homopolymer_index_end - reference_homopolymer_index_start

        all_reads = samfile.fetch(contig, start_pos - 1, end_pos)

        read_homopolymers = []
        for read in all_reads:
            aligned_pairs = read.get_aligned_pairs()

            start_index = 0
            for index, position in aligned_pairs:
                if index is None:
                    continue
                if position == start_pos:
                    start_index = index + 1
                    break
            if read.query_sequence is None:
                continue
            if start_index == len(read.query_sequence):
                continue
            homopolymer_base = read.query_sequence[start_index]
            # print(homopolymer_base)
            end_index = start_index
            while end_index < len(read.query_sequence) and read.query_sequence[end_index] == homopolymer_base:
                end_index += 1
            read_homopolymer_length = end_index - start_index
            read_homopolymers.append(read_homopolymer_length)

        print(contig + "\t" + str(start_pos) + "\t" + str(end_pos) + "\t" + str(reference_homopolymer_length) + "\t" + str(','.join([str(x) for x in read_homopolymers])))
Пример #18
0
 def __init__(self, datafile, use_strand=False, **kwargs):
     """Fasta file extractor
     
     NOTE: The extractor is not thread-save.
     If you with to use it with multiprocessing,
     create a new extractor object in each process.
     
     Args:
       datafile (str): path to the bigwig file
       use_strand (bool): if True, the extracted sequence
         is reverse complemented in case interval.strand == "-"
     """
     super(FastaExtractor, self).__init__(datafile, **kwargs)
     self.use_strand = use_strand
     self.fasta = FastaFile(self._datafile)
Пример #19
0
    def close(self):
        if self._fh:
            self._fh.close()
            self._fh = None
            subprocess.check_call([self._bgzip_exe, "--force", self._basepath])
            os.rename(self._basepath + ".gz", self.filename)

            # open file with FastaFile to create indexes, then make all read-only
            _fh = FastaFile(self.filename)
            _fh.close()
            os.chmod(self.filename, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
            os.chmod(self.filename + ".fai", stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
            os.chmod(self.filename + ".gzi", stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)

            logger.info("{} written; added {} sequences".format(self.filename, len(self._added)))
Пример #20
0
    def close(self):
        if self._fh:
            self._fh.close()
            self._fh = None
            subprocess.check_call([self._bgzip_exe, "--force", self._basepath])
            os.rename(self._basepath + ".gz", self.filename)

            # open file with FastaFile to create indexes, then make all read-only
            _fh = FastaFile(self.filename)
            _fh.close()
            os.chmod(self.filename, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
            os.chmod(self.filename + ".fai", stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
            os.chmod(self.filename + ".gzi", stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)

            logger.info("{} written; added {} sequences".format(self.filename, len(self._added)))
Пример #21
0
def main():
    min_region_size = 1000
    genome = FastaFile("./genome/GRCh38.genome.fa")
    train_path = "./train_data/"
    list_dir = os.listdir(train_path)
    for filename in list_dir:
        for region, label in iter_peaks_and_labels(train_path + filename):
            # create a new region exactly min_region_size basepairs long centered on 
            # region  
            expanded_start = region[1] + (region[2] - region[1])/2 - min_region_size/2
            expanded_stop = expanded_start + min_region_size
            region = (region[0], expanded_start, expanded_stop)
            #print region, label
            print genome.fetch(*region), label
    return
Пример #22
0
def data_generator_pysam(my_args, name, start, stop, is_bulk):
    fasta_file = FastaFile(my_args.fasta)
    ref = fasta_file.fetch(name, start, stop)

    my_arg = {
        'fastafile': fasta_file,
        'stepper': 'samtools',
        'adjust_capq_threshold': 50,
        'contig': name,
        'start': start,
        'stop': stop,
        'min_mapping_quality': 0 if is_bulk else 20,
        'min_base_quality': 13,
    }

    if is_bulk:
        bam_file = AlignmentFile(my_args.bulk, 'rb')
    else:
        bam_file = AlignmentFile(my_args.bam, 'rb')

    read_bases_list = []
    for pileup_column in bam_file.pileup(**my_arg):
        pos = pileup_column.reference_pos

        if pos >= stop:
            break
        if pos < start:
            continue

        read_bases_list = pileup_column.get_query_sequences(mark_matches=True,
                                                            mark_ends=True,
                                                            add_indels=True)

        read_bases = ''.join(read_bases_list).upper()
        n = pileup_column.get_num_aligned()
        if n == 0:
            read_bases = '*'
            base_q = '*'
            map_q = '*'
        else:
            base_q = ''.join([chr(int(i) + PHREDSCORE) \
                for i in pileup_column.get_query_qualities()])
            map_q = ''.join([chr(int(i) + PHREDSCORE) \
                for i in pileup_column.get_mapping_qualities()])

        yield [name, pos, ref[pos - start], str(n), read_bases, base_q, map_q]

    yield None
Пример #23
0
class FastaExtractor(BaseExtractor):
    def __init__(self, datafile, use_strand=False, **kwargs):
        """Fasta file extractor
        
        NOTE: The extractor is not thread-save.
        If you with to use it with multiprocessing,
        create a new extractor object in each process.
        
        Args:
          datafile (str): path to the bigwig file
          use_strand (bool): if True, the extracted sequence
            is reverse complemented in case interval.strand == "-"
        """
        super(FastaExtractor, self).__init__(datafile, **kwargs)
        self.use_strand = use_strand
        self.fasta = FastaFile(self._datafile)

    def _extract(self, intervals, out, **kwargs):
        for index, interval in enumerate(intervals):
            seq = self.fasta.fetch(str(interval.chrom), interval.start,
                                   interval.stop)
            one_hot_encode_sequence(seq, out[index, :, :])

            # reverse-complement seq the negative strand
            if self.use_strand and interval.strand == "-":
                out[index, :, :] = out[index, ::-1, ::-1]

        return out

    @staticmethod
    def _get_output_shape(num_intervals, width):
        return (num_intervals, width, NUM_SEQ_CHARS)
Пример #24
0
def main():
    min_region_size = 1000
    genome = FastaFile("./genome/GRCh38.genome.fa")
    train_path = "./train_data/"
    list_dir = os.listdir(train_path)
    for filename in list_dir:
        for region, label in iter_peaks_and_labels(train_path + filename):
            # create a new region exactly min_region_size basepairs long centered on
            # region
            expanded_start = region[1] + (region[2] -
                                          region[1]) / 2 - min_region_size / 2
            expanded_stop = expanded_start + min_region_size
            region = (region[0], expanded_start, expanded_stop)
            #print region, label
            print genome.fetch(*region), label
    return
Пример #25
0
def split_variants_to_files(vcf_file, genome_file, bi_file, multi_file):
    vcf = VariantFile(vcf_file)
    genome = FastaFile(genome_file)
    vcf.header.add_line('##INFO=<ID=multi,Number=0,Type=Flag,'
                        'Description="Variant with multiple allele">')
    vcf.header.add_line('##INFO=<ID=duplicated,Number=0,Type=Flag,'
                        'Description="Duplicated in position">')
    vcf.header.formats.add('GT', 1, 'String', "Genotype")
    vcf.header.add_sample("Genotype")

    with open(bi_file, 'wt') as outbi:
        with open(multi_file, 'wt') as outmu:

            outbi.write(str(vcf.header))
            outmu.write(str(vcf.header))

            for multi_alleles, duplicated, record in iter_wanted_variants(
                    vcf, genome):
                record = record_to_string(record) + ['GT', '0/1']
                record = '\t'.join(record)
                if duplicated:
                    continue
                if multi_alleles:
                    outmu.write(record + '\n')
                else:
                    outbi.write(record + '\n')
Пример #26
0
def initWorker(localWindowSize, fastaFile, k, N, M):
    global FA, windowSize, kSize, useN, method
    windowSize = localWindowSize
    FA = FastaFile(fastaFile)
    kSize = k
    useN = N
    method = M
Пример #27
0
def method2(basefl):
    fa = FLAGS.input + ".feature.fa"
    loader = FastaFile(fa)
    fl1 = FLAGS.input + ".feature.tsv"
    output = open("%s/20bp.fa" % (basefl), "w")
    for i in open(fl1, "r"):
        ele = i.rstrip().split()
        ids, pos = ele[0].split("|")[:-1]
        pos = int(pos)
        try:
            seq = loader.fetch(ids, pos - 30, pos + 30)
            output.write(">%s|%s\n%s\n" % (ids, pos, seq))
        except:
            print("ids %s %s,error" % (ids, pos))
    output.close()
    align_hisat2()
Пример #28
0
def _chrom_names(fasta_file):
    """Get the list of chromosome names from a fasta file
    """
    from pysam import FastaFile
    with FastaFile(fasta_file) as fa:
        chroms = list(fa.references)
    return chroms
Пример #29
0
def split_variants(vcf_file, genome_file):
    vcf = VariantFile(vcf_file)
    genome = FastaFile(genome_file)
    vcf.header.add_line('##INFO=<ID=multi,Number=0,Type=Flag,'
                        'Description="Variant with multiple allele">')
    vcf.header.add_line('##INFO=<ID=duplicated,Number=0,Type=Flag,'
                        'Description="Duplicated in position">')
    vcf.header.formats.add('GT', 1, 'String', "Genotype")
    vcf.header.add_sample("Genotype")

    print(vcf.header, end='')

    for multi_alleles, duplicated, record in iter_wanted_variants(vcf, genome):
        record = record_to_string(record) + ['GT', '0/1']
        if multi_alleles:
            add = "multi" if record[6] else ";multi"
            record[6] += add

        if duplicated:
            add = "duplicated" if record[6] else ";duplicated"
            record[6] += add

        record = '\t'.join(record)

        print(record)
Пример #30
0
def main():
    genome_fname = sys.argv[1]
    regions_fname = sys.argv[2]
    
    genome = FastaFile(genome_fname)
    print "Loaded genome"
    motifs = load_all_motifs()
    tfname_id_map = load_tfname_tfid_mapping()
    print "Loaded Motifs"
    with open(regions_fname) as fp:
        regions = load_regions_in_bed(fp)
    print "Loaded regions"

    with open(os.path.basename(regions_fname)+".peaks.txt", "w") as ofp:
        ofp.write("\t".join(["region".ljust(30),] + [
            motif.name for motif in motifs]) +"\n")
        ofp.write("\t".join(["region".ljust(30),] + [
            motif.factor for motif in motifs]) +"\n")
        for i, region in enumerate(regions):
            print i, region
            overlapping_peaks = load_overlapping_peaks(*region)
            motif_overlap_scores = []
            for motif in motifs:
                if motif.factor in overlapping_peaks:
                    motif_overlap_scores.append(1)
                else:
                    motif_overlap_scores.append(0)
            ofp.write("%s\t%s\n" % (
                      "_".join(map(str, region)).ljust(30), 
                      "\t".join("%i" % motif_overlap 
                                for motif_overlap in motif_overlap_scores)))
    print "Finished building peak overlap matrix"

    with open(os.path.basename(regions_fname) + ".TFscores.txt", "w") as ofp:
        ofp.write("\t".join(["region".ljust(30),] + [
            motif.name for motif in motifs]) +"\n")
        ofp.write("\t".join(["region".ljust(30),] + [
            motif.factor for motif in motifs]) +"\n")
        for i, region in enumerate(regions):
            if i%100 == 0: print i, len(regions), os.path.basename(regions_fname)
            seq = genome.fetch(*region).upper()
            try: scores = score_region(motifs, seq)
            except: continue
            ofp.write("%s\t%s\n" % (
                      "_".join(map(str, region)).ljust(30), 
                      "\t".join("%.4f" % score for score in scores)))
    print "Finished building score matrix"
Пример #31
0
def extract_fasta_to_file(fasta, output_dir, overwrite):
    """
    Returns compressed version of fasta file for a quickly accessible memory map
    Args:
        fasta: fasta file to be converted
        output_dir: output directory for memory map location
        overwrite: boolean - whether to overwrite current memory map
    """

    for i in [0, 1]:
        if overwrite:
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            fasta_file = FastaFile(fasta)
            file_shapes = {}
            for chrom, size in zip(fasta_file.references, fasta_file.lengths):
                seq = fasta_file.fetch(chrom)
                data = one_hot_encode_sequence(seq)
                file_shapes[chrom] = data.shape
                bcolz.carray(data,
                             rootdir=os.path.join(output_dir, chrom),
                             cparams=_blosc_params,
                             mode='w').flush()
            mode = '2D_transpose_bcolz'
            metadata = {
                'file_shapes': file_shapes,
                'type': 'array_{}'.format(mode),
                'extractor': 'CompressedFastaExtractor',
                'source': fasta
            }
            with open(os.path.join(output_dir, 'metadata.json'), 'w') as fp:
                json.dump(metadata, fp)
                overwrite = False
        else:
            try:
                with open(os.path.join(output_dir, 'metadata.json'),
                          'r') as fp:
                    metadata = json.load(fp)
                break
            except IOError as e:
                print("I/O error({0}): {1} for {2}".format(
                    e.errno, e.strerror, output_dir))
                print(
                    "There is a problem with opening the metadata. Recreating the mmap files and overwriting..."
                )
                overwrite = True
    return metadata
Пример #32
0
def get_contig_list_from_fasta(fasta_path, with_length=False):
    """Obtain list of contigs froma  fasta file,
        all alternative contigs are pooled into the string MISC_ALT_CONTIGS_SCMO

    Args:
        fasta_path (str or pysam.FastaFile) : Path or handle to fasta file

        with_length(bool): return list of lengths

    Returns:
        contig_list (list ) : List of contigs + ['MISC_ALT_CONTIGS_SCMO'] if any alt contig is present in the fasta file
        """

    contig_list = []
    has_alt = False
    if with_length:
        lens = []

    if type(fasta_path) is str:
        fa = FastaFile(fasta_path)
    elif type(fasta_path) is FastaFile:
        fa = fasta_path
    else:
        raise TypeError('Supply pysam.FastaFile or str')

    for reference, length in zip(fa.references, fa.lengths):
        if is_main_chromosome(reference):
            contig_list.append(reference)
            if with_length:
                lens.append(length)
        else:
            has_alt = True

    # Close handle if we just opened one
    if type(fasta_path) is str:
        fa.close()

    if has_alt:
        contig_list.append('MISC_ALT_CONTIGS_SCMO')
        if with_length:
            lens.append(None)

    if with_length:
        return contig_list, lens

    return contig_list
Пример #33
0
def load_sequences_and_labels(regions_fname, genome_fa_fname):
    seqs, labels = [], []
    min_region_size = 1000
    genome = FastaFile(genome_fa_fname)
    for region, label in iter_peaks_and_labels(sys.argv[1]):
        # create a new region exactly min_region_size basepairs long centered on 
        # region  
        expanded_start = region[1] + (region[2] - region[1])/2 - min_region_size/2
        expanded_stop = expanded_start + min_region_size
        region = (region[0], expanded_start, expanded_stop)
        seqs.append(genome.fetch(*region))

        if label == 'promoter': labels.append(1)
        elif label == 'enhancer': labels.append(0)
        else: assert False

    return one_hot_encode_sequences(seqs)[:,None,:,:], np.array(labels)
Пример #34
0
def extract_fasta_to_npy(fasta, output_dir):
    fasta_file = FastaFile(fasta)
    file_shapes = {}
    for chrom, size in zip(fasta_file.references, fasta_file.lengths):
        data = np.empty((NUM_SEQ_CHARS, size), dtype=np.float32)
        seq = fasta_file.fetch(chrom)
        one_hot_encode_sequence(seq, data)
        np.save('{}.npy'.format(os.path.join(output_dir, chrom)), data)
        file_shapes[chrom] = data.shape

    with open(os.path.join(output_dir, 'metadata.json'), 'w') as fp:
        json.dump(
            {
                'file_shapes': file_shapes,
                'type': 'array',
                'source': fasta
            }, fp)
Пример #35
0
    def __init__(self, reference_file_path):
        """
        create fasta file object given file path to a fasta reference file
        :param fasta_file_path: full path to a fasta reference file
        """

        self.fasta_file_path = reference_file_path
        assert os.path.exists(
            reference_file_path), "Reference path does not exist: {}".format(
                reference_file_path)
        try:
            self.fasta = FastaFile(self.fasta_file_path)
        except Exception as e:
            print(e)
            raise IOError(
                "Fasta File Read Error: Try indexing reference with 'samtools faidx {}'"
                .format(reference_file_path))
def generate_header(reference_fa: str, tag: str) -> VariantHeader:
    """
    Generates the header for the minimal VCF.

    :param reference_fa: Path to reference fasta file.
    :param tag: The filter tag to use.
    """
    header = VariantHeader()
    header.filters.add(tag, None, None, "Failed dToxoG")

    fasta = FastaFile(reference_fa)
    try:
        for contig in fasta.references:
            header.contigs.add(contig,
                               length=fasta.get_reference_length(contig))
    finally:
        fasta.close()

    return header
Пример #37
0
def extract_seq(interval, variant, fasta_file, one_hot=False):
    """
    Note: in case the variant is an indel, the anchorpoint at the beginning is used

    Args:
      interval: pybedtools.Interval where to extract the sequence from
      variant: Variant class with attributes: chr, pos, ref, alt
      fasta_file: file path or pysam.FastaFile instance
      one_hot: if True, one-hot-encode the output sequence

    Returns:
      sequence
    """
    if isinstance(fasta_file, str):
        from pysam import FastaFile
        fasta_file = FastaFile(fasta_file)
    if variant is not None and variant.pos - 1 >= interval.start and variant.pos <= interval.stop:
        inside = True
        lendiff = len(variant.alt) - len(variant.ref)
    else:
        inside = False
        lendiff = 0
    seq = fasta_file.fetch(str(interval.chrom), interval.start,
                           interval.stop - lendiff)

    if not inside:
        out = seq
    else:
        # now, mutate the sequence
        pos = variant.pos - interval.start - 1
        expect_ref = seq[pos:(pos + len(variant.ref))]
        if expect_ref != variant.ref:
            raise ValueError(
                f"Expected reference: {expect_ref}, observed reference: {variant.ref}"
            )
        # Anchor at the beginning
        out = seq[:pos] + variant.alt + seq[(pos + len(variant.ref)):]
    assert len(
        out
    ) == interval.stop - interval.start  # sequece length has to be correct at the end
    if one_hot:
        out = encodeDNA([out.upper()])[0]
    return out
Пример #38
0
def main(args):
    sample_name = extract_sample_name(args.input_path)
    with open(args.input_path) as cnv_input, FastaFile(args.genome_ref) as genome_ref,\
            open(args.output_path, 'w') as vcf_output:
        is_full_chrom_name = genome_ref.references[0].startswith('chr')
        cnv_reader = csv.DictReader(cnv_input, delimiter='\t')
        vcf_output.write('\n'.join(get_vcf_headers(sample_name, genome_ref)) + '\n')
        for cnv_line in cnv_reader:
            vcf_line = get_vcf_line(cnv_line, genome_ref, is_full_chrom_name)
            vcf_output.write(vcf_line + '\n')
Пример #39
0
    def __call__(self, intervals, to_mirror=None, **kwargs):
        NUM_SEQ_CHARS = 4
        fasta = FastaFile(self._datafile)

        width = intervals[0].stop - intervals[0].start
        data = np.zeros((len(intervals), 1, NUM_SEQ_CHARS, width))

        for index, interval in enumerate(intervals):
            seq = fasta.fetch(str(interval.chrom), interval.start,
                              interval.stop)
            one_hot_encode_sequence(seq, data[index, 0, :, :])

        # This is performing a reverse complement operation
        if to_mirror is not None:
            for index, mirror in enumerate(to_mirror):
                if mirror:
                    data[index, :, :, :] = data[index, :, ::-1, ::-1]

        return data
Пример #40
0
def vcf2chain(input_file, fasta_file, strain, output_file, vcf_keep=False, passed=False, quality=False, diploid=False):
    """

    :param input_file:
    :param fasta_file:
    :param strain:
    :param output_file:
    :param vcf_keep:
    :param passed:
    :param quality:
    :param diploid:
    :return:
    """
    start = time.time()

    input_file = g2g_fu.check_file(input_file)
    fasta_file = g2g_fu.check_file(fasta_file)

    if not strain:
        raise G2GValueError("No strain was specified.")

    output_file = g2g_fu.check_file(output_file, 'w')
    output_file_dir = os.path.dirname(output_file)

    LOG.info("VCF FILE: {0}".format(input_file))
    LOG.info("FASTA FILE: {0}".format(fasta_file))
    LOG.info("CHAIN FILE: {0}".format(output_file))

    vcf_discard_file = None

    if vcf_keep:
        vcf_discard_file = "{0}.errors.vcf".format(os.path.basename(input_file))
        vcf_discard_file = os.path.join(output_file_dir, vcf_discard_file)
        LOG.info("VCF DISCARD FILE: {0}".format(vcf_discard_file))

    LOG.info("STRAIN: {0}".format(strain))
    LOG.info("PASS FILTER ON: {0}".format(str(passed)))
    LOG.info("QUALITY FILTER ON: {0}".format(str(quality)))
    LOG.info("DIPLOID: {0}".format(str(diploid)))

    if not isinstance(fasta_file, FastaFile):
        fasta_file = FastaFile(fasta_file)

    tb = TabixFile(input_file)
    sample_index = None

    for h in tb.header:
        if h[:6] == '#CHROM':
            try:
                elems = h.split('\t')
                samples = elems[9:]
                samples = dict(zip(samples, (x for x in xrange(len(samples)))))
                sample_index = samples[strain]
            except KeyError, ke:
                raise G2GVCFError("Unknown strain '{0}', valid strains are: {1}".format(strain, ", ".join(samples)))
Пример #41
0
def main():
    
    min_region_size = 1000
    cell_type = "combination"
    genome = FastaFile("/srv/scratch/zho/GRCh38.genome.fa")
    
    k = 8 #k in kmer -- we choose 6
    sequence_list = []
    labels_list = []
    attributes_map = get_attributes_map(['A','C','G','T'],k)

    for region, label in iter_peaks_and_labels(sys.argv[1]):
        # create a new region exactly min_region_size basepairs long centered on 
        # region  
        expanded_start = region[1] + (region[2] - region[1])/2 - min_region_size/2
        if expanded_start < 0:
	    expanded_start = 0
	expanded_stop = expanded_start + min_region_size
        region = (region[0], expanded_start, expanded_stop)
        print region, label
	# note: 1 = promoter, 0 = enhancer
        if label == "promoter":
	    labels_list.append(1)
	else:
	    labels_list.append(0)
        print genome.fetch(*region)
	sequence_list.append(genome.fetch(*region))

    sequence_series=pd.Series(sequence_list)
    X = createAttributeMatrix(sequence_series, k, attributes_map)

    X.to_csv("/srv/scratch/zho/" + str(min_region_size) + "_" + cell_type + "_" + str(k)  + "mer_train_matrix.csv");
    labels_series = pd.Series(labels_list)
    labels_series.to_csv("/srv/scratch/zho/" + str(min_region_size) + "_" + cell_type + "_" + str(k) + "mer_output_vector.csv");

    return
Пример #42
0
def test_vutil_homoRunForOneVariant():

    assert_equal(vutil._calHrunSize('tcggg'), 0)
    assert_equal(vutil._calHrunSize('ttcggg'), 2)
    assert_equal(vutil._calHrunSize('AATTGAGACTACAGAGCAAC'), 2)
    assert_equal(vutil._calHrunSize('ACTCACAGGTTTTATAAAAC'[::-1]), 0)

    fa = FastaFile('tests/data/ex1.fa')
    vcf_readers = vcreader(['tests/data/ex1.vcf.gz'])
    varlist = vcf_readers.variants(chrom = 'chr1', nosnp = False)
    vutil.homoRunForOneVariant(fa, varlist[0])

    varlist = vcf_readers.variants(chrom = 'chr2', nosnp = False)
    assert_equal(784, varlist[2].POS)
    assert_equal('ACTCACAGGTTTTATAAAAC', fa.fetch('chr2', varlist[2].POS - 20, varlist[2].POS))
    assert_equal('AATTGAGACTACAGAGCAAC', fa.fetch('chr2', varlist[2].POS, varlist[2].POS + 20))
    assert_equal('ACTCACAGGTTTTATAAAACAATTGAGACTACAGAGCAAC', fa.fetch('chr2', varlist[2].POS - 20, varlist[2].POS + 20))

    hr = vutil.homoRunForOneVariant(fa, varlist[2])
    assert_equal(2, hr)

    varlist[2].POS = fa.get_reference_length('chr2')
    hr = vutil.homoRunForOneVariant(fa, varlist[2])
    assert_equal(0, hr)
Пример #43
0
class FabgzReader(object):
    def __init__(self, filename):
        self._fh = FastaFile(filename)

    def fetch(self, seq_id, start=None, end=None):
        return self._fh.fetch(seq_id.encode("ascii"), start, end)

    def keys(self):
        return self._fh.references

    def __len__(self):
        return self._fh.nreferences

    def __getitem__(self, ac):
        return self.fetch(ac)

    @property
    def filename(self):
        return self._fh.filename
Пример #44
0
    ac_dict = {}
    af_dict = {}
    sites_dict = {}
    ac_list = []
    af_list = []
    sites_list = []

    if args.bed[-3:] == '.gz':
        bed_file = gzip.open(args.bed, 'r')
    elif args.bed[-4:] == '.bed':
        bed_file = open(args.bed, 'r')
    else:
        sys.exit("\nIs this a bed file? Is it compressed?\n")

    callable_f = FastaFile(args.callable)

    for rec in bed_file:
        col = rec.split()
        if ';' in col[4]:
            feature_name = get_feature_name(col[4])
        else:
            feature_name = col[4]

        if feature_name not in bed_dict:
            bed_dict[feature_name] = []

        bed_dict[feature_name].append((col[0], int(col[1]), int(col[2])))

    with open(args.outfile, 'w') as outfile:
        print('Population', 'Chromosome', 'Feature_name', 'Feature_type', 'Sites', 'S', 'thetaW', 'pi', 'tajd', sep='\t',
# Compute confusion matrix
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
# Standardize features by removing the mean and scaling to unit variance
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score, StratifiedKFold
# Grid Search Random Forest parameters
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score

min_region_size = 1000
genomeDirectory = './genome/'
dataDirectory = './train_data/'
genome = FastaFile("./genome/GRCh38.genome.fa")
dataFiles = ['E114.bed', 'E116.bed', 'E117.bed', 'E118.bed', 'E119.bed']#, 'E120.bed', 'E121.bed', 'E122.bed', 'E123.bed', 'E124.bed', 'E126.bed', 'E127.bed', 'E128.bed', 'E129.bed']
c = 0
regions = []
labels = []

def iter_peaks_and_labels(fname):
    with open(fname) as fp:
        for line in fp:
            data = line.split()
            yield (data[0], int(data[1]), int(data[2])), data[3]   # returns region and its label: ('chrY', 20575266, 20576266),   'promoter'/'enhancer'
    return

def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
Пример #46
0
 def __init__(self, filename):
     self._fh = FastaFile(filename)
Пример #47
0
def fasta_transform(fasta_file, chain_file, locations, output_file, bgzip=False, reverse=False):
    """

    :param fasta_file:
    :param chain_file:
    :param locations:
    :param output_file:
    :param bgzip:
    :param reverse:
    :return:
    """
    start = time.time()

    if not isinstance(fasta_file, FastaFile):
        fasta_file = g2g_fu.check_file(fasta_file)

    if not isinstance(chain_file, ChainIter):
        chain_file = g2g_fu.check_file(chain_file)

    output_file = g2g_fu.check_file(output_file, 'w')
    g2g_fu.delete_file(output_file)
    g2g_fu.delete_index_files(output_file)

    LOG.info("FASTA FILE: {0}".format(fasta_file))
    LOG.info("CHAIN FILE: {0}".format(chain_file))
    LOG.info("OUTPUT FILE: {0}".format(output_file))
    LOG.info("BGZIP: {0}".format(bgzip))
    LOG.info("REVERSE: {0}".format(reverse))

    if isinstance(fasta_file, FastaFile):
        fasta = fasta_file
    else:
        fasta = FastaFile(fasta_file)

    if not isinstance(chain_file, ChainIter):
        chain_file = ChainIter(chain_file, reverse=reverse)

    seq_ids = []

    if locations:
        LOG.debug("Have locations")
        new_locations = []
        for l in locations:
            if isinstance(l, Location):
                new_locations.append(l)
            else:
                new_locations.append(parse_location(l))
            seq_ids.append(new_locations[-1].seqid)
        locations = new_locations
    else:
        LOG.debug("Calculating locations")
        locations = [parse_location("{0}:1-{1}".format(a, fasta.get_reference_length(a)), 1) for a in fasta.references]
        seq_ids = [a for a in fasta.references]

    temp_output_file = output_file

    if bgzip:
        if g2g_fu.get_extension(output_file) != 'gz':
            output_file = "{0}.gz".format(output_file)
        else:
            temp_output_file = temp_output_file[:-3]

    fasta_out = open(temp_output_file, "w")

    LOG.info("Transforming...")

    chr_info = {}

    try:
        # will need a better way, but for now...
        LOG.info("Parsing chain file...")
        for line in chain_file:
            if len(line) > 7:
                LOG.debug("Adding chromosome {0}".format(chain_file.current_chain_header[1]))
                chr_info[chain_file.current_chain_header[1]] = {'from_size': line[2], 'from_start': line[4], 'from_end': line[5],
                                  'to_size': line[7], 'to_start': line[9], 'to_end': line[10],
                                  'header_chain':chain_file.current_chain_header, 'lines': []}
            else:
                chr_info[chain_file.current_chain_header[1]]['lines'].append(line)

        LOG.info("Chain file parsed")

        insertion_bases = 0
        deletion_bases = 0

        for location in locations:
            LOG.info("Processing chromosome={0}".format(location.seqid))
            LOG.debug("Location: {0}".format(location))

            chrom_size_from = chr_info[location.seqid]['from_size']
            chrom_size_to = chr_info[location.seqid]['to_size']

            last_pos = chr_info[location.seqid]['from_start']
            new_sequence = StringIO()
            chain_file.reset()

            for chain_line in chr_info[location.seqid]['lines']:
                LOG.debug("\nLINE: {0} : {1}".format(chain_file.line_no, chain_line))

                if len(chain_line) == 1:
                    # last line
                    fragment = chain_line[0]

                    partial_seq = fasta.fetch(location.seqid, last_pos, last_pos + fragment)
                    new_sequence.write(str(partial_seq))

                    if len(new_sequence.getvalue()) < chrom_size_to:
                        LOG.warn("Length's do not match, chromosome length in chain: {0}, sequence length: {1}".format(chrom_size_to, len(new_sequence.getvalue())))

                    fasta_out.write(">{0} {1}:{2}-{3}\n".format(location.seqid, location.seqid, chr_info[location.seqid]['from_start'] + 1, chrom_size_to))

                    for l in wrap_sequence(new_sequence.getvalue()):
                        fasta_out.write(l.strip())
                        fasta_out.write('\n')

                    break

                else:

                    # fragment_size dt_size dq_size same_bases dt_bases dq_bases

                    fragment = chain_line[0]
                    dt = chain_line[1 if not reverse else 2]
                    dq = chain_line[2 if not reverse else 1]
                    same = chain_line[3]
                    dt_bases = chain_line[4 if not reverse else 5]
                    dq_bases = chain_line[5 if not reverse else 4]

                    partial_seq = fasta.fetch(location.seqid, last_pos, last_pos + fragment)
                    new_sequence.write(partial_seq)

                    if dq > 0:
                        # insertion
                        LOG.debug("INSERTION")
                        new_sequence.write(dq_bases)
                        LOG.debug("{0}:{1}-{2} (Length: {3})".format(location.seqid, last_pos, last_pos + fragment, len(partial_seq)))
                        if len(partial_seq) > 100:
                            LOG.debug("{0}...{1}".format(partial_seq[:10], partial_seq[-10:]))
                        else:
                            LOG.debug(partial_seq)
                        LOG.debug("Adding {0}".format(dq_bases))
                        LOG.debug("SAME={0}, {1}".format(same, partial_seq[-(len(same)):]))

                        insertion_bases += dq

                    if dt > 0:
                        # deletion
                        LOG.debug("DELETION")
                        last_pos += dt
                        LOG.debug("skipping ahead {0} bases".format(dt))

                        deletion_bases += dt

                    last_pos += fragment

                    LOG.debug("LAST_POS={0}, INSERTIONS={1}, DELETIONS={2}, DIFF={3}".format(last_pos, insertion_bases, deletion_bases, (insertion_bases - deletion_bases)))

        # bgzip and index
        if bgzip:
            LOG.info("Compressing and indexing...")
            g2g_fu.bgzip_index(temp_output_file, output_file, 'fa')

    except G2GLocationError, le:
        LOG.debug("Unable to parse location, {0}".format(le.message))
        raise le
Пример #48
0
def test_vutil_get_sequence_context():

    fa = FastaFile('tests/data/ex1.fa')
    vcf_readers = vcreader(['tests/data/ex1.vcf.gz'], 'options')
    varlist = vcf_readers.variants('chr2')
    vutil.get_sequence_context(fa.fetch('chr2'), varlist[0])