Exemplo n.º 1
0
    def get_part_sequence(self, fasta_file, header, start, stop, nterminus, strand, name):
        """
        Pull part sequence from fasta file
        # https://github.com/mdshw5/pyfaidx
        # pip install pyfaidx

        Parameters
        ----------

        Args:
            fasta_file (str): input fasta file
            header (str): header for fasta sequence
            start (str): start coordinate
            stop (str): stop coordinate
            nterminus (int): length of missing sequence
            strand (str): strand
            name (str): gene name

        Returns:
            sequence (str): portion on a sequence
        """  
        # remove the last 2 characters from header as this is appended by prodigal
        header = header[:header.rfind("_")]

        # logger.info("[PARTIAL] ARO: {} | contig: {} | filename: {}".format(name, header, fasta_file))
    
        genes = Fasta(fasta_file, sequence_always_upper=False, read_long_names=False, one_based_attributes=True)
        # logger.info(genes.records)

        logger.info(json.dumps({"strand":strand, "start":start, "stop":stop, "nterminus":nterminus}, indent=2))
        if strand == "-":
            return str(genes.get_spliced_seq( header, [[stop, stop+nterminus]]))
        elif strand == "+":
            return str(genes.get_spliced_seq( header, [[start-nterminus, start]]))
Exemplo n.º 2
0
    def test_split_seq(self):
        """ Fetch sequence by blocks """
        fa = Fasta('data/chr17.hg19.part.fa')

        gene = Fasta("data/gene.bed12.fasta")
        expect = gene[list(gene.keys())[0]][:].seq

        bed = "data/gene.bed12"
        with open(bed) as fi:
            record = fi.readline().strip().split("\t")

        chrom = record[0]
        start = int(record[1])
        strand = record[5]

        # parse bed12 format
        starts = [int(x) for x in record[11].split(",")[:-1]]
        sizes = [int(x) for x in record[10].split(",")[:-1]]
        starts = [start + x for x in starts]
        ends = [start + size for start, size in zip(starts, sizes)]

        # bed half-open
        if strand == "-":
            starts = [start + 1 for start in starts]
        else:
            ends = [end - 1 for end in ends]

        intervals = zip(starts, ends)
        result = fa.get_spliced_seq(chrom, intervals, rc=True)
        print(result.seq)
        print("====")
        print(expect)

        assert result.seq == expect
Exemplo n.º 3
0
    def test_split_seq(self):
        """ Fetch sequence by blocks """
        fa = Fasta('data/chr17.hg19.part.fa')
        
        gene = Fasta("data/gene.bed12.fasta")
        expect = gene[list(gene.keys())[0]][:].seq
        
        bed = "data/gene.bed12"
        with open(bed) as fi:
            record = fi.readline().strip().split("\t")

        chrom = record[0]
        start = int(record[1])
        strand = record[5]

        # parse bed12 format
        starts = [int(x) for x in record[11].split(",")[:-1]] 
        sizes = [int(x) for x in record[10].split(",")[:-1]]
        starts = [start + x  for x in starts]
        ends = [start + size  for start,size in zip(starts, sizes)] 
        
        # bed half-open
        if strand == "-":
            starts = [start + 1 for start in starts]
        else: 
            ends = [end - 1 for end in ends]
        
        intervals = zip(starts, ends) 
        result = fa.get_spliced_seq(chrom, intervals, rc=True)
        print(result.seq)
        print("====")
        print(expect)

        assert result.seq == expect
Exemplo n.º 4
0
    def get_part_sequence(self, fasta_file, header, start, stop, nterminus,
                          strand, name):
        """
        Pull part sequence from fasta file
        # https://github.com/mdshw5/pyfaidx
        # pip install pyfaidx

        Parameters
        ----------

        Args:
            fasta_file (str): input fasta file
            header (str): header for fasta sequence
            start (str): start coordinate
            stop (str): stop coordinate
            nterminus (int): length of missing sequence
            strand (str): strand
            name (str): gene name

        Returns:
            sequence (str): portion on a sequence
        """
        # remove the last 2 characters from header as this is appended by prodigal
        header = header[:header.rfind("_")]
        genes = False
        # logger.info("[PARTIAL] ARO: {} | contig: {} | filename: {}".format(name, header, fasta_file))
        try:
            genes = Fasta(fasta_file,
                          sequence_always_upper=False,
                          read_long_names=False,
                          one_based_attributes=True)
        except Exception as e:
            logger.error(e)
        # logger.info(genes.records)
        if genes:
            # logger.debug(json.dumps({"strand":strand, "start":start, "stop":stop, "nterminus":nterminus}, indent=2))
            if strand == "-":
                _start = stop + 1
                _stop = stop + nterminus
                # logger.debug("grep sequence from {}|-|{}-{}".format(header,_start, _stop,))
                if nterminus == 0:
                    # logger.debug("grep sequence from {}|-|{}-{}".format(header,start, stop,))
                    return str(genes.get_spliced_seq(
                        header, [[start, stop]])), start, stop
                else:
                    return str(genes.get_spliced_seq(
                        header, [[_start, _stop]])), _start, _stop
            elif strand == "+":
                _start = start - nterminus
                _stop = start - 1
                if _start <= 0:
                    _start = 1
                if _stop <= 0:
                    _stop = 1
                # logger.debug("grep sequence from {}|+|{}-{}".format(header,_start, _stop))
                if nterminus == 0:
                    # logger.debug("grep sequence from {}|+|{}-{}".format(header,start, stop))
                    return str(genes.get_spliced_seq(
                        header, [[start, stop]])), start, stop
                else:
                    return str(genes.get_spliced_seq(
                        header, [[_start, _stop]])), _start, _stop
Exemplo n.º 5
0
	# cds = cds.sort_values(['chr','startGene'])

	for index,row in cds.iterrows():
		print(index,row['id'])
		start = time.time()

		# Convert CDS list into numeric array
		coordinates = array(row['coordinates'].split(',')).astype(int).tolist()
		coordinates =  [coordinates[i:i+2] for i in range(0, len(coordinates), 2)]

		# Open ref and outgroup
		ref = Fasta('/data/shared/dgn/ref/Chr' + row['chr'] +'.fasta',sequence_always_upper=True)
		outgroup = Fasta(outgroupFastas + '/Chr' + row['chr'] +'_dsim.fasta',sequence_always_upper=True)

		## Extract ref and outgroup seq
		refSeq = ref.get_spliced_seq(list(ref.keys())[0], coordinates).seq
		outgroupSeq = outgroup.get_spliced_seq(list(outgroup.keys())[0], coordinates).seq
		
		# Check length divisible by 3
		if((len(refSeq) % 3) == 0): 

			# Open multifasta
			multiFasta = Fasta('/data/shared/dgn/alignments/'+ args.population + '_Chr' + row['chr'] +'.seq',sequence_always_upper=True)

			# Extract samples from fastas
			samples = list(multiFasta.keys())
						
			# Create empty array with ndimesions equal to multi-Fasta lines and length
			matrix = np.empty([len(samples) + 1, len(refSeq)],dtype='str')
			
			positions=[]
Exemplo n.º 6
0
    for index, row in cds.iterrows():
        print(index, row['id'])
        start = time.time()

        # Convert CDS list into numeric array
        coordinates = array(row['coordinates'].split(',')).astype(int).tolist()
        coordinates = [
            coordinates[i:i + 2] for i in range(0, len(coordinates), 2)
        ]

        # Open ref and outgroup
        ref = Fasta('/data/shared/dgn/ref/Chr' + row['chr'] + '.fasta')
        outgroup = Fasta(outgroupFastas + '/Chr' + row['chr'] + '_dsim.fasta')

        ## Extract ref and outgroup seq
        refSeq = ref.get_spliced_seq(row['chr'], coordinates).seq.upper()
        outgroupSeq = outgroup.get_spliced_seq(outputHeader,
                                               coordinates).seq.upper()

        if ('M' in refSeq):
            continue
        else:
            if ((len(refSeq) / 3).is_integer()):

                # Open population multifasta
                popFasta = Fasta('/data/shared/dgn/alignments/' +
                                 args.population + '_Chr' + row['chr'] +
                                 '.seq')

                #Extract samples
                samples = list(popFasta.keys())