def splicejunctions_to_gff3(inputBedFile, chrom_sizes, output): gff3 = open(output, 'w') gff3.write("##gff-version 3\n") sizes_dict = subtools.sequence_region(chrom_sizes) seq_regions = dict() with open(inputBedFile, 'r') as bed: for line in bed: field = OrderedDict() attribute = OrderedDict() li = line.rstrip().split("\t") field['seqid'] = li[0] if field['seqid'] not in seq_regions: end_region = sizes_dict[field['seqid']] gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n') seq_regions[field['seqid']] = end_region field['source'] = li[3] field['type'] = 'junction' # The first base in a chromosome is numbered 0 in BED format field['start'] = int(li[1]) + 1 field['end'] = li[2] field['score'] = li[12] field['strand'] = li[5] field['phase'] = '.' attribute['ID'] = li[0] + '_' + li[3] attribute['Name'] = li[3] attribute['blockcount'] = li[9] attribute['blocksizes'] = li[10] attribute['chromstarts'] = li[11] subtools.write_features(field, attribute, gff3) subtools.child_blocks(field, attribute, gff3, 'exon_junction') gff3.close()
def gtfToGff3(gtf_file, gff3_file, chrom_sizes): """ Covert gtf file output from StringTie to gff3 format """ gff3 = open(gff3_file, 'w') gff3.write("##gff-version 3\n") sizes_dict = subtools.sequence_region(chrom_sizes) seq_regions = dict() parents = dict() with open(gtf_file, 'r') as gtf: for line in gtf: if line.startswith('#') or not line.strip(): continue field = OrderedDict() attribute = OrderedDict() li = line.rstrip().split("\t") #print li field['seqid'] = li[0] #print field['seqid'] if field['seqid'] not in seq_regions: end_region = sizes_dict[field['seqid']] gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n') seq_regions[field['seqid']] = end_region field['source'] = li[1] field['type'] = li[2] # The first base in a chromosome is numbered 0 in BED format field['start'] = li[3] field['end'] = li[4] field['score'] = li[5] field['strand'] = li[6] field['phase'] = li[7] attr_li = li[8].split(';') gene_id = attr_li[0].split()[1].strip('"') attribute['ID'] = gene_id + '_' + field['type'] + '_' + str( field['start']) + '_' + str(field['end']) if field['type'] == 'transcript': parents[gene_id] = attribute['ID'] attribute['transcript_id'] = attr_li[1].split()[1].strip('"') attribute['coverage'] = attr_li[2].split()[1].strip('"') attribute['fpkm'] = attr_li[3].split()[1].strip('"') attribute['tpm'] = attr_li[4].split()[1].strip('"') elif field['type'] == 'exon': attribute['Parent'] = parents[gene_id] attribute['transcript_id'] = attr_li[1].split()[1].strip('"') attribute['coverage'] = attr_li[3].split()[1].strip('"') subtools.write_features(field, attribute, gff3) gff3.close()
def bigpsl_to_gff3(self): gff3 = open(self.gff3_file.name, 'w') gff3.write("##gff-version 3\n") sizes_dict = subtools.sequence_region(self.chromSizesFile) seq_regions = dict() with open(self.inputFile, 'r') as bed: for line in bed: field = OrderedDict() attribute = OrderedDict() li = line.rstrip().split("\t") field['seqid'] = li[0] if field['seqid'] not in seq_regions: end_region = sizes_dict[field['seqid']] gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n') seq_regions[field['seqid']] = end_region field['source'] = 'UCSC BLAT alignment tool' field['type'] = 'match' # The first base in a chromosome is numbered 0 in BED format field['start'] = str(int(li[1]) + 1) field['end'] = li[2] field['score'] = li[4] field['strand'] = li[5] field['phase'] = '.' attribute['ID'] = li[0] + '_' + li[3] attribute['Name'] = li[3] attribute['blockcount'] = li[9] attribute['blocksizes'] = li[10] attribute['chromstarts'] = li[11] attribute['ochrom_start'] = li[12] attribute['ochrom_end'] = li[13] attribute['ochrom_strand'] = li[14] attribute['ochrom_size'] = li[15] attribute['ochrom_starts'] = li[16] attribute['sequence on other chromosome'] = li[17] attribute['cds in ncbi format'] = li[18] attribute['size of target chromosome'] = li[19] attribute['number of bases matched'] = li[20] attribute['number of bases that don\'t match'] = li[21] attribute[ 'number of bases that match but are part of repeats'] = li[ 22] attribute['number of \'N\' bases'] = li[23] subtools.write_features(field, attribute, gff3) subtools.child_blocks(field, attribute, gff3, 'match_part') gff3.close()
def trfbig_to_gff3(inputBedFile, chrom_sizes, output): gff3 = open(output, 'w') gff3.write("##gff-version 3\n") sizes_dict = subtools.sequence_region(chrom_sizes) seq_regions = dict() with open(inputBedFile, 'r') as bed: for line in bed: field = OrderedDict() attribute = OrderedDict() li = line.rstrip().split("\t") field['seqid'] = li[0] if field['seqid'] not in seq_regions: end_region = sizes_dict[field['seqid']] gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n') seq_regions[field['seqid']] = end_region field['source'] = li[3] field['type'] = 'tandem_repeat' # The first base in a chromosome is numbered 0 in BED format field['start'] = str(int(li[1]) + 1) field['end'] = li[2] field['score'] = li[9] field['strand'] = '+' field['phase'] = '.' attribute['length of repeat unit'] = li[4] attribute['mean number of copies of repeat'] = li[5] attribute['length of consensus sequence'] = li[6] attribute['percentage match'] = li[7] attribute['percentage indel'] = li[8] attribute['percent of a\'s in repeat unit'] = li[10] attribute['percent of c\'s in repeat unit'] = li[11] attribute['percent of g\'s in repeat unit'] = li[12] attribute['percent of t\'s in repeat unit'] = li[13] attribute['entropy'] = li[14] attribute['sequence of repeat unit element'] = li[15] subtools.write_features(field, attribute, gff3) gff3.close()
def gff3_writer(blast_records, gff3_file): gff3 = open(gff3_file, 'a') gff3.write("##gff-version 3\n") seq_regions = dict() for blast_record in blast_records: query_name = blast_record.query.split(" ")[0] source = blast_record.application method = blast_record.matrix for alignment in blast_record.alignments: group = { "parent_field": OrderedDict(), "parent_attribute": OrderedDict(), "alignments": [] } title = alignment.title.split(" ") contig_name = title[len(title) - 1] length = alignment.length group['parent_field']['seqid'] = contig_name group['parent_field']['source'] = source group['parent_field']['type'] = 'match' group['parent_attribute']['ID'] = contig_name + '_' + query_name group['parent_attribute']['Name'] = query_name group['parent_attribute']['method'] = method group['parent_attribute']['length'] = length if contig_name not in seq_regions: gff3.write("##sequence-region " + contig_name + ' 1 ' + str(length) + '\n') seq_regions[contig_name] = length match_num = 0 coords = [length, 0] for hsp in alignment.hsps: hsp_align = {} field = OrderedDict() attribute = OrderedDict() ref = hsp.sbjct query = hsp.query field['seqid'] = contig_name field['source'] = source field['type'] = 'match_part' field['start'] = hsp.sbjct_start if field['start'] < coords[0]: coords[0] = field['start'] ref_length = len(ref.replace('-', '')) # if run tblastn, the actual length of reference should be multiplied by 3 if source.lower() == "tblastn": ref_length *= 3 field['end'] = field['start'] + ref_length - 1 if field['end'] > coords[1]: coords[1] = field['end'] field['score'] = hsp.score #decide if the alignment in the same strand or reverse strand #reading frame # (+, +), (0, 0), (-, -) => + # (+, -), (-, +) => - if hsp.frame[1] * hsp.frame[0] > 0: field['strand'] = '+' elif hsp.frame[1] * hsp.frame[0] < 0: field['strand'] = '-' else: if hsp.frame[0] + hsp.frame[1] >= 0: field['strand'] = '+' else: field['strand'] = '-' field['phase'] = '.' target_start = hsp.query_start target_len = len(query.replace('-', '')) # if run blastx, the actual length of query should be multiplied by 3 if source.lower() == "blastx": target_len *= 3 target_end = target_start + target_len - 1 attribute['ID'] = group['parent_attribute'][ 'ID'] + '_match_' + str(match_num) attribute['Parent'] = group['parent_attribute']['ID'] attribute['Target'] = query_name + " " + str( target_start) + " " + str(target_end) attribute['Gap'] = align2cigar(query, ref) #store the query sequence and match string in the file in order to display alignment with BlastAlignment plugin attribute['subject'] = hsp.sbjct attribute['query'] = hsp.query attribute['match'] = hsp.match attribute['gaps'] = attribute['match'].count(' ') similar = attribute['match'].count('+') attribute['identities'] = len( attribute['match']) - similar - attribute['gaps'] attribute['positives'] = attribute['identities'] + similar attribute['expect'] = hsp.expect # show reading frame attribute only if the frame is not (0, 0) attribute['frame'] = hsp.frame[1] match_num += 1 hsp_align['field'] = field hsp_align['attribute'] = attribute group['alignments'].append(hsp_align) group['parent_field']['start'] = coords[0] group['parent_field']['end'] = coords[1] group['parent_field']['score'] = group['parent_field'][ 'strand'] = group['parent_field']['phase'] = '.' group['parent_attribute']['match_num'] = match_num group['alignments'].sort( key=lambda x: (x['field']['start'], x['field']['end'])) subtools.write_features(group['parent_field'], group['parent_attribute'], gff3) prev_end = -1 for align in group['alignments']: overlap = '' if align['field']['start'] <= prev_end: overlap += str( align['field']['start']) + ',' + str(prev_end) prev_end = align['field']['end'] align['attribute']['overlap'] = overlap subtools.write_features(align['field'], align['attribute'], gff3) gff3.close()