def BEDIterator(handle): """Generator function to iterate over Fasta records (as SeqRecord objects). handle - input file If this is not given, then the entire title line will be used as the description, and the first word as the id and name. Note that use of title2ids matches that of Bio.Fasta.SequenceParser but the defaults are slightly different. """ line_no = 0 #Skip any text before the first record (e.g. blank lines, comments) while True : line_no += 1 line = handle.readline().strip() if not line: return if line[0] == "#" or len(line) == 0: continue try: ref,source,type,start,end,score,strand,frame,attributes = \ line.split("\t") except: raise FormatError, "Problem with line %d in %s. Line was\n%s" %\ (line_no,handle.name,line) attr_pairs = attributes.strip(';').split(";") attr_dict = dict(map(lambda x: tuple(x.split("=")), attr_pairs)) result = SeqFeature(location=FeatureLocation(int(start),int(end)), type=type,strand=_gff3_strand_to_numeric[strand],ref=ref,ref_db=source) result.id = attr_dict.get("ID",None) result.name = attr_dict.get("Name",None) result.attributes = attr_dict # not an official property of SeqFeature. yield result
def ncrna_gene(self, ncrna): """Create a gene for ncRNAs""" gene = SeqFeature(ncrna.location, type="ncRNA_gene") gene.qualifiers["source"] = ncrna.qualifiers["source"] gene.sub_features = [ncrna] gene.id = ncrna.id return gene
def cds_gene(self, cds): """Create a gene for a lone CDS""" # Create a transcript, add the CDS transcript = SeqFeature(cds.location, type="mRNA") transcript.qualifiers["source"] = cds.qualifiers["source"] transcript.sub_features = [cds] # Add an exon too exon = SeqFeature(cds.location, type="exon") exon.qualifiers["source"] = cds.qualifiers["source"] transcript.sub_features.append(exon) # Create a gene, add the transcript gene = SeqFeature(cds.location, type="gene") gene.qualifiers["source"] = cds.qualifiers["source"] gene.sub_features = [transcript] gene.id = self.generate_stable_id() return gene
def gbk2gff(genbank_path, new_gff_path, species_id): print('Start change', os.path.basename(new_gff_path)) records_list = [] genome = SeqIO.read(genbank_path, "genbank") remove_none_location(genome) genome.features.sort(key=lambda x: x.location.start) gene_count = 0 IR_count = 0 for ele in genome.features: if ele.type == 'gene': if ele.qualifiers['gene'][0] == 'rps12': continue gene_count += 1 ele.id = species_id + '%03d' % gene_count for child_feature in genome[ele.location.start:ele.location. end].features: fix_location(child_feature, ele.location.start) if child_feature.type != 'gene' and \ child_feature.location.start == ele.location.start and \ child_feature.location.end == ele.location.end: child_feature.type = 'mRNA' if child_feature.type == 'CDS' else child_feature.type if child_feature.qualifiers['gene'][0] == ele.qualifiers[ 'gene'][0]: # This module for protein coding gene CDS region if child_feature.type == 'mRNA': gene_attributes = [ 'ID=' + ele.id, 'Name=' + ele.qualifiers['gene'][0], 'gene_biotype=protein_coding' ] records_list.append( get_record(ele, gene_attributes)) cds_count = 0 for cds in reversed(child_feature.location.parts): cds_count += 1 cds_feature = SeqFeature( cds, type='CDS', qualifiers={ 'codon_start': child_feature.qualifiers['codon_start'] [0] }) cds_feature.id = 'cds_' + species_id + '%03d' % gene_count + '_' + '%d' % cds_count cds_attributes = [ 'ID=' + cds_feature.id, 'Parent=' + ele.id, 'product=' + child_feature.qualifiers['product'][0] ] records_list.append( get_record(cds_feature, cds_attributes)) # This module for rRNA and tRNA exon else: # gene gene_attributes = [ 'ID=' + ele.id, 'Name=' + ele.qualifiers['gene'][0], 'gene_biotype=' + child_feature.type ] records_list.append( get_record(ele, gene_attributes)) # rna child_feature.id = 'rna_' + species_id + '%03d' % gene_count child_attributes = [ 'ID=' + child_feature.id, 'Parent=' + ele.id, 'product=' + child_feature.qualifiers['product'][0] ] records_list.append( get_record(child_feature, child_attributes)) exon_list = [] exon_count = 0 # exon for exon in reversed(child_feature.location.parts): exon_count += 1 exon_feature = SeqFeature(exon, type='exon') exon_feature.id = 'exon_' + species_id + '%03d' % gene_count + '_' + '%d' % exon_count exon_attributes = [ 'ID=' + exon_feature.id, 'Parent=' + child_feature.id ] exon_list.append( get_record(exon_feature, exon_attributes)) if exon_count > 1: records_list += exon_list elif ele.type == 'repeat_region': IR_count += 1 gene_attributes = [ 'ID=IR' + str(IR_count), 'note=Inverted repeats' ] records_list.append(get_record(ele, gene_attributes)) records_dict = {index: record for index, record in enumerate(records_list)} result_gff = pd.DataFrame.from_dict(records_dict, 'index') result_gff['seqid'] = genome.id result_gff['score'] = '.' result_gff['source'] = 'PGA' result_gff = result_gff[[ "seqid", "source", "type", "start", "end", "score", "strand", "phase", "attributes" ]] result_gff.to_csv(new_gff_path, sep='\t', header=False, index=False, encoding='utf8') return genome.seq
# st and en are unmodified from the blast file # they can be backwards # original row as of nov23 read #feature = SeqFeature(FeatureLocation(st-1,en), strand=framepart2,type="repeathit") # start location should not be modified by -1 in this case # changed Featurelocation start, older line: #feature = SeqFeature(FeatureLocation(st,en), strand=framepart2,type="repeathit") # Genbank magic adds one to start position? feature = SeqFeature(FeatureLocation(st-1,en), strand=framepart2,type="repeathit") #feature.id=rename #feature.qualifiers["hit"]=rename feature.id=isname feature.qualifiers["hit"]=isname feature.qualifiers["sequence_length"]=str(len(contigseq[myhsp.sbjct_start-1:myhsp.sbjct_end])) # feature.qualifiers["score"]=str(myhsp.score) if myhsp.score>bestscorehsp.score: bestscorehsp=myhsp if myhsp.expect<bestexphsp.expect: bestexphsp=myhsp # hit length is disance from hits start plus hit end feature.qualifiers["hitlength"]=str(1+max(st,en)-min(st,en)) # feature.qualifiers["expect"]=str(myhsp.expect) feature.qualifiers["query_start"]=str(myhsp.query_start)
for feature in feature_lambda(record.features, feature_test_true, {}): if feature.type in args.changeList: #if "Parent" in feature.qualifiers.keys(): #endOfChain = False #while endOfChain == False: # parentFeat = record.features(feature.qualifiers["Parent"][0]) #for x in range(0, len(args.changeList)): #feature.qualifiers["Parent"] = [] newChain = [feature] tempParent = "" if "Parent" in feature.qualifiers.keys(): tempParent = feature.qualifiers["Parent"][0] for x in range(0, len(args.changeTo)): tempFeat = SeqFeature(location=feature.location) tempFeat.type = args.changeTo[len(args.changeTo) - 1 - x] tempFeat.id = feature.id + "_p" + str( len(args.changeTo) - x) tempFeat.ref_db = feature.ref_db tempFeat.ref = feature.ref #tempFeat.sub_features.append(newChain[x]) if "Parent" in newChain[x].qualifiers.keys(): newChain[x].qualifiers["Parent"][ 0] = feature.id + "_p" + str( len(args.changeTo) - x) else: newChain[x].qualifiers["Parent"] = [ feature.id + "_p" + str(len(args.changeTo) - x) ] tempFeat.qualifiers["ID"] = [tempFeat.id] if "Name" in newChain[x].qualifiers.keys(): tempFeat.qualifiers["Name"] = [ feature.qualifiers["Name"][0] + "_p" +