def main(): """ Overview: 1. Read gene predicitions from .gtf files. 2. Read Evidence from .gff files. 3. Detect overlapping transcripts. 4. Create feature vector (for a list of all features see features.py) for all transcripts. 5. Compare the feature vectors of all pairs of overlapping transcripts. 6. Exclude transcripts based on the 'transcript comparison rule' and 5. 7. Remove Transcripts with low evidence support. 8. Create combined gene predicitions (all transcripts that weren't excluded). """ from genome_anno import Anno from overlap_graph import Graph from evidence import Evidence global anno, graph, parameter args = parseCmd() init(args) if v > 0: print(gtf) # read gene prediciton files c = 1 for g in gtf: if not quiet: sys.stderr.write('### READING GENE PREDICTION: [{}]\n'.format(g)) anno.append(Anno(g, 'anno{}'.format(c))) anno[-1].addGtf() anno[-1].norm_tx_format() c += 1 # read hintfiles evi = Evidence() for h in hintfiles: if not quiet: sys.stderr.write('### READING EXTRINSIC EVIDENCE: [{}]\n'.format(h)) evi.add_hintfile(h) for src in evi.src: if src not in parameter.keys(): sys.stderr.write('ConfigError: No weight for src={}, it is set to 1\n'.format(src)) parameter.update({src : 1}) # create graph with an edge for each unique transcript # and an edge if two transcripts overlap # two transcripts overlap if they share at least 3 adjacent protein coding nucleotides graph = Graph(anno, para=parameter, verbose=v) if not quiet: sys.stderr.write('### BUILD OVERLAP GRAPH\n') graph.build() # add features if not quiet: sys.stderr.write('### ADD FEATURES TO TRANSCRIPTS\n') graph.add_node_features(evi) # apply decision rule to exclude a set of transcripts if not quiet: sys.stderr.write('### SELECT TRANSCRIPTS\n') combined_prediction = graph.get_decided_graph() if v > 0: sys.stderr.write(str(combined_prediction.keys()) + '\n') for a in anno: sys.stderr.write('Numb_tx in {}: {}\n'.format(a.id, len(combined_prediction[a.id]))) # write result to output file if not quiet: sys.stderr.write('### WRITE COMBINED GENE PREDICTION\n') combined_gtf = [] for a in anno: combined_gtf += a.get_subset_gtf(combined_prediction[a.id]) with open(out, 'w+') as file: out_writer = csv.writer(file, delimiter='\t', quotechar = "'") for line in combined_gtf: out_writer.writerow(line) if not quiet: sys.stderr.write('### FINISHED\n\n') sys.stderr.write('### The combined gene prediciton is located at {}.\n'.format(\ out))
def test_get_hint(): evi = Evidence() evi.add_hintfile(testDir + '/evidence/hint3.gff') mult = evi.get_hint('3R','801','899','intron','+') assert sum(mult.values()) == 28