def load_gff(gff): genes = defaultdict(list) gene_exon_positions = defaultdict(lambda: defaultdict(tuple)) try: with open(gff) as g: for line in g: if line.startswith('#') or 'contig' in line: continue feature = GFF(line) gene_id = get_gene_attribute(feature, "ID") if feature.featuretype == 'exon': gene_exon_positions[feature.genename][gene_id] = ( feature.start, feature.end) if feature.featuretype == 'CDS': for exon in gene_exon_positions[feature.genename]: e = gene_exon_positions[feature.genename][exon] if e[0] <= feature.start <= e[1] and e[ 0] <= feature.end <= e[1]: gene_id = exon + "_CDS" if gene_id is None: print("No gene id for CDS found", feature, end="") feature.id = gene_id genes[feature.genename].append(feature) except IOError: print("Failed to load GFF file {}".format(gff)) sys.exit() return genes
def load_crossmap(crossmapout): genes = defaultdict(list) gene_exon_positions = defaultdict(lambda: defaultdict(tuple)) try: with open(crossmapout) as c: for line in c: cm = None if 'fail' in line: featureline = line.split('\tfail')[0] if '->' in line: featureline, cmline = line.split('\t->\t') cm = GFF(cmline) cm.attributes += ';Note=CrossMap' feature = GFF(featureline) feature.crossmap = cm gene_id = get_gene_attribute(feature, "ID") if feature.featuretype == 'exon': gene_exon_positions[feature.genename][gene_id] = ( feature.start, feature.end) if feature.featuretype == 'CDS': for exon in gene_exon_positions[feature.genename]: e = gene_exon_positions[feature.genename][exon] if e[0] <= feature.start <= e[1] and e[ 0] <= feature.end <= e[1]: gene_id = exon + "_CDS" if feature.crossmap: # Lose CrossMap features where the CDSs aren't the same length cds_len = feature.end - feature.start + 1 cm_len = feature.crossmap.end - feature.crossmap.start + 1 if cds_len != cm_len: feature.crossmap = None if gene_id is None: print("No gene id for CDS found", feature, end="") feature.id = gene_id genes[feature.genename].append(feature) except IOError: print("Failed to load CrossMap output {}".format(crossmapout)) sys.exit() return genes
def make_output_feature(outstart, outend, feature, new_parts): if outstart is not None and outend is not None: if outstart > outend: outstart, outend = outend, outstart strand = feature.strand if new_parts[0].strand == -1: if feature.strand == '+': strand = '-' elif feature.strand == '-': strand = '+' return GFF(new_parts[0].oldname, feature.source, feature.featuretype, outstart, outend, feature.score, strand, feature.phase, feature.attributes) else: return None
def transfer_gff_feature(feature, genome, crossmap=False): output_feature = outstart = outend = status = None if crossmap and feature.crossmap: output_feature = GFF(feature.crossmap.scaffold, feature.crossmap.source, feature.crossmap.featuretype, feature.crossmap.start, feature.crossmap.end, feature.crossmap.score, feature.crossmap.strand, feature.crossmap.phase, feature.crossmap.attributes) status = 'crossmap' return output_feature, status new_parts, haps = get_parts(feature.scaffold, feature.start, feature.end, genome) if not new_parts: if not haps: status = 'missing' else: status = haps[0].parttype elif len(new_parts) == 1 and not haps: np = new_parts[0] if 'removed' not in np.parttype: outstart, outend = new_parts[0].oldstart, new_parts[0].oldend status = 'ok' else: status = 'removed' else: scaffolds = {} for part in new_parts: if 'removed' in part.parttype: continue if part.newstart <= feature.start <= part.newend: outstart = part.oldstart if part.strand == 1 else part.oldend scaffolds[part.oldname] = 1 if part.newstart <= feature.end <= part.newend: outend = part.oldend if part.strand == 1 else part.oldstart scaffolds[part.oldname] = 1 if outstart is not None and outend is not None: status = 'ok' if len(scaffolds) > 1: outstart = outend = None status = 'multiscaffold' if feature.featuretype == 'CDS': # Do not allow CDSs to span multiple parts, but try to find a CrossMap hit outstart = outend = None status = 'broken' output_feature = make_output_feature(outstart, outend, feature, new_parts) if output_feature is None and feature.crossmap: output_feature = GFF(feature.crossmap.scaffold, feature.crossmap.source, feature.crossmap.featuretype, feature.crossmap.start, feature.crossmap.end, feature.crossmap.score, feature.crossmap.strand, feature.crossmap.phase, feature.crossmap.attributes) status = 'crossmap' if output_feature is None: if status is None: status = 'broken' return GFF(feature.scaffold, feature.source, feature.featuretype, feature.start, feature.end, feature.score, feature.strand, feature.phase, feature.attributes), status else: return output_feature, status