def get_loci(transcripts_genepred):
    loci = Loci()
    loci.verbose = True
    with open(transcripts_genepred) as inf:
        for line in inf:
            if line[0] == '#': continue
            gpd = GenePredEntry(line.rstrip())
            rng = Bed(gpd.value('chrom'), gpd.value('txStart'),
                      gpd.value('txEnd'))
            rng.set_payload(gpd.value('name'))
            loc1 = Locus()
            loc1.add_member(rng)
            loci.add_locus(loc1)
    sys.stderr.write("Organizing genepred data into overlapping loci\n")
    sys.stderr.write("Started with " + str(len(loci.loci)) + " loci\n")
    loci.update_loci()
    sys.stderr.write("Ended with " + str(len(loci.loci)) + " loci\n")

    m = 0
    locus2name = {}
    name2locus = {}
    for locus in loci.loci:
        m += 1
        for member in locus.members:
            name = member.get_payload()
            if m not in locus2name: locus2name[m] = set()
            locus2name[m].add(name)
            name2locus[name] = m
    return [locus2name, name2locus]
def process_read(mpa, args):
    # Filter entries by a minimum alignment coverage
    newentries = []
    for i in [
            i for i in range(0, len(mpa.entries))
            if mpa.entries[i].get_coverage() > args.minimum_alignment_coverage
    ]:
        newentries.append(mpa.entries[i])
    mpa.entries = newentries

    # Find best singles
    bestsingle = None
    bestsinglescore = -1
    for i in range(0, len(mpa.entries)):
        totalcov = mpa.entries[i].get_coverage()
        weightedcov = float(mpa.entries[i].get_coverage()) * float(
            mpa.entries[i].get_quality())
        if weightedcov > bestsinglescore:
            bestsinglescore = weightedcov
            bestsingle = i
    if bestsinglescore == -1:
        sys.stderr.write("failed to find a single path\n")
        return None
    my_max_intron = args.maximum_intron
    if args.fusion: my_max_intron = -1  # we can look any distance for a group
    mpa.compatible_graph(
        max_intron=my_max_intron,
        max_query_overlap=args.maximum_query_overlap,
        max_gap=args.maximum_query_gap,
        max_target_overlap=args.maximum_target_overlap,
        max_query_fraction_overlap=args.maximum_query_fraction_overlap)
    ps = mpa.get_root_paths()
    bestpath = [bestsingle]
    bestscore = 0
    besttotalcov = 0
    allscores = []
    allcov = []
    best_path_index = -1
    zz = 0
    for path in ps:
        totalcov = sum([mpa.entries[i].get_coverage() for i in path])
        weightedcov = sum([
            float(mpa.entries[i].get_coverage()) *
            float(mpa.entries[i].get_quality()) for i in path
        ])
        allscores.append(weightedcov)
        allcov.append(totalcov)
        if weightedcov > bestscore:
            bestscore = weightedcov
            bestpath = path
            besttotalcov = totalcov
            best_path_index = zz
        zz += 1
    #if not bestpath: return None
    otherpaths = []
    for i in range(0, len(ps)):
        if i != best_path_index:
            otherpaths.append(ps[i])
    query_target_coverages = []
    for other_path in otherpaths:
        qcov = 0
        tcov = 0
        for other_entry in [mpa.entries[i] for i in other_path]:
            for entry in [mpa.entries[j] for j in bestpath]:
                qcov += other_entry.query_overlap_size(entry)
                tcov += other_entry.target_overlap_size(entry)
        query_target_coverages.append(str(qcov) + '/' + str(tcov))

    gapsizes = []
    if len(bestpath) > 1:
        gapsizes = [
            mpa.entries[bestpath[j + 1]].get_query_bed().start -
            mpa.entries[bestpath[j]].get_query_bed().end - 1
            for j in range(0,
                           len(bestpath) - 1)
        ]
    #print mpa.g.get_status_string()
    #print [mpa.entries[i].get_target_bed().get_range_string() for i in bestpath]
    #print [mpa.entries[i].get_query_bed().get_range_string() for i in bestpath]
    #print [mpa.entries[i].get_quality() for i in bestpath]
    #print [mpa.entries[i].get_coverage() for i in bestpath]
    #print gapsizes
    #print bestscore
    #print bestsinglescore

    #See if we should use the single path score instead
    if len(path) > 1 and bestsinglescore * (
            1 + args.multipath_score_improvement) > bestscore:
        bestpath = [bestsingle]
        besttotalcov = mpa.entries[bestsingle].get_coverage()
        bestscore = bestsinglescore
    query_span = mpa.entries[bestpath[0]].get_query_bed()
    loci = Loci()
    loci.set_use_direction(True)
    loci.set_minimum_distance(args.maximum_intron)
    for i in bestpath:
        r = mpa.entries[i].get_target_bed()
        locus = Locus()
        locus.set_use_direction(True)
        locus.add_member(r)
        loci.add_locus(locus)
    loci.update_loci()
    if len(bestpath) > 1:
        for i in bestpath[1:]:
            query_span = mpa.entries[i].get_query_bed().merge(query_span)
    report = ''
    report += mpa.entries[bestpath[0]].value('qName') + "\t"
    report += str(len(bestpath)) + "\t"
    report += str(len(loci.loci)) + "\t"
    report += query_span.get_range_string() + "\t"
    report += ','.join([mpa.entries[i].value('strand')
                        for i in bestpath]) + "\t"
    report += ','.join(
        [mpa.entries[i].get_query_bed().get_range_string()
         for i in bestpath]) + "\t"
    report += ','.join(
        [mpa.entries[i].get_target_bed().get_range_string()
         for i in bestpath]) + "\t"
    report += ','.join([str(mpa.entries[i].get_quality())
                        for i in bestpath]) + "\t"
    report += ','.join([str(mpa.entries[i].get_coverage())
                        for i in bestpath]) + "\t"
    report += ','.join([str(x) for x in gapsizes]) + "\t"
    report += str(besttotalcov) + "\t"
    report += str(bestscore) + "\t"
    report += str(bestsinglescore) + "\t"
    report += str(','.join(query_target_coverages) + "\t")
    #if args.best_report:
    #  best_report_fh.write(report+"\n")
    #for i in bestpath:
    #  args.output.write(mpa.entries[i].get_line()+"\n")
    return [report, [mpa.entries[i].get_line() for i in bestpath]]
Пример #3
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "Rename gene and transcript elements of GenePred file that are redundant.  Please specify an output if you would like report files generated for the filters."
    )
    parser.add_argument('input', help="GENEPREDFILE or '-' for STDIN")
    parser.add_argument(
        '-o',
        '--output',
        help=
        "OUTPUT FILE default is STDOUT, but you need to specify an output file to get report files generated"
    )
    parser.add_argument(
        '--minimum_locus_distance',
        type=int,
        default=500000,
        help="Genes with the same name will be renamed if this far apart")
    parser.add_argument(
        '--keep_positional_duplicates',
        action='store_true',
        help="By default we remove one of the duplicate entries")
    parser.add_argument(
        '--keep_transcript_names',
        action='store_true',
        help="By default we rename duplicated transcript names")
    parser.add_argument(
        '--keep_gene_names',
        action='store_true',
        help="By default we rename genes located at different loci.")
    args = parser.parse_args()
    inf = sys.stdin
    if args.input != '-': inf = open(args.input)
    of = sys.stdout
    if args.output: of = open(args.output, 'w')
    txdef = {}
    gfams = {}
    for line in inf:
        if line[0] == '#': continue
        g = GenePredEntry(line)
        loc = g.value('chrom') + ':' + ','.join(
            [str(x) for x in g.value('exonStarts')]) + '-' + ','.join(
                [str(x)
                 for x in g.value('exonEnds')]) + '/' + g.value('strand')
        if loc not in txdef:
            txdef[loc] = []
        txdef[loc].append(g)
        if g.value('gene_name') not in gfams: gfams[g.value('gene_name')] = []
        gfams[g.value('gene_name')].append(g.value('name'))
    # now we have cataloged all transcripts by unique locations
    omissions = []
    keepers = []
    for loc in sorted(txdef.keys()):
        if args.keep_positional_duplicates:  # We don't want to ommit anything here
            for g in txdef[loc]:
                keepers.append(g)
            continue  #basically skipping this part by populating keepers
        num = len(txdef[loc])
        if num > 1:
            sys.stderr.write("Found " + str(num) + " entries at location\n")
            sys.stderr.write(loc + "\n")
            sys.stderr.write("They are:\n")
            largest = 0
            keepgene = None
            keepindex = -1
            i = 0
            for e in txdef[loc]:
                famsize = len(gfams[e.value('gene_name')])
                sys.stderr.write("     " + e.value('gene_name') + "\t" +
                                 e.value('name') + "\t" + str(famsize) + "\n")
                if famsize > largest:
                    keepgene = e
                    largest = famsize
                    keepindex = i
                i += 1
            for j in range(0, len(txdef[loc])):
                if j != keepindex: omissions.append(txdef[loc][j])
                else: keepers.append(txdef[loc][j])
            sys.stderr.write("     Biggest gene family is " +
                             keepgene.value('gene_name') + " with " +
                             str(largest) + " transcripts\n")
            sys.stderr.write("     so keep that one.\n")
        else:
            keepers.append(txdef[loc][0])
    sys.stderr.write("Omitting " + str(len(omissions)) +
                     " entries for redundant positions\n")
    if args.output and not args.keep_positional_duplicates:
        of1 = open(args.output + '.positional_duplicate_omissions', 'w')
        for g in omissions:
            of1.write(g.get_line() + "\n")
        of1.close()
    # Now the keepers contains transcripts with unique locations
    # Lets provide unique names to remaining transcripts
    tnames = {}
    renametx = {}
    for g in keepers:
        tx = g.value('name')
        if tx not in tnames: tnames[tx] = []
        tnames[tx].append(g)
    for name in tnames:
        if args.keep_transcript_names: continue  # We don't want to rename them
        nsize = len(tnames[name])
        if nsize > 1:
            sys.stderr.write("Name: " + name + " has a family of size " +
                             str(nsize) + "\n")
            for i in range(0, len(tnames[name])):
                newname = name + '[' + str(i + 1) + '/' + str(nsize) + ']'
                renametx[newname] = name
                tnames[name][i].entry['name'] = newname
    sys.stderr.write("Renamed: " + str(len(renametx)) + " transcripts\n")
    if args.output and not args.keep_transcript_names:
        of1 = open(args.output + '.renamed_transcripts', 'w')
        for name in sorted(renametx.keys()):
            of1.write(name + "\t" + renametx[name] + "\n")
        of1.close()
    #now we need to arrange into gene families
    gnames = {}
    for name in tnames:
        for g in tnames[name]:
            gene = g.value('gene_name')
            if gene not in gnames: gnames[gene] = []
            gnames[gene].append(g)
    renamegene = {}
    finished = []
    for gene in gnames:
        if args.keep_gene_names:
            for g in gnames[gene]:
                finished.append(g)
            continue  # We don't want to rename genes
        if len(gnames[gene]) == 1:
            finished.append(gnames[gene][0])
            continue
        # Now we need to make sure these genes are really on the same locus.
        loci = Loci()
        loci.set_minimum_distance(args.minimum_locus_distance)
        for g in gnames[gene]:
            r = g.locus_range.copy()
            r.set_payload(g)
            loc = Locus()
            loc.add_member(r)
            loci.add_locus(loc)
        loci.update_loci()
        lcount = len(loci.loci)
        if lcount == 1:
            for g in gnames[gene]:
                finished.append(g)
            continue
        # need to rename some genes
        for i in range(0, lcount):
            newname = gene + '[' + str(i + 1) + '/' + str(lcount) + ']'
            rstr = loci.loci[i].range.get_range_string()
            renamegene[newname] = gene
            sys.stderr.write(newname + "\t" + rstr + "\n")
            for m in loci.loci[i].members:
                m.get_payload().entry['gene_name'] = newname
                finished.append(m.get_payload())
    sys.stderr.write("Renamed: " + str(len(renamegene)) + " genes\n")
    if args.output and not args.keep_transcript_names:
        of1 = open(args.output + '.renamed_genes', 'w')
        for name in sorted(renamegene.keys()):
            of1.write(name + "\t" + renamegene[name] + "\n")
        of1.close()
    #Now lets resort by genes
    bygene = {}
    for g in finished:
        gene = g.value('gene_name')
        if gene not in bygene: bygene[gene] = []
        bygene[gene].append(g)
    for gene in sorted(bygene.keys()):
        for g in bygene[gene]:
            of.write(g.get_line() + "\n")
    of.close()
    inf.close()