Python Loci.Loci примеры использования

Язык программирования: Python

Пространство имен/Пакет: RangeBasics

Класс/Тип: Loci

Метод/Функция: Loci

Примеров на hotexamples.com: 3

Python Loci.Loci - 3 примера найдено. Это лучшие примеры Python кода для RangeBasics.Loci.Loci, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Loci(3)

add_locus(3)

update_loci(3)

set_minimum_distance(2)

set_use_direction(1)

Пример #1

Показать файл

Файл: simulate_biallelic_transcriptome.py Проект: songjiajia2018/Manual-for-running-IDP-pipeline

def get_loci(transcripts_genepred):
    loci = Loci()
    loci.verbose = True
    with open(transcripts_genepred) as inf:
        for line in inf:
            if line[0] == '#': continue
            gpd = GenePredEntry(line.rstrip())
            rng = Bed(gpd.value('chrom'), gpd.value('txStart'),
                      gpd.value('txEnd'))
            rng.set_payload(gpd.value('name'))
            loc1 = Locus()
            loc1.add_member(rng)
            loci.add_locus(loc1)
    sys.stderr.write("Organizing genepred data into overlapping loci\n")
    sys.stderr.write("Started with " + str(len(loci.loci)) + " loci\n")
    loci.update_loci()
    sys.stderr.write("Ended with " + str(len(loci.loci)) + " loci\n")

    m = 0
    locus2name = {}
    name2locus = {}
    for locus in loci.loci:
        m += 1
        for member in locus.members:
            name = member.get_payload()
            if m not in locus2name: locus2name[m] = set()
            locus2name[m].add(name)
            name2locus[name] = m
    return [locus2name, name2locus]

Пример #2

Показать файл

Файл: psl_to_best_path.py Проект: songjiajia2018/Manual-for-running-IDP-pipeline

def process_read(mpa, args):
    # Filter entries by a minimum alignment coverage
    newentries = []
    for i in [
            i for i in range(0, len(mpa.entries))
            if mpa.entries[i].get_coverage() > args.minimum_alignment_coverage
    ]:
        newentries.append(mpa.entries[i])
    mpa.entries = newentries

    # Find best singles
    bestsingle = None
    bestsinglescore = -1
    for i in range(0, len(mpa.entries)):
        totalcov = mpa.entries[i].get_coverage()
        weightedcov = float(mpa.entries[i].get_coverage()) * float(
            mpa.entries[i].get_quality())
        if weightedcov > bestsinglescore:
            bestsinglescore = weightedcov
            bestsingle = i
    if bestsinglescore == -1:
        sys.stderr.write("failed to find a single path\n")
        return None
    my_max_intron = args.maximum_intron
    if args.fusion: my_max_intron = -1  # we can look any distance for a group
    mpa.compatible_graph(
        max_intron=my_max_intron,
        max_query_overlap=args.maximum_query_overlap,
        max_gap=args.maximum_query_gap,
        max_target_overlap=args.maximum_target_overlap,
        max_query_fraction_overlap=args.maximum_query_fraction_overlap)
    ps = mpa.get_root_paths()
    bestpath = [bestsingle]
    bestscore = 0
    besttotalcov = 0
    allscores = []
    allcov = []
    best_path_index = -1
    zz = 0
    for path in ps:
        totalcov = sum([mpa.entries[i].get_coverage() for i in path])
        weightedcov = sum([
            float(mpa.entries[i].get_coverage()) *
            float(mpa.entries[i].get_quality()) for i in path
        ])
        allscores.append(weightedcov)
        allcov.append(totalcov)
        if weightedcov > bestscore:
            bestscore = weightedcov
            bestpath = path
            besttotalcov = totalcov
            best_path_index = zz
        zz += 1
    #if not bestpath: return None
    otherpaths = []
    for i in range(0, len(ps)):
        if i != best_path_index:
            otherpaths.append(ps[i])
    query_target_coverages = []
    for other_path in otherpaths:
        qcov = 0
        tcov = 0
        for other_entry in [mpa.entries[i] for i in other_path]:
            for entry in [mpa.entries[j] for j in bestpath]:
                qcov += other_entry.query_overlap_size(entry)
                tcov += other_entry.target_overlap_size(entry)
        query_target_coverages.append(str(qcov) + '/' + str(tcov))

    gapsizes = []
    if len(bestpath) > 1:
        gapsizes = [
            mpa.entries[bestpath[j + 1]].get_query_bed().start -
            mpa.entries[bestpath[j]].get_query_bed().end - 1
            for j in range(0,
                           len(bestpath) - 1)
        ]
    #print mpa.g.get_status_string()
    #print [mpa.entries[i].get_target_bed().get_range_string() for i in bestpath]
    #print [mpa.entries[i].get_query_bed().get_range_string() for i in bestpath]
    #print [mpa.entries[i].get_quality() for i in bestpath]
    #print [mpa.entries[i].get_coverage() for i in bestpath]
    #print gapsizes
    #print bestscore
    #print bestsinglescore

    #See if we should use the single path score instead
    if len(path) > 1 and bestsinglescore * (
            1 + args.multipath_score_improvement) > bestscore:
        bestpath = [bestsingle]
        besttotalcov = mpa.entries[bestsingle].get_coverage()
        bestscore = bestsinglescore
    query_span = mpa.entries[bestpath[0]].get_query_bed()
    loci = Loci()
    loci.set_use_direction(True)
    loci.set_minimum_distance(args.maximum_intron)
    for i in bestpath:
        r = mpa.entries[i].get_target_bed()
        locus = Locus()
        locus.set_use_direction(True)
        locus.add_member(r)
        loci.add_locus(locus)
    loci.update_loci()
    if len(bestpath) > 1:
        for i in bestpath[1:]:
            query_span = mpa.entries[i].get_query_bed().merge(query_span)
    report = ''
    report += mpa.entries[bestpath[0]].value('qName') + "\t"
    report += str(len(bestpath)) + "\t"
    report += str(len(loci.loci)) + "\t"
    report += query_span.get_range_string() + "\t"
    report += ','.join([mpa.entries[i].value('strand')
                        for i in bestpath]) + "\t"
    report += ','.join(
        [mpa.entries[i].get_query_bed().get_range_string()
         for i in bestpath]) + "\t"
    report += ','.join(
        [mpa.entries[i].get_target_bed().get_range_string()
         for i in bestpath]) + "\t"
    report += ','.join([str(mpa.entries[i].get_quality())
                        for i in bestpath]) + "\t"
    report += ','.join([str(mpa.entries[i].get_coverage())
                        for i in bestpath]) + "\t"
    report += ','.join([str(x) for x in gapsizes]) + "\t"
    report += str(besttotalcov) + "\t"
    report += str(bestscore) + "\t"
    report += str(bestsinglescore) + "\t"
    report += str(','.join(query_target_coverages) + "\t")
    #if args.best_report:
    #  best_report_fh.write(report+"\n")
    #for i in bestpath:
    #  args.output.write(mpa.entries[i].get_line()+"\n")
    return [report, [mpa.entries[i].get_line() for i in bestpath]]

Пример #3

Показать файл

def main():
    parser = argparse.ArgumentParser(
        description=
        "Rename gene and transcript elements of GenePred file that are redundant.  Please specify an output if you would like report files generated for the filters."
    )
    parser.add_argument('input', help="GENEPREDFILE or '-' for STDIN")
    parser.add_argument(
        '-o',
        '--output',
        help=
        "OUTPUT FILE default is STDOUT, but you need to specify an output file to get report files generated"
    )
    parser.add_argument(
        '--minimum_locus_distance',
        type=int,
        default=500000,
        help="Genes with the same name will be renamed if this far apart")
    parser.add_argument(
        '--keep_positional_duplicates',
        action='store_true',
        help="By default we remove one of the duplicate entries")
    parser.add_argument(
        '--keep_transcript_names',
        action='store_true',
        help="By default we rename duplicated transcript names")
    parser.add_argument(
        '--keep_gene_names',
        action='store_true',
        help="By default we rename genes located at different loci.")
    args = parser.parse_args()
    inf = sys.stdin
    if args.input != '-': inf = open(args.input)
    of = sys.stdout
    if args.output: of = open(args.output, 'w')
    txdef = {}
    gfams = {}
    for line in inf:
        if line[0] == '#': continue
        g = GenePredEntry(line)
        loc = g.value('chrom') + ':' + ','.join(
            [str(x) for x in g.value('exonStarts')]) + '-' + ','.join(
                [str(x)
                 for x in g.value('exonEnds')]) + '/' + g.value('strand')
        if loc not in txdef:
            txdef[loc] = []
        txdef[loc].append(g)
        if g.value('gene_name') not in gfams: gfams[g.value('gene_name')] = []
        gfams[g.value('gene_name')].append(g.value('name'))
    # now we have cataloged all transcripts by unique locations
    omissions = []
    keepers = []
    for loc in sorted(txdef.keys()):
        if args.keep_positional_duplicates:  # We don't want to ommit anything here
            for g in txdef[loc]:
                keepers.append(g)
            continue  #basically skipping this part by populating keepers
        num = len(txdef[loc])
        if num > 1:
            sys.stderr.write("Found " + str(num) + " entries at location\n")
            sys.stderr.write(loc + "\n")
            sys.stderr.write("They are:\n")
            largest = 0
            keepgene = None
            keepindex = -1
            i = 0
            for e in txdef[loc]:
                famsize = len(gfams[e.value('gene_name')])
                sys.stderr.write("     " + e.value('gene_name') + "\t" +
                                 e.value('name') + "\t" + str(famsize) + "\n")
                if famsize > largest:
                    keepgene = e
                    largest = famsize
                    keepindex = i
                i += 1
            for j in range(0, len(txdef[loc])):
                if j != keepindex: omissions.append(txdef[loc][j])
                else: keepers.append(txdef[loc][j])
            sys.stderr.write("     Biggest gene family is " +
                             keepgene.value('gene_name') + " with " +
                             str(largest) + " transcripts\n")
            sys.stderr.write("     so keep that one.\n")
        else:
            keepers.append(txdef[loc][0])
    sys.stderr.write("Omitting " + str(len(omissions)) +
                     " entries for redundant positions\n")
    if args.output and not args.keep_positional_duplicates:
        of1 = open(args.output + '.positional_duplicate_omissions', 'w')
        for g in omissions:
            of1.write(g.get_line() + "\n")
        of1.close()
    # Now the keepers contains transcripts with unique locations
    # Lets provide unique names to remaining transcripts
    tnames = {}
    renametx = {}
    for g in keepers:
        tx = g.value('name')
        if tx not in tnames: tnames[tx] = []
        tnames[tx].append(g)
    for name in tnames:
        if args.keep_transcript_names: continue  # We don't want to rename them
        nsize = len(tnames[name])
        if nsize > 1:
            sys.stderr.write("Name: " + name + " has a family of size " +
                             str(nsize) + "\n")
            for i in range(0, len(tnames[name])):
                newname = name + '[' + str(i + 1) + '/' + str(nsize) + ']'
                renametx[newname] = name
                tnames[name][i].entry['name'] = newname
    sys.stderr.write("Renamed: " + str(len(renametx)) + " transcripts\n")
    if args.output and not args.keep_transcript_names:
        of1 = open(args.output + '.renamed_transcripts', 'w')
        for name in sorted(renametx.keys()):
            of1.write(name + "\t" + renametx[name] + "\n")
        of1.close()
    #now we need to arrange into gene families
    gnames = {}
    for name in tnames:
        for g in tnames[name]:
            gene = g.value('gene_name')
            if gene not in gnames: gnames[gene] = []
            gnames[gene].append(g)
    renamegene = {}
    finished = []
    for gene in gnames:
        if args.keep_gene_names:
            for g in gnames[gene]:
                finished.append(g)
            continue  # We don't want to rename genes
        if len(gnames[gene]) == 1:
            finished.append(gnames[gene][0])
            continue
        # Now we need to make sure these genes are really on the same locus.
        loci = Loci()
        loci.set_minimum_distance(args.minimum_locus_distance)
        for g in gnames[gene]:
            r = g.locus_range.copy()
            r.set_payload(g)
            loc = Locus()
            loc.add_member(r)
            loci.add_locus(loc)
        loci.update_loci()
        lcount = len(loci.loci)
        if lcount == 1:
            for g in gnames[gene]:
                finished.append(g)
            continue
        # need to rename some genes
        for i in range(0, lcount):
            newname = gene + '[' + str(i + 1) + '/' + str(lcount) + ']'
            rstr = loci.loci[i].range.get_range_string()
            renamegene[newname] = gene
            sys.stderr.write(newname + "\t" + rstr + "\n")
            for m in loci.loci[i].members:
                m.get_payload().entry['gene_name'] = newname
                finished.append(m.get_payload())
    sys.stderr.write("Renamed: " + str(len(renamegene)) + " genes\n")
    if args.output and not args.keep_transcript_names:
        of1 = open(args.output + '.renamed_genes', 'w')
        for name in sorted(renamegene.keys()):
            of1.write(name + "\t" + renamegene[name] + "\n")
        of1.close()
    #Now lets resort by genes
    bygene = {}
    for g in finished:
        gene = g.value('gene_name')
        if gene not in bygene: bygene[gene] = []
        bygene[gene].append(g)
    for gene in sorted(bygene.keys()):
        for g in bygene[gene]:
            of.write(g.get_line() + "\n")
    of.close()
    inf.close()