Python load_data示例，seqcluster.libs.read.load_data Python示例

示例#1

0

显示文件

文件： test_database.py 项目： smoe/seqcluster

 def test_database(self):
     if find_cmd("sqlite3"):
         with make_workdir() as workdir:
             arg = namedtuple('args', 'debug print_debug json ')
             args = arg(True, True, "../../data/examples/seqcluster.json")
             initialize_logger(".", args.debug, args.print_debug)
             logger = mylog.getLogger(__name__)
             logger.info(args)
             logger.info("Reading data")
             data = load_data(args.json)
             logger.info("Create database")
             make_database(data)

示例#2

0

显示文件

文件： test_database.py 项目： lpantano/seqcluster

 def test_database(self):
     if find_cmd("sqlite3"):
         with make_workdir() as workdir:
             arg = namedtuple('args', 'debug print_debug json ')
             args = arg(True, True, "../../data/examples/seqcluster.json")
             initialize_logger(".", args.debug, args.print_debug)
             logger = mylog.getLogger(__name__)
             logger.info(args)
             logger.info("Reading data")
             data = load_data(args.json)
             logger.info("Create database")
             make_database(data)

示例#3

0

显示文件

文件： test_database.py 项目： JackieMium/seqcluster

 def test_databse(self):
     if find_cmd("sqlite3"):
         mod_dir = os.path.dirname(inspect.getfile(seqcluster)).replace("seqcluster/", "")
         os.chdir(os.path.join(mod_dir, "data/examples"))
         arg = namedtuple('args', 'debug print_debug json ')
         args = arg(True, True, "seqcluster.json")
         initialize_logger(".", args.debug, args.print_debug)
         logger = mylog.getLogger(__name__)
         logger.info(args)
         logger.info("Reading data")
         data = load_data(args.json)
         logger.info("Create databse")
         make_database(data)

示例#4

0

显示文件

文件： create_report.py 项目： lpantano/seqcluster

def report(args):
    """
    Create report in html format
    """
    logger.info("reading sequeces")
    data = load_data(args.json)

    logger.info("create profile")
    data = make_profile(data, os.path.join(args.out, "profiles"), args)
    logger.info("create database")
    make_database(data, "seqcluster.db", args.out)

    logger.info("Done. Download https://github.com/lpantano/seqclusterViz/archive/master.zip to browse the output.")

示例#5

0

显示文件

文件： create_report.py 项目： smoe/seqcluster

def report(args):
    """
    Create report in html format
    """
    logger.info("Reading sequences")
    data = load_data(args.json)

    logger.info("Create profile")
    data = make_profile(data, os.path.join(args.out, "profiles"), args)
    logger.info("Create database")
    make_database(data, "seqcluster.db", args.out)

    logger.info("Done. Download https://github.com/lpantano/seqclusterViz/archive/master.zip to browse the output.")

示例#6

0

显示文件

文件： test_predict.py 项目： gitter-badger/seqcluster

 def test_predict(self):
     if find_cmd("tRNAscan-SE"):
         mod_dir = os.path.dirname(inspect.getfile(seqcluster)).replace("seqcluster/", "")
         os.chdir(os.path.join(mod_dir, "data/examples"))
         out_dir = os.path.join(mod_dir, "data/examples/predictions")
         arg = namedtuple('args', 'debug print_debug MIN_SEQ json reference')
         args = arg(True, True, 1, "seqcluster.json", "../genomes/genome.fa")
         initialize_logger(".", args.debug, args.print_debug)
         logger = mylog.getLogger(__name__)
         logger.info(args)
         logger.info("Reading data")
         data = load_data(args.json)
         logger.info("Start prediction")
         make_predictions(data, out_dir, args)

示例#7

0

显示文件

文件： test_database.py 项目： sinanugur/seqcluster

 def test_databse(self):
     if find_cmd("sqlite3"):
         mod_dir = os.path.dirname(inspect.getfile(seqcluster)).replace(
             "seqcluster/", "")
         os.chdir(os.path.join(mod_dir, "data/examples"))
         arg = namedtuple('args', 'debug print_debug json ')
         args = arg(True, True, "seqcluster.json")
         initialize_logger(".", args.debug, args.print_debug)
         logger = mylog.getLogger(__name__)
         logger.info(args)
         logger.info("Reading data")
         data = load_data(args.json)
         logger.info("Create databse")
         make_database(data)

示例#8

0

显示文件

文件： test_predict.py 项目： sinanugur/seqcluster

 def test_predict(self):
     if find_cmd("tRNAscan-SE"):
         mod_dir = os.path.dirname(inspect.getfile(seqcluster)).replace(
             "seqcluster/", "")
         os.chdir(os.path.join(mod_dir, "data/examples"))
         out_dir = os.path.join(mod_dir,
                                "data/examples/test_out_predictions")
         arg = namedtuple('args',
                          'debug print_debug MIN_SEQ json reference')
         args = arg(True, True, 1, "seqcluster.json",
                    "../genomes/genome.fa")
         initialize_logger(".", args.debug, args.print_debug)
         logger = mylog.getLogger(__name__)
         logger.info(args)
         logger.info("Reading data")
         data = load_data(args.json)
         logger.info("Start prediction")
         is_tRNA(data, out_dir, args)

示例#9

0

显示文件

文件： explore_cluster.py 项目： lpantano/seqcluster

def explore(args):
    """Create mapping of sequences of two clusters
    """
    logger.info("reading sequeces")
    data = load_data(args.json)
    logger.info("get sequences from json")
    #get_sequences_from_cluster()
    c1, c2 = args.names.split(",")
    seqs, names = get_sequences_from_cluster(c1, c2, data[0])
    loci = get_precursors_from_cluster(c1, c2, data[0])
    logger.info("map all sequences to all loci")
    print("%s" % (loci))
    map_to_precursors(seqs, names, loci, os.path.join(args.out, "map.tsv"), args)
    #map_sequences_w_bowtie(sequences, precursors)
    logger.info("plot sequences on loci")
    #get_matrix_position()
    #plot_sequences()
    logger.info("Done")

示例#10

0

显示文件

文件： explore_cluster.py 项目： smoe/seqcluster

def explore(args):
    """Create mapping of sequences of two clusters
    """
    logger.info("Reading sequences")
    data = load_data(args.json)
    logger.info("Get sequences from json")
    #get_sequences_from_cluster()
    c1, c2 = args.names.split(",")
    seqs, names = get_sequences_from_cluster(c1, c2, data[0])
    loci = get_precursors_from_cluster(c1, c2, data[0])
    logger.info("Map all sequences to all loci")
    print("%s" % (loci))
    map_to_precursors(seqs, names, loci, os.path.join(args.out, "map.tsv"), args)
    #map_sequences_w_bowtie(sequences, precursors)
    logger.info("Plot sequences on loci")
    #get_matrix_position()
    #plot_sequences()
    logger.info("Done")

示例#11

0

显示文件

def predictions(args):
    """
    Create predictions of clusters
    """

    logger.info(args)
    logger.info("Reading sequences")
    out_file = os.path.abspath(os.path.splitext(args.json)[0] + "_prediction.json")
    data = load_data(args.json)
    out_dir = os.path.abspath(safe_dirs(os.path.join(args.out, "predictions")))

    logger.info("Make predictions")
    data = is_tRNA(data, out_dir, args)

    if args.coral:
        logger.info("Make CoRaL predictions")
        run_coral(data, out_dir, args)
    write_data(data[0], out_file)
    logger.info("Done")

示例#12

0

显示文件

文件： make_predictions.py 项目： lpantano/seqcluster

def predictions(args):
    """
    Create predictions of clusters
    """

    logger.info(args)
    logger.info("reading sequeces")
    out_file = os.path.abspath(os.path.splitext(args.json)[0] + "_prediction.json")
    data = load_data(args.json)
    out_dir = os.path.abspath(safe_dirs(os.path.join(args.out, "predictions")))

    logger.info("make predictions")
    data = is_tRNA(data, out_dir, args)

    if args.coral:
        logger.info("make CoRaL predictions")
        run_coral(data, out_dir, args)
    write_data(data[0], out_file)
    logger.info("Done")

示例#13

0

显示文件

def cluster(args):
    """
    Creating clusters
    """

    args = _check_args(args)
    read_stats_file = op.join(args.dir_out, "read_stats.tsv")
    if file_exists(read_stats_file):
        os.remove(read_stats_file)

    bam_file, seq_obj = _clean_alignment(args)

    logger.info("Parsing matrix file")
    seqL, y, l = parse_ma_file(seq_obj, args.ffile)
    # y, l = _total_counts(seqL.keys(), seqL)
    logger.info("counts after: %s" % sum(y.values()))
    logger.info("# sequences after: %s" % l)
    dt = pd.DataFrame({'sample': y.keys(), 'counts': y.values()})
    dt['step'] = 'aligned'
    dt.to_csv(read_stats_file, sep="\t", index=False, header=False, mode='a')

    if len(seqL.keys()) < 10:
        logger.error(
            "It seems you have low coverage. Please check your fastq files have enough sequences."
        )
        raise ValueError("So few sequences.")

    logger.info("Cleaning bam file")
    y, l = _total_counts(list(seqL.keys()), seqL)
    logger.info("counts after: %s" % sum(y.values()))
    logger.info("# sequences after: %s" % l)
    dt = pd.DataFrame({'sample': y.keys(), 'counts': y.values()})
    dt['step'] = 'cleaned'
    dt.to_csv(read_stats_file, sep="\t", index=False, header=False, mode='a')

    clusL = _create_clusters(seqL, bam_file, args)

    y, l = _total_counts(list(clusL.seq.keys()), clusL.seq, aligned=True)
    logger.info("counts after: %s" % sum(y.values()))
    logger.info("# sequences after: %s" % l)
    dt = pd.DataFrame({'sample': y.keys(), 'counts': y.values()})
    dt['step'] = 'clusters'
    dt.to_csv(read_stats_file, sep="\t", index=False, header=False, mode='a')

    logger.info("Solving multi-mapping events in the network of clusters")
    clusLred = _cleaning(clusL, args.dir_out)
    y, l = _total_counts(clusLred.clus, seqL)
    logger.info("counts after: %s" % sum(y.values()))
    logger.info("# sequences after: %s" % l)
    dt = pd.DataFrame({'sample': y.keys(), 'counts': y.values()})
    dt['step'] = 'meta-cluster'
    dt.to_csv(read_stats_file, sep="\t", index=False, header=False, mode='a')
    logger.info("Clusters up to %s" % (len(clusLred.clus.keys())))

    if args.show:
        logger.info("Creating sequences alignment to precursor")
        clusLred = show_seq(clusLred, args.index)

    clusLred = peak_calling(clusLred)

    clusLred = _annotate(args, clusLred)

    logger.info("Creating json and count matrix")
    json_file = _create_json(clusLred, args)

    logger.info("Output file in: %s" % args.dir_out)

    if args.db:
        name = args.db + ".db"
        logger.info("Create database: database/" + name)
        data = load_data(json_file)
        out_dir = op.join(args.dir_out, "database")
        make_database(data, name, out_dir)
    logger.info("Finished")

示例#14

0

显示文件

文件： make_clusters.py 项目： lpantano/seqcluster

def cluster(args):
    """
    Creating clusters
    """

    args = _check_args(args)
    read_stats_file = op.join(args.dir_out, "read_stats.tsv")
    if file_exists(read_stats_file):
        os.remove(read_stats_file)

    bam_file, seq_obj = _clean_alignment(args)

    logger.info("Parsing matrix file")
    seqL, y, l = parse_ma_file(seq_obj, args.ffile)
    # y, l = _total_counts(seqL.keys(), seqL)
    logger.info("counts after: %s" % sum(y.values()))
    logger.info("# sequences after: %s" % l)
    dt = pd.DataFrame({'sample': y.keys(), 'counts': y.values()})
    dt['step'] = 'aligned'
    dt.to_csv(read_stats_file, sep="\t", index=False, header=False, mode='a')

    if len(seqL.keys()) < 10:
        logger.error("It seems you have low coverage. Please check your fastq files have enough sequences.")
        raise ValueError("So few sequences.")

    logger.info("Cleaning bam file")
    y, l = _total_counts(list(seqL.keys()), seqL)
    logger.info("counts after: %s" % sum(y.values()))
    logger.info("# sequences after: %s" % l)
    dt = pd.DataFrame({'sample': y.keys(), 'counts': y.values()})
    dt['step'] = 'cleaned'
    dt.to_csv(read_stats_file, sep="\t", index=False, header=False, mode='a')

    clusL = _create_clusters(seqL, bam_file, args)

    y, l = _total_counts(list(clusL.seq.keys()), clusL.seq, aligned=True)
    logger.info("counts after: %s" % sum(y.values()))
    logger.info("# sequences after: %s" % l)
    dt = pd.DataFrame({'sample': y.keys(), 'counts': y.values()})
    dt['step'] = 'clusters'
    dt.to_csv(read_stats_file, sep="\t", index=False, header=False, mode='a')

    logger.info("Solving multi-mapping events in the network of clusters")
    clusLred = _cleaning(clusL, args.dir_out)
    y, l = _total_counts(clusLred.clus, seqL)
    logger.info("counts after: %s" % sum(y.values()))
    logger.info("# sequences after: %s" % l)
    dt = pd.DataFrame({'sample': y.keys(), 'counts': y.values()})
    dt['step'] = 'meta-cluster'
    dt.to_csv(read_stats_file, sep="\t", index=False, header=False, mode='a')
    logger.info("Clusters up to %s" % (len(clusLred.clus.keys())))

    if args.show:
        logger.info("Creating sequences alignment to precursor")
        clusLred = show_seq(clusLred, args.index)

    clusLred = peak_calling(clusLred)

    clusLred = _annotate(args, clusLred)

    logger.info("Creating json and count matrix")
    json_file = _create_json(clusLred, args)

    logger.info("Output file in: %s" % args.dir_out)

    if args.db:
        name = args.db + ".db"
        logger.info("Create database: database/" + name)
        data = load_data(json_file)
        out_dir = op.join(args.dir_out, "database")
        make_database(data, name, out_dir)
    logger.info("Finished")