Exemplo n.º 1
0
def fix_sqlite_db(database):
    db      = sqlite3.connect(database)
    cursor  = db.cursor()
    chr_name=  cursor.execute("SELECT name FROM chrNames")
    for chromosome in chr_name.fetchall():
        exist = cursor.execute("SELECT COUNT(name) FROM sqlite_master WHERE type='table' AND name='"+chromosome[0]+"';").fetchone()[0]
        if exist == 0:
            cursor.execute("DELETE FROM chrNames WHERE name=?", (chromosome[0],))
    #~ result  = cursor.execute("SELECT name FROM chrNames")
    #~ for i in result.fetchall():
        #~ cursor.execute("CREATE TABLE IF NOT EXISTS '"+i[0]+"'(start INTEGER, end INTEGER, score REAL);")
    db.commit()
    cursor.close()
    db.close()
    track_scanned_signal_tmp = unique_filename_in()
    track_scanned_signal = unique_filename_in()
    # temp result with duplicate element
    with new(track_scanned_signal_tmp, format = "sql", datatype= "quantitative") as t:
        with load(database) as b:
            t.chrmeta = b.chrmeta
            for chrom in b:
                for value in b.read(chrom, fields=['start','end','score', 'strand']):
                    if value[3] == 1:
                        t.write(chrom, (t.chrmeta[chrom]-(value[1]+1), t.chrmeta[chrom]-(value[0]), value[2],))
                    else:
                        t.write(chrom, (value[0:3],))
    # result without duplicate element id duplicate element exist take higher score
    with new(track_scanned_signal, format = "sql", datatype= "quantitative") as t:
        with load(database) as b:
            t.chrmeta = b.chrmeta
    db1     = sqlite3.connect(track_scanned_signal)
    db2     = sqlite3.connect(track_scanned_signal_tmp)
    cursor1 = db1.cursor()
    cursor2 = db2.cursor()
    chr_name=  cursor2.execute("SELECT name FROM chrNames")
    for chromosome in chr_name.fetchall():
        cursor1.execute("CREATE TABLE '"+chromosome[0]+"' (start INTEGER, end INTEGER, score REAL);")
        #~ values = cursor2.execute("SELECT DISTINCT start, end, score FROM '"+chromosome[0]+"'")*
        values = cursor2.execute    ("""
SELECT t.start, t.end, t.score FROM '"""+chromosome[0]+"""' t INNER JOIN (
    SELECT start, end, MAX(score) AS MAXSCORE FROM '"""+chromosome[0]+"""' GROUP BY start
) groupedt ON t.start=groupedt.start AND t.score=MAXSCORE;
                                    """)
        for v in values.fetchall():
            cursor1.execute("INSERT INTO '"+chromosome[0]+"' VALUES (?,?,?) ", (v[0], v[1], v[2],) )
    db1.commit()
    db2.commit()
    cursor1.close()
    cursor2.close()
    db2.close()
    db2.close()
    return track_scanned_signal
Exemplo n.º 2
0
def main(argv = None):
    """
    Entry point when program start
    """
    genrep              = None
    assembly            = None
    lims                = None
    job                 = None
    config              = None
    config_file         = None
    background          = ""
    matrix              = ""
    original_sql_data   = ""
    random_sql_data     = ""
    track_filtered      = ""
    track_scanned       = ""
    project             = ""
    username            = ""
    identity_file       = ""
    host                = ""
    website             = ""
    remote_path         = ""
    result_path         = ""
    track_regions_path  = ""
    via                 = ""
    limspath            = ""
    fdr                 = 0
    runs                = {}
    logging.basicConfig(filename='run_scanning.log',level=logging.INFO)
    if argv is None:
        argv = sys.argv
    try:
        try:
            opts, args = getopt.getopt   (
                                            argv[1:],"hu:c:"  ,
                                            [
                                                "help", "via = ", "host = "     ,
                                                "remote_path = " , "website = " ,
                                                "minilims = ","config = "       ,
                                                "matrix = ", "username = "******"identity_file = ", "project = "
                                            ]
                                        )
        except getopt.error, msg:
            raise Usage(msg)
        for option, value in opts:
            if option in ("-h", "--help"):
                print __doc__
                print USAGE
                sys.exit(0)
            elif option == "--via":
                if value == "local":
                    via = "local"
                elif value == "lsf":
                    via = "lsf"
                else:
                    raise Usage("Via (-u) can only be \"local\" or \"lsf\", got %s." % (value,))
            elif option == "--website":
                website = normalize_url(value)
            elif option == "--minilims":
                limspath = normcase(expanduser(value))
            elif option == "--host":
                host = value
            elif option == "--identity_file":
                identity_file = value
            elif option == "--remote_path":
                remote_path = normcase(expanduser(value))
                if not remote_path.endswith(sep):
                    remote_path += sep
            elif option == "--matrix":
                matrix = {basename(value):normcase(expanduser(value))}
            elif option == "--username":
                username = value
            elif option == "--project":
                project = value
            elif option in ("-c", "--config"):
                config_file = normcase(expanduser(value))
            else:
                raise Usage("Unhandled option: " + option)

        # read config file
        if config_file is None or not exists(config_file) or not isfile(config_file):
            raise Usage("Config file missing")
        else:
            job, config = parseConfig(normcase(expanduser(config_file)))

        if project == "":
            project = job.description
        if matrix == "":
            if "matrix" in job.options:
                path = normcase(expanduser(job.options["matrix"]))
                matrix = {basename(path): path}
            else:
                raise Usage("You need give value matrix file ")
        if limspath == "":
            if "minilims" in job.options:
                limspath = job.options["minilims"]
            else:
                raise Usage("You need give value minilims path/name")
        if via == "":
            if "via" in job.options:
                via = job.options["via"]
            else:
                via = "lsf"
        if host == "" and "host" in job.options:
            host = job.options["host"]
        if identity_file == "" and "identity_file" in job.options:
            identity_file = job.options["identity_file"]
        if remote_path == "" and "remote_path" in job.options:
            remote_path = job.options["remote_path"]
        if username == "" and "username" in job.options:
            username = job.options["username"]
        if website == "" and "website" in job.options:
            website = job.options["website"]

        genrep      = GenRep(config = config)
        assembly    = genrep.assembly(job.assembly_id)
        lims        = MiniLIMS(limspath)
        json        = create_gdv_project(
                                            config["gdv"]["key"], config["gdv"]["email"],
                                            project,
                                            assembly.nr_assembly_id,
                                            config["gdv"]["url"],
                                            public = True
                                        )
        project_id  = get_project_id( json )
        # compute false discovery rate
        with execution(lims, description = job.description) as ex:
            background = genrep.statistics  (
                                                assembly,
                                                output = unique_filename_in(),
                                                frequency = True,
                                                matrix_format = True
                                            )
            if len(job.groups) >2:
                raise ValueError("They are more than 2 group in config file")

            for group_number in job.groups:
                group = job.groups[group_number]
                for run_number in group["runs"]:
                    run_iter = job.groups[group_number]["runs"][run_number]
                    if "url" in run_iter:
                        url = run_iter["url"]
                        uri = ""
                        if run_iter["run"] not in runs:
                            runs[run_iter["run"]] = {"name":None, "control":None, "experimental":None}
                        if url.startswith("http") or url.startswith("www."):
                            url = normalize_url(url)
                            # download data
                            data    = urllib2.urlopen(url)
                            uri     = unique_filename_in()
                            with open(uri, "w") as opening_file:
                                opening_file.write(data.read())
                        else:
                            uri = normcase(expanduser(url))
                        if group["control"]:
                            runs[run_iter["run"]]["control"]   = uri
                            runs[run_iter["run"]]["name"]      = basename(uri)
                        else:
                            runs[run_iter["run"]]["experimental"] = uri

            for run in runs:
                current_run         = runs[run]
                original_sql_data   = unique_filename_in()
                random_sql_data     = unique_filename_in()
                track_filtered      = unique_filename_in()
                logging.info( "[%s]" % job.description )
                logging.info( "alias %s => %s" % (current_run["experimental"], track_filtered) )

                # convert data to sql
                with Track(current_run["experimental"], chrmeta = assembly.chromosomes) as track:
                    # Get sqlite file if is not arleady in this format
                    if track.format != "sql" or track.format != "db" or track.format != "sqlite":
                        track.convert(original_sql_data, format = "sql")
                    else:
                        original_sql_data = current_run["experimental"]
                    # Generate a random population from orginal if it is not give from config file
                    if current_run["control"] is None:
                        # create random track
                        track.shuffle_track(random_sql_data, repeat_number = 5)
                    else:
                        with Track(current_run["control"], chrmeta = assembly.chromosomes) as track_random:
                            # Get sqlite file if is not arleady in this format
                            if track_random.format != "sql" or \
                                track_random.format != "db" or \
                                track_random.format != "sqlite":
                                track_random.convert(random_sql_data, format = "sql")
                            else:
                                random_sql_data = current_run["control"]
                track_scanned, fdr, p_value = sqlite_to_false_discovery_rate(
                                                                                ex,
                                                                                matrix,
                                                                                background,
                                                                                genrep,
                                                                                assembly.chromosomes,
                                                                                original_sql_data,
                                                                                random_sql_data,
                                                                                threshold = -100,
                                                                                via = via,
                                                                                keep_max_only = False,
                                                                                alpha = 0.05,
                                                                                nb_sample = 5.0
                                                                            )

                # filter track with fdr as treshold
                with new(track_filtered, format = "sql", datatype = "qualitative") as track_out:
                    chromosome_used     = {}
                    track_out.meta_track = {"source": basename(current_run["experimental"])}
                    track_out.meta_track.update({"k":"v"})
                    with Track(track_scanned, format = "sql", chrmeta = assembly.chromosomes) as track_in:
                        meta = dict([(v["name"], dict([("length", v["length"])])) for v in track_in.chrmeta.values()])
                        for chromosome in track_in.all_chrs:
                            data_list = []
                            for data in track_in.read   (
                                                            {"chr": chromosome, "score": (fdr, sys.maxsize)},
                                                            fields = Track.qualitative_fields
                                                        ):
                                data_list.append(data)
                                chromosome_used[chromosome] = meta[chromosome]
                            if len(data_list) > 0:
                                track_out.write(chromosome, data_list)
                        track_out.chrmeta = chromosome_used
                ex.add(track_filtered,      "sql: filtred %s" % track_filtered)
                logging.info( "scanned: %s" % track_scanned )
                logging.info( "score selected: %f with p: %.3f" % (fdr, p_value) )
                logging.info( "filtred: %s" % track_filtered )


                # fix track
                track_scanned_signal = fix_sqlite_db(track_scanned)
                logging.info( "scanned signal: %s" % track_scanned_signal )
                ex.add(track_scanned_signal, description="%s: sql track signal %s" % (job.description, track_scanned_signal))

                # send filtred track and scanned track to remote
                if host != "" and remote_path != "" and username != "":
                    args = []
                    if identity_file != "":
                        args = ["-i", normcase(expanduser(identity_file)), "-C" ]
                    source_filtred      = normcase(expanduser(track_filtered))
                    source_scanned      = normcase(expanduser(track_scanned_signal))
                    result_destination          = "%s@%s:%s%s%s.db" % (username, host, remote_path, sep, track_filtered)
                    result_path                 = "%s%s%s.db" % (website, sep, track_filtered)
                    track_regions_destination   = "%s@%s:%s%s%s.db" % (username, host, remote_path, sep, track_scanned_signal)
                    track_regions_path          = "%s%s%s.db" % (website, sep, track_scanned_signal)
                    scp(ex, source_filtred, result_destination, args = args)
                    scp(ex, source_scanned, track_regions_destination, args = args)
                else:
                    result_path = track_filtered
                # Send to GDV filtred track
                add_gdv_track  (
                                    config["gdv"]["key"], config["gdv"]["email"],
                                    project_id, result_path,
                                    name    = "filtred_%s" % (splitext( basename( current_run["experimental"] ) )[0]),
                                    gdv_url = config["gdv"]["url"]
                                )
                # Send to GDV scanned track
                add_gdv_track  (
                                    config["gdv"]["key"], config["gdv"]["email"],
                                    project_id, track_regions_path,
                                    name    = "regions_%s" % (splitext( basename( current_run["experimental"] ) )[0]),
                                    gdv_url = config["gdv"]["url"]
                                )
                logging.info( "++++++++++++")
            logging.info( "-------------------END--------------------")
Exemplo n.º 3
0
 def restore(self, ex, filename):
     target_filename = unique_filename_in(ex.working_directory)
     shutil.copyfile(os.path.join(ex.lims.memopad_path, filename),
                     os.path.join(ex.working_directory, target_filename))
     return target_filename
Exemplo n.º 4
0
 def serialize(self, ex, value):
     file_to_copy = os.path.join(ex.working_directory, value)
     target_filename = unique_filename_in(ex.lims.memopad_path)
     shutil.copyfile(file_to_copy,
                     os.path.join(ex.lims.memopad_path, target_filename))
     return target_filename
Exemplo n.º 5
0
 def serialize(self, ex, value):
     pickle_filename = unique_filename_in(ex.lims.memopad_path)
     with open(os.path.join(ex.lims.memopad_path, pickle_filename),
               'w') as pickle_file:
         cPickle.dump(value, pickle_file)
     return pickle_filename
Exemplo n.º 6
0
 def serialize(self, ex, value):
     pickle_filename = unique_filename_in(ex.lims.memopad_path)
     with open(os.path.join(ex.lims.memopad_path, pickle_filename), 'w') as pickle_file:
         cPickle.dump(value, pickle_file)
     return pickle_filename
Exemplo n.º 7
0
 def restore(self, ex, filename):
     target_filename = unique_filename_in(ex.working_directory)
     shutil.copyfile(os.path.join(ex.lims.memopad_path, filename),
                     os.path.join(ex.working_directory, target_filename))
     return target_filename
Exemplo n.º 8
0
 def serialize(self, ex, value):
     file_to_copy = os.path.join(ex.working_directory, value)
     target_filename = unique_filename_in(ex.lims.memopad_path)
     shutil.copyfile(file_to_copy, os.path.join(ex.lims.memopad_path, target_filename))
     return target_filename