예제 #1
0
def rsync_files(to_copy, logfile, group, dry):
    # Iterate over the files to copy and create directories and copy files as necessary
    successful = 0
    uid = os.getuid()
    gid = os.getgid()
    if group is not None and len(group) > 0:
        gid = grp.getgrnam(group).gr_gid
    for src_file, dst_dir, dst_name in to_copy:
        dst_file = os.path.join(dst_dir, dst_name)
        print "Will copy (rsync) ", src_file, "to ", dst_file
        if not dry:

            # Create the destination directory if necessary
            logfile.write("[{:s}] - Creating run-level delivery directory: {:s} " \
                          "(or leaving it in place if already present)\n".format(utc_time(),
                                                                                 dst_dir))
            if os.path.exists(dst_dir):
                print("Directory {:s} already exists!".format(dst_dir))
            else:
                try:
                    # Create directory hierarchy with ug+rwX permissions
                    os.makedirs(dst_dir, 0770)
                    os.chown(dst_dir, uid, gid)
                except:
                    print("Could not create run-level delivery directory!")
                    clean_exit(1, logfile, dry)

            # Rsync the file across
            command_to_execute = ['rsync', '-ac', src_file, dst_file]

            logfile.write("[{:s}] - Executing command: {:s}\n".format(
                utc_time(), " ".join(command_to_execute)))
            logfile.flush()
            try:
                check_call(command_to_execute)
            except CalledProcessError, e:
                logfile.write(
                    "[{:s}] - rsync exited with exit code {:d}\n".format(
                        utc_time(), e.returncode))
                raise e

            logfile.write("[{:s}] - rsync exited with exit code 0\n".format(
                utc_time()))
            successful += 1

            print("{:d} of {:d} files copied successfully".format(
                successful, len(to_copy)))

            # Modify the permissions to ug+rw
            os.chown(dst_file, uid, gid)
            os.chmod(dst_file,
                     stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP)
예제 #2
0
 def __init__(self, **kw):
     self["_id"] = kw.get("_id", uuid4().hex)
     self["entity_type"] = self._entity_type
     self["name"] = kw.get("name", None)
     self["creation_time"] = kw.get("creation_time", utc_time())
     self["modification_time"] = kw.get("modification_time", utc_time())
     for f in self._fields:
         self[f] = kw.get(f, None)
     for f in self._dict_fields:
         self[f] = kw.get(f, {})
     for f in self._list_fields:
         self[f] = kw.get(f, {})
     self = _update(self, kw)
예제 #3
0
 def __init__(self, **kw):
     self["_id"] = kw.get("_id", uuid4().hex)
     self["entity_type"] = self._entity_type
     self["name"] = kw.get("name", None)
     self["creation_time"] = kw.get("creation_time", utc_time())
     self["modification_time"] = kw.get("modification_time", utc_time())
     for f in self._fields:
         self[f] = kw.get(f, None)
     for f in self._dict_fields:
         self[f] = kw.get(f, {})
     for f in self._list_fields:
         self[f] = kw.get(f, {})
     self = _update(self, kw)
예제 #4
0
def rsync_files(to_copy, logfile, group, dry):
    # Iterate over the files to copy and create directories and copy files as necessary 
    successful = 0
    uid = os.getuid()
    gid = os.getgid()
    if group is not None and len(group) > 0:
        gid = grp.getgrnam(group).gr_gid
    for src_file, dst_dir, dst_name in to_copy:
        dst_file = os.path.join(dst_dir, dst_name)
        print "Will copy (rsync) ", src_file, "to ", dst_file
        if not dry:
            
            # Create the destination directory if necessary
            logfile.write("[{:s}] - Creating run-level delivery directory: {:s} " \
                          "(or leaving it in place if already present)\n".format(utc_time(),
                                                                                 dst_dir))
            if os.path.exists(dst_dir):
                print("Directory {:s} already exists!".format(dst_dir))
            else:
                try:
                    # Create directory hierarchy with ug+rwX permissions
                    os.makedirs(dst_dir, 0770)
                    os.chown(dst_dir,uid,gid)
                except:
                    print("Could not create run-level delivery directory!")
                    clean_exit(1,logfile,dry)
            
            # Rsync the file across 
            command_to_execute = ['rsync',
                                  '-ac',
                                  src_file,
                                  dst_file]
            
            logfile.write("[{:s}] - Executing command: {:s}\n".format(utc_time(), " ".join(command_to_execute)))
            logfile.flush()
            try:
                check_call(command_to_execute)
            except CalledProcessError, e:
                logfile.write("[{:s}] - rsync exited with exit code {:d}\n".format(utc_time(), e.returncode))
                raise e
            
            logfile.write("[{:s}] - rsync exited with exit code 0\n".format(utc_time()))
            successful += 1
        
            print("{:d} of {:d} files copied successfully".format(successful,len(to_copy)))
    
            # Modify the permissions to ug+rw
            os.chown(dst_file,uid,gid)
            os.chmod(dst_file, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP)
예제 #5
0
def update_fn(db, obj):
    t_utc = utc_time()
    def equal(a, b):
        a_keys = [str(x) for x in a.keys() if x not in ["_id", "_rev", "creation_time", "modification_time"]]
        b_keys = [str(x) for x in b.keys() if x not in ["_id", "_rev", "creation_time", "modification_time"]]
        keys = list(set(a_keys + b_keys))
        return {k:a.get(k, None) for k in keys} == {k:b.get(k, None) for k in keys}

    if isinstance(obj, FlowcellRunMetrics):
        view = db.view("names/id_to_name")
    if isinstance(obj, SampleRunMetrics):
        view = db.view("names/id_to_name")

    d_view = {k.value:k for k in view}
    dbid =  d_view.get(obj["name"], None)
    dbobj = None
    if dbid:
        dbobj = db.get(dbid.id, None)
    if dbobj is None:
        obj["creation_time"] = t_utc
        return obj
    if equal(obj, dbobj):
        return None
    else:
        obj["creation_time"] = dbobj.get("creation_time")
        obj["modification_time"] = t_utc
        obj["_rev"] = dbobj.get("_rev")
        obj["_id"] = dbobj.get("_id")
        return obj
예제 #6
0
 def remove_finished(self):
     if not self._check_pargs(["project"]):
         return
     # Don't filter out files
     def filter_fn(f):
         return True
     slist = os.listdir(os.path.join(self._meta.root_path, self._meta.path_id))
     for s in slist:
         spath = os.path.join(self._meta.root_path, self._meta.path_id, s)
         if not os.path.isdir(spath):
             continue
         if not os.path.exists(os.path.join(spath, FINISHED_FILE)):
             self.app.log.info("Sample {} not finished; skipping".format(s))
             continue
         flist = filtered_walk(spath, filter_fn)
         dlist = filtered_walk(spath, filter_fn, get_dirs=True)
         if os.path.exists(os.path.join(spath, REMOVED_FILE)):
             self.app.log.info("Sample {} already removed; skipping".format(s))
             continue
         if len(flist) > 0 and not query_yes_no("Will remove directory {} containing {} files; continue?".format(s, len(flist)), force=self.pargs.force):
             continue
         self.app.log.info("Removing {} files from {}".format(len(flist), spath))            
         for f in flist:
             if f == os.path.join(spath, FINISHED_FILE):
                 continue
             self.app.cmd.safe_unlink(f)
         self.app.log.info("Removing {} directories from {}".format(len(dlist), spath))
         for d in sorted(dlist, reverse=True):
             self.app.cmd.safe_rmdir(d)
         if not self.pargs.dry_run:
             with open(os.path.join(spath, REMOVED_FILE), "w") as fh:
                 t_utc = utc_time()
                 fh.write(t_utc)
예제 #7
0
 def touch_finished(self):
     if not self._check_pargs(["project", "sample"]):
         return
     if os.path.exists(self.pargs.sample) and os.path.isfile(self.pargs.sample):
         with open(self.pargs.sample) as fh:
             slist = [x.rstrip() for x in fh.readlines()]
     else:
         slist = [self.pargs.sample]
     for s in slist:
         spath = os.path.join(self._meta.root_path, self._meta.path_id, s)
         if not os.path.exists(spath):
             self.app.log.warn("No such path {}; skipping".format(spath))
             continue
         rsync_src = os.path.join(self._meta.root_path, self._meta.path_id, s) + os.sep
         rsync_tgt = os.path.join(self.app.config.get("runqc", "root"), self.pargs.project, s) + os.sep
         cl = ["rsync {} {} {}".format(self.app.config.get("runqc", "rsync_sample_opts"), rsync_src, rsync_tgt)]
         self.app.log.info("Checking if runqc uptodate with command '{}'".format(" ".join(cl)))
         out = self.app.cmd.command(cl, **{'shell':True})
         if not self.pargs.dry_run and not out.find("total size is 0"):
             self.app.log.info("Some files need to be updated. Rsync output:")
             print "********"
             print out
             print "********"
             continue
         if not query_yes_no("Going to touch file {} for sample {}; continue?".format(FINISHED_FILE, s), force=self.pargs.force):
             continue
         self.app.log.info("Touching file {} for sample {}".format(FINISHED_FILE, s))
         with open(os.path.join(spath, FINISHED_FILE), "w") as fh:
             t_utc = utc_time()
             fh.write(t_utc)
예제 #8
0
def update_fn(cls, db, obj, viewname = "names/id_to_name", key="name"):
    """Compare object with object in db if present.

    :param cls: calling class
    :param db: couch database
    :param obj: database object to save

    :returns: database object to save and database id if present
    """
    t_utc = utc_time()
    def equal(a, b):
        a_keys = [str(x) for x in a.keys() if x not in ["_id", "_rev", "creation_time", "modification_time"]]
        b_keys = [str(x) for x in b.keys() if x not in ["_id", "_rev", "creation_time", "modification_time"]]
        keys = list(set(a_keys + b_keys))
        return {k:a.get(k, None) for k in keys} == {k:b.get(k, None) for k in keys}

    view = db.view(viewname)
    d_view = {k.value:k for k in view}
    dbid =  d_view.get(obj[key], None)
    dbobj = None

    if dbid:
        dbobj = db.get(dbid.id, None)
    if dbobj is None:
        obj["creation_time"] = t_utc
        return (obj, dbid)
    if equal(obj, dbobj):
        return (None, dbid)
    else:
        obj["creation_time"] = dbobj.get("creation_time")
        obj["modification_time"] = t_utc
        obj["_rev"] = dbobj.get("_rev")
        obj["_id"] = dbobj.get("_id")
        return (obj, dbid)
예제 #9
0
def update_fn(cls, db, obj, viewname="names/id_to_name", key="name"):
    """Compare object with object in db if present.

    :param cls: calling class
    :param db: couch database
    :param obj: database object to save

    :returns: database object to save and database id if present
    """
    t_utc = utc_time()

    def equal(a, b):
        a_keys = [
            str(x) for x in a.keys()
            if x not in ["_id", "_rev", "creation_time", "modification_time"]
        ]
        b_keys = [
            str(x) for x in b.keys()
            if x not in ["_id", "_rev", "creation_time", "modification_time"]
        ]
        keys = list(set(a_keys + b_keys))
        return {k: a.get(k, None)
                for k in keys} == {k: b.get(k, None)
                                   for k in keys}

    view = db.view(viewname)
    d_view = {k.value: k for k in view}
    dbid = d_view.get(obj[key], None)
    dbobj = None

    if dbid:
        dbobj = db.get(dbid.id, None)
    if dbobj is None:
        obj["creation_time"] = t_utc
        return (obj, dbid)
    if equal(obj, dbobj):
        return (None, dbid)
    else:
        # Merge the newly created object with the one found in the database, replacing
        # the information found in the database for the new one if found the same key
        merge(obj, dbobj)
        # We need the original times and id from the DB object though
        obj["creation_time"] = dbobj.get("creation_time")
        obj["modification_time"] = t_utc
        obj["_rev"] = dbobj.get("_rev")
        obj["_id"] = dbobj.get("_id")
        return (obj, dbid)
예제 #10
0
 def touch_finished(self):
     if not self._check_pargs(["project", "sample"]):
         return
     if os.path.exists(self.pargs.sample) and os.path.isfile(
             self.pargs.sample):
         with open(self.pargs.sample) as fh:
             slist = [x.rstrip() for x in fh.readlines()]
     else:
         slist = [self.pargs.sample]
     for s in slist:
         spath = os.path.join(self._meta.root_path, self._meta.path_id, s)
         if not os.path.exists(spath):
             self.app.log.warn("No such path {}; skipping".format(spath))
             continue
         rsync_src = os.path.join(self._meta.root_path, self._meta.path_id,
                                  s) + os.sep
         rsync_tgt = os.path.join(self.app.config.get("runqc", "root"),
                                  self.pargs.project, s) + os.sep
         cl = [
             "rsync {} {} {}".format(
                 self.app.config.get("runqc", "rsync_sample_opts"),
                 rsync_src, rsync_tgt)
         ]
         self.app.log.info(
             "Checking if runqc uptodate with command '{}'".format(
                 " ".join(cl)))
         out = self.app.cmd.command(cl, **{'shell': True})
         if not self.pargs.dry_run and not out.find("total size is 0"):
             self.app.log.info(
                 "Some files need to be updated. Rsync output:")
             print "********"
             print out
             print "********"
             continue
         if not query_yes_no(
                 "Going to touch file {} for sample {}; continue?".format(
                     FINISHED_FILE, s),
                 force=self.pargs.force):
             continue
         self.app.log.info("Touching file {} for sample {}".format(
             FINISHED_FILE, s))
         with open(os.path.join(spath, FINISHED_FILE), "w") as fh:
             t_utc = utc_time()
             fh.write(t_utc)
예제 #11
0
    def remove_finished(self):
        if not self._check_pargs(["project"]):
            return
        # Don't filter out files
        def filter_fn(f):
            return True

        slist = os.listdir(
            os.path.join(self._meta.root_path, self._meta.path_id))
        for s in slist:
            spath = os.path.join(self._meta.root_path, self._meta.path_id, s)
            if not os.path.isdir(spath):
                continue
            if not os.path.exists(os.path.join(spath, FINISHED_FILE)):
                self.app.log.info("Sample {} not finished; skipping".format(s))
                continue
            flist = filtered_walk(spath, filter_fn)
            dlist = filtered_walk(spath, filter_fn, get_dirs=True)
            if os.path.exists(os.path.join(spath, REMOVED_FILE)):
                self.app.log.info(
                    "Sample {} already removed; skipping".format(s))
                continue
            if len(flist) > 0 and not query_yes_no(
                    "Will remove directory {} containing {} files; continue?".
                    format(s, len(flist)),
                    force=self.pargs.force):
                continue
            self.app.log.info("Removing {} files from {}".format(
                len(flist), spath))
            for f in flist:
                if f == os.path.join(spath, FINISHED_FILE):
                    continue
                self.app.cmd.safe_unlink(f)
            self.app.log.info("Removing {} directories from {}".format(
                len(dlist), spath))
            for d in sorted(dlist, reverse=True):
                self.app.cmd.safe_rmdir(d)
            if not self.pargs.dry_run:
                with open(os.path.join(spath, REMOVED_FILE), "w") as fh:
                    t_utc = utc_time()
                    fh.write(t_utc)
예제 #12
0
def main():
    parser = argparse.ArgumentParser(description="A script to help doing the deliveries, now using the Casava directory structure. " \
                                     "The user is asked to provide a project ID, a run name, and an UPPMAX project")

    parser.add_argument('-c',
                        '--casava-path',
                        action="store",
                        dest="caspath",
                        default='/proj/a2010002/nobackup/illumina/',
                        help="Specify a path to a Casava directory manually")
    parser.add_argument('-l',
                        '--log-path',
                        action="store",
                        dest="logpath",
                        default='/proj/a2010002/private/delivery_logs',
                        help="Specify a path to a log file")
    parser.add_argument('-i',
                        '--interactive',
                        action="store_true",
                        dest="interactive",
                        default=False,
                        help="Interactively select samples to be delivered")
    parser.add_argument('-d',
                        '--dry-run',
                        action="store_true",
                        dest="dry",
                        default=False,
                        help="Dry run: nothing will be done")
    parser.add_argument(
        '-a',
        '--deliver-all-fcs',
        action="store_true",
        dest="deliver_all_fcs",
        default=False,
        help=
        "rsync samples from all flow cells. Default is to only deliver from specified flowcell"
    )
    parser.add_argument(
        '-p',
        '--nophix',
        action="store_true",
        dest="deliver_nophix",
        default=False,
        help=
        "Deliver fastq files from nophix subdirectory. Default is to deliver from run directory"
    )
    parser.add_argument('-g',
                        '--group',
                        action="store",
                        dest="group",
                        default="uppmax",
                        help="Group membership to set on copied files")
    parser.add_argument('project_name',
                        action='store',
                        help="Project name to deliver, e.g. J.Doe_10_01")
    parser.add_argument('flowcell_id',
                        action='store',
                        help="Flowcell id to deliver, e.g. 120824_BD1915ACXX")
    parser.add_argument('uppmax_id',
                        action='store',
                        help="UPPMAX project id to deliver to, e.g. b2012001")
    args = parser.parse_args()

    if not args.project_name in os.listdir(args.caspath):
        print("Could not find project. Check directory listing:")
        for f in os.listdir(args.caspath):
            print(f)
        clean_exit(0, None, args.dry)

    fcid = args.flowcell_id
    fcid_comp = fcid.split('_')
    if len(fcid_comp) > 2:
        fcid = fcid_comp[0] + '_' + fcid_comp[-1]
        print("FCID format too long, trying {:s}".format(fcid))

    dt = datetime.now()
    time_str = "_".join([
        str(dt.year),
        str(dt.month),
        str(dt.day),
        str(dt.hour),
        str(dt.minute),
        str(dt.second)
    ])

    logfilename = os.path.join(os.path.normpath(args.logpath),
                               "{:s}.log".format(time_str))
    if not args.dry:
        logfile = open(logfilename, "w")
    else:
        logfile = sys.stdout

    logfile.write("[{:s}] - Project to move files for:\n{:s}\n".format(
        utc_time(), args.project_name))
    logfile.flush()

    proj_base_dir = os.path.join(args.caspath, args.project_name)
    skip_list = []
    if args.interactive:
        for sample_dir in os.listdir(proj_base_dir):
            if not os.path.isdir(os.path.join(proj_base_dir, sample_dir)):
                continue
            if not query_yes_no("Deliver sample {:s}?".format(sample_dir),
                                default="no"):
                skip_list.append(sample_dir)

    created_proj_dir_name = fixProjName(args.project_name)
    del_path_top = '/proj/' + args.uppmax_id + "/INBOX/" + created_proj_dir_name

    to_copy = get_file_copy_list(proj_base_dir, del_path_top, fcid,
                                 args.deliver_all_fcs, args.deliver_nophix,
                                 skip_list)

    # Prompt user if any of the files are non-compressed
    for fqfile, _, _ in to_copy:
        if os.path.splitext(fqfile)[1] == ".gz":
            continue
        print("WARNING: The file {:s}, which you are about to deliver, does not seem to be compressed. " \
              "It is recommended that you compress files prior to delivery.".format(fqfile))
        if query_yes_no("Do you wish to continue delivering " \
                        "uncompressed fastq files?", default="yes"):
            break
        clean_exit(1, logfile, args.dry)

    rsync_files(to_copy, logfile, args.group, args.dry)

    clean_exit(0, logfile, args.dry)
예제 #13
0
파일: deliver.py 프로젝트: ewels/scilifelab
    def raw_data(self):
        if not self._check_pargs(["project"]):
            return

        # if necessary, reformat flowcell identifier
        if self.pargs.flowcell:
            self.pargs.flowcell = self.pargs.flowcell.split("_")[-1]

        # get the uid and gid to use for destination files
        uid = os.getuid()
        gid = os.getgid()
        if self.pargs.group is not None and len(self.pargs.group) > 0:
            gid = grp.getgrnam(group).gr_gid

        self.log.debug("Connecting to project database")
        p_con = ProjectSummaryConnection(**vars(self.pargs))
        assert p_con, "Could not get connection to project databse"
        self.log.debug("Connecting to samples database")
        s_con = SampleRunMetricsConnection(**vars(self.pargs))
        assert s_con, "Could not get connection to samples databse"

        # Fetch the Uppnex project to deliver to
        if not self.pargs.uppmax_project:
            self.pargs.uppmax_project = p_con.get_entry(
                self.pargs.project, "uppnex_id")
            if not self.pargs.uppmax_project:
                self.log.error(
                    "Uppmax project was not specified and could not be fetched from project database"
                )
                return

        # Extract the list of samples and runs associated with the project and sort them
        samples = sorted(s_con.get_samples(fc_id=self.pargs.flowcell,
                                           sample_prj=self.pargs.project),
                         key=lambda k:
                         (k.get('project_sample_name', 'NA'),
                          k.get('flowcell', 'NA'), k.get('lane', 'NA')))

        # Setup paths and verify parameters
        self._meta.production_root = self.app.config.get("production", "root")
        self._meta.root_path = self._meta.production_root
        proj_base_dir = os.path.join(self._meta.root_path, self.pargs.project)
        assert os.path.exists(
            self._meta.production_root
        ), "No such directory {}; check your production config".format(
            self._meta.production_root)
        assert os.path.exists(
            proj_base_dir), "No project {} in production path {}".format(
                self.pargs.project, self._meta.root_path)

        try:
            self._meta.uppnex_project_root = self.app.config.get(
                "deliver", "uppnex_project_root")
        except Exception as e:
            self.log.warn(
                "{}, will use '/proj' as uppnext_project_root".format(e))
            self._meta.uppnex_project_root = '/proj'

        try:
            self._meta.uppnex_delivery_dir = self.app.config.get(
                "deliver", "uppnex_project_delivery_path")
        except Exception as e:
            self.log.warn(
                "{}, will use 'INBOX' as uppnext_project_delivery_path".format(
                    e))
            self._meta.uppnex_delivery_dir = 'INBOX'

        destination_root = os.path.join(self._meta.uppnex_project_root,
                                        self.pargs.uppmax_project,
                                        self._meta.uppnex_delivery_dir)
        assert os.path.exists(
            destination_root
        ), "Delivery destination folder {} does not exist".format(
            destination_root)
        destination_root = os.path.join(destination_root, self.pargs.project)

        # If interactively select, build a list of samples to skip
        if self.pargs.interactive:
            to_process = []
            for sample in samples:
                sname = sample.get("project_sample_name")
                index = sample.get("sequence")
                fcid = sample.get("flowcell")
                lane = sample.get("lane")
                date = sample.get("date")
                self.log.info(
                    "Sample: {}, Barcode: {}, Flowcell: {}, Lane: {}, Started on: {}"
                    .format(sname, index, fcid, lane, date))
                if query_yes_no("Deliver sample?", default="no"):
                    to_process.append(sample)
            samples = to_process

        # Find uncompressed fastq
        uncompressed = self._find_uncompressed_fastq_files(
            proj_base_dir, samples)
        if len(uncompressed) > 0:
            self.log.warn(
                "The following samples have uncompressed *.fastq files that cannot be delivered: {}"
                .format(",".join(uncompressed)))
            if not query_yes_no("Continue anyway?", default="no"):
                return

        self.log.info(
            "Will deliver data for {} samples from project {} to {}".format(
                len(samples), self.pargs.project, destination_root))
        if not query_yes_no("Continue?"):
            return

        # Get the list of files to transfer and the destination
        self.log.debug("Gathering list of files to copy")
        to_copy = self.get_file_copy_list(proj_base_dir, destination_root,
                                          samples)

        # Make sure that transfer will be with rsync
        if not self.pargs.rsync:
            self.log.warn("Files must be transferred using rsync")
            if not query_yes_no(
                    "Do you wish to continue delivering using rsync?",
                    default="yes"):
                return
            self.pargs.rsync = True

        # Process each sample run
        for id, files in to_copy.items():
            # get the sample database object
            [sample] = [s for s in samples if s.get('_id') == id]
            self.log.info("Processing sample {} and flowcell {}".format(
                sample.get("project_sample_name", "NA"),
                sample.get("flowcell", "NA")))

            # calculate md5sums on the source side and write it on the destination
            md5 = []
            for f in files:
                m = md5sum(f[0])
                mfile = "{}.md5".format(f[1])
                md5.append([m, mfile, f[2], f[0]])
                self.log.debug("md5sum for source file {}: {}".format(f[0], m))

            # transfer files
            self.log.debug("Transferring {} fastq files".format(len(files)))
            self._transfer_files([f[0] for f in files], [f[1] for f in files])

            # write the md5sum to a file at the destination and verify the transfer
            passed = True
            for m, mfile, read, srcpath in md5:
                dstfile = os.path.splitext(mfile)[0]
                self.log.debug("Writing md5sum to file {}".format(mfile))
                self.app.cmd.write(
                    mfile, "{}  {}".format(m, os.path.basename(dstfile)), True)
                self.log.debug("Verifying md5sum for file {}".format(dstfile))

                # if dry-run, make sure verification pass
                if self.pargs.dry_run:
                    dm = m
                else:
                    dm = md5sum(dstfile)
                self.log.debug("md5sum for destination file {}: {}".format(
                    dstfile, dm))
                if m != dm:
                    self.log.warn(
                        "md5sum verification FAILED for {}. Source: {}, Target: {}"
                        .format(dstfile, m, dm))
                    self.log.warn(
                        "Improperly transferred file {} is removed from destination, please retry transfer of this file"
                        .format(dstfile))
                    self.app.cmd.safe_unlink(dstfile)
                    self.app.cmd.safe_unlink(mfile)
                    passed = False
                    continue

                # Modify the permissions to ug+rw
                for f in [dstfile, mfile]:
                    self.app.cmd.chmod(
                        f, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP
                        | stat.S_IWGRP)

            # touch the flag to trigger uppmax inbox permission fix
            self.app.cmd.safe_touchfile(
                os.path.join("/sw", "uppmax", "var", "inboxfix", "schedule",
                             self.pargs.uppmax_project))

            # log the transfer to statusdb if verification passed
            if passed:
                self.log.info(
                    "Logging delivery to StatusDB document {}".format(id))
                data = {
                    'raw_data_delivery': {
                        'timestamp': utc_time(),
                        'files': {
                            'R{}'.format(read): {
                                'md5':
                                m,
                                'path':
                                os.path.splitext(mfile)[0],
                                'size_in_bytes':
                                self._getsize(os.path.splitext(mfile)[0]),
                                'source_location':
                                srcpath
                            }
                            for m, mfile, read, srcpath in md5
                        },
                    }
                }
                jsonstr = json.dumps(data)
                jsonfile = os.path.join(
                    os.path.dirname(md5[0][3]),
                    "{}_{}_{}_{}_L{}_raw_data_delivery.json".format(
                        sample.get("date"), sample.get("flowcell"),
                        sample.get("project_sample_name"),
                        sample.get("sequence"), sample.get("lane")))
                self.log.debug(
                    "Writing delivery to json file {}".format(jsonfile))
                self.app.cmd.write(jsonfile, data=jsonstr, overwrite=True)
                self.log.debug(
                    "Saving delivery in StatusDB document {}".format(id))
                sample.update(data)
                self._save(s_con, sample)
                self.log.debug(jsonstr)
예제 #14
0
    def raw_data(self):
        if not self._check_pargs(["project"]):
            return

        # if necessary, reformat flowcell identifier
        if self.pargs.flowcell:
            self.pargs.flowcell = self.pargs.flowcell.split("_")[-1]

        # get the uid and gid to use for destination files
        uid = os.getuid()
        gid = os.getgid()
        if self.pargs.group is not None and len(self.pargs.group) > 0:
            gid = grp.getgrnam(group).gr_gid

        self.log.debug("Connecting to project database")
        p_con = ProjectSummaryConnection(**vars(self.pargs))
        assert p_con, "Could not get connection to project databse"
        self.log.debug("Connecting to samples database")
        s_con = SampleRunMetricsConnection(**vars(self.pargs))
        assert s_con, "Could not get connection to samples databse"

        # Fetch the Uppnex project to deliver to
        if not self.pargs.uppmax_project:
            self.pargs.uppmax_project = p_con.get_entry(self.pargs.project, "uppnex_id")
            if not self.pargs.uppmax_project:
                self.log.error("Uppmax project was not specified and could not be fetched from project database")
                return

        # Extract the list of samples and runs associated with the project and sort them
        samples = sorted(s_con.get_samples(fc_id=self.pargs.flowcell, sample_prj=self.pargs.project), key=lambda k: (k.get('project_sample_name','NA'), k.get('flowcell','NA'), k.get('lane','NA')))

        # Setup paths and verify parameters
        self._meta.production_root = self.app.config.get("production", "root")
        self._meta.root_path = self._meta.production_root
        proj_base_dir = os.path.join(self._meta.root_path, self.pargs.project)
        assert os.path.exists(self._meta.production_root), "No such directory {}; check your production config".format(self._meta.production_root)
        assert os.path.exists(proj_base_dir), "No project {} in production path {}".format(self.pargs.project,self._meta.root_path)

        try:
            self._meta.uppnex_project_root = self.app.config.get("deliver", "uppnex_project_root")
        except Exception as e:
            self.log.warn("{}, will use '/proj' as uppnext_project_root".format(e))
            self._meta.uppnex_project_root = '/proj'

        try:
            self._meta.uppnex_delivery_dir = self.app.config.get("deliver", "uppnex_project_delivery_path")
        except Exception as e:
            self.log.warn("{}, will use 'INBOX' as uppnext_project_delivery_path".format(e))
            self._meta.uppnex_delivery_dir = 'INBOX'

        destination_root = os.path.join(self._meta.uppnex_project_root,self.pargs.uppmax_project,self._meta.uppnex_delivery_dir)
        assert os.path.exists(destination_root), "Delivery destination folder {} does not exist".format(destination_root)
        destination_root = os.path.join(destination_root,self.pargs.project)

        # If interactively select, build a list of samples to skip
        if self.pargs.interactive:
            to_process = []
            for sample in samples:
                sname = sample.get("project_sample_name")
                index = sample.get("sequence")
                fcid = sample.get("flowcell")
                lane = sample.get("lane")
                date = sample.get("date")
                self.log.info("Sample: {}, Barcode: {}, Flowcell: {}, Lane: {}, Started on: {}".format(sname,
                                                                                                           index,
                                                                                                           fcid,
                                                                                                           lane,
                                                                                                           date))
                if query_yes_no("Deliver sample?", default="no"):
                    to_process.append(sample)
            samples = to_process

        # Find uncompressed fastq
        uncompressed = self._find_uncompressed_fastq_files(proj_base_dir,samples)
        if len(uncompressed) > 0:
            self.log.warn("The following samples have uncompressed *.fastq files that cannot be delivered: {}".format(",".join(uncompressed)))
            if not query_yes_no("Continue anyway?", default="no"):
                return

        self.log.info("Will deliver data for {} samples from project {} to {}".format(len(samples),self.pargs.project,destination_root))
        if not query_yes_no("Continue?"):
            return

        # Get the list of files to transfer and the destination
        self.log.debug("Gathering list of files to copy")
        to_copy = self.get_file_copy_list(proj_base_dir,
                                          destination_root,
                                          samples)

        # Make sure that transfer will be with rsync
        if not self.pargs.rsync:
            self.log.warn("Files must be transferred using rsync")
            if not query_yes_no("Do you wish to continue delivering using rsync?", default="yes"):
                return
            self.pargs.rsync = True

        # Process each sample run
        for id, files in to_copy.items():
            # get the sample database object
            [sample] = [s for s in samples if s.get('_id') == id]
            self.log.info("Processing sample {} and flowcell {}".format(sample.get("project_sample_name","NA"),sample.get("flowcell","NA")))

            # transfer files
            self.log.debug("Transferring {} fastq files".format(len(files)))
            self._transfer_files([f[0] for f in files], [f[1] for f in files])

            passed = True
            if self.pargs.link or self.pargs.dry_run:
                passed = False
            else:
                # calculate md5sums on the source side and write it on the destination
                md5 = []
                for f in files:
                    m = md5sum(f[0])
                    mfile = "{}.md5".format(f[1])
                    md5.append([m,mfile,f[2],f[0]])
                    self.log.debug("md5sum for source file {}: {}".format(f[0],m))

                # write the md5sum to a file at the destination and verify the transfer
                for m, mfile, read, srcpath in md5:
                    dstfile = os.path.splitext(mfile)[0]
                    self.log.debug("Writing md5sum to file {}".format(mfile))
                    self.app.cmd.write(mfile,"{}  {}".format(m,os.path.basename(dstfile)),True)
                    self.log.debug("Verifying md5sum for file {}".format(dstfile))
                    dm = md5sum(dstfile)
                    self.log.debug("md5sum for destination file {}: {}".format(dstfile,dm))
                    if m != dm:
                        self.log.warn("md5sum verification FAILED for {}. Source: {}, Target: {}".format(dstfile,m,dm))
                        self.log.warn("Improperly transferred file {} is removed from destination, please retry transfer of this file".format(dstfile))
                        self.app.cmd.safe_unlink(dstfile)
                        self.app.cmd.safe_unlink(mfile)
                        passed = False
                        continue

                    # Modify the permissions to ug+rw
                    for f in [dstfile, mfile]:
                        self.app.cmd.chmod(f,stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP)

            # touch the flag to trigger uppmax inbox permission fix
            self.app.cmd.safe_touchfile(os.path.join("/sw","uppmax","var","inboxfix","schedule",self.pargs.uppmax_project))

            # log the transfer to statusdb if verification passed
            if passed:
                self.log.info("Logging delivery to StatusDB document {}".format(id))
                data = {'raw_data_delivery': {'timestamp': utc_time(),
                                              'files': {'R{}'.format(read):{'md5': m,
                                                                            'path': os.path.splitext(mfile)[0],
                                                                            'size_in_bytes': self._getsize(os.path.splitext(mfile)[0]),
                                                                            'source_location': srcpath} for m, mfile, read, srcpath in md5},
                                              }
                        }
                jsonstr = json.dumps(data)
                jsonfile = os.path.join(os.path.dirname(md5[0][3]),
                                        "{}_{}_{}_{}_L{}_raw_data_delivery.json".format(sample.get("date"),
                                                                                       sample.get("flowcell"),
                                                                                       sample.get("project_sample_name"),
                                                                                       sample.get("sequence"),
                                                                                       sample.get("lane")))
                self.log.debug("Writing delivery to json file {}".format(jsonfile))
                self.app.cmd.write(jsonfile,data=jsonstr,overwrite=True)
                self.log.debug("Saving delivery in StatusDB document {}".format(id))
                sample.update(data)
                self._save(s_con,sample)
                self.log.debug(jsonstr)
예제 #15
0
def main():
    parser = argparse.ArgumentParser(description="A script to help doing the deliveries, now using the Casava directory structure. " \
                                     "The user is asked to provide a project ID, a run name, and an UPPMAX project")

    parser.add_argument('-c', '--casava-path', action="store", dest="caspath", default='/proj/a2010002/nobackup/illumina/', 
                        help="Specify a path to a Casava directory manually")
    parser.add_argument('-l', '--log-path', action="store", dest="logpath", default='/proj/a2010002/private/delivery_logs', 
                        help="Specify a path to a log file")
    parser.add_argument('-i', '--interactive', action="store_true", dest="interactive", default=False, 
                        help="Interactively select samples to be delivered")
    parser.add_argument('-d', '--dry-run', action="store_true", dest="dry", default=False, 
                        help="Dry run: nothing will be done")
    parser.add_argument('-a', '--deliver-all-fcs', action="store_true", dest="deliver_all_fcs", default=False, 
                        help="rsync samples from all flow cells. Default is to only deliver from specified flowcell")
    parser.add_argument('-p', '--nophix', action="store_true", dest="deliver_nophix", default=False, 
                        help="Deliver fastq files from nophix subdirectory. Default is to deliver from run directory")
    parser.add_argument('project_name', action='store', help="Project name to deliver, e.g. J.Doe_10_01")
    parser.add_argument('flowcell_id', action='store', help="Flowcell id to deliver, e.g. 120824_BD1915ACXX")
    parser.add_argument('uppmax_id', action='store', help="UPPMAX project id to deliver to, e.g. b2012001")
    args = parser.parse_args()

    if not args.project_name in os.listdir(args.caspath): 
        print("Could not find project. Check directory listing:")
        for f in os.listdir(args.caspath): 
            print(f)
        clean_exit(0,None,args.dry)

    fcid = args.flowcell_id
    fcid_comp = fcid.split('_')
    if len(fcid_comp) > 2:
        fcid = fcid_comp[0] + '_' + fcid_comp[-1]
        print("FCID format too long, trying {:s}".format(fcid))

    dt = datetime.now()
    time_str = "_".join([str(dt.year),
                         str(dt.month),
                         str(dt.day),
                         str(dt.hour),
                         str(dt.minute),
                         str(dt.second)])

    logfilename = os.path.join(os.path.normpath(args.logpath),"{:s}.log".format(time_str)) 
    if not args.dry:
        logfile = open(logfilename, "w")
    else:
        logfile = sys.stdout
         
    logfile.write("[{:s}] - Project to move files for:\n{:s}\n".format(utc_time(), args.project_name))
    logfile.flush()

    proj_base_dir = os.path.join(args.caspath, args.project_name)
    skip_list = []
    if args.interactive:
        for sample_dir in os.listdir(proj_base_dir):
            if not os.path.isdir(os.path.join(proj_base_dir,sample_dir)):
                continue
            if not query_yes_no("Deliver sample {:s}?".format(sample_dir), default="no"):
                skip_list.append(sample_dir)
    
    created_proj_dir_name = fixProjName(args.project_name)
    del_path_top = '/proj/' +  args.uppmax_id + "/INBOX/" + created_proj_dir_name 

    to_copy = get_file_copy_list(proj_base_dir,
                                 del_path_top,
                                 fcid,
                                 args.deliver_all_fcs,
                                 args.deliver_nophix,
                                 skip_list)
    
    # Prompt user if any of the files are non-compressed
    for fqfile, _, _ in to_copy:
        if os.path.splitext(fqfile)[1] == ".gz":
            continue
        print("WARNING: The file {:s}, which you are about to deliver, does not seem to be compressed. " \
              "It is recommended that you compress files prior to delivery.".format(fqfile))
        if query_yes_no("Do you wish to continue delivering " \
                        "uncompressed fastq files?", default="yes"):
            break
        clean_exit(1,logfile,args.dry)
            
    rsync_files(to_copy,
                logfile,
                args.dry)
        
    clean_exit(0,logfile,args.dry)
예제 #16
0
    def raw_data(self):
        if not self._check_pargs(["project"]):
            return

        # if necessary, reformat flowcell identifier
        if self.pargs.flowcell:
            self.pargs.flowcell = self.pargs.flowcell.split("_")[-1]

        # get the uid and gid to use for destination files
        uid = os.getuid()
        gid = os.getgid()
        if self.pargs.group is not None and len(self.pargs.group) > 0:
            gid = grp.getgrnam(group).gr_gid

        self.log.debug("Connecting to project database")
        p_con = ProjectSummaryConnection(**vars(self.pargs))
        assert p_con, "Could not get connection to project database"
        self.log.debug("Connecting to flowcell database")
        f_con = FlowcellRunMetricsConnection(**vars(self.pargs))
        assert f_con, "Could not get connection to flowcell database"
        self.log.debug("Connecting to x_flowcell database")
        x_con = X_FlowcellRunMetricsConnection(**vars(self.pargs))
        assert x_con, "Could not get connection to x_flowcell database"

        # Fetch the Uppnex project to deliver to
        if not self.pargs.uppmax_project:
            self.pargs.uppmax_project = p_con.get_entry(
                self.pargs.project, "uppnex_id")
            if not self.pargs.uppmax_project:
                self.log.error(
                    "Uppmax project was not specified and could not be fetched from project database"
                )
                return

        # Setup paths and verify parameters
        self._meta.production_root = self.pargs.root if self.pargs.root else self.app.config.get(
            "production", "root")
        self._meta.root_path = self._meta.production_root
        proj_base_dir = os.path.join(self._meta.root_path, self.pargs.project)
        assert os.path.exists(
            self._meta.production_root
        ), "No such directory {}; check your production config".format(
            self._meta.production_root)
        assert os.path.exists(
            proj_base_dir), "No project {} in production path {}".format(
                self.pargs.project, self._meta.root_path)

        try:
            self._meta.uppnex_project_root = self.app.config.get(
                "deliver", "uppnex_project_root")
        except Exception as e:
            self.log.warn(
                "{}, will use '/proj' as uppnext_project_root".format(e))
            self._meta.uppnex_project_root = '/proj'

        try:
            self._meta.uppnex_delivery_dir = self.app.config.get(
                "deliver", "uppnex_project_delivery_path")
        except Exception as e:
            self.log.warn(
                "{}, will use 'INBOX' as uppnext_project_delivery_path".format(
                    e))
            self._meta.uppnex_delivery_dir = 'INBOX'

        destination_root = os.path.join(self._meta.uppnex_project_root,
                                        self.pargs.uppmax_project,
                                        self._meta.uppnex_delivery_dir)
        assert os.path.exists(
            destination_root
        ), "Delivery destination folder {} does not exist".format(
            destination_root)
        destination_root = os.path.join(destination_root, self.pargs.project)

        # Find uncompressed fastq
        uncompressed = self._find_uncompressed_fastq_files(
            proj_base_dir=proj_base_dir,
            sample=self.pargs.sample,
            flowcell=self.pargs.flowcell)
        if len(uncompressed) > 0:
            self.log.error(
                "There are uncompressed fastq file for project, kindly check all files are compressed properly before delivery"
            )
            return

        # Extract the list of samples and runs associated with the project and sort them
        samples = self.samples_to_copy(
            pid=p_con.get_entry(self.pargs.project, "project_id"),
            pod=p_con.get_entry(self.pargs.project, "open_date"),
            fc_dict={
                'HiSeq2500': f_con.proj_list,
                'HiSeqX': x_con.proj_list
            },
            proj_base_dir=proj_base_dir,
            destination_root=destination_root,
            sample=self.pargs.sample,
            flowcell=self.pargs.flowcell)

        # If interactively select, build a list of samples to skip
        if self.pargs.interactive:
            to_process = {}
            for sample in samples:
                if query_yes_no("Deliver sample {} ?".format(sample),
                                default="no"):
                    to_process[sample] = samples[sample]
            samples = to_process

        if self.pargs.sample:
            sample = samples.get(self.pargs.sample)
            if not sample:
                self.log.error(
                    "There is no such sample {} for project {}".format(
                        self.pargs.sample, self.pargs.project))
                return
            samples = {self.pargs.sample: sample}

        self.log.info(
            "Will deliver data for {} samples from project {} to {}".format(
                len(samples), self.pargs.project, destination_root))
        if not query_yes_no("Continue?"):
            return

        # Make sure that transfer will be with rsync
        if not self.pargs.rsync:
            self.log.warn("Files must be transferred using rsync")
            if not query_yes_no(
                    "Do you wish to continue delivering using rsync?",
                    default="yes"):
                return
            self.pargs.rsync = True

        # Process each sample
        for sample, flowcells in samples.iteritems():
            for fc, files in flowcells.iteritems():
                self.log.info("Processing sample {} and flowcell {}".format(
                    sample, fc))

                # transfer files
                self.log.debug("Transferring {} fastq files".format(
                    len(files['src'])))
                self._transfer_files(sources=files['src'],
                                     targets=files['dst'])

                passed = True
                if self.pargs.link or self.pargs.dry_run:
                    passed = False
                else:
                    # calculate md5sums on the source side and write it on the destination
                    md5 = []
                    for s, d in zip(files['src'], files['dst']):
                        m = md5sum(s)
                        mfile = "{}.md5".format(d)
                        md5.append([m, mfile, s])
                        self.log.debug("md5sum for source file {}: {}".format(
                            s, m))

                    # write the md5sum to a file at the destination and verify the transfer
                    for m, mfile, srcpath in md5:
                        dstfile = os.path.splitext(mfile)[0]
                        self.log.debug(
                            "Writing md5sum to file {}".format(mfile))
                        self.app.cmd.write(
                            mfile, "{}  {}".format(m,
                                                   os.path.basename(dstfile)),
                            True)
                        self.log.debug(
                            "Verifying md5sum for file {}".format(dstfile))
                        dm = md5sum(dstfile)
                        self.log.debug(
                            "md5sum for destination file {}: {}".format(
                                dstfile, dm))
                        if m != dm:
                            self.log.warn(
                                "md5sum verification FAILED for {}. Source: {}, Target: {}"
                                .format(dstfile, m, dm))
                            self.log.warn(
                                "Improperly transferred file {} is removed from destination, please retry transfer of this file"
                                .format(dstfile))
                            self.app.cmd.safe_unlink(dstfile)
                            self.app.cmd.safe_unlink(mfile)
                            passed = False
                            continue

                        # Modify the permissions to ug+rw
                        for f in [dstfile, mfile]:
                            self.app.cmd.chmod(
                                f, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP
                                | stat.S_IWGRP)

                # touch the flag to trigger uppmax inbox permission fix
                self.app.cmd.safe_touchfile(
                    os.path.join("/sw", "uppmax", "var", "inboxfix",
                                 "schedule", self.pargs.uppmax_project))

                # log the transfer to statusdb if verification passed
                if passed:
                    data = {
                        'raw_data_delivery': {
                            'timestamp': utc_time(),
                            'files': {
                                os.path.splitext(
                                    (os.path.basename(srcpath)))[0]:
                                {
                                    'md5':
                                    m,
                                    'path':
                                    os.path.splitext(mfile)[0],
                                    'size_in_bytes':
                                    self._getsize(os.path.splitext(mfile)[0]),
                                    'source_location':
                                    srcpath
                                }
                                for m, mfile, srcpath in md5
                            }
                        }
                    }
                    jsonstr = json.dumps(data)
                    jsonfile = os.path.join(
                        proj_base_dir, sample, fc,
                        "{}_{}_raw_data_delivery.json".format(sample, fc))
                    self.log.debug(
                        "Writing delivery to json file {}".format(jsonfile))
                    self.app.cmd.write(jsonfile, data=jsonstr, overwrite=True)
                    self.log.debug(
                        "Saving delivery in StatusDB document {}".format(id))
                    if self.proj_flowcells[fc]['type'] == 'HiSeqX':
                        fc_con = x_con
                    else:
                        fc_con = f_con
                    fc_obj = fc_con.get_entry(fc)
                    self.log.info(
                        "Logging delivery to StatusDB document {}".format(
                            fc_obj.get('_id')))
                    fc_raw_data = fc_obj.get('raw_data_delivery', {})
                    fc_raw_data.update(data['raw_data_delivery'])
                    fc_obj['raw_data_delivery'] = fc_raw_data
                    self._save(fc_con, fc_obj)
                    self.log.debug(jsonstr)