예제 #1
0
    def raw_data(self):
        if not self._check_pargs(["project"]):
            return

        # if necessary, reformat flowcell identifier
        if self.pargs.flowcell:
            self.pargs.flowcell = self.pargs.flowcell.split("_")[-1]

        # get the uid and gid to use for destination files
        uid = os.getuid()
        gid = os.getgid()
        if self.pargs.group is not None and len(self.pargs.group) > 0:
            gid = grp.getgrnam(group).gr_gid

        self.log.debug("Connecting to project database")
        p_con = ProjectSummaryConnection(**vars(self.pargs))
        assert p_con, "Could not get connection to project databse"
        self.log.debug("Connecting to samples database")
        s_con = SampleRunMetricsConnection(**vars(self.pargs))
        assert s_con, "Could not get connection to samples databse"

        # Fetch the Uppnex project to deliver to
        if not self.pargs.uppmax_project:
            self.pargs.uppmax_project = p_con.get_entry(self.pargs.project, "uppnex_id")
            if not self.pargs.uppmax_project:
                self.log.error("Uppmax project was not specified and could not be fetched from project database")
                return

        # Extract the list of samples and runs associated with the project and sort them
        samples = sorted(s_con.get_samples(fc_id=self.pargs.flowcell, sample_prj=self.pargs.project), key=lambda k: (k.get('project_sample_name','NA'), k.get('flowcell','NA'), k.get('lane','NA')))

        # Setup paths and verify parameters
        self._meta.production_root = self.app.config.get("production", "root")
        self._meta.root_path = self._meta.production_root
        proj_base_dir = os.path.join(self._meta.root_path, self.pargs.project)
        assert os.path.exists(self._meta.production_root), "No such directory {}; check your production config".format(self._meta.production_root)
        assert os.path.exists(proj_base_dir), "No project {} in production path {}".format(self.pargs.project,self._meta.root_path)

        try:
            self._meta.uppnex_project_root = self.app.config.get("deliver", "uppnex_project_root")
        except Exception as e:
            self.log.warn("{}, will use '/proj' as uppnext_project_root".format(e))
            self._meta.uppnex_project_root = '/proj'

        try:
            self._meta.uppnex_delivery_dir = self.app.config.get("deliver", "uppnex_project_delivery_path")
        except Exception as e:
            self.log.warn("{}, will use 'INBOX' as uppnext_project_delivery_path".format(e))
            self._meta.uppnex_delivery_dir = 'INBOX'

        destination_root = os.path.join(self._meta.uppnex_project_root,self.pargs.uppmax_project,self._meta.uppnex_delivery_dir)
        assert os.path.exists(destination_root), "Delivery destination folder {} does not exist".format(destination_root)
        destination_root = os.path.join(destination_root,self.pargs.project)

        # If interactively select, build a list of samples to skip
        if self.pargs.interactive:
            to_process = []
            for sample in samples:
                sname = sample.get("project_sample_name")
                index = sample.get("sequence")
                fcid = sample.get("flowcell")
                lane = sample.get("lane")
                date = sample.get("date")
                self.log.info("Sample: {}, Barcode: {}, Flowcell: {}, Lane: {}, Started on: {}".format(sname,
                                                                                                           index,
                                                                                                           fcid,
                                                                                                           lane,
                                                                                                           date))
                if query_yes_no("Deliver sample?", default="no"):
                    to_process.append(sample)
            samples = to_process

        # Find uncompressed fastq
        uncompressed = self._find_uncompressed_fastq_files(proj_base_dir,samples)
        if len(uncompressed) > 0:
            self.log.warn("The following samples have uncompressed *.fastq files that cannot be delivered: {}".format(",".join(uncompressed)))
            if not query_yes_no("Continue anyway?", default="no"):
                return

        self.log.info("Will deliver data for {} samples from project {} to {}".format(len(samples),self.pargs.project,destination_root))
        if not query_yes_no("Continue?"):
            return

        # Get the list of files to transfer and the destination
        self.log.debug("Gathering list of files to copy")
        to_copy = self.get_file_copy_list(proj_base_dir,
                                          destination_root,
                                          samples)

        # Make sure that transfer will be with rsync
        if not self.pargs.rsync:
            self.log.warn("Files must be transferred using rsync")
            if not query_yes_no("Do you wish to continue delivering using rsync?", default="yes"):
                return
            self.pargs.rsync = True

        # Process each sample run
        for id, files in to_copy.items():
            # get the sample database object
            [sample] = [s for s in samples if s.get('_id') == id]
            self.log.info("Processing sample {} and flowcell {}".format(sample.get("project_sample_name","NA"),sample.get("flowcell","NA")))

            # transfer files
            self.log.debug("Transferring {} fastq files".format(len(files)))
            self._transfer_files([f[0] for f in files], [f[1] for f in files])

            passed = True
            if self.pargs.link or self.pargs.dry_run:
                passed = False
            else:
                # calculate md5sums on the source side and write it on the destination
                md5 = []
                for f in files:
                    m = md5sum(f[0])
                    mfile = "{}.md5".format(f[1])
                    md5.append([m,mfile,f[2],f[0]])
                    self.log.debug("md5sum for source file {}: {}".format(f[0],m))

                # write the md5sum to a file at the destination and verify the transfer
                for m, mfile, read, srcpath in md5:
                    dstfile = os.path.splitext(mfile)[0]
                    self.log.debug("Writing md5sum to file {}".format(mfile))
                    self.app.cmd.write(mfile,"{}  {}".format(m,os.path.basename(dstfile)),True)
                    self.log.debug("Verifying md5sum for file {}".format(dstfile))
                    dm = md5sum(dstfile)
                    self.log.debug("md5sum for destination file {}: {}".format(dstfile,dm))
                    if m != dm:
                        self.log.warn("md5sum verification FAILED for {}. Source: {}, Target: {}".format(dstfile,m,dm))
                        self.log.warn("Improperly transferred file {} is removed from destination, please retry transfer of this file".format(dstfile))
                        self.app.cmd.safe_unlink(dstfile)
                        self.app.cmd.safe_unlink(mfile)
                        passed = False
                        continue

                    # Modify the permissions to ug+rw
                    for f in [dstfile, mfile]:
                        self.app.cmd.chmod(f,stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP)

            # touch the flag to trigger uppmax inbox permission fix
            self.app.cmd.safe_touchfile(os.path.join("/sw","uppmax","var","inboxfix","schedule",self.pargs.uppmax_project))

            # log the transfer to statusdb if verification passed
            if passed:
                self.log.info("Logging delivery to StatusDB document {}".format(id))
                data = {'raw_data_delivery': {'timestamp': utc_time(),
                                              'files': {'R{}'.format(read):{'md5': m,
                                                                            'path': os.path.splitext(mfile)[0],
                                                                            'size_in_bytes': self._getsize(os.path.splitext(mfile)[0]),
                                                                            'source_location': srcpath} for m, mfile, read, srcpath in md5},
                                              }
                        }
                jsonstr = json.dumps(data)
                jsonfile = os.path.join(os.path.dirname(md5[0][3]),
                                        "{}_{}_{}_{}_L{}_raw_data_delivery.json".format(sample.get("date"),
                                                                                       sample.get("flowcell"),
                                                                                       sample.get("project_sample_name"),
                                                                                       sample.get("sequence"),
                                                                                       sample.get("lane")))
                self.log.debug("Writing delivery to json file {}".format(jsonfile))
                self.app.cmd.write(jsonfile,data=jsonstr,overwrite=True)
                self.log.debug("Saving delivery in StatusDB document {}".format(id))
                sample.update(data)
                self._save(s_con,sample)
                self.log.debug(jsonstr)
예제 #2
0
파일: deliver.py 프로젝트: ewels/scilifelab
    def raw_data(self):
        if not self._check_pargs(["project"]):
            return

        # if necessary, reformat flowcell identifier
        if self.pargs.flowcell:
            self.pargs.flowcell = self.pargs.flowcell.split("_")[-1]

        # get the uid and gid to use for destination files
        uid = os.getuid()
        gid = os.getgid()
        if self.pargs.group is not None and len(self.pargs.group) > 0:
            gid = grp.getgrnam(group).gr_gid

        self.log.debug("Connecting to project database")
        p_con = ProjectSummaryConnection(**vars(self.pargs))
        assert p_con, "Could not get connection to project databse"
        self.log.debug("Connecting to samples database")
        s_con = SampleRunMetricsConnection(**vars(self.pargs))
        assert s_con, "Could not get connection to samples databse"

        # Fetch the Uppnex project to deliver to
        if not self.pargs.uppmax_project:
            self.pargs.uppmax_project = p_con.get_entry(
                self.pargs.project, "uppnex_id")
            if not self.pargs.uppmax_project:
                self.log.error(
                    "Uppmax project was not specified and could not be fetched from project database"
                )
                return

        # Extract the list of samples and runs associated with the project and sort them
        samples = sorted(s_con.get_samples(fc_id=self.pargs.flowcell,
                                           sample_prj=self.pargs.project),
                         key=lambda k:
                         (k.get('project_sample_name', 'NA'),
                          k.get('flowcell', 'NA'), k.get('lane', 'NA')))

        # Setup paths and verify parameters
        self._meta.production_root = self.app.config.get("production", "root")
        self._meta.root_path = self._meta.production_root
        proj_base_dir = os.path.join(self._meta.root_path, self.pargs.project)
        assert os.path.exists(
            self._meta.production_root
        ), "No such directory {}; check your production config".format(
            self._meta.production_root)
        assert os.path.exists(
            proj_base_dir), "No project {} in production path {}".format(
                self.pargs.project, self._meta.root_path)

        try:
            self._meta.uppnex_project_root = self.app.config.get(
                "deliver", "uppnex_project_root")
        except Exception as e:
            self.log.warn(
                "{}, will use '/proj' as uppnext_project_root".format(e))
            self._meta.uppnex_project_root = '/proj'

        try:
            self._meta.uppnex_delivery_dir = self.app.config.get(
                "deliver", "uppnex_project_delivery_path")
        except Exception as e:
            self.log.warn(
                "{}, will use 'INBOX' as uppnext_project_delivery_path".format(
                    e))
            self._meta.uppnex_delivery_dir = 'INBOX'

        destination_root = os.path.join(self._meta.uppnex_project_root,
                                        self.pargs.uppmax_project,
                                        self._meta.uppnex_delivery_dir)
        assert os.path.exists(
            destination_root
        ), "Delivery destination folder {} does not exist".format(
            destination_root)
        destination_root = os.path.join(destination_root, self.pargs.project)

        # If interactively select, build a list of samples to skip
        if self.pargs.interactive:
            to_process = []
            for sample in samples:
                sname = sample.get("project_sample_name")
                index = sample.get("sequence")
                fcid = sample.get("flowcell")
                lane = sample.get("lane")
                date = sample.get("date")
                self.log.info(
                    "Sample: {}, Barcode: {}, Flowcell: {}, Lane: {}, Started on: {}"
                    .format(sname, index, fcid, lane, date))
                if query_yes_no("Deliver sample?", default="no"):
                    to_process.append(sample)
            samples = to_process

        # Find uncompressed fastq
        uncompressed = self._find_uncompressed_fastq_files(
            proj_base_dir, samples)
        if len(uncompressed) > 0:
            self.log.warn(
                "The following samples have uncompressed *.fastq files that cannot be delivered: {}"
                .format(",".join(uncompressed)))
            if not query_yes_no("Continue anyway?", default="no"):
                return

        self.log.info(
            "Will deliver data for {} samples from project {} to {}".format(
                len(samples), self.pargs.project, destination_root))
        if not query_yes_no("Continue?"):
            return

        # Get the list of files to transfer and the destination
        self.log.debug("Gathering list of files to copy")
        to_copy = self.get_file_copy_list(proj_base_dir, destination_root,
                                          samples)

        # Make sure that transfer will be with rsync
        if not self.pargs.rsync:
            self.log.warn("Files must be transferred using rsync")
            if not query_yes_no(
                    "Do you wish to continue delivering using rsync?",
                    default="yes"):
                return
            self.pargs.rsync = True

        # Process each sample run
        for id, files in to_copy.items():
            # get the sample database object
            [sample] = [s for s in samples if s.get('_id') == id]
            self.log.info("Processing sample {} and flowcell {}".format(
                sample.get("project_sample_name", "NA"),
                sample.get("flowcell", "NA")))

            # calculate md5sums on the source side and write it on the destination
            md5 = []
            for f in files:
                m = md5sum(f[0])
                mfile = "{}.md5".format(f[1])
                md5.append([m, mfile, f[2], f[0]])
                self.log.debug("md5sum for source file {}: {}".format(f[0], m))

            # transfer files
            self.log.debug("Transferring {} fastq files".format(len(files)))
            self._transfer_files([f[0] for f in files], [f[1] for f in files])

            # write the md5sum to a file at the destination and verify the transfer
            passed = True
            for m, mfile, read, srcpath in md5:
                dstfile = os.path.splitext(mfile)[0]
                self.log.debug("Writing md5sum to file {}".format(mfile))
                self.app.cmd.write(
                    mfile, "{}  {}".format(m, os.path.basename(dstfile)), True)
                self.log.debug("Verifying md5sum for file {}".format(dstfile))

                # if dry-run, make sure verification pass
                if self.pargs.dry_run:
                    dm = m
                else:
                    dm = md5sum(dstfile)
                self.log.debug("md5sum for destination file {}: {}".format(
                    dstfile, dm))
                if m != dm:
                    self.log.warn(
                        "md5sum verification FAILED for {}. Source: {}, Target: {}"
                        .format(dstfile, m, dm))
                    self.log.warn(
                        "Improperly transferred file {} is removed from destination, please retry transfer of this file"
                        .format(dstfile))
                    self.app.cmd.safe_unlink(dstfile)
                    self.app.cmd.safe_unlink(mfile)
                    passed = False
                    continue

                # Modify the permissions to ug+rw
                for f in [dstfile, mfile]:
                    self.app.cmd.chmod(
                        f, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP
                        | stat.S_IWGRP)

            # touch the flag to trigger uppmax inbox permission fix
            self.app.cmd.safe_touchfile(
                os.path.join("/sw", "uppmax", "var", "inboxfix", "schedule",
                             self.pargs.uppmax_project))

            # log the transfer to statusdb if verification passed
            if passed:
                self.log.info(
                    "Logging delivery to StatusDB document {}".format(id))
                data = {
                    'raw_data_delivery': {
                        'timestamp': utc_time(),
                        'files': {
                            'R{}'.format(read): {
                                'md5':
                                m,
                                'path':
                                os.path.splitext(mfile)[0],
                                'size_in_bytes':
                                self._getsize(os.path.splitext(mfile)[0]),
                                'source_location':
                                srcpath
                            }
                            for m, mfile, read, srcpath in md5
                        },
                    }
                }
                jsonstr = json.dumps(data)
                jsonfile = os.path.join(
                    os.path.dirname(md5[0][3]),
                    "{}_{}_{}_{}_L{}_raw_data_delivery.json".format(
                        sample.get("date"), sample.get("flowcell"),
                        sample.get("project_sample_name"),
                        sample.get("sequence"), sample.get("lane")))
                self.log.debug(
                    "Writing delivery to json file {}".format(jsonfile))
                self.app.cmd.write(jsonfile, data=jsonstr, overwrite=True)
                self.log.debug(
                    "Saving delivery in StatusDB document {}".format(id))
                sample.update(data)
                self._save(s_con, sample)
                self.log.debug(jsonstr)
예제 #3
0
    def raw_data(self):
        if not self._check_pargs(["project"]):
            return

        # if necessary, reformat flowcell identifier
        if self.pargs.flowcell:
            self.pargs.flowcell = self.pargs.flowcell.split("_")[-1]

        # get the uid and gid to use for destination files
        uid = os.getuid()
        gid = os.getgid()
        if self.pargs.group is not None and len(self.pargs.group) > 0:
            gid = grp.getgrnam(group).gr_gid

        self.log.debug("Connecting to project database")
        p_con = ProjectSummaryConnection(**vars(self.pargs))
        assert p_con, "Could not get connection to project database"
        self.log.debug("Connecting to flowcell database")
        f_con = FlowcellRunMetricsConnection(**vars(self.pargs))
        assert f_con, "Could not get connection to flowcell database"
        self.log.debug("Connecting to x_flowcell database")
        x_con = X_FlowcellRunMetricsConnection(**vars(self.pargs))
        assert x_con, "Could not get connection to x_flowcell database"

        # Fetch the Uppnex project to deliver to
        if not self.pargs.uppmax_project:
            self.pargs.uppmax_project = p_con.get_entry(
                self.pargs.project, "uppnex_id")
            if not self.pargs.uppmax_project:
                self.log.error(
                    "Uppmax project was not specified and could not be fetched from project database"
                )
                return

        # Setup paths and verify parameters
        self._meta.production_root = self.pargs.root if self.pargs.root else self.app.config.get(
            "production", "root")
        self._meta.root_path = self._meta.production_root
        proj_base_dir = os.path.join(self._meta.root_path, self.pargs.project)
        assert os.path.exists(
            self._meta.production_root
        ), "No such directory {}; check your production config".format(
            self._meta.production_root)
        assert os.path.exists(
            proj_base_dir), "No project {} in production path {}".format(
                self.pargs.project, self._meta.root_path)

        try:
            self._meta.uppnex_project_root = self.app.config.get(
                "deliver", "uppnex_project_root")
        except Exception as e:
            self.log.warn(
                "{}, will use '/proj' as uppnext_project_root".format(e))
            self._meta.uppnex_project_root = '/proj'

        try:
            self._meta.uppnex_delivery_dir = self.app.config.get(
                "deliver", "uppnex_project_delivery_path")
        except Exception as e:
            self.log.warn(
                "{}, will use 'INBOX' as uppnext_project_delivery_path".format(
                    e))
            self._meta.uppnex_delivery_dir = 'INBOX'

        destination_root = os.path.join(self._meta.uppnex_project_root,
                                        self.pargs.uppmax_project,
                                        self._meta.uppnex_delivery_dir)
        assert os.path.exists(
            destination_root
        ), "Delivery destination folder {} does not exist".format(
            destination_root)
        destination_root = os.path.join(destination_root, self.pargs.project)

        # Find uncompressed fastq
        uncompressed = self._find_uncompressed_fastq_files(
            proj_base_dir=proj_base_dir,
            sample=self.pargs.sample,
            flowcell=self.pargs.flowcell)
        if len(uncompressed) > 0:
            self.log.error(
                "There are uncompressed fastq file for project, kindly check all files are compressed properly before delivery"
            )
            return

        # Extract the list of samples and runs associated with the project and sort them
        samples = self.samples_to_copy(
            pid=p_con.get_entry(self.pargs.project, "project_id"),
            pod=p_con.get_entry(self.pargs.project, "open_date"),
            fc_dict={
                'HiSeq2500': f_con.proj_list,
                'HiSeqX': x_con.proj_list
            },
            proj_base_dir=proj_base_dir,
            destination_root=destination_root,
            sample=self.pargs.sample,
            flowcell=self.pargs.flowcell)

        # If interactively select, build a list of samples to skip
        if self.pargs.interactive:
            to_process = {}
            for sample in samples:
                if query_yes_no("Deliver sample {} ?".format(sample),
                                default="no"):
                    to_process[sample] = samples[sample]
            samples = to_process

        if self.pargs.sample:
            sample = samples.get(self.pargs.sample)
            if not sample:
                self.log.error(
                    "There is no such sample {} for project {}".format(
                        self.pargs.sample, self.pargs.project))
                return
            samples = {self.pargs.sample: sample}

        self.log.info(
            "Will deliver data for {} samples from project {} to {}".format(
                len(samples), self.pargs.project, destination_root))
        if not query_yes_no("Continue?"):
            return

        # Make sure that transfer will be with rsync
        if not self.pargs.rsync:
            self.log.warn("Files must be transferred using rsync")
            if not query_yes_no(
                    "Do you wish to continue delivering using rsync?",
                    default="yes"):
                return
            self.pargs.rsync = True

        # Process each sample
        for sample, flowcells in samples.iteritems():
            for fc, files in flowcells.iteritems():
                self.log.info("Processing sample {} and flowcell {}".format(
                    sample, fc))

                # transfer files
                self.log.debug("Transferring {} fastq files".format(
                    len(files['src'])))
                self._transfer_files(sources=files['src'],
                                     targets=files['dst'])

                passed = True
                if self.pargs.link or self.pargs.dry_run:
                    passed = False
                else:
                    # calculate md5sums on the source side and write it on the destination
                    md5 = []
                    for s, d in zip(files['src'], files['dst']):
                        m = md5sum(s)
                        mfile = "{}.md5".format(d)
                        md5.append([m, mfile, s])
                        self.log.debug("md5sum for source file {}: {}".format(
                            s, m))

                    # write the md5sum to a file at the destination and verify the transfer
                    for m, mfile, srcpath in md5:
                        dstfile = os.path.splitext(mfile)[0]
                        self.log.debug(
                            "Writing md5sum to file {}".format(mfile))
                        self.app.cmd.write(
                            mfile, "{}  {}".format(m,
                                                   os.path.basename(dstfile)),
                            True)
                        self.log.debug(
                            "Verifying md5sum for file {}".format(dstfile))
                        dm = md5sum(dstfile)
                        self.log.debug(
                            "md5sum for destination file {}: {}".format(
                                dstfile, dm))
                        if m != dm:
                            self.log.warn(
                                "md5sum verification FAILED for {}. Source: {}, Target: {}"
                                .format(dstfile, m, dm))
                            self.log.warn(
                                "Improperly transferred file {} is removed from destination, please retry transfer of this file"
                                .format(dstfile))
                            self.app.cmd.safe_unlink(dstfile)
                            self.app.cmd.safe_unlink(mfile)
                            passed = False
                            continue

                        # Modify the permissions to ug+rw
                        for f in [dstfile, mfile]:
                            self.app.cmd.chmod(
                                f, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP
                                | stat.S_IWGRP)

                # touch the flag to trigger uppmax inbox permission fix
                self.app.cmd.safe_touchfile(
                    os.path.join("/sw", "uppmax", "var", "inboxfix",
                                 "schedule", self.pargs.uppmax_project))

                # log the transfer to statusdb if verification passed
                if passed:
                    data = {
                        'raw_data_delivery': {
                            'timestamp': utc_time(),
                            'files': {
                                os.path.splitext(
                                    (os.path.basename(srcpath)))[0]:
                                {
                                    'md5':
                                    m,
                                    'path':
                                    os.path.splitext(mfile)[0],
                                    'size_in_bytes':
                                    self._getsize(os.path.splitext(mfile)[0]),
                                    'source_location':
                                    srcpath
                                }
                                for m, mfile, srcpath in md5
                            }
                        }
                    }
                    jsonstr = json.dumps(data)
                    jsonfile = os.path.join(
                        proj_base_dir, sample, fc,
                        "{}_{}_raw_data_delivery.json".format(sample, fc))
                    self.log.debug(
                        "Writing delivery to json file {}".format(jsonfile))
                    self.app.cmd.write(jsonfile, data=jsonstr, overwrite=True)
                    self.log.debug(
                        "Saving delivery in StatusDB document {}".format(id))
                    if self.proj_flowcells[fc]['type'] == 'HiSeqX':
                        fc_con = x_con
                    else:
                        fc_con = f_con
                    fc_obj = fc_con.get_entry(fc)
                    self.log.info(
                        "Logging delivery to StatusDB document {}".format(
                            fc_obj.get('_id')))
                    fc_raw_data = fc_obj.get('raw_data_delivery', {})
                    fc_raw_data.update(data['raw_data_delivery'])
                    fc_obj['raw_data_delivery'] = fc_raw_data
                    self._save(fc_con, fc_obj)
                    self.log.debug(jsonstr)