示例#1
0
    def record_to_csv(self,
                      tarfile_name,
                      filepath,
                      fullpath,
                      archive_path,
                      filesize,
                      result,
                      ext=None):

        flowcell_id = SFHelper.get_flowcell_id(tarfile_name)
        normalized_filepath = fullpath.split("uploads/")[-1]
        if filepath.endswith('html'):
            head, sep, tail = fullpath.partition('all/')
            path = head.split(flowcell_id + '/')[-1]
        else:
            path = SFUtils.get_meta_path(fullpath, False)

        includes_csv = open(self.includes_csv_path, "a")
        includes_csv.write(tarfile_name + ", " + normalized_filepath + ", " +
                           archive_path + ", " + flowcell_id + ", " +
                           SFHelper.get_pi_name(path) + ", " +
                           SFHelper.get_project_id(path) + ", " +
                           SFHelper.get_project_name(path, tarfile_name, ext) +
                           ", " + SFHelper.get_sample_name(path) + ", " +
                           SFHelper.get_run_name(tarfile_name) + ", " +
                           SFHelper.get_sequencing_platform(tarfile_name) +
                           "," + str(filesize) + "," + result + "\n")
        includes_csv.close()
示例#2
0
文件: app.py 项目: CBIIT/HPC_DME_APIs
def main(args):

    if len(sys.argv) < 6:
        print(
            "\n Usage: python app.py tarfile_list tarfile_dir extract_path audit_dir dryrun include_file_size"
        )
        return

    # The file containing the tarlist
    tarfile_list = args[1]

    # The directory containing the tarfiles
    tarfile_dir = args[2]

    # path containing the extracted file
    extract_path = args[3]

    # sub-directory to hold the log and audit files
    audit_dir = args[4]

    #If this is a dryrun or not
    dryrun = args[5].lower() == 'true'

    #If filesize has to be recorded or not - applicable only for dry run
    if dryrun:
        if len(sys.argv) > 6:
            record_file_size = args[6]
        else:
            record_file_size = False
    else:
        record_file_size = True

    bytes_stored = 0
    files_registered = 0

    #if (args[6] is not None):
    #    bytes_stored = args[6]
    #    if (args[7] is not None):
    #        files_registered = args[7]

    sf_audit = SFAudit(audit_dir, extract_path, bytes_stored, files_registered)
    sf_audit.prep_for_audit()

    for line_filepath in open(tarfile_list).readlines():

        tarfile_name = line_filepath.rstrip()

        tarfile_path = tarfile_dir + '/' + tarfile_name.rstrip()

        # This is a valid tarball, so process
        logging.info("Processing file: " + tarfile_path)

        if (tarfile_name.endswith("supplement.tar")
                or 'singlecell' in tarfile_name or '10x' in tarfile_name):

            # Register PI collection
            register_collection(tarfile_path, "PI_Lab", tarfile_name, False,
                                sf_audit, dryrun)

            # Register Flowcell collection with Project type parent
            register_collection(tarfile_path, "Flowcell", tarfile_name, True,
                                sf_audit, dryrun)

            # create Object metadata with Flowcell type parent and register object
            register_object(tarfile_path, "Flowcell", tarfile_name, False,
                            tarfile_path, sf_audit, dryrun)

            logging.info('Done processing file: ' + tarfile_path)

            continue

        #Get or create list file (if not present)
        tarfile_contents = SFUtils.get_tarball_contents(
            tarfile_name, tarfile_dir, sf_audit)
        if tarfile_contents is None:
            continue

        # Extract all files and store in extract_path directory
        if record_file_size:
            if not (SFUtils.extract_files_from_tar(tarfile_path,
                                                   extract_path)):
                # Something wrong with this file path, go to
                # next one and check logs later
                continue

        #loop through each line in the contents file of this tarball
        #We need to do an upload for each fastq.gz or BAM file
        for line in tarfile_contents.readlines():
            logging.info('processing line in tarfile: ' + line)

            if (line.rstrip().endswith("/")):
                #This is a directory, nothing to do
                continue

            #Get full path of the extracted file
            filepath = SFUtils.get_filepath_to_archive(line.rstrip(),
                                                       extract_path)
            logging.info('filepath to archive: ' + filepath)

            #if SFUtils.path_contains_exclude_str(tarfile_name, line.rstrip()):
            exclusion_list = ['10X', 'demux', 'demultiplex']
            if any(ext in line.rstrip() for ext in exclusion_list):
                sf_audit.record_exclusion(
                    tarfile_name, line.rstrip(), filepath,
                    'Path contains substring from exclusion list')
                continue

            if filepath.endswith('fastq') or filepath.endswith(
                    'fastq.gz') or filepath.endswith('fastq.gz.md5'):

                # Extract the info for PI metadata
                path = SFUtils.get_meta_path(filepath)
                ext = SFUtils.get_unaligned_ext(filepath)

                # Register PI collection
                register_collection(path, "PI_Lab", tarfile_name, False,
                                    sf_audit, dryrun)

                #Register Flowcell collection with Project type parent
                register_collection(path, "Flowcell", tarfile_name, True,
                                    sf_audit, dryrun, ext)

                #create Object metadata with Sample type parent and register object
                register_object(path, "Sample", tarfile_name, True, filepath,
                                sf_audit, dryrun, ext)

            elif line.rstrip().endswith(
                    'laneBarcode.html'
            ) and '/all/' in line and not 'Control_Sample' in line:

                #Remove the string after the first '/all' because metadata path if present will be before that
                head, sep, tail = line.partition('all/')

                #Remove everything upto the Flowcell_id, because metadata path if present will be after that
                flowcell_id = SFHelper.get_flowcell_id(tarfile_name)
                if flowcell_id in head:

                    path = head.split(flowcell_id + '/')[-1]
                    ext = SFUtils.get_unaligned_ext(filepath)

                    #Ensure that metadata path does not have the Sample sub-directory and that it is valid
                    if path.count('/') == 1 and '_' in path:

                        #Register the html in flowcell collection

                        path = path + 'laneBarcode.html'
                        logging.info('metadata base: ' + path)

                        # Register PI collection
                        register_collection(path, "PI_Lab", tarfile_name,
                                            False, sf_audit, dryrun)

                        # Register Flowcell collection with Project type parent
                        register_collection(path, "Flowcell", tarfile_name,
                                            True, sf_audit, dryrun, ext)

                        # create Object metadata with Flowcell type parent and register object
                        register_object(path, "Flowcell", tarfile_name, False,
                                        filepath, sf_audit, dryrun, ext)

                    else:
                        # ignore this html
                        sf_audit.record_exclusion(
                            tarfile_name, line.rstrip(), filepath,
                            'html path not valid - may have other sub-directory'
                        )
                        continue

                else:
                    #ignore this html
                    sf_audit.record_exclusion(
                        tarfile_name, line.rstrip(), filepath,
                        'html path not valid - could not extract flowcell_id')
                    continue

            else:
                #For now, we ignore files that are not fastq.gz or html
                sf_audit.record_exclusion(tarfile_name, line.rstrip(),
                                          filepath,
                                          'Not fastq.gz or valid html file')

        logging.info('Done processing file: ' + tarfile_path)

        # delete the extracted tar file
        if record_file_size:
            os.system("rm -rf " + extract_path + "*")

    sf_audit.audit_summary()
示例#3
0
def main(args):

    if len(sys.argv) < 5:
        print(
            "\n Usage: python app.py tarfile_list tarfile_dir extract_path audit_dir dryrun initial_bytes initial_file_count"
        )
        return

    # The file containing the tarlist
    tarfile_list = args[1]

    # The directory containing the tarfiles
    tarfile_dir = args[2]

    # path containing the extracted file
    extract_path = args[3]

    # sub-directory to hold the log and audit files
    audit_dir = args[4]

    #If this is a dryrun or not
    dryrun = args[5].lower() == 'true'

    sf_audit = SFAudit(audit_dir, extract_path, 0, 0)
    sf_audit.prep_for_audit()

    for line_filepath in open(tarfile_list).readlines():

        tarfile_name = line_filepath.rstrip()

        tarfile_path = tarfile_dir + '/' + tarfile_name.rstrip()

        # This is a valid tarball, so process
        logging.info("Processing file: " + tarfile_path)

        if (tarfile_name.endswith("supplement.tar")
                or 'singlecell' in tarfile_name or '10x' in tarfile_name):

            # Register Flowcell collection
            register_collection(tarfile_path, "Flowcell", tarfile_name, False,
                                tarfile_path, sf_audit, dryrun)

            logging.info('Done processing file: ' + tarfile_path)

            continue

        tarfile_contents = SFUtils.get_tarball_contents(
            tarfile_name, tarfile_dir, sf_audit)
        if tarfile_contents is None:
            continue

        #loop through each line in the contents file of this tarball
        #We need to do an upload for each fatq.gz or BAM file
        for line in tarfile_contents.readlines():

            if (line.rstrip().endswith("/")):
                #This is a directory, nothing to do
                continue

            #if SFUtils.path_contains_exclude_str(tarfile_name, line.rstrip()):
            exclusion_list = ['10X', 'Phix', 'PhiX', 'demux', 'demultiplex']
            if any(ext in line.rstrip() for ext in exclusion_list):
                sf_audit.record_exclusion(
                    tarfile_name, line.rstrip(),
                    'Path contains substring from exclusion list')
                continue

            filepath = SFUtils.get_filepath_to_archive(line.rstrip(),
                                                       extract_path)

            if filepath.endswith('fastq.gz') or filepath.endswith(
                    'fastq.gz.md5'):

                # Extract the info for PI metadata
                path = SFUtils.get_meta_path(filepath)

                #Register Flowcell collection with Project type parent
                register_collection(path, "Flowcell", tarfile_name, False,
                                    filepath, sf_audit, dryrun)

            else:
                #For now, we ignore files that are not fastq.gz or html
                sf_audit.record_exclusion(tarfile_name, line.rstrip(),
                                          'Not fastq.gz or valid html file')

        logging.info('Done processing file: ' + tarfile_path)