示例#1
0
def main(args):

    if len(sys.argv) < 5:
        print(
            "\n Usage: python app.py tarfile_list tarfile_dir extract_path audit_dir dryrun initial_bytes initial_file_count"
        )
        return

    # The file containing the tarlist
    tarfile_list = args[1]

    # The directory containing the tarfiles
    tarfile_dir = args[2]

    # path containing the extracted file
    extract_path = args[3]

    # sub-directory to hold the log and audit files
    audit_dir = args[4]

    #If this is a dryrun or not
    dryrun = args[5].lower() == 'true'

    sf_audit = SFAudit(audit_dir, extract_path, 0, 0)
    sf_audit.prep_for_audit()

    for line_filepath in open(tarfile_list).readlines():

        tarfile_name = line_filepath.rstrip()

        tarfile_path = tarfile_dir + '/' + tarfile_name.rstrip()

        # This is a valid tarball, so process
        logging.info("Processing file: " + tarfile_path)

        if (tarfile_name.endswith("supplement.tar")
                or 'singlecell' in tarfile_name or '10x' in tarfile_name):

            # Register Flowcell collection
            register_collection(tarfile_path, "Flowcell", tarfile_name, False,
                                tarfile_path, sf_audit, dryrun)

            logging.info('Done processing file: ' + tarfile_path)

            continue

        tarfile_contents = SFUtils.get_tarball_contents(
            tarfile_name, tarfile_dir, sf_audit)
        if tarfile_contents is None:
            continue

        #loop through each line in the contents file of this tarball
        #We need to do an upload for each fatq.gz or BAM file
        for line in tarfile_contents.readlines():

            if (line.rstrip().endswith("/")):
                #This is a directory, nothing to do
                continue

            #if SFUtils.path_contains_exclude_str(tarfile_name, line.rstrip()):
            exclusion_list = ['10X', 'Phix', 'PhiX', 'demux', 'demultiplex']
            if any(ext in line.rstrip() for ext in exclusion_list):
                sf_audit.record_exclusion(
                    tarfile_name, line.rstrip(),
                    'Path contains substring from exclusion list')
                continue

            filepath = SFUtils.get_filepath_to_archive(line.rstrip(),
                                                       extract_path)

            if filepath.endswith('fastq.gz') or filepath.endswith(
                    'fastq.gz.md5'):

                # Extract the info for PI metadata
                path = SFUtils.get_meta_path(filepath)

                #Register Flowcell collection with Project type parent
                register_collection(path, "Flowcell", tarfile_name, False,
                                    filepath, sf_audit, dryrun)

            else:
                #For now, we ignore files that are not fastq.gz or html
                sf_audit.record_exclusion(tarfile_name, line.rstrip(),
                                          'Not fastq.gz or valid html file')

        logging.info('Done processing file: ' + tarfile_path)
示例#2
0
文件: app.py 项目: CBIIT/HPC_DME_APIs
def main(args):

    if len(sys.argv) < 6:
        print(
            "\n Usage: python app.py tarfile_list tarfile_dir extract_path audit_dir dryrun include_file_size"
        )
        return

    # The file containing the tarlist
    tarfile_list = args[1]

    # The directory containing the tarfiles
    tarfile_dir = args[2]

    # path containing the extracted file
    extract_path = args[3]

    # sub-directory to hold the log and audit files
    audit_dir = args[4]

    #If this is a dryrun or not
    dryrun = args[5].lower() == 'true'

    #If filesize has to be recorded or not - applicable only for dry run
    if dryrun:
        if len(sys.argv) > 6:
            record_file_size = args[6]
        else:
            record_file_size = False
    else:
        record_file_size = True

    bytes_stored = 0
    files_registered = 0

    #if (args[6] is not None):
    #    bytes_stored = args[6]
    #    if (args[7] is not None):
    #        files_registered = args[7]

    sf_audit = SFAudit(audit_dir, extract_path, bytes_stored, files_registered)
    sf_audit.prep_for_audit()

    for line_filepath in open(tarfile_list).readlines():

        tarfile_name = line_filepath.rstrip()

        tarfile_path = tarfile_dir + '/' + tarfile_name.rstrip()

        # This is a valid tarball, so process
        logging.info("Processing file: " + tarfile_path)

        if (tarfile_name.endswith("supplement.tar")
                or 'singlecell' in tarfile_name or '10x' in tarfile_name):

            # Register PI collection
            register_collection(tarfile_path, "PI_Lab", tarfile_name, False,
                                sf_audit, dryrun)

            # Register Flowcell collection with Project type parent
            register_collection(tarfile_path, "Flowcell", tarfile_name, True,
                                sf_audit, dryrun)

            # create Object metadata with Flowcell type parent and register object
            register_object(tarfile_path, "Flowcell", tarfile_name, False,
                            tarfile_path, sf_audit, dryrun)

            logging.info('Done processing file: ' + tarfile_path)

            continue

        #Get or create list file (if not present)
        tarfile_contents = SFUtils.get_tarball_contents(
            tarfile_name, tarfile_dir, sf_audit)
        if tarfile_contents is None:
            continue

        # Extract all files and store in extract_path directory
        if record_file_size:
            if not (SFUtils.extract_files_from_tar(tarfile_path,
                                                   extract_path)):
                # Something wrong with this file path, go to
                # next one and check logs later
                continue

        #loop through each line in the contents file of this tarball
        #We need to do an upload for each fastq.gz or BAM file
        for line in tarfile_contents.readlines():
            logging.info('processing line in tarfile: ' + line)

            if (line.rstrip().endswith("/")):
                #This is a directory, nothing to do
                continue

            #Get full path of the extracted file
            filepath = SFUtils.get_filepath_to_archive(line.rstrip(),
                                                       extract_path)
            logging.info('filepath to archive: ' + filepath)

            #if SFUtils.path_contains_exclude_str(tarfile_name, line.rstrip()):
            exclusion_list = ['10X', 'demux', 'demultiplex']
            if any(ext in line.rstrip() for ext in exclusion_list):
                sf_audit.record_exclusion(
                    tarfile_name, line.rstrip(), filepath,
                    'Path contains substring from exclusion list')
                continue

            if filepath.endswith('fastq') or filepath.endswith(
                    'fastq.gz') or filepath.endswith('fastq.gz.md5'):

                # Extract the info for PI metadata
                path = SFUtils.get_meta_path(filepath)
                ext = SFUtils.get_unaligned_ext(filepath)

                # Register PI collection
                register_collection(path, "PI_Lab", tarfile_name, False,
                                    sf_audit, dryrun)

                #Register Flowcell collection with Project type parent
                register_collection(path, "Flowcell", tarfile_name, True,
                                    sf_audit, dryrun, ext)

                #create Object metadata with Sample type parent and register object
                register_object(path, "Sample", tarfile_name, True, filepath,
                                sf_audit, dryrun, ext)

            elif line.rstrip().endswith(
                    'laneBarcode.html'
            ) and '/all/' in line and not 'Control_Sample' in line:

                #Remove the string after the first '/all' because metadata path if present will be before that
                head, sep, tail = line.partition('all/')

                #Remove everything upto the Flowcell_id, because metadata path if present will be after that
                flowcell_id = SFHelper.get_flowcell_id(tarfile_name)
                if flowcell_id in head:

                    path = head.split(flowcell_id + '/')[-1]
                    ext = SFUtils.get_unaligned_ext(filepath)

                    #Ensure that metadata path does not have the Sample sub-directory and that it is valid
                    if path.count('/') == 1 and '_' in path:

                        #Register the html in flowcell collection

                        path = path + 'laneBarcode.html'
                        logging.info('metadata base: ' + path)

                        # Register PI collection
                        register_collection(path, "PI_Lab", tarfile_name,
                                            False, sf_audit, dryrun)

                        # Register Flowcell collection with Project type parent
                        register_collection(path, "Flowcell", tarfile_name,
                                            True, sf_audit, dryrun, ext)

                        # create Object metadata with Flowcell type parent and register object
                        register_object(path, "Flowcell", tarfile_name, False,
                                        filepath, sf_audit, dryrun, ext)

                    else:
                        # ignore this html
                        sf_audit.record_exclusion(
                            tarfile_name, line.rstrip(), filepath,
                            'html path not valid - may have other sub-directory'
                        )
                        continue

                else:
                    #ignore this html
                    sf_audit.record_exclusion(
                        tarfile_name, line.rstrip(), filepath,
                        'html path not valid - could not extract flowcell_id')
                    continue

            else:
                #For now, we ignore files that are not fastq.gz or html
                sf_audit.record_exclusion(tarfile_name, line.rstrip(),
                                          filepath,
                                          'Not fastq.gz or valid html file')

        logging.info('Done processing file: ' + tarfile_path)

        # delete the extracted tar file
        if record_file_size:
            os.system("rm -rf " + extract_path + "*")

    sf_audit.audit_summary()
示例#3
0
文件: app.py 项目: CBIIT/HPC_DME_APIs
def main(args):
    if len(sys.argv) <> 3:
        print("\n Usage: python app.py base_dir pi_dir_list")
        return

    # The root dir of the source
    base_dir = args[1]

    # The file containing the PI directory
    pi_dir_list = args[2]

    # path containing the extracted file
    dest_base_dir = base_dir + "/staged"

    # path containing the extracted file
    extract_path = base_dir + "/work"

    # sub-directory to hold the log and audit files
    audit_dir = base_dir + "/work"

    bytes_stored = 0
    files_registered = 0

    sf_audit = SFAudit(audit_dir, extract_path, bytes_stored, files_registered)
    sf_audit.prep_for_audit()

    pi_dir_list_path = base_dir + "/" + pi_dir_list
    for line_dirname in open(pi_dir_list_path).readlines():

        pi_dir_path = base_dir + '/' + line_dirname.rstrip()
        destDir = dest_base_dir + "/" + line_dirname.rstrip()
        print "destDir = " + destDir
        if not os.path.exists(destDir):
            os.mkdir(destDir)

        # This is a valid pi directory, so process
        logging.info("Processing pi dir: " + pi_dir_path)

        #number go through each dir here that get the one that has the name 'fastq' it it
        #for element in os.listdir(pi_dir_path):
        for (dirName, subdirList, fileList) in os.walk(pi_dir_path):
            for fileName in fileList:
                if fileName.startswith('Seq') and fileName.endswith('.tar'):
                    logging.info("fileName: %s", fileName)
                    #Untar to extract_path
                    tarPath = dirName + "/" + fileName

                    # Get or create list file (if not present)
                    tarfile_contents = SFUtils.get_tarball_contents(
                        fileName, dirName, sf_audit, extract_path)
                    if tarfile_contents is None:
                        continue

                    if not (SFUtils.extract_files_from_tar(
                            tarPath, extract_path)):
                        # Something wrong with this file path, go to
                        # next one and check logs later
                        continue

                    # loop through each line in the contents file of this tarball
                    # We need to do a copy for each fastq.gz or BAM file
                    for line in tarfile_contents.readlines():
                        logging.info('processing line in tarfile: ' + line)

                        if (line.rstrip().endswith("/")):
                            # This is a directory, nothing to do
                            continue

                        # Get full path of the extracted file
                        filePath = SFUtils.get_filepath_to_archive(
                            line.rstrip(), extract_path)
                        print 'Extracted filePath ' + filePath

                        if filePath.endswith('fastq') or filePath.endswith('fastq.gz') \
                                or (filePath.endswith('laneBarcode.html') and '/all/' in filePath):
                            copy_file(tarPath, line, filePath, destDir,
                                      sf_audit)

                    os.system("rm -rf " + extract_path + "/" +
                              fileName.split(".tar")[0])

                elif fileName.endswith('bam') or fileName.endswith(
                        'bai') or fileName.endswith('fastq'):
                    filePath = dirName + "/" + fileName
                    copy_file(None, None, filePath, destDir, sf_audit)

        logging.info('Done processing directory: ' + pi_dir_path)

    sf_audit.audit_summary()