def record_to_csv(self, tarfile_name, filepath, fullpath, archive_path, filesize, result, ext=None): flowcell_id = SFHelper.get_flowcell_id(tarfile_name) normalized_filepath = fullpath.split("uploads/")[-1] if filepath.endswith('html'): head, sep, tail = fullpath.partition('all/') path = head.split(flowcell_id + '/')[-1] else: path = SFUtils.get_meta_path(fullpath, False) includes_csv = open(self.includes_csv_path, "a") includes_csv.write(tarfile_name + ", " + normalized_filepath + ", " + archive_path + ", " + flowcell_id + ", " + SFHelper.get_pi_name(path) + ", " + SFHelper.get_project_id(path) + ", " + SFHelper.get_project_name(path, tarfile_name, ext) + ", " + SFHelper.get_sample_name(path) + ", " + SFHelper.get_run_name(tarfile_name) + ", " + SFHelper.get_sequencing_platform(tarfile_name) + "," + str(filesize) + "," + result + "\n") includes_csv.close()
def main(args): if len(sys.argv) < 6: print( "\n Usage: python app.py tarfile_list tarfile_dir extract_path audit_dir dryrun include_file_size" ) return # The file containing the tarlist tarfile_list = args[1] # The directory containing the tarfiles tarfile_dir = args[2] # path containing the extracted file extract_path = args[3] # sub-directory to hold the log and audit files audit_dir = args[4] #If this is a dryrun or not dryrun = args[5].lower() == 'true' #If filesize has to be recorded or not - applicable only for dry run if dryrun: if len(sys.argv) > 6: record_file_size = args[6] else: record_file_size = False else: record_file_size = True bytes_stored = 0 files_registered = 0 #if (args[6] is not None): # bytes_stored = args[6] # if (args[7] is not None): # files_registered = args[7] sf_audit = SFAudit(audit_dir, extract_path, bytes_stored, files_registered) sf_audit.prep_for_audit() for line_filepath in open(tarfile_list).readlines(): tarfile_name = line_filepath.rstrip() tarfile_path = tarfile_dir + '/' + tarfile_name.rstrip() # This is a valid tarball, so process logging.info("Processing file: " + tarfile_path) if (tarfile_name.endswith("supplement.tar") or 'singlecell' in tarfile_name or '10x' in tarfile_name): # Register PI collection register_collection(tarfile_path, "PI_Lab", tarfile_name, False, sf_audit, dryrun) # Register Flowcell collection with Project type parent register_collection(tarfile_path, "Flowcell", tarfile_name, True, sf_audit, dryrun) # create Object metadata with Flowcell type parent and register object register_object(tarfile_path, "Flowcell", tarfile_name, False, tarfile_path, sf_audit, dryrun) logging.info('Done processing file: ' + tarfile_path) continue #Get or create list file (if not present) tarfile_contents = SFUtils.get_tarball_contents( tarfile_name, tarfile_dir, sf_audit) if tarfile_contents is None: continue # Extract all files and store in extract_path directory if record_file_size: if not (SFUtils.extract_files_from_tar(tarfile_path, extract_path)): # Something wrong with this file path, go to # next one and check logs later continue #loop through each line in the contents file of this tarball #We need to do an upload for each fastq.gz or BAM file for line in tarfile_contents.readlines(): logging.info('processing line in tarfile: ' + line) if (line.rstrip().endswith("/")): #This is a directory, nothing to do continue #Get full path of the extracted file filepath = SFUtils.get_filepath_to_archive(line.rstrip(), extract_path) logging.info('filepath to archive: ' + filepath) #if SFUtils.path_contains_exclude_str(tarfile_name, line.rstrip()): exclusion_list = ['10X', 'demux', 'demultiplex'] if any(ext in line.rstrip() for ext in exclusion_list): sf_audit.record_exclusion( tarfile_name, line.rstrip(), filepath, 'Path contains substring from exclusion list') continue if filepath.endswith('fastq') or filepath.endswith( 'fastq.gz') or filepath.endswith('fastq.gz.md5'): # Extract the info for PI metadata path = SFUtils.get_meta_path(filepath) ext = SFUtils.get_unaligned_ext(filepath) # Register PI collection register_collection(path, "PI_Lab", tarfile_name, False, sf_audit, dryrun) #Register Flowcell collection with Project type parent register_collection(path, "Flowcell", tarfile_name, True, sf_audit, dryrun, ext) #create Object metadata with Sample type parent and register object register_object(path, "Sample", tarfile_name, True, filepath, sf_audit, dryrun, ext) elif line.rstrip().endswith( 'laneBarcode.html' ) and '/all/' in line and not 'Control_Sample' in line: #Remove the string after the first '/all' because metadata path if present will be before that head, sep, tail = line.partition('all/') #Remove everything upto the Flowcell_id, because metadata path if present will be after that flowcell_id = SFHelper.get_flowcell_id(tarfile_name) if flowcell_id in head: path = head.split(flowcell_id + '/')[-1] ext = SFUtils.get_unaligned_ext(filepath) #Ensure that metadata path does not have the Sample sub-directory and that it is valid if path.count('/') == 1 and '_' in path: #Register the html in flowcell collection path = path + 'laneBarcode.html' logging.info('metadata base: ' + path) # Register PI collection register_collection(path, "PI_Lab", tarfile_name, False, sf_audit, dryrun) # Register Flowcell collection with Project type parent register_collection(path, "Flowcell", tarfile_name, True, sf_audit, dryrun, ext) # create Object metadata with Flowcell type parent and register object register_object(path, "Flowcell", tarfile_name, False, filepath, sf_audit, dryrun, ext) else: # ignore this html sf_audit.record_exclusion( tarfile_name, line.rstrip(), filepath, 'html path not valid - may have other sub-directory' ) continue else: #ignore this html sf_audit.record_exclusion( tarfile_name, line.rstrip(), filepath, 'html path not valid - could not extract flowcell_id') continue else: #For now, we ignore files that are not fastq.gz or html sf_audit.record_exclusion(tarfile_name, line.rstrip(), filepath, 'Not fastq.gz or valid html file') logging.info('Done processing file: ' + tarfile_path) # delete the extracted tar file if record_file_size: os.system("rm -rf " + extract_path + "*") sf_audit.audit_summary()
def main(args): if len(sys.argv) < 5: print( "\n Usage: python app.py tarfile_list tarfile_dir extract_path audit_dir dryrun initial_bytes initial_file_count" ) return # The file containing the tarlist tarfile_list = args[1] # The directory containing the tarfiles tarfile_dir = args[2] # path containing the extracted file extract_path = args[3] # sub-directory to hold the log and audit files audit_dir = args[4] #If this is a dryrun or not dryrun = args[5].lower() == 'true' sf_audit = SFAudit(audit_dir, extract_path, 0, 0) sf_audit.prep_for_audit() for line_filepath in open(tarfile_list).readlines(): tarfile_name = line_filepath.rstrip() tarfile_path = tarfile_dir + '/' + tarfile_name.rstrip() # This is a valid tarball, so process logging.info("Processing file: " + tarfile_path) if (tarfile_name.endswith("supplement.tar") or 'singlecell' in tarfile_name or '10x' in tarfile_name): # Register Flowcell collection register_collection(tarfile_path, "Flowcell", tarfile_name, False, tarfile_path, sf_audit, dryrun) logging.info('Done processing file: ' + tarfile_path) continue tarfile_contents = SFUtils.get_tarball_contents( tarfile_name, tarfile_dir, sf_audit) if tarfile_contents is None: continue #loop through each line in the contents file of this tarball #We need to do an upload for each fatq.gz or BAM file for line in tarfile_contents.readlines(): if (line.rstrip().endswith("/")): #This is a directory, nothing to do continue #if SFUtils.path_contains_exclude_str(tarfile_name, line.rstrip()): exclusion_list = ['10X', 'Phix', 'PhiX', 'demux', 'demultiplex'] if any(ext in line.rstrip() for ext in exclusion_list): sf_audit.record_exclusion( tarfile_name, line.rstrip(), 'Path contains substring from exclusion list') continue filepath = SFUtils.get_filepath_to_archive(line.rstrip(), extract_path) if filepath.endswith('fastq.gz') or filepath.endswith( 'fastq.gz.md5'): # Extract the info for PI metadata path = SFUtils.get_meta_path(filepath) #Register Flowcell collection with Project type parent register_collection(path, "Flowcell", tarfile_name, False, filepath, sf_audit, dryrun) else: #For now, we ignore files that are not fastq.gz or html sf_audit.record_exclusion(tarfile_name, line.rstrip(), 'Not fastq.gz or valid html file') logging.info('Done processing file: ' + tarfile_path)