def parse_and_link(file, symlink_dir, metadata_dict): """ Parse metadata out of input filename and construct symlink. Takes a fastq filename, destination directory, and a metadata dict, which should be of type defaultdict(dict). Parse the filename to get information on the sample name, run, read #, etc. Medadata is added to the provided metadata_dict. Some metadata is used to build symlinks, to guarantee filename uniqueness and a regular naming structure.\ Currently parsing by assuming AGRF naming structure and paired-end reads Currently will ONLY handle gzipped files, to avoid multiple links to the same data. """ match_old = re.match( r".*?/([^_/]+)_([a-zA-Z0-9-.]+)_s_([0-9]+)_(1|2)_sequence.txt.gz", file) match_new = re.match( r".*?/([a-zA-Z0-9-.]+)_([^_/]+)_[CAGTN]+_L([0-9]+)_R(1|2).fastq.gz", file) if match_old: run_id = match_old.group(1) sample = match_old.group(2) lane = int(match_old.group(3)) pair = match_old.group(4) encoding = 'I' elif match_new: run_id = match_new.group(2) sample = match_new.group(1) lane = int(match_new.group(3)) pair = match_new.group(4) encoding = 'S' else: print "Unable to parse name of fastq file %s ." % file sys.exit(1) newfile = os.path.join( symlink_dir, "%s_%s_L%d_%s.fastq.gz" % (sample, run_id, lane, pair)) metadata_dict[os.path.basename(newfile)]['sample'] = sample metadata_dict[os.path.basename(newfile)]['run_id'] = run_id metadata_dict[os.path.basename(newfile)]['lane'] = lane metadata_dict[os.path.basename(newfile)]['pair'] = pair metadata_dict[os.path.basename(newfile)]['encoding'] = encoding relative_sourcefile = os.path.relpath(file, symlink_dir) mkLink(relative_sourcefile, newfile) return newfile
def parse_and_link(file, symlink_dir, metadata_dict): """ Parse metadata out of input filename and construct symlink. Takes a fastq filename, destination directory, and a metadata dict, which should be of type defaultdict(dict). Parse the filename to get information on the sample name, run, read #, etc. Medadata is added to the provided metadata_dict. Some metadata is used to build symlinks, to guarantee filename uniqueness and a regular naming structure.\ Currently parsing by assuming AGRF naming structure and paired-end reads Currently will ONLY handle gzipped files, to avoid multiple links to the same data. """ match_old = re.match(r".*?/([^_/]+)_([a-zA-Z0-9-.]+)_s_([0-9]+)_(1|2)_sequence.txt.gz",file) match_new = re.match(r".*?/([a-zA-Z0-9-.]+)_([^_/]+)_[CAGTN]+_L([0-9]+)_R(1|2).fastq.gz",file) if match_old: run_id = match_old.group(1) sample = match_old.group(2) lane = int(match_old.group(3)) pair = match_old.group(4) encoding = 'I' elif match_new: run_id = match_new.group(2) sample = match_new.group(1) lane = int(match_new.group(3)) pair = match_new.group(4) encoding = 'S' else: print "Unable to parse name of fastq file %s ." % file sys.exit(1) newfile = os.path.join(symlink_dir, "%s_%s_L%d_%s.fastq.gz" % (sample, run_id, lane, pair)) metadata_dict[os.path.basename(newfile)]['sample'] = sample metadata_dict[os.path.basename(newfile)]['run_id'] = run_id metadata_dict[os.path.basename(newfile)]['lane'] = lane metadata_dict[os.path.basename(newfile)]['pair'] = pair metadata_dict[os.path.basename(newfile)]['encoding'] = encoding relative_sourcefile = os.path.relpath(file, symlink_dir) mkLink(relative_sourcefile, newfile) return newfile
snpeff_dir = os.path.join(output_dir, "snpeff") mkDir(snpeff_dir) # directory for final summary tables results_dir = os.path.join(output_dir, "results") mkDir(results_dir) # Pipeline declarations # Making references #Reference file setup RefName=ref_files['fasta_reference'].split("/")[-1] print RefName fasta_reference=os.path.join(ref_dir, RefName) mkLink(ref_files['fasta_reference'], fasta_reference) RefDict=ref_files['fasta_dict'].split("/")[-1] print RefDict # print ref_files['fasta_dict'] fasta_dict=os.path.join(ref_dir, RefDict) # print fasta_dict mkLink(ref_files['fasta_dict'], fasta_dict) # sys.exit(0) # RefDict_temp=ref_files['fasta_reference'].split("/")[-1] # RefDict=RefDict_temp.split(".")[0]+".dict" # ref_dir=ref_dir+"/" # print RefDict # print ref_dir