def gse_task(gse_id, gsefiltdict=get_queryfilt_dict(), timestamp=gettime_ntp()): """ gse_task GSE based task for celery job queue. Arguments * gse_id : A single valid GSE id (str). * gsefiltdict : GSE filtered query object, as dictionary read using querydict() (dict). * timestamp : NTP timestamp for versioning file downloads (str). Returns * rl, a list of download dictionaries and rmdb update statuses. """ if not timestamp: run_timestamp = gettime_ntp() else: run_timestamp = timestamp print('Beginning GSE task, ID: ' + gse_id) rl = [] rl.append(gse_id) if gsefiltdict: print('File gsefiltdict provided, continuing...') gsmlist = gsefiltdict[gse_id] print('Detected N = ' + str(len(gsmlist)) + ' GSM IDs...') if len(gsmlist) > 0: rl.append(True) print("Beginning soft file download...") ddsoft = dl_soft(gse_list=[gse_id], timestamp=run_timestamp) rl.append(True) print('Beginning idat download...') ddidat = dl_idat(input_list=gsmlist, timestamp=run_timestamp) rl.append(True) print('updating rmdb...') updateobj = update_rmdb(ddidat=ddidat, ddsoft=ddsoft) rl.append(True) else: print('No valid GSM IDs detected for study GSE ID ', gse_id, ', skipping...') rl.append(None) print('Task completed! Returning...') return rl else: print("Error: no GSE query filt file provided. Returning...") rl.append(None) return rl
def firsttime_run(filedir='recount-methylation-files', run_timestamp=gettime_ntp()): """ firsttime_run On first setup, run new equeries and query filter. Arguments: * filedir (str): Dir name for db files. * run_timestamp (str) : NTP timestamp or function to retrieve it. Returns: * gseidlist (list): List of valid GSE IDs. """ print("Beginning first time server run...") equery_dest = settings.equerypath temppath = settings.temppath gse_query() gsm_query() gseqfile = getlatest_filepath(equery_dest, 'gse_edirectquery') gsmqfile = getlatest_filepath(equery_dest, 'gsm_edirectquery') gsequery_filter() gsefiltpath = getlatest_filepath(equery_dest, 'gsequery_filt') if gsefiltpath: gsefiltd = querydict(querypath=gsefiltpath, splitdelim=' ') gseidlist = list(gsefiltd.keys()) print("GSE id list of len " + str(len(gseidlist)) + " found. Returning...") return gseidlist else: print("Error retrieving gse query filtered file. Returning...") return None return None
def scheduled_run(eqfilt_path=False, run_timestamp=gettime_ntp()): """ scheduled_run Tasks performed on regular schedule, after first setup. For the job queue, a list of GSE IDs is returned. The id list is filtered on existing GSE soft files to prioritize unrepresented experiments for download. Arguments: * eqfilt_path (str) : Filepath to edirect query filter file. * filedir (str) : Root name of files directory. * run_timestamp (str) : NTP timestamp or function to retrieve it. Returns: * gse_list (list) : list of valid GSE IDs, or None if error occurs """ try: gsefiltd = get_queryfilt_dict() except: print("No gse query filt file found, checking for GSE and GSM " + "queries...") gsequery_latest = getlatest_filepath(filepath=eqpath, filestr='gse_edirectquery') if not gsequery_latest: gse_query() gsmquery_latest = getlatest_filepath(eqpath, 'gsm_edirectquery') if not gsmquery_latest: gsm_query() print("Running filter on GSE query...") gsequery_filter() gsefiltd = get_queryfilt_dict() # get list of GSE IDs from existing SOFT files gsesoftfiles = os.listdir(settings.gsesoftpath) print("GSE SOFT files: " + str(gsesoftfiles)) rxgse = re.compile('GSE[0-9]*') gseid_softexists = [ str(rxgse.findall(softfn)[0]) for softfn in gsesoftfiles if rxgse.findall(softfn) ] if gsefiltd: gseid_listall = list(gsefiltd.keys()) print("GSE ID list of len " + str(len(gseid_listall)) + " found. Filtering..") if gseid_softexists and len(gseid_softexists) > 0: gseid_filt = [ gseid for gseid in gseid_listall if not gseid in gseid_softexists ] else: gseid_filt = gseid_listall print("After filtering existing SOFT files, N = " + str(len(gseid_filt)) + " GSE IDs remain. Returning ID list...") # if all GSE IDs represented, return all GSE IDs for brand new run if len(gseid_filt) == len(gseid_listall): gseid_filt = gseid_listall return gseid_filt else: print("Error forming equery filt dictionary. Returning...") return None
def gsequery_filter(splitdelim='\t', timestamp=gettime_ntp()): """ gsequery_filter Prepare an edirect query file. Filter a GSE query file on its GSM membership. Arguments: * splitdelim (str) : Delimiter to split ids in querydict() call. * timestamp (str) : NTP timestamp or function to retrieve it. Returns: * gsequeryfiltered (list): Filtered GSE query object (list), writes filtered query file as side effect. """ eqpath = settings.equerypath gsequerystr = settings.gsequerystr gsmquerystr = settings.gsmquerystr # get GSM list from gsm query file gsmqueryf_latestpath = getlatest_filepath(filepath=eqpath, filestr=gsmquerystr, embeddedpattern=True, tslocindex=1, returntype='returnlist') if gsmqueryf_latestpath: print("Latest gsmquery file detected: " + str(gsmqueryf_latestpath)) else: print("Error detecting latest gsmquery file! Returning...") return gsmlines = [line.rstrip('\n') for line in open(gsmqueryf_latestpath[0])] gsmlist = [line.split('\t')[1::][0] for line in gsmlines] # get GSE dictionary object gsequeryf_latestpath = getlatest_filepath(filepath=eqpath, filestr=gsequerystr, embeddedpattern=True, tslocindex=1, returntype='returnlist') if gsequeryf_latestpath: print("Latest gsequery file detected: " + str(gsequeryf_latestpath)) else: print("Error detecting latest gsequery file! Returning...") return gsed_obj = querydict(querypath=gsequeryf_latestpath[0], splitdelim='\t') gsefiltl = [] for gsekey in list(gsed_obj.keys()): samplelist_original = gsed_obj[gsekey] samplelist_filt = [ sample for sample in samplelist_original if sample in gsmlist ] if samplelist_filt and len(samplelist_filt) > 0: gsefiltl.append(' '.join([gsekey, ' '.join(samplelist_filt)])) print('writing filt file...') if eqpath: filtfn = ".".join(["gsequery_filt", timestamp]) with open(os.path.join(eqpath, filtfn), 'w') as filtfile: for item in gsefiltl: filtfile.write("%s\n" % item) return gsefiltl
def eqd_gsm_exclude(equery_dest=settings.equerypath, filesdir=settings.filesdir, gsmv_fname="gsmv.txt", exclude_dpath=os.path.join("inst", "freeze_gsmv")): """ eqd_gsm_exclude Exclude GSM IDs from edirecty query objects Arguments: * gsmv_fname: Name of the file to load. Should include only space-separated sample/GSM IDs in a single line. * exclude_dpath: Path to directory containing the file gsmv_fname. Returns: * Returns the path to the new filtered file at settings.equerypath. """ gsmv_fpath = os.path.join(exclude_dpath, gsmv_fname) if not os.path.exists(gsmv_fpath): print("Couldn't find sample ID file") gsmv_exclude = [line.rstrip('\n').split(" ") for line in open(gsmv_fpath)][0] # gsmv_exclude = [i for sublist in gsmv_exclude for i in sublist] eqpath = settings.equerypath gsefilt_latest = getlatest_filepath(eqpath,'gsequery_filt', embeddedpattern=True, tslocindex=1, returntype='returnlist' )[0] print("Starting with latest detected filter file: "+gsefilt_latest) querylines = [line.rstrip('\n') for line in open(gsefilt_latest)] qlnew = []; print("Applying filter..."); numgsm_old = len(querylines) for line in querylines: line = line.split(" ") ldat = [gid for gid in line if not gid in gsmv_exclude] numgsm_new = len(ldat) if len(ldat) > 1: qlnew.append(ldat) print("After filter, retained " + str(len(qlnew)) + " studies.") nts = gettime_ntp() newfpath = os.path.join(eqpath, ".".join(["gsequery_filt",nts])) print("Writing new filter file: ", newfpath) with open(newfpath, "w") as wf: for line in qlnew: wf.write(" ".join(line) + "\n") return newfpath
def gsm_query(validate=True, timestamp=gettime_ntp()): """ gsm_query Get GSM level query object, from edirect query. Arguments: * validate (True/False, bool.) : whether to validate the file after ownload. * timestamp (str) : NTP timestamp or function to retrieve it. Returns: * Error (str) or download object (dictionary). """ # timestamp = str(gettime_ntp()) eqdestpath = settings.equerypath temppath = settings.temppath os.makedirs(eqdestpath, exist_ok=True) os.makedirs(temppath, exist_ok=True) temp_make = tempfile.mkdtemp(dir=temppath) atexit.register(shutil.rmtree, temp_make) dldict = {} dldict['gsmquery'] = [] dlfilename = ".".join(['gsm_edirectquery', timestamp]) dldict['gsmquery'].append(dlfilename) subp_strlist1 = [ "esearch", "-db", "gds", "-query", "'" + settings.platformid + "[ACCN] AND idat[suppFile] AND gsm[ETYP]'" ] subp_strlist2 = ["efetch", "-format", "docsum"] subp_strlist3 = [ "xtract", "-pattern", "DocumentSummary", "-element", "Id Accession", ">", os.path.join(temp_make, dlfilename) ] args = " | ".join([ " ".join(subp_strlist1), " ".join(subp_strlist2), " ".join(subp_strlist3) ]) output = subprocess.check_output(args, shell=True) dldict['gsmquery'].append(output) if validate: gsmquery_filewritten = os.path.join(temp_make, dlfilename) gsmquery_old = glob.glob('.'.join([ os.path.join(eqdestpath, 'gsm_edirectquery'), '*', ])) if gsmquery_old: if len(gsmquery_old) > 1: gsmquery_old.sort(key=lambda x: int(x.split('.')[1])) gsmquery_old_mostrecent = gsmquery_old[-1] else: gsmquery_old_mostrecent = gsmquery_old[0] # filecmp should work (equesry file order preserved on reps) if filecmp.cmp(gsmquery_old_mostrecent, gsmquery_filewritten): print("Downloaded gsm query file same as most recent stored." + " Removing...") os.remove(gsmquery_filewritten) dldict['gsmquery'].append(False) else: print("Downloaded file is new, moving to dest...") shutil.move( gsmquery_filewritten, os.path.join(eqdestpath, os.path.basename(gsmquery_filewritten))) dldict['gsmquery'].append(True) else: print("Downloaded file is new, moving...") shutil.move( gsmquery_filewritten, os.path.join(eqdestpath, os.path.basename(gsmquery_filewritten))) dldict['gsmquery'].append(True) return dldict
def write_cjson(jffnv, ts=gettime_ntp(), newfilefn="cjson", tempdname="cjsontemp", jsonfiltpath=settings.gsmjsonfiltpath, msrap_destpath=settings.gsmmsrapoutpath): """ write_cjson Write a composite JSON file with multiple samples Arguments: * jffnv : Vector of filtered JSON filenames (list). * newfilefn : File name stem of new file to write (str). * msrap_destpath : Path to MetaSRA-pipeline output files (str). * jsonfiltpath : Path to filtered GSM JSON files (str). * tempdname : Name of dir, at jsonfiltpath, to contain composite JSON files (str). * ts : Timestamp of output and input files (int). Returns: * Path to new composite JSON file. """ temppath_read = os.path.join(jsonfiltpath) if not os.path.exists(temppath_read): os.makedirs(temppath_read) temppath_write = os.path.join(msrap_destpath, tempdname) if not os.path.exists(temppath_write): os.makedirs(temppath_write) ll = [] fnl = [] for fn in jffnv: fpath = os.path.join(jsonfiltpath, fn) if os.path.exists(fpath): with open(fpath, "r") as openjson: linesi = openjson.readlines() if len(linesi) > 0: ll.append(linesi) fnl.append(fpath) newfn = ".".join([newfilefn, ts]) wite_fpath = os.path.join(temppath_write, newfn) if len(ll) > 0: print("Read data for " + str(len(ll)) + " files. Writing data") lform = [] with open(wite_fpath, "w") as opencj: opencj.write("[\n") # first line for fi, file in enumerate(ll): ld = [] for line in file: if line == "}\n": opencj.write("\t{\n") ld = ld[1::] jfname = os.path.basename(fnl[fi]) gsmid = '"' + jfname.split(".")[1] + '"' lpath = ":".join(['"gsm"', gsmid]) opencj.write("\t\t" + lpath + ",\n") # sample id for ii, ldi in enumerate(ld): lf = ldi.split(":") if lf[0] == ' !Sample_source_name_ch1': lf = ['source'] + lf[1::] elif lf[0] == ' !Sample_title': lf = ['title'] + lf[1::] else: lf = lf[1::] lf = ['"' + i + '"' for i in lf] lf = ':'.join(lf[0:2]) if ii == len(ld) - 1: lf = lf + "\n" # comma for values before last else: lf = lf + ",\n" opencj.write("\t\t" + lf) if fi == len(ll) - 1: opencj.write("\t}\n") # no comma for final entry else: opencj.write("\t},\n") else: ldi = line ldi = ldi.replace(']', '') ldi = ldi.replace('[', '') ldi = ldi.replace('"', '') ldi = ldi.replace('\n', '') ldi = ldi.replace(',', '') if not ldi == "": ld.append(ldi) opencj.write("]") # last line return wite_fpath
def run_msrap_compjson(json_flist=[], njint=500, jsonpatt=".*json.filt$", gsm_jsonpath=settings.gsmjsonfiltpath, tempdname="cjsontemp", msrap_destpath=settings.gsmmsrapoutpath, newfnpattern="msrap.cjson"): """ run_msrap_compjson Run MetaSRA-pipeline on composite JSON files Runs the MetaSRA-pipeline on composite JSON files containing njint samples' JSON-formatted metadata. The composite JSON files and the composite metadata outputs are both written to tempfname at msrap_destpath. After mapping, get_gsm_outputs() is called to make the GSM-specific files, which are output to the top level of msrap_destpath. Arguments: * json_flist : List of JSON filename(s) to process. If not provided, automatically targets all JSON files at gsm_jsondir (list, optional). * njint : Number of JSON files per composite file to process (int). * jsonpatt : File name pattern for valid filtered JSON files (str). * gsm_jsonpath : Path to the filtered GSM JSON files directory (str). * tempdname : Dir, located at msrap_destpath, where composite JSON files and outputs are to be written (str). * msrap_destpath : Path where mapped metadata output files will be written (str). * newfnpattern : File name pattern for mapped metadata output (str). Returns: * NULL, produces the composite file pairs and GSM metadata files. """ eqfiltdict = get_queryfilt_dict() validgsmlist = [ gsmid for gselist in list(eqfiltdict.values()) for gsmid in gselist ] msrap_runpath = settings.msraprunscriptpath msrap_oldgsm = [] if os.path.exists(msrap_destpath): mld = os.listdir(msrap_destpath) mld = [i for i in mld if not i == "cjsontemp"] if len(mld) > 0: msrap_oldgsm = [ fn.split(".")[1] for fn in mld if len(fn.split(".")) > 1 ] if not (json_flist and len(json_flist) > 0): if os.path.exists(gsm_jsonpath): json_flist = os.listdir(gsm_jsonpath) else: print("Couldn't find JSON file dir at " + gsm_jsonpath) print("Filtering GSM JSON filenames on pattern, existing msrap files...") gsm_json_fn_list = list(filter(re.compile(jsonpatt).match, json_flist)) gsm_json_fn_list = [ fn for fn in gsm_json_fn_list if not fn.split(".")[1] in msrap_oldgsm ] cjsonpath = os.path.join(msrap_destpath, tempdname) os.makedirs(cjsonpath, exist_ok=True) msrap_statlist = [] msrap_fn = settings.msrapfnstem process_list = [] rl = [r for r in range(0, len(gsm_json_fn_list), njint)] print("Running pipeline for composite JSON files...") for r in rl: ts = gettime_ntp() # use new ts for each new composite file pair jsonflist = gsm_json_fn_list[r:r + njint] cjreadpath = write_cjson(jffnv=jsonflist, jsonfiltpath=gsm_jsonpath, msrap_destpath=msrap_destpath, ts=ts, tempdname=tempdname) newfn = ".".join([newfnpattern, ts]) cjwritepath = os.path.join(cjsonpath, newfn) cmdlist = [ 'python2', msrap_runpath, "--fnvread", cjreadpath, "--fnvwrite", cjwritepath ] process_list.append(subprocess.call(cmdlist, shell=False)) print("Finished index " + str(r)) print("Extracting GSM data from composite JSON results...") get_gsm_outputs() return None
def rmdb_fpaths(): """ rmdb_fpaths Get filepaths for existant sample idats and msrap outfiles. Returns: * hlinklist, list of new hlink files created at settings.idatspath. """ timestamp = gettime_ntp() # connect to RMDB mongodb #client = pymongo.MongoClient(mdb_host, mdb_port) #mdbcon = client.recount_methylation; mdb_idatscon = mdbcon.gsm.idats #mdb_idatrecords = list(mdb_idatscon.find()) # list all previously expanded idat files directy from idats dir instpath_allidats = os.listdir(settings.idatspath) # compressed idats instpath_compidats = list(filter(re.compile('.*\.idat.gz$').match, instpath_allidats)) # expanded idats instpath_expidat = list(filter(re.compile('.*\.idat$').match, instpath_allidats)) # idat hlinks instpath_hlink = list(filter(re.compile('.*hlink.*').match, instpath_expidat)) # expanded idats without hlinks instpath_nohlink = [i for i in instpath_expidat if not i in instpath_hlink] print("Detected " +str(len(instpath_compidats))+ " compressed IDATs, " + str(len(instpath_expidat)) + " expanded IDATs, and " + str(len(instpath_nohlink)) + " expanded IDATs without hlinks.") print("Getting GSM IDs for IDATs without hlinks...") instpath_nohlink_gsm = list(set([i.split(".")[0] for i in instpath_nohlink])) print("Getting IDAT filepaths from MongoDB records, "+ "for GSM IDs lacking hlinks...") #mdb_nohlink_gsmlist = list(set([i["gsmid"] for i in mdb_idatrecords # if i["gsmid"] in instpath_nohlink_gsm])) gsmlist = list(set([i.split(".")[0] for i in instpath_nohlink])) instpath_idatspathlist = [i for i in instpath_nohlink if i.split(".")[0] in gsmlist]; hlinklist=[] for gsmid in gsmlist: print("Processing GSM ID " + gsmid + "..."); ired_fn = ""; igrn_fn = ""; basename_grn = ""; basename_red = "" gsm_idats = [i for i in instpath_idatspathlist if i.split(".")[0] == gsmid and os.path.exists(os.path.join(settings.idatspath, i))] try: igrn_fn = list(filter(re.compile(".*Grn\.idat$").match, gsm_idats))[0] ired_fn = list(filter(re.compile(".*Red\.idat$").match, gsm_idats))[0] basename_grn = "_".join(igrn_fn.split(".")[2].split("_")[0:-1]) basename_red = "_".join(ired_fn.split(".")[2].split("_")[0:-1]) if (basename_grn==basename_red and not basename_grn == "" and not basename_red == ""): print("Making new IDAT hlinks for GSM ID " + gsmid) rlist = new_idat_hlinks(gsmid = gsmid, ts = timestamp, igrn_fn = igrn_fn, ired_fn = ired_fn) hlinklist.append(rlist) except: print("Couldn't find Red and Grn IDATs for GSM ID " + gsmid) print("Finished with GSM ID " + gsmid) print("Made " + str(len(hlinklist)) + " new IDAT hlinks. Returning...") return hlinklist
if gseid in gseid_processnew: gsefn_processnew.append(gsesoftl[i]) len(gsefn_processnew) #--------------------------------- # process a single gse soft file #--------------------------------- softopenindex='.*!Sample_title.*' softcloseindex='.*!Sample_data_row_count.*' timestamp=gettime_ntp() gse_softpath = settings.gsesoftpath gsm_softpath = settings.gsmsoftpath gsmsoft_destpath = settings.gsmsoftpath validate=True which_gsefn = 0 gsefn = gsefn_processnew[which_gsefn] gsesoft_flist=[gsefn] eqfiltdict=get_queryfilt_dict() validgsmlist = list(set([gsmid for gselist in list(eqfiltdict.values()) for gsmid in gselist ])) print("length validgsmlist : "+str(len(validgsmlist)))
def compile_rsheet(gsmfpathdict): """ compile_rsheet Takes dictionary of GSM IDs. Compiles valid GSM IDs, filenames, and path into an rsheet object. Arguments: * gsmfpathdict: gsm paths dict obj output from rmdb_fpaths() (dict). Returns: * lsheet, produces rsheet file as a side effect. """ timestamp = gettime_ntp() print("Getting equery filter...") eqd = get_queryfilt_dict() gsmvalidlist = list(set([gsmid for gselist in list(eqd.values()) for gsmid in gselist ])) sheetspath = settings.sheetspath; sheetfn_ext = settings.sheetfnstem os.makedirs(sheetspath, exist_ok = True) sheets_fpath = os.path.join(sheetspath, ".".join([timestamp, sheetfn_ext])) # table written as list of row strings print("Forming table list for rsheet..."); lsheet = [] lsheet.append(" ".join(["gsmid", "gseid", "idats_fn", "msrapmd_fn", "msrapmd_flatjson", "SENTRIX_ID", "ARRAY_ID", "Basename"])) lsheet[0] = lsheet[0]+"\n"; print("Forming filtered GSM dictionary...") gsmvalid_fpathlist = {key:value for (key,value) in gsmfpathdict.items() if key in gsmvalidlist} if gsmvalid_fpathlist: print("Starting iterations on gsm filepaths list of len = " +str(len(list(gsmvalid_fpathlist.keys())))) for gsmindex, gsmid in enumerate(gsmvalid_fpathlist, 1): print("Beginning GSM num "+str(gsmindex)+", id: "+str(gsmid)) gsmvalid_fp = [fp for fp in gsmvalid_fpathlist[gsmid] if not fp==None and not fp==False] if gsmvalid_fp: print("Getting GSE ID...") gseid = ';'.join(list(set([gsek for gsek in list(eqd.keys()) if gsmid in eqd[gsek] ] ) ) ) print("GSE id found: "+str(gseid)) gsm_fpaths = gsmvalid_fp gsmi_redidatpath = [fp for fp in gsm_fpaths if "_Red.idat" in fp] gsmi_grnidatpath = [fp for fp in gsm_fpaths if "_Grn.idat" in fp] gsmi_msrappath = [fp for fp in gsm_fpaths if "soft.msrapout" in fp] if gsmi_redidatpath and gsmi_grnidatpath: print("Found paired channel idats for GSM...") gsmi_redidatpath = gsmi_redidatpath[0] gsmi_grnidatpath = gsmi_grnidatpath[0] # idat filenames grn_idatfn = os.path.basename(gsmi_grnidatpath) red_idatfn = os.path.basename(gsmi_redidatpath) # sample basename (common stem of channel array filenames) print("Getting sample basename..."); gsmimdd = [] gsmi_basename = "_".join(red_idatfn.split("_")[0:3]) # basename if gsmi_msrappath: print("Detected metadata file for GSM.") gsmi_msrappath = gsmi_msrappath[0] gsmi_msrappath_var = os.path.basename(gsmi_msrappath) print("Forming flattened sample metadata...") try: # load msrap mapped terms json file with open(gsmi_msrappath, 'r') as msrapmd: gsmimdd = json.load(msrapmd) except json.decoder.JSONDecodeError: print("Error, cannot load non-json file: "+gsmi_msrappath) gsmi_msrappath_var = "NA" else: gsmi_msrappath_var = "NA" if gsmimdd and not gsmi_msrappath_var == "NA": gsmi_md = gsmimdd[0]; gmd = [] # coerce json metadata to flat string for key in list(gsmi_md.keys()): print(str(key)) kval = '' if key == 'sample type': print("key = 'sample type'") print("kval = "+str(gsmi_md[key])) gmd.append(str("sampletype="+str(gsmi_md[key]))) if key == 'real-value properties': print("key = "+str(key)) kval = gsmi_md[key] print("kval : "+str(kval)) for index, val in enumerate(kval): subkval = kval[index] print("subkval : "+str(subkval)) gmd.append(str(subkval['property_id'])) if key == 'mapped ontology terms': print("key = "+str(key)) kval = gsmi_md[key] gmd.append(";".join([term for term in kval])) if key == 'sample-type confidence': print("key = "+str(key)) gmd.append('sampletypeconf='+str(gsmi_md[key])) gsmi_mdvar = ";".join(gmd) # long metadata string else: gsmi_mdvar = "NA" # form table row entry for gsmid as long string print("Adding row to table list...") lgsmi = " ".join([gsmid, # gsm id gseid, # gse id ";".join([red_idatfn,grn_idatfn]), # idat filenames gsmi_msrappath_var, # metadata filename gsmi_mdvar, # flattened json file grn_idatfn.split("_")[-2], # sentrix id grn_idatfn.split("_")[-3], # array id gsmi_basename # minfi path Basename, for arrays ]) lgsmi = lgsmi+"\n" lsheet.append(lgsmi) else: print("Error: GSM is missing one or more valid filepaths. Continuing...") else: print("No valid GSM IDs detected. Check idats and pipeline folders.") return None print("Finished processing the GSM files dictionary, writing new rsheet...") with open(sheets_fpath,'+w') as fsheet: for gsmitem in lsheet: fsheet.write(gsmitem) return lsheet
def preprocess_mdat(bnlistpass, timelim=40, nsampproc=10, nprocmax=4, statint=2): """ preprocess_mdat Preprocess mdat files via background subprocesses, monitoring, and logging. Arguments * bnlistpass (list) : List of valid basenames * timelim (int) : Time limit for running processes, in minutes. * nsampproc (int): Number of samples per process launched. * nprocmax (int): Total processes to launch * statint (int): Seconds to wait before monitor status updates. Returns * None, produces status log in stdout and new logfile as side effect """ # form the array of bn lists print("Forming basenames array...") bnscreenarray = [] # array of bn lists for batch processing n = nsampproc bnscreenarray = [ ' '.join(bnlistpass[i * n:(i + 1) * n]) for i in range((len(bnlistpass) + n - 1) // n) ] bnscreenarray = bnscreenarray[0:nprocmax] print("Finished forming basenames array of length = " + str(len(bnscreenarray))) # new screen deployment print("Getting timestamp...") timestamp = gettime_ntp() process_list = [] # process list for status monitoring and stderr # getting string limit print("Getting args maxstr...") argmaxstr = int( str(subprocess.check_output(['getconf', 'ARG_MAX' ])).replace("b'", '').replace("\\n'", "")) print("Detected argmaxstr of " + str(argmaxstr) + ". Continuing...") print("Launching background subprocesses...") for bi, bnstr in enumerate(bnscreenarray, 0): cmd = [ 'Rscript', settings.mdatscriptpath, timestamp, str(bi), '"' + bnstr + '"' ] cmdcharlen = len(''.join(cmd)) print("Formed cmd str of len = " + str(cmdcharlen) + ", checking args str limit...") if cmdcharlen <= argmaxstr: proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) process_list.append(proc) print("Launched background subprocess and appended poll to " + "statuslist. Continuing...") else: print("Error! Char length of cmd exceeds system limit for args. " + "Try modifying argument 'nsampproc'. Continuing...") # process monitoring start monitor_processes(process_list=process_list, logpath=settings.mdatlogspath) print("Completed preprocessing. Returning...") return None
def dl_idat(input_list, retries_connection=3, retries_files=3, interval_con=.1, interval_file=.01, validate=True, timestamp=gettime_ntp()): """ dl_idat Download idats, reading in either list of GSM IDs or ftp addresses. Arguments * input list (list, required) : A list of valid GSM IDs. * retries_connection (int) : Number of ftp connection retries allowed. # retries_files : Number of retry attempts allowed for sample file downloads. * interval_con (float) : Time (in seconds) to sleep before retrying a database connection. * interval_file (float) : Time (in seconds) to sleep before retrying a file connection. * validate (Bool.): Validate new files against existing idats? * timestamp (str) : An NTP timestamp for versioning. Returns * dldict (dictionary) : Records, dates, and exit statuses of ftp calls, OR error string over connection issues. Downloads and moves new and validated files as side effect. """ idatspath = settings.idatspath temppath = settings.temppath os.makedirs(idatspath, exist_ok=True) os.makedirs(temppath, exist_ok=True) temp_dir_make = tempfile.mkdtemp(dir=temppath) item = input_list[0] if not item.startswith('GSM'): raise RuntimeError("GSM IDs must begin with \"GSM\".") ftptoken_login = '******' retries_left_connection = retries_connection while retries_left_connection: print('trying ftp connection') try: ftp = ftplib.FTP(ftptoken_login) loginstat = ftp.login() print('connection successful, continuing...') break except ftplib.all_errors as e: if retries_left_connection: retries_left_connection -= 1 print('continuing with connection retries left = ' + str(retries_left_connection)) time.sleep(interval_con) continue else: print('connection retries exhausted, returning...') return str(e) # mongodb connection client = pymongo.MongoClient(settings.rmdbhost, settings.rmdbport) dldict = {} files_written = [] for gsm_id in input_list: print('Starting GSM: ' + gsm_id) dldict[gsm_id] = [] id_ftptokens = [ 'ftp.ncbi.nlm.nih.gov', 'geo', 'samples', gsm_id[:-3] + 'nnn', gsm_id, 'suppl' ] id_ftpadd = '/'.join(id_ftptokens[1::]) + '/' filenames = [] retries_left_files = retries_files try: filenames = ftp.nlst(id_ftpadd) if len(filenames) > 0: filestr = '; '.join(str(e) for e in filenames) print("files found: " + filestr) dldict[gsm_id].append([ gsm_id, id_ftpadd, "connection success, valid num idats found" ]) print("Idat filenames detected for " + gsm_id + ", continuing...") for file in filenames: print("Beginning iteration for file: " + file) filedate = "" filedate_estat = "" filedl_estat = "" file_tokens = file.split('/') try: filedate = ftp.sendcmd("MDTM /" + '/'.join(file_tokens)) filedate = datetime.datetime.strptime( filedate[4:], "%Y%m%d%H%M%S") mongo_date = idat_mongo_date(gsm_id, file, client) if filedate in mongo_date: filedate_estat = "same_as_local_date" dldict[gsm_id].append( [gsm_id, file, filedate, filedate_estat]) print('Online date same as local date. Breaking..') break else: filedate_estat = "new_date" to_write = os.path.join( temp_dir_make, '.'.join( [gsm_id, str(timestamp), file_tokens[-1]])) file_ftpadd = '/'.join(file_tokens[:-1]) file_ftpadd = file_ftpadd + '/' + file_tokens[-1:][ 0] print('Attempting file download, for file: ' + file) try: with open(to_write, 'wb') as output_stream: filedl_estat = ftp.retrbinary( "RETR /" + file_ftpadd, output_stream.write) dldict[gsm_id].append([ gsm_id, file_ftpadd, to_write, filedl_estat, filedate, filedate_estat ]) if '226 Transfer complete' in filedl_estat: files_written.append( (gsm_id, to_write, len(dldict[gsm_id]) - 1)) print("File successfully downloaded. " + "Continuing...") continue except ftplib.all_errors as efiledl: if retries_left_files: retries_left_files -= 1 print( 'ftp file dl error, retries left = ' + str(retries_left_files)) time.sleep(interval_file) continue else: print( 'File retries exhausted. Breaking...') filedl_estat = str(efiledl) dldict[gsm_id].append([ gsm_id, file_ftpadd, to_write, filedl_estat, filedate, filedate_estat ]) break break break except ftplib.all_errors as efiledate: if retries_left_files: retries_left_files -= 1 print('ftplib file date error, retries left = ' + str(retries_left_files)) time.sleep(interval_file) continue else: print('File retries exhausted. Breaking...') filedate_estat = str(efiledate) filedate = "not_available" dldict[gsm_id].append( [gsm_id, file, filedate, filedate_estat]) break continue else: dldict[gsm_id].append([gsm_id, "no files at ftp address"]) break except ftplib.error_temp as eid: if retries_left_files: retries_left_files -= 1 print('ftplib filenames error, retries left = ' + str(retries_left_files)) time.sleep(interval_file) continue else: print('File retries exhausted. Breaking...') dldict[gsm_id].append([gsm_id, id_ftpadd, str(eid)]) break if validate: print("Validating downloaded files...") for gsm_id, file_written, index in files_written: print("file written is " + file_written) filestr = os.path.basename(file_written).split('.')[2::] filestr = str('.'.join(filestr)) print('filestr written : ' + filestr) print('dir to search latest: ' + idatspath) gsmidat_latest = getlatest_filepath(idatspath, filestr, embeddedpattern=True, returntype='returnlist', tslocindex=1) print('gsm latest: ' + str(gsmidat_latest)) if gsmidat_latest: gsmidat_latest = gsmidat_latest[0] print('cmp result: ' + str(filecmp.cmp(gsmidat_latest, file_written))) if filecmp.cmp(gsmidat_latest, file_written): print( "Downloaded file is same as recent file. Removing...") os.remove(file_written) # If filename is false, we found it was the same dldict[gsm_id][index].append(False) else: print("Downloaded file is new, moving to idatspath...") shutil.move( file_written, os.path.join(idatspath, os.path.basename(file_written))) dldict[gsm_id][index].append(True) dldict[gsm_id][index][2] = os.path.join( idatspath, os.path.basename(file_written)) else: print("Downloaded file is new, moving...") shutil.move( file_written, os.path.join(idatspath, os.path.basename(file_written))) dldict[gsm_id][index].append(True) dldict[gsm_id][index][2] = os.path.join( idatspath, os.path.basename(file_written)) shutil.rmtree(temp_dir_make) return dldict
def dl_soft(gse_list=[], retries_connection=3, retries_files=3, interval_con=.1, interval_file=.01, validate=True, timestamp=gettime_ntp()): """ dl_soft Download GSE soft file(s). Accepts either a list of GSM IDs or ftp addresses. Arguments: * gse_list (list, required) : A list of valid GSE id(s). * retries_connection (int) : Number of ftp connection retries allowed. * retries_files : Number of retry attempts allowed for sample file downloads. * interval_con (float) : Time (in seconds) to sleep before retrying a database connection. * interval_file (float) : Time (in seconds) to sleep before retrying a file connection. * validate (Bool.): Validate new files against existing idats? * timestamp (str) : An NTP timestamp for versioning. Returns: * Dictionary showing records, dates, and exit statuses of ftp calls OR error string over connection issues """ gsesoftpath = settings.gsesoftpath temppath = settings.temppath os.makedirs(gsesoftpath, exist_ok=True) os.makedirs(temppath, exist_ok=True) temp_dir_make = tempfile.mkdtemp(dir=temppath) item = gse_list[0] if not item.startswith('GSE'): raise RuntimeError("GSE IDs must begin with \"GSE\".") ftptoken_login = '******' retries_left_connection = retries_connection while retries_left_connection: print('trying ftp connection') try: ftp = ftplib.FTP(ftptoken_login) loginstat = ftp.login() print('connection successful, continuing...') break except ftplib.all_errors as e: if retries_left_connection: retries_left_connection -= 1 print('continuing with connection retries left = ' + str(retries_left_connection)) time.sleep(interval_con) continue else: print('connection retries exhausted, returning...') return str(e) # mongodb connection client = pymongo.MongoClient(settings.rmdbhost, settings.rmdbport) dldict = {} print('beginning iterations over gse list...') for gse in gse_list: print('beginning download for gse: ' + gse) retries_left_files = retries_files dldict[gse] = [] files_written = [] filenames = [] # tokens for soft file ftp address id_ftptokens = [ 'ftp.ncbi.nlm.nih.gov', 'geo', 'series', gse[:-3] + 'nnn', gse, 'soft' ] id_ftpadd = '/'.join(id_ftptokens[1::]) + '/' while retries_left_files: try: filenames = ftp.nlst(id_ftpadd) # filter for only soft file names file = list(filter(lambda x: 'family.soft' in x, filenames))[0] dldict[gse].append([gse, id_ftpadd, "success"]) filedate = "" filedate_estat = "" filedl_estat = "" file_tokens = file.split('/') try: print('getting date from ' + '/'.join(file_tokens)) filedate = ftp.sendcmd("MDTM /" + '/'.join(file_tokens)) filedate = datetime.datetime.strptime( filedate[4:], "%Y%m%d%H%M%S") mongo_date = soft_mongo_date(gse, file, client) if filedate in mongo_date: print('online date same as local date,' + 'breaking...') filedate_estat = "same_as_local_date" dldict[gse].append( [gse, file, filedate, filedate_estat]) break else: print('new online date found, continuing...') filedate_estat = "new_date" to_write = os.path.join( temp_dir_make, '.'.join([gse, timestamp, file_tokens[-1]])) file_ftpadd = '/'.join(file_tokens[:-1]) file_ftpadd = file_ftpadd + '/' + file_tokens[-1:][0] try: print('downloading soft from ' + file_ftpadd) with open(to_write, 'wb') as output_stream: filedl_estat = ftp.retrbinary( "RETR /" + file_ftpadd, output_stream.write) dldict[gse].append([ gse, file_ftpadd, to_write, filedl_estat, filedate, filedate_estat ]) if '226 Transfer complete' in filedl_estat: files_written.append( (gse, to_write, len(dldict[gse]) - 1)) print('total files written = ' + str(len(files_written))) print('soft transfer successful for ' + to_write + ', breaking...') break except ftplib.all_errors as efiledl: print('file download error from ' + file_ftpadd) if retries_left_files: retries_left_files -= 1 print('continuing with file retries left =' + str(retries_left_files)) time.sleep(interval_file) continue else: print('file retries exhausted, breaking..') filedl_estat = str(efiledl) dldict[gse].append([ gse, file_ftpadd, to_write, filedl_estat, filedate, filedate_estat ]) break except ftplib.all_errors as efiledate: print('error getting date from ' + '/'.join(file_tokens)) if retries_left_files: retries_left_files -= 1 print('continuing with file retries left = ' + str(retries_left_files)) time.sleep(interval_file) continue else: print('file retries exhausted, breaking..') filedate_estat = str(efiledate) filedate = "not_available" dldict[gse].append( [gse, file, filedate, filedate_estat]) break except ftplib.error_temp as eid: print('error making ftp connection to ' + id_ftpadd) if retries_left_files: retries_left_connection -= 1 print('ftplib error encountered, file retries left = ' + str(retries_left_files)) time.sleep(interval_file) continue else: print('file retries exhausted, breaking..') dldict[gse].append([gse, id_ftpadd, str(eid)]) break if validate: print('commencing file validation...') for gse, new_filepath, index in files_written: filestr = os.path.basename(new_filepath).split('.')[0] gsesoft_latest = getlatest_filepath(gsesoftpath, filestr) if gsesoft_latest and not gsesoft_latest == 0: if filecmp.cmp(gsesoft_latest, new_filepath): print('identical file found in dest_dir, removing...') dldict[gse].append(False) os.remove(new_filepath) else: print('new file detected in temp_dir, moving to ' + 'dest_dir...') dldict[gse].append(True) dldict[gse][index][2] = os.path.join( gsesoftpath, os.path.basename(new_filepath)) shutil.move( new_filepath, os.path.join(dest_dir, os.path.basename(new_filepath))) else: print('new file detected in temp_dir, moving to dest_dir..') dldict[gse].append(True) dldict[gse][index][2] = os.path.join( gsesoftpath, os.path.basename(new_filepath)) shutil.move( new_filepath, os.path.join(gsesoftpath, os.path.basename(new_filepath))) continue shutil.rmtree(temp_dir_make) return dldict
Null, provides status updates over run. """ if __name__ == "__main__": print("Starting server.py...") import subprocess, glob, sys, os, re sys.path.insert(0, os.path.join("recountmethylation_server", "src")) import edirect_query, settings, argparse settings.init() from edirect_query import gsm_query, gse_query, gsequery_filter from utilities import gettime_ntp, getlatest_filepath, querydict from utilities import get_queryfilt_dict from gse_celerytask import gse_task from random import shuffle gselist = [] # queue input, gse-based qstatlist = [] # job status object, also stored at sqlite db print("Getting timestamp...") run_timestamp = gettime_ntp() # pass this result to child functions # Parse the specified GSE ID. parser = argparse.ArgumentParser(description='Arguments for server.py') parser.add_argument( "--gseid", type=str, required=False, default=None, help='Option to enter valid GSE ID for immediate download.') args = parser.parse_args()
def compile_rsheet(eqfiltd=get_queryfilt_dict(), sheetfn_ext='rsheet', msrapfn_ext='msrapout', msrapfn='msrapout', idatsfn_ext='idat', timestamp=gettime_ntp()): """ compile_rsheet Knits poised file data together into a sheet to be read into R using minfi. Steps taken include: 1. Grab msrap file list 2. Grab idats file list 3. Intersect files lists 4. Subset eqfilt dict on gse 5. Form and write new sheet files, one per gse Arguments * eqfiltd (function or dictionary) : Equery filter dictionary object. * sheetsdir (str) : Directory to write new sheet files. * sheetfn_ext (str) : Filename extension for new sheet files. * msrapdir (str) : Directory containing MetaSRA-pipeline datafiles. * msrapfn_ext (str) : Filename extension of valid MetaSRA-pipeline datafiles. * idatsfn_ext (str) : Filename extension of valid idat files. * idatsdir (str) : Name of directory containing GSM idat files. * filesdir (str) : Root name of directory containing database files. * timestamp (str) : NTP timestamp for file versioning. * msrapfn (str) : File name stem for MetaSRA-pipeline files Returns: * null, produces sheet files as a side effect. """ # form the sheet path and make dir as needed sheetspath = settings.sheetspath os.makedirs(sheetspath, exist_ok=True) sheets_fpath = os.path.join(sheetspath, ".".join([timestamp, sheetfn_ext])) # form msrap and idat paths and get filenames msrap_path = settings.gsmmsrapoutpath rxmsrap = re.compile(".*" + msrapfn_ext + "$") msrap_fnlist = list(filter(rxmsrap.match, os.listdir(msrap_path))) print("msrap_fnlist : " + str(msrap_fnlist)) # idats fn idats_path = settings.idatspath rxidat = re.compile(".*" + idatsfn_ext + "$") idats_fnlist = list(filter(rxidat.match, os.listdir(idats_path))) # extract gsm ids rxgsm = re.compile(".*GSM[0-9]") idats_splitlist = [ idatfn.split(".")[0] for idatfn in idats_fnlist if len(idatfn.split(".")) > 1 ] idats_gsmlist_filt = list(set(filter(rxgsm.match, idats_splitlist))) # unique gsm ids msrap_splitlist = [ msrapfn.split(".")[1] for msrapfn in msrap_fnlist if len(msrapfn.split(".")) > 1 ] msrap_gsmlist_filt = list(set(filter(rxgsm.match, msrap_splitlist))) # unique gsm ids print("idats_gsmlist_filt : " + str(idats_gsmlist_filt)) print("msrap_gsmlist_filt : " + str(msrap_gsmlist_filt)) gsmvalid = [ gsmid for gsmid in msrap_gsmlist_filt if gsmid in idats_gsmlist_filt ] if len(gsmvalid) > 0: rxgrn = re.compile(".*Grn.idat$") rxred = re.compile(".*Red.idat$") lsheet = [] # list object to write rsheet, one row per gsmid # append colnames lsheet.append(" ".join([ "gsmid", "gseid", "idats_fn", "msrapmd_fn", "msrapmd_flatjson", "SENTRIX_ID", "ARRAY_ID", "Basename" ])) lsheet[0] = lsheet[0] + "\n" for gsmid in gsmvalid: # compile the file info for this gsm rxgsmi = re.compile(".*" + gsmid + ".*") gsmi_idats = list(filter(rxgsmi.match, idats_fnlist)) gsmi_red_idats = list(filter(rxred.match, gsmi_idats)) gsmi_grn_idats = list(filter(rxgrn.match, gsmi_idats)) # get the latest file versions gsmi_red_pattern = gsmi_red_idats[0].split(".")[2] gsmi_grn_pattern = gsmi_grn_idats[0].split(".")[2] gsmi_red_latest = getlatest_filepath(filepath=idats_path, filestr=gsmi_red_pattern, embeddedpattern=True) gsmi_grn_latest = getlatest_filepath(filepath=idats_path, filestr=gsmi_grn_pattern, embeddedpattern=True) # get the latest msrap file gsmi_msrap_latest = getlatest_filepath(filepath=msrap_path, filestr=gsmid, embeddedpattern=True) print(gsmi_msrap_latest) if (gsmi_red_latest and not gsmi_red_latest == 0 and gsmi_grn_latest and not gsmi_grn_latest == 0 and gsmi_msrap_latest and not gsmi_msrap_latest == 0): # form the rsheets with valid gsm ids with open(gsmi_msrap_latest, 'r') as msrapmd: gsmi_metadata_dict = json.load(msrapmd) gsmi_md = gsmi_metadata_dict[0] # weird dictionary grows = [] for key in list(gsmi_md.keys()): kval = gsmi_md[key] if type(kval) is list: grows.append(";".join(kval)) else: grows.append(":".join([str(key), str(gsmi_md[key])])) gsmi_mdvar = "'" + ";".join(grows) + "'" # grab the gse id for this gsm gseid = str([ gsek for gsek in list(eqfiltd.keys()) if gsmid in eqfiltd[gsek] ][0]) # make the gsm arrays path Basename for minfi gsmi_bn = "_".join(gsmi_red_latest.split("_")[0:3]) # one entry per gsm lgsmi = " ".join([ gsmid, # gsm id gseid, # gse id ";".join([ os.path.basename(gsmi_red_latest), os.path.basename(gsmi_grn_latest) ]), # idat filenames os.path.basename(gsmi_msrap_latest), # metadata filename gsmi_mdvar, # flattened json file os.path.basename(gsmi_red_latest).split( "_")[-2], # sentrix id os.path.basename(gsmi_red_latest).split("_") [-3], # array id gsmi_bn # minfi path Basename, for arrays ]) lgsmi = lgsmi + "\n" lsheet.append(lgsmi) else: print( "No valid GSM IDs detected. Check idats and MetaSRA-pipeline GSM " + "files directories.") return 0 # write the final sheet files with open(sheets_fpath, 'w') as fsheet: for item in lsheet: fsheet.write(item) return lsheet
def rmdb_fpaths_old(rmhlinks=False): """ rmdb_fpaths Get filepaths for existant sample idats and msrap outfiles. Arguments: * rmhlinks : Whether to remove old hardlinks and form new ones, regardless of whether current hlinks exist (boolean). Returns: * gsmdocdict (dict.) : Dictionary of validated filepaths. """ timestamp = gettime_ntp() # connect to RMDB mongodb client = pymongo.MongoClient(settings.rmdbhost, settings.rmdbport) dbcon = client.recount_methylation; idatscon = dbcon.gsm.idats softcon = dbcon.gse.soft; idatslist = list(idatscon.find()) # grab unique gsm ids idatslist = [record for record in idatslist if 'gsmid' in record.keys()] gsmindex = list(set([record['gsmid'] for record in idatslist])) print("from idats db, found n = "+str(len(gsmindex))+" gsm ids") # fname catch patterns for re grnidatcatch = settings.grnidat_expcatch redidatcatch = settings.redidat_expcatch msrapoutcatch = settings.msrapoutfnpattern # filter all records for gsm on most recent update datetime gsm_fpaths_dd = {} # list all previously expanded idat files directy from idats dir allidatslist = os.listdir(settings.idatspath) allidatslist = list(filter(re.compile('.*\.idat$').match, allidatslist)) print("found n = "+str((len(allidatslist)))+" expanded idat filenames...") # grab and filter idats and msrap outfiles lists if rmhlinks: print("Beginning sample iterations with hlink removal.") else: print("Beginning sample iterations without hlink removal.") for gi, gsmid in enumerate(gsmindex, 1): print("Getting fpaths for gsm: "+str(gsmid)+", num: "+str(gi), end="\r") gsm_fpaths_dd[gsmid] = [] # all idat records for the GSM id recordsgsm = [record for record in idatslist if record['gsmid']==gsmid] # filter records by channel type, # note most records are for compressed files idatsrec_gsmgrn = [record for record in recordsgsm if isinstance(record['date'],datetime.datetime) and re.search('.*Grn\.idat.*',os.path.basename(record['filepath'])) ] idatsrec_gsmred = [record for record in recordsgsm if isinstance(record['date'],datetime.datetime) and re.search('.*Red\.idat.*',os.path.basename(record['filepath'])) ] if idatsrec_gsmgrn and idatsrec_gsmred: # get latest records for each channel irec_filtgrn = sorted(idatsrec_gsmgrn, key=lambda k: k['date'])[-1] irec_filtred = sorted(idatsrec_gsmred, key=lambda k: k['date'])[-1] # valid record file basenames igrnrec_bn = os.path.basename(irec_filtgrn['filepath']) iredrec_bn = os.path.basename(irec_filtred['filepath']) # check for expanded versions of compressed files igrn_fn = [fn for fn in allidatslist if igrnrec_bn[:-3] in fn ] ired_fn = [fn for fn in allidatslist if iredrec_bn[:-3] in fn ] if igrn_fn and ired_fn: igrn_fn = igrn_fn[0] ired_fn = ired_fn[0] hllist = [] if rmhlinks: # remove old hard links to sample idats grnhl_torm = [fn for fn in allidatslist if "hlink" in fn and '.'.join(igrn_fn.split('.')[2:]) in fn ] redhl_torm = [fn for fn in allidatslist if "hlink" in fn and '.'.join(ired_fn.split('.')[2:]) in fn ] if grnhl_torm: for hlfn in grnhl_torm: os.remove(os.path.join(settings.idatspath, hlfn) ) if redhl_torm: for hlfn in redhl_torm: os.remove(os.path.join(settings.idatspath, hlfn) ) # new hlinks hllist = new_idat_hlinks(gsmid, ts=timestamp, igrn_fn=igrn_fn, ired_fn=ired_fn ) else: # check if hlinks exist, create new ones otherwise grnhllist = [fn for fn in allidatslist if "hlink" in fn and '.'.join(igrn_fn.split('.')[2:]) in fn ] redhllist = [fn for fn in allidatslist if "hlink" in fn and '.'.join(ired_fn.split('.')[2:]) in fn ] # get matching grn and red hlink fn's if they exist status_hlink = None grnfnpass = None redfnpass = None if grnhllist and redhllist: grnhllistfilt = list(set(grnhllist)) redhllistfilt = [] for ghl in grnhllistfilt: for rhl in redhllist: # check that base array ids identical if ghl[:-9]==rhl[:-9]: redhllistfilt.append(rhl) else: redhllistfilt.append("") rhlfiltsub = [rhl[:-9] for rhl in redhllistfilt] grnhllistfilt = [ghl for ghl in grnhllistfilt if ghl[:-9] in rhlfiltsub] redhllistfilt = [rhl for rhl in redhllistfilt if not rhl==""] if grnhllistfilt and redhllistfilt: grnfnpass = grnhllistfilt[0] redfnpass = redhllistfilt[0] # pass hlinks to return dictionary hllist.append(os.path.join(settings.idatspath, grnfnpass)) hllist.append(os.path.join(settings.idatspath, redfnpass)) else: # make new hlinks hllist = new_idat_hlinks(gsmid, ts=timestamp, igrn_fn=igrn_fn, ired_fn=ired_fn) else: # make new hlinks hllist = new_idat_hlinks(gsmid, ts=timestamp, igrn_fn=igrn_fn, ired_fn=ired_fn) # finally, pass listed hlinks to return dictionary gsm_fpaths_dd[gsmid].append(hllist[0]) gsm_fpaths_dd[gsmid].append(hllist[1]) else: gsm_fpaths_dd[gsmid].append(None) gsm_fpaths_dd[gsmid].append(None) else: gsm_fpaths_dd[gsmid].append(False) # check for valid MetaSRA-pipeline filepaths try: msraplatest = getlatest_filepath(filepath=settings.gsmmsrapoutpath, filestr=gsmid, embeddedpattern=True, tslocindex=0, returntype='returnlist' ) if msraplatest and len(msraplatest)==1: gsm_fpaths_dd[gsmid].append(msraplatest[0]) except: gsm_fpaths_dd[gsmid].append(False) print("Finished with sample num "+str(gi), end="\r") print("Finished sample iterations. Returning...") # return gsmid dictionary with lists of filtered results or valid fpaths return gsm_fpaths_dd