def gse_task(gse_id, gsefiltdict=get_queryfilt_dict(),
             timestamp=gettime_ntp()):
    """ gse_task

        GSE based task for celery job queue.
        
        Arguments
            * gse_id : A single valid GSE id (str).
            * gsefiltdict : GSE filtered query object, as dictionary read
                using querydict() (dict).
            * timestamp : NTP timestamp for versioning file downloads (str).
            
        Returns
            * rl, a list of download dictionaries and rmdb update statuses.
            
    """
    if not timestamp:
        run_timestamp = gettime_ntp()
    else:
        run_timestamp = timestamp
    print('Beginning GSE task, ID: ' + gse_id)
    rl = []
    rl.append(gse_id)
    if gsefiltdict:
        print('File gsefiltdict provided, continuing...')
        gsmlist = gsefiltdict[gse_id]
        print('Detected N = ' + str(len(gsmlist)) + ' GSM IDs...')
        if len(gsmlist) > 0:
            rl.append(True)
            print("Beginning soft file download...")
            ddsoft = dl_soft(gse_list=[gse_id], timestamp=run_timestamp)
            rl.append(True)
            print('Beginning idat download...')
            ddidat = dl_idat(input_list=gsmlist, timestamp=run_timestamp)
            rl.append(True)
            print('updating rmdb...')
            updateobj = update_rmdb(ddidat=ddidat, ddsoft=ddsoft)
            rl.append(True)
        else:
            print('No valid GSM IDs detected for study GSE ID ', gse_id,
                  ', skipping...')
            rl.append(None)
        print('Task completed! Returning...')
        return rl
    else:
        print("Error: no GSE query filt file provided. Returning...")
        rl.append(None)
        return rl
Exemplo n.º 2
0
def firsttime_run(filedir='recount-methylation-files',
                  run_timestamp=gettime_ntp()):
    """ firsttime_run

        On first setup, run new equeries and query filter.
    
        Arguments:
        * filedir (str): Dir name for db files. 
        * run_timestamp (str) : NTP timestamp or function to retrieve it.
    
        Returns:
        * gseidlist (list): List of valid GSE IDs.
    """
    print("Beginning first time server run...")
    equery_dest = settings.equerypath
    temppath = settings.temppath
    gse_query()
    gsm_query()
    gseqfile = getlatest_filepath(equery_dest, 'gse_edirectquery')
    gsmqfile = getlatest_filepath(equery_dest, 'gsm_edirectquery')
    gsequery_filter()
    gsefiltpath = getlatest_filepath(equery_dest, 'gsequery_filt')
    if gsefiltpath:
        gsefiltd = querydict(querypath=gsefiltpath, splitdelim=' ')
        gseidlist = list(gsefiltd.keys())
        print("GSE id list of len " + str(len(gseidlist)) +
              " found. Returning...")
        return gseidlist
    else:
        print("Error retrieving gse query filtered file. Returning...")
        return None
    return None
Exemplo n.º 3
0
def scheduled_run(eqfilt_path=False, run_timestamp=gettime_ntp()):
    """ scheduled_run

        Tasks performed on regular schedule, after first setup. For the job 
        queue, a list of GSE IDs is returned. The id list is filtered on 
        existing GSE soft files to prioritize unrepresented experiments for 
        download. 

        Arguments:
        * eqfilt_path (str) : Filepath to edirect query filter file.
        * filedir (str) : Root name of files directory.
        * run_timestamp (str) : NTP timestamp or function to retrieve it.
        
        Returns:
        * gse_list (list) : list of valid GSE IDs, or None if error occurs 
    """
    try:
        gsefiltd = get_queryfilt_dict()
    except:
        print("No gse query filt file found, checking for GSE and GSM " +
              "queries...")
        gsequery_latest = getlatest_filepath(filepath=eqpath,
                                             filestr='gse_edirectquery')
        if not gsequery_latest:
            gse_query()
        gsmquery_latest = getlatest_filepath(eqpath, 'gsm_edirectquery')
        if not gsmquery_latest:
            gsm_query()
        print("Running filter on GSE query...")
        gsequery_filter()
        gsefiltd = get_queryfilt_dict()
    # get list of GSE IDs from existing SOFT files
    gsesoftfiles = os.listdir(settings.gsesoftpath)
    print("GSE SOFT files: " + str(gsesoftfiles))
    rxgse = re.compile('GSE[0-9]*')
    gseid_softexists = [
        str(rxgse.findall(softfn)[0]) for softfn in gsesoftfiles
        if rxgse.findall(softfn)
    ]
    if gsefiltd:
        gseid_listall = list(gsefiltd.keys())
        print("GSE ID list of len " + str(len(gseid_listall)) +
              " found. Filtering..")
        if gseid_softexists and len(gseid_softexists) > 0:
            gseid_filt = [
                gseid for gseid in gseid_listall
                if not gseid in gseid_softexists
            ]
        else:
            gseid_filt = gseid_listall
        print("After filtering existing SOFT files, N = " +
              str(len(gseid_filt)) + " GSE IDs remain. Returning ID list...")
        # if all GSE IDs represented, return all GSE IDs for brand new run
        if len(gseid_filt) == len(gseid_listall):
            gseid_filt = gseid_listall
        return gseid_filt
    else:
        print("Error forming equery filt dictionary. Returning...")
        return None
def gsequery_filter(splitdelim='\t', timestamp=gettime_ntp()):
    """ gsequery_filter
        
        Prepare an edirect query file. Filter a GSE query file on its GSM 
            membership. 
        
        Arguments:
            * splitdelim (str) : Delimiter to split ids in querydict() call.
            * timestamp (str) : NTP timestamp or function to retrieve it.
        
        Returns:
            * gsequeryfiltered (list): Filtered GSE query object (list), writes
                filtered query file as side effect.
    """
    eqpath = settings.equerypath
    gsequerystr = settings.gsequerystr
    gsmquerystr = settings.gsmquerystr
    # get GSM list from gsm query file
    gsmqueryf_latestpath = getlatest_filepath(filepath=eqpath,
                                              filestr=gsmquerystr,
                                              embeddedpattern=True,
                                              tslocindex=1,
                                              returntype='returnlist')
    if gsmqueryf_latestpath:
        print("Latest gsmquery file detected: " + str(gsmqueryf_latestpath))
    else:
        print("Error detecting latest gsmquery file! Returning...")
        return
    gsmlines = [line.rstrip('\n') for line in open(gsmqueryf_latestpath[0])]
    gsmlist = [line.split('\t')[1::][0] for line in gsmlines]
    # get GSE dictionary object
    gsequeryf_latestpath = getlatest_filepath(filepath=eqpath,
                                              filestr=gsequerystr,
                                              embeddedpattern=True,
                                              tslocindex=1,
                                              returntype='returnlist')
    if gsequeryf_latestpath:
        print("Latest gsequery file detected: " + str(gsequeryf_latestpath))
    else:
        print("Error detecting latest gsequery file! Returning...")
        return
    gsed_obj = querydict(querypath=gsequeryf_latestpath[0], splitdelim='\t')
    gsefiltl = []
    for gsekey in list(gsed_obj.keys()):
        samplelist_original = gsed_obj[gsekey]
        samplelist_filt = [
            sample for sample in samplelist_original if sample in gsmlist
        ]
        if samplelist_filt and len(samplelist_filt) > 0:
            gsefiltl.append(' '.join([gsekey, ' '.join(samplelist_filt)]))
    print('writing filt file...')
    if eqpath:
        filtfn = ".".join(["gsequery_filt", timestamp])
        with open(os.path.join(eqpath, filtfn), 'w') as filtfile:
            for item in gsefiltl:
                filtfile.write("%s\n" % item)
    return gsefiltl
def eqd_gsm_exclude(equery_dest=settings.equerypath, filesdir=settings.filesdir,
    gsmv_fname="gsmv.txt", exclude_dpath=os.path.join("inst", "freeze_gsmv")):
    """ eqd_gsm_exclude

        Exclude GSM IDs from edirecty query objects

        Arguments:
        * gsmv_fname: Name of the file to load. Should include only 
            space-separated sample/GSM IDs in a single line.
        * exclude_dpath: Path to directory containing the file gsmv_fname.

        Returns:
        * Returns the path to the new filtered file at settings.equerypath.

    """
    gsmv_fpath = os.path.join(exclude_dpath, gsmv_fname)
    if not os.path.exists(gsmv_fpath):
        print("Couldn't find sample ID file")
    gsmv_exclude = [line.rstrip('\n').split(" ") 
                        for line in open(gsmv_fpath)][0]
    # gsmv_exclude = [i for sublist in gsmv_exclude for i in sublist]
    eqpath = settings.equerypath
    gsefilt_latest = getlatest_filepath(eqpath,'gsequery_filt', 
            embeddedpattern=True, tslocindex=1, returntype='returnlist'
        )[0]
    print("Starting with latest detected filter file: "+gsefilt_latest)
    querylines = [line.rstrip('\n') for line in open(gsefilt_latest)]
    qlnew = []; print("Applying filter..."); numgsm_old = len(querylines)
    for line in querylines:
        line = line.split(" ")
        ldat = [gid for gid in line if not gid in gsmv_exclude]
        numgsm_new = len(ldat)
        if len(ldat) > 1:
            qlnew.append(ldat)
    print("After filter, retained " + str(len(qlnew)) + " studies.")
    nts = gettime_ntp()
    newfpath = os.path.join(eqpath, ".".join(["gsequery_filt",nts]))
    print("Writing new filter file: ", newfpath)
    with open(newfpath, "w") as wf:
        for line in qlnew:
            wf.write(" ".join(line) + "\n")
        return newfpath
def gsm_query(validate=True, timestamp=gettime_ntp()):
    """ gsm_query
        Get GSM level query object, from edirect query.
        Arguments:
            * validate (True/False, bool.) : whether to validate the file after 
                ownload.
            * timestamp (str) : NTP timestamp or function to retrieve it.
        Returns: 
            * Error (str) or download object (dictionary). 
    """
    # timestamp = str(gettime_ntp())
    eqdestpath = settings.equerypath
    temppath = settings.temppath
    os.makedirs(eqdestpath, exist_ok=True)
    os.makedirs(temppath, exist_ok=True)
    temp_make = tempfile.mkdtemp(dir=temppath)
    atexit.register(shutil.rmtree, temp_make)
    dldict = {}
    dldict['gsmquery'] = []
    dlfilename = ".".join(['gsm_edirectquery', timestamp])
    dldict['gsmquery'].append(dlfilename)
    subp_strlist1 = [
        "esearch", "-db", "gds", "-query",
        "'" + settings.platformid + "[ACCN] AND idat[suppFile] AND gsm[ETYP]'"
    ]
    subp_strlist2 = ["efetch", "-format", "docsum"]
    subp_strlist3 = [
        "xtract", "-pattern", "DocumentSummary", "-element", "Id Accession",
        ">",
        os.path.join(temp_make, dlfilename)
    ]
    args = " | ".join([
        " ".join(subp_strlist1), " ".join(subp_strlist2),
        " ".join(subp_strlist3)
    ])
    output = subprocess.check_output(args, shell=True)
    dldict['gsmquery'].append(output)
    if validate:
        gsmquery_filewritten = os.path.join(temp_make, dlfilename)
        gsmquery_old = glob.glob('.'.join([
            os.path.join(eqdestpath, 'gsm_edirectquery'),
            '*',
        ]))
        if gsmquery_old:
            if len(gsmquery_old) > 1:
                gsmquery_old.sort(key=lambda x: int(x.split('.')[1]))
                gsmquery_old_mostrecent = gsmquery_old[-1]
            else:
                gsmquery_old_mostrecent = gsmquery_old[0]
            # filecmp should work (equesry file order preserved on reps)
            if filecmp.cmp(gsmquery_old_mostrecent, gsmquery_filewritten):
                print("Downloaded gsm query file same as most recent stored." +
                      " Removing...")
                os.remove(gsmquery_filewritten)
                dldict['gsmquery'].append(False)
            else:
                print("Downloaded file is new, moving to dest...")
                shutil.move(
                    gsmquery_filewritten,
                    os.path.join(eqdestpath,
                                 os.path.basename(gsmquery_filewritten)))
                dldict['gsmquery'].append(True)
        else:
            print("Downloaded file is new, moving...")
            shutil.move(
                gsmquery_filewritten,
                os.path.join(eqdestpath,
                             os.path.basename(gsmquery_filewritten)))
            dldict['gsmquery'].append(True)
    return dldict
def write_cjson(jffnv,
                ts=gettime_ntp(),
                newfilefn="cjson",
                tempdname="cjsontemp",
                jsonfiltpath=settings.gsmjsonfiltpath,
                msrap_destpath=settings.gsmmsrapoutpath):
    """ write_cjson

        Write a composite JSON file with multiple samples

        Arguments: 
        * jffnv : Vector of filtered JSON filenames (list).
        * newfilefn : File name stem of new file to write (str).
        * msrap_destpath : Path to MetaSRA-pipeline output files (str).
        * jsonfiltpath : Path to filtered GSM JSON files (str).
        * tempdname : Name of dir, at jsonfiltpath, to contain composite 
            JSON files (str).
        * ts : Timestamp of output and input files (int).

        Returns:
        * Path to new composite JSON file.

    """
    temppath_read = os.path.join(jsonfiltpath)
    if not os.path.exists(temppath_read):
        os.makedirs(temppath_read)
    temppath_write = os.path.join(msrap_destpath, tempdname)
    if not os.path.exists(temppath_write):
        os.makedirs(temppath_write)
    ll = []
    fnl = []
    for fn in jffnv:
        fpath = os.path.join(jsonfiltpath, fn)
        if os.path.exists(fpath):
            with open(fpath, "r") as openjson:
                linesi = openjson.readlines()
                if len(linesi) > 0:
                    ll.append(linesi)
                    fnl.append(fpath)
    newfn = ".".join([newfilefn, ts])
    wite_fpath = os.path.join(temppath_write, newfn)
    if len(ll) > 0:
        print("Read data for " + str(len(ll)) + " files. Writing data")
        lform = []
        with open(wite_fpath, "w") as opencj:
            opencj.write("[\n")  # first line
            for fi, file in enumerate(ll):
                ld = []
                for line in file:
                    if line == "}\n":
                        opencj.write("\t{\n")
                        ld = ld[1::]
                        jfname = os.path.basename(fnl[fi])
                        gsmid = '"' + jfname.split(".")[1] + '"'
                        lpath = ":".join(['"gsm"', gsmid])
                        opencj.write("\t\t" + lpath + ",\n")  # sample id
                        for ii, ldi in enumerate(ld):
                            lf = ldi.split(":")
                            if lf[0] == '  !Sample_source_name_ch1':
                                lf = ['source'] + lf[1::]
                            elif lf[0] == '  !Sample_title':
                                lf = ['title'] + lf[1::]
                            else:
                                lf = lf[1::]
                            lf = ['"' + i + '"' for i in lf]
                            lf = ':'.join(lf[0:2])
                            if ii == len(ld) - 1:
                                lf = lf + "\n"  # comma for values before last
                            else:
                                lf = lf + ",\n"
                            opencj.write("\t\t" + lf)
                        if fi == len(ll) - 1:
                            opencj.write("\t}\n")  # no comma for final entry
                        else:
                            opencj.write("\t},\n")
                    else:
                        ldi = line
                        ldi = ldi.replace(']', '')
                        ldi = ldi.replace('[', '')
                        ldi = ldi.replace('"', '')
                        ldi = ldi.replace('\n', '')
                        ldi = ldi.replace(',', '')
                        if not ldi == "":
                            ld.append(ldi)
            opencj.write("]")  # last line
    return wite_fpath
def run_msrap_compjson(json_flist=[],
                       njint=500,
                       jsonpatt=".*json.filt$",
                       gsm_jsonpath=settings.gsmjsonfiltpath,
                       tempdname="cjsontemp",
                       msrap_destpath=settings.gsmmsrapoutpath,
                       newfnpattern="msrap.cjson"):
    """ run_msrap_compjson

        Run MetaSRA-pipeline on composite JSON files
        
        Runs the MetaSRA-pipeline on composite JSON files containing njint 
        samples' JSON-formatted metadata. The composite JSON files and the 
        composite metadata outputs are both written to tempfname at 
        msrap_destpath. After mapping, get_gsm_outputs() is called to make the
        GSM-specific files, which are output to the top level of msrap_destpath.
        
        Arguments:
        * json_flist : List of JSON filename(s) to process. If not provided, 
            automatically targets all JSON files at gsm_jsondir (list, 
            optional).
        * njint : Number of JSON files per composite file to process (int).
        * jsonpatt : File name pattern for valid filtered JSON files (str).
        * gsm_jsonpath : Path to the filtered GSM JSON files directory (str).
        * tempdname : Dir, located at msrap_destpath, where composite JSON files 
            and outputs are to be written (str).
        * msrap_destpath : Path where mapped metadata output files will be 
            written (str).     
        * newfnpattern : File name pattern for mapped metadata output (str).
        
        Returns:
        * NULL, produces the composite file pairs and GSM metadata files.

    """
    eqfiltdict = get_queryfilt_dict()
    validgsmlist = [
        gsmid for gselist in list(eqfiltdict.values()) for gsmid in gselist
    ]
    msrap_runpath = settings.msraprunscriptpath
    msrap_oldgsm = []
    if os.path.exists(msrap_destpath):
        mld = os.listdir(msrap_destpath)
        mld = [i for i in mld if not i == "cjsontemp"]
        if len(mld) > 0:
            msrap_oldgsm = [
                fn.split(".")[1] for fn in mld if len(fn.split(".")) > 1
            ]
    if not (json_flist and len(json_flist) > 0):
        if os.path.exists(gsm_jsonpath):
            json_flist = os.listdir(gsm_jsonpath)
        else:
            print("Couldn't find JSON file dir at " + gsm_jsonpath)
    print("Filtering GSM JSON filenames on pattern, existing msrap files...")
    gsm_json_fn_list = list(filter(re.compile(jsonpatt).match, json_flist))
    gsm_json_fn_list = [
        fn for fn in gsm_json_fn_list if not fn.split(".")[1] in msrap_oldgsm
    ]
    cjsonpath = os.path.join(msrap_destpath, tempdname)
    os.makedirs(cjsonpath, exist_ok=True)
    msrap_statlist = []
    msrap_fn = settings.msrapfnstem
    process_list = []
    rl = [r for r in range(0, len(gsm_json_fn_list), njint)]
    print("Running pipeline for composite JSON files...")
    for r in rl:
        ts = gettime_ntp()  # use new ts for each new composite file pair
        jsonflist = gsm_json_fn_list[r:r + njint]
        cjreadpath = write_cjson(jffnv=jsonflist,
                                 jsonfiltpath=gsm_jsonpath,
                                 msrap_destpath=msrap_destpath,
                                 ts=ts,
                                 tempdname=tempdname)
        newfn = ".".join([newfnpattern, ts])
        cjwritepath = os.path.join(cjsonpath, newfn)
        cmdlist = [
            'python2', msrap_runpath, "--fnvread", cjreadpath, "--fnvwrite",
            cjwritepath
        ]
        process_list.append(subprocess.call(cmdlist, shell=False))
        print("Finished index " + str(r))
    print("Extracting GSM data from composite JSON results...")
    get_gsm_outputs()
    return None
Exemplo n.º 9
0
def rmdb_fpaths():
    """ rmdb_fpaths

        Get filepaths for existant sample idats and msrap outfiles.

        Returns:
        * hlinklist, list of new hlink files created at settings.idatspath.

    """
    timestamp = gettime_ntp()
    # connect to RMDB mongodb
    #client = pymongo.MongoClient(mdb_host, mdb_port)
    #mdbcon = client.recount_methylation; mdb_idatscon = mdbcon.gsm.idats
    #mdb_idatrecords = list(mdb_idatscon.find())
    # list all previously expanded idat files directy from idats dir
    instpath_allidats = os.listdir(settings.idatspath)
    # compressed idats
    instpath_compidats = list(filter(re.compile('.*\.idat.gz$').match, 
        instpath_allidats))
    # expanded idats
    instpath_expidat = list(filter(re.compile('.*\.idat$').match, 
        instpath_allidats))
    # idat hlinks
    instpath_hlink = list(filter(re.compile('.*hlink.*').match, 
        instpath_expidat))
    # expanded idats without hlinks
    instpath_nohlink = [i for i in instpath_expidat 
        if not i in instpath_hlink]
    print("Detected " +str(len(instpath_compidats))+ 
        " compressed IDATs, " + str(len(instpath_expidat)) + 
        " expanded IDATs, and " + str(len(instpath_nohlink)) + 
        " expanded IDATs without hlinks.")
    print("Getting GSM IDs for IDATs without hlinks...")
    instpath_nohlink_gsm = list(set([i.split(".")[0] for i in instpath_nohlink]))
    print("Getting IDAT filepaths from MongoDB records, "+
        "for GSM IDs lacking hlinks...")
    #mdb_nohlink_gsmlist = list(set([i["gsmid"] for i in mdb_idatrecords 
    #    if i["gsmid"] in instpath_nohlink_gsm]))
    gsmlist = list(set([i.split(".")[0] for i in instpath_nohlink]))
    instpath_idatspathlist = [i for i in instpath_nohlink 
        if i.split(".")[0] in gsmlist]; hlinklist=[]
    for gsmid in gsmlist:
        print("Processing GSM ID " + gsmid + "..."); 
        ired_fn = ""; igrn_fn = ""; basename_grn = ""; basename_red = ""
        gsm_idats = [i for i in instpath_idatspathlist
            if i.split(".")[0] == gsmid and 
            os.path.exists(os.path.join(settings.idatspath, i))]
        try:
            igrn_fn = list(filter(re.compile(".*Grn\.idat$").match, gsm_idats))[0]
            ired_fn = list(filter(re.compile(".*Red\.idat$").match, gsm_idats))[0]
            basename_grn = "_".join(igrn_fn.split(".")[2].split("_")[0:-1])
            basename_red = "_".join(ired_fn.split(".")[2].split("_")[0:-1])
            if (basename_grn==basename_red and not basename_grn == "" 
                and not basename_red == ""):
                print("Making new IDAT hlinks for GSM ID " + gsmid)
                rlist = new_idat_hlinks(gsmid = gsmid, ts = timestamp, 
                    igrn_fn = igrn_fn, ired_fn = ired_fn)
                hlinklist.append(rlist)
        except:
            print("Couldn't find Red and Grn IDATs for GSM ID " + gsmid)
        print("Finished with GSM ID " + gsmid)
    print("Made " + str(len(hlinklist)) + " new IDAT hlinks. Returning...")
    return hlinklist
Exemplo n.º 10
0
    if gseid in gseid_processnew:
        gsefn_processnew.append(gsesoftl[i])

len(gsefn_processnew)






#---------------------------------
#  process a single gse soft file
#---------------------------------
softopenindex='.*!Sample_title.*'
softcloseindex='.*!Sample_data_row_count.*'
timestamp=gettime_ntp()
gse_softpath = settings.gsesoftpath
gsm_softpath = settings.gsmsoftpath
gsmsoft_destpath = settings.gsmsoftpath
validate=True

which_gsefn = 0
gsefn = gsefn_processnew[which_gsefn]

gsesoft_flist=[gsefn]

eqfiltdict=get_queryfilt_dict()
validgsmlist = list(set([gsmid for gselist in list(eqfiltdict.values()) 
    for gsmid in gselist
]))
print("length validgsmlist : "+str(len(validgsmlist)))
Exemplo n.º 11
0
def compile_rsheet(gsmfpathdict):
    """ compile_rsheet

        Takes dictionary of GSM IDs. Compiles valid GSM IDs, filenames, and 
        path into an rsheet object.
        
        Arguments:
            * gsmfpathdict: gsm paths dict obj output from rmdb_fpaths() 
                (dict).
        
        Returns:
            * lsheet, produces rsheet file as a side effect.

    """
    timestamp = gettime_ntp()
    print("Getting equery filter...")
    eqd = get_queryfilt_dict()
    gsmvalidlist = list(set([gsmid for gselist in list(eqd.values()) 
        for gsmid in gselist
    ]))
    sheetspath = settings.sheetspath; sheetfn_ext = settings.sheetfnstem
    os.makedirs(sheetspath, exist_ok = True)
    sheets_fpath = os.path.join(sheetspath, ".".join([timestamp, sheetfn_ext]))
    # table written as list of row strings
    print("Forming table list for rsheet..."); lsheet = []
    lsheet.append(" ".join(["gsmid",
        "gseid",
        "idats_fn",
        "msrapmd_fn",
        "msrapmd_flatjson",
        "SENTRIX_ID",
        "ARRAY_ID",
        "Basename"]))
    lsheet[0] = lsheet[0]+"\n"; print("Forming filtered GSM dictionary...")
    gsmvalid_fpathlist = {key:value for (key,value) in gsmfpathdict.items() 
        if key in gsmvalidlist}
    if gsmvalid_fpathlist:
        print("Starting iterations on gsm filepaths list of len = "
            +str(len(list(gsmvalid_fpathlist.keys()))))
        for gsmindex, gsmid in enumerate(gsmvalid_fpathlist, 1):
            print("Beginning GSM num "+str(gsmindex)+", id: "+str(gsmid))
            gsmvalid_fp = [fp for fp in gsmvalid_fpathlist[gsmid] 
                if not fp==None
                and not fp==False]
            if gsmvalid_fp:
                print("Getting GSE ID...")
                gseid = ';'.join(list(set([gsek for gsek in list(eqd.keys()) 
                            if gsmid in eqd[gsek]
                            ]
                        )      
                    )
                )
                print("GSE id found: "+str(gseid))
                gsm_fpaths = gsmvalid_fp
                gsmi_redidatpath = [fp for fp in gsm_fpaths if "_Red.idat" in fp]
                gsmi_grnidatpath = [fp for fp in gsm_fpaths if "_Grn.idat" in fp]
                gsmi_msrappath = [fp for fp in gsm_fpaths if "soft.msrapout" in fp]
                if gsmi_redidatpath and gsmi_grnidatpath:
                    print("Found paired channel idats for GSM...")
                    gsmi_redidatpath = gsmi_redidatpath[0]
                    gsmi_grnidatpath = gsmi_grnidatpath[0]
                    # idat filenames
                    grn_idatfn = os.path.basename(gsmi_grnidatpath)
                    red_idatfn = os.path.basename(gsmi_redidatpath)
                    # sample basename (common stem of channel array filenames)
                    print("Getting sample basename..."); gsmimdd = []
                    gsmi_basename = "_".join(red_idatfn.split("_")[0:3]) # basename
                    if gsmi_msrappath:
                        print("Detected metadata file for GSM.")
                        gsmi_msrappath = gsmi_msrappath[0]
                        gsmi_msrappath_var = os.path.basename(gsmi_msrappath)   
                        print("Forming flattened sample metadata...")
                        try:
                            # load msrap mapped terms json file
                            with open(gsmi_msrappath, 'r') as msrapmd:
                                gsmimdd = json.load(msrapmd)
                        except json.decoder.JSONDecodeError:
                            print("Error, cannot load non-json file: "+gsmi_msrappath)
                            gsmi_msrappath_var = "NA"
                    else:
                        gsmi_msrappath_var = "NA"
                    if gsmimdd and not gsmi_msrappath_var == "NA":
                        gsmi_md = gsmimdd[0]; gmd = []
                        # coerce json metadata to flat string
                        for key in list(gsmi_md.keys()):
                            print(str(key))
                            kval = ''
                            if key == 'sample type':
                                print("key = 'sample type'")
                                print("kval = "+str(gsmi_md[key]))
                                gmd.append(str("sampletype="+str(gsmi_md[key])))
                            if key == 'real-value properties':
                                print("key = "+str(key))
                                kval = gsmi_md[key]
                                print("kval : "+str(kval))
                                for index, val in enumerate(kval):
                                    subkval = kval[index]
                                    print("subkval : "+str(subkval))
                                    gmd.append(str(subkval['property_id']))
                            if key == 'mapped ontology terms':
                                print("key = "+str(key))
                                kval = gsmi_md[key]
                                gmd.append(";".join([term for term in kval]))
                            if key == 'sample-type confidence':
                                print("key = "+str(key))
                                gmd.append('sampletypeconf='+str(gsmi_md[key]))
                        gsmi_mdvar = ";".join(gmd) # long metadata string
                    else:
                        gsmi_mdvar = "NA"     
                    # form table row entry for gsmid as long string
                    print("Adding row to table list...")
                    lgsmi = " ".join([gsmid, # gsm id
                        gseid, # gse id
                        ";".join([red_idatfn,grn_idatfn]), # idat filenames
                        gsmi_msrappath_var, # metadata filename
                        gsmi_mdvar, # flattened json file
                        grn_idatfn.split("_")[-2], # sentrix id
                        grn_idatfn.split("_")[-3], # array id
                        gsmi_basename # minfi path Basename, for arrays
                    ])
                    lgsmi = lgsmi+"\n"
                    lsheet.append(lgsmi)
                else:
                    print("Error: GSM is missing one or more valid filepaths. Continuing...")
    else:
        print("No valid GSM IDs detected. Check idats and pipeline folders.")
        return None
    print("Finished processing the GSM files dictionary, writing new rsheet...")
    with open(sheets_fpath,'+w') as fsheet:
        for gsmitem in lsheet:
            fsheet.write(gsmitem)
    return lsheet
def preprocess_mdat(bnlistpass,
                    timelim=40,
                    nsampproc=10,
                    nprocmax=4,
                    statint=2):
    """ preprocess_mdat
        
        Preprocess mdat files via background subprocesses, monitoring, and 
        logging.
        
        Arguments
            * bnlistpass (list) : List of valid basenames
            * timelim (int) : Time limit for running processes, in minutes.
            * nsampproc (int): Number of samples per process launched.
            * nprocmax (int): Total processes to launch
            * statint (int): Seconds to wait before monitor status updates.
        
        Returns
            * None, produces status log in stdout and new logfile as side effect
    """
    # form the array of bn lists
    print("Forming basenames array...")
    bnscreenarray = []  # array of bn lists for batch processing
    n = nsampproc
    bnscreenarray = [
        ' '.join(bnlistpass[i * n:(i + 1) * n])
        for i in range((len(bnlistpass) + n - 1) // n)
    ]
    bnscreenarray = bnscreenarray[0:nprocmax]
    print("Finished forming basenames array of length = " +
          str(len(bnscreenarray)))
    # new screen deployment
    print("Getting timestamp...")
    timestamp = gettime_ntp()
    process_list = []  # process list for status monitoring and stderr
    # getting string limit
    print("Getting args maxstr...")
    argmaxstr = int(
        str(subprocess.check_output(['getconf', 'ARG_MAX'
                                     ])).replace("b'", '').replace("\\n'", ""))
    print("Detected argmaxstr of " + str(argmaxstr) + ". Continuing...")
    print("Launching background subprocesses...")
    for bi, bnstr in enumerate(bnscreenarray, 0):
        cmd = [
            'Rscript', settings.mdatscriptpath, timestamp,
            str(bi), '"' + bnstr + '"'
        ]
        cmdcharlen = len(''.join(cmd))
        print("Formed cmd str of len = " + str(cmdcharlen) +
              ", checking args str limit...")
        if cmdcharlen <= argmaxstr:
            proc = subprocess.Popen(cmd,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
            process_list.append(proc)
            print("Launched background subprocess and appended poll to " +
                  "statuslist. Continuing...")
        else:
            print("Error! Char length of cmd exceeds system limit for args. " +
                  "Try modifying argument 'nsampproc'. Continuing...")
    # process monitoring start
    monitor_processes(process_list=process_list, logpath=settings.mdatlogspath)
    print("Completed preprocessing. Returning...")
    return None
Exemplo n.º 13
0
def dl_idat(input_list,
            retries_connection=3,
            retries_files=3,
            interval_con=.1,
            interval_file=.01,
            validate=True,
            timestamp=gettime_ntp()):
    """ dl_idat
        
        Download idats, reading in either list of GSM IDs or ftp addresses.
        
        Arguments
            * input list (list, required) : A list of valid GSM IDs.
            * retries_connection (int) : Number of ftp connection retries 
                allowed.
            # retries_files : Number of retry attempts allowed for sample file
                downloads.
            * interval_con (float) : Time (in seconds) to sleep before retrying 
                a database connection.
            * interval_file (float) : Time (in seconds) to sleep before retrying 
                a file connection. 
            * validate (Bool.): Validate new files against existing idats?
            * timestamp (str) : An NTP timestamp for versioning.
        
        Returns 
            * dldict (dictionary) : Records, dates, and exit statuses of ftp 
                calls, OR error string over connection issues. Downloads and 
                moves new and validated files as side effect. 
    """
    idatspath = settings.idatspath
    temppath = settings.temppath
    os.makedirs(idatspath, exist_ok=True)
    os.makedirs(temppath, exist_ok=True)
    temp_dir_make = tempfile.mkdtemp(dir=temppath)
    item = input_list[0]
    if not item.startswith('GSM'):
        raise RuntimeError("GSM IDs must begin with \"GSM\".")
    ftptoken_login = '******'
    retries_left_connection = retries_connection
    while retries_left_connection:
        print('trying ftp connection')
        try:
            ftp = ftplib.FTP(ftptoken_login)
            loginstat = ftp.login()
            print('connection successful, continuing...')
            break
        except ftplib.all_errors as e:
            if retries_left_connection:
                retries_left_connection -= 1
                print('continuing with connection retries left = ' +
                      str(retries_left_connection))
                time.sleep(interval_con)
                continue
            else:
                print('connection retries exhausted, returning...')
                return str(e)
    # mongodb connection
    client = pymongo.MongoClient(settings.rmdbhost, settings.rmdbport)
    dldict = {}
    files_written = []
    for gsm_id in input_list:
        print('Starting GSM: ' + gsm_id)
        dldict[gsm_id] = []
        id_ftptokens = [
            'ftp.ncbi.nlm.nih.gov', 'geo', 'samples', gsm_id[:-3] + 'nnn',
            gsm_id, 'suppl'
        ]
        id_ftpadd = '/'.join(id_ftptokens[1::]) + '/'
        filenames = []
        retries_left_files = retries_files
        try:
            filenames = ftp.nlst(id_ftpadd)
            if len(filenames) > 0:
                filestr = '; '.join(str(e) for e in filenames)
                print("files found: " + filestr)
                dldict[gsm_id].append([
                    gsm_id, id_ftpadd,
                    "connection success, valid num idats found"
                ])
                print("Idat filenames detected for " + gsm_id +
                      ", continuing...")
                for file in filenames:
                    print("Beginning iteration for file: " + file)
                    filedate = ""
                    filedate_estat = ""
                    filedl_estat = ""
                    file_tokens = file.split('/')
                    try:
                        filedate = ftp.sendcmd("MDTM /" +
                                               '/'.join(file_tokens))
                        filedate = datetime.datetime.strptime(
                            filedate[4:], "%Y%m%d%H%M%S")
                        mongo_date = idat_mongo_date(gsm_id, file, client)
                        if filedate in mongo_date:
                            filedate_estat = "same_as_local_date"
                            dldict[gsm_id].append(
                                [gsm_id, file, filedate, filedate_estat])
                            print('Online date same as local date. Breaking..')
                            break
                        else:
                            filedate_estat = "new_date"
                            to_write = os.path.join(
                                temp_dir_make, '.'.join(
                                    [gsm_id,
                                     str(timestamp), file_tokens[-1]]))
                            file_ftpadd = '/'.join(file_tokens[:-1])
                            file_ftpadd = file_ftpadd + '/' + file_tokens[-1:][
                                0]
                            print('Attempting file download, for file: ' +
                                  file)
                            try:
                                with open(to_write, 'wb') as output_stream:
                                    filedl_estat = ftp.retrbinary(
                                        "RETR /" + file_ftpadd,
                                        output_stream.write)
                                dldict[gsm_id].append([
                                    gsm_id, file_ftpadd, to_write,
                                    filedl_estat, filedate, filedate_estat
                                ])
                                if '226 Transfer complete' in filedl_estat:
                                    files_written.append(
                                        (gsm_id, to_write,
                                         len(dldict[gsm_id]) - 1))
                                print("File successfully downloaded. " +
                                      "Continuing...")
                                continue
                            except ftplib.all_errors as efiledl:
                                if retries_left_files:
                                    retries_left_files -= 1
                                    print(
                                        'ftp file dl error, retries left = ' +
                                        str(retries_left_files))
                                    time.sleep(interval_file)
                                    continue
                                else:
                                    print(
                                        'File retries exhausted. Breaking...')
                                    filedl_estat = str(efiledl)
                                    dldict[gsm_id].append([
                                        gsm_id, file_ftpadd, to_write,
                                        filedl_estat, filedate, filedate_estat
                                    ])
                                    break
                            break
                        break
                    except ftplib.all_errors as efiledate:
                        if retries_left_files:
                            retries_left_files -= 1
                            print('ftplib file date error, retries left = ' +
                                  str(retries_left_files))
                            time.sleep(interval_file)
                            continue
                        else:
                            print('File retries exhausted. Breaking...')
                            filedate_estat = str(efiledate)
                            filedate = "not_available"
                            dldict[gsm_id].append(
                                [gsm_id, file, filedate, filedate_estat])
                            break
                    continue
            else:
                dldict[gsm_id].append([gsm_id, "no files at ftp address"])
                break
        except ftplib.error_temp as eid:
            if retries_left_files:
                retries_left_files -= 1
                print('ftplib filenames error, retries left = ' +
                      str(retries_left_files))
                time.sleep(interval_file)
                continue
            else:
                print('File retries exhausted. Breaking...')
                dldict[gsm_id].append([gsm_id, id_ftpadd, str(eid)])
                break
    if validate:
        print("Validating downloaded files...")
        for gsm_id, file_written, index in files_written:
            print("file written is " + file_written)
            filestr = os.path.basename(file_written).split('.')[2::]
            filestr = str('.'.join(filestr))
            print('filestr written : ' + filestr)
            print('dir to search latest: ' + idatspath)
            gsmidat_latest = getlatest_filepath(idatspath,
                                                filestr,
                                                embeddedpattern=True,
                                                returntype='returnlist',
                                                tslocindex=1)
            print('gsm latest: ' + str(gsmidat_latest))
            if gsmidat_latest:
                gsmidat_latest = gsmidat_latest[0]
                print('cmp result: ' +
                      str(filecmp.cmp(gsmidat_latest, file_written)))
                if filecmp.cmp(gsmidat_latest, file_written):
                    print(
                        "Downloaded file is same as recent file. Removing...")
                    os.remove(file_written)
                    # If filename is false, we found it was the same
                    dldict[gsm_id][index].append(False)
                else:
                    print("Downloaded file is new, moving to idatspath...")
                    shutil.move(
                        file_written,
                        os.path.join(idatspath,
                                     os.path.basename(file_written)))
                    dldict[gsm_id][index].append(True)
                    dldict[gsm_id][index][2] = os.path.join(
                        idatspath, os.path.basename(file_written))
            else:
                print("Downloaded file is new, moving...")
                shutil.move(
                    file_written,
                    os.path.join(idatspath, os.path.basename(file_written)))
                dldict[gsm_id][index].append(True)
                dldict[gsm_id][index][2] = os.path.join(
                    idatspath, os.path.basename(file_written))
        shutil.rmtree(temp_dir_make)
    return dldict
Exemplo n.º 14
0
def dl_soft(gse_list=[],
            retries_connection=3,
            retries_files=3,
            interval_con=.1,
            interval_file=.01,
            validate=True,
            timestamp=gettime_ntp()):
    """ dl_soft
        
        Download GSE soft file(s). Accepts either a list of GSM IDs or ftp 
        addresses.
        
        Arguments:
            * gse_list (list, required) : A list of valid GSE id(s).
            * retries_connection (int) : Number of ftp connection retries 
                allowed. 
            * retries_files : Number of retry attempts allowed for sample file
                downloads. 
            * interval_con (float) : Time (in seconds) to sleep before retrying 
                a database connection. 
            * interval_file (float) : Time (in seconds) to sleep before retrying 
                a file connection. 
            * validate (Bool.): Validate new files against existing idats?
            * timestamp (str) : An NTP timestamp for versioning.     
        
        Returns: 
            * Dictionary showing records, dates, and exit statuses of ftp calls
                OR error string over connection issues
    """
    gsesoftpath = settings.gsesoftpath
    temppath = settings.temppath
    os.makedirs(gsesoftpath, exist_ok=True)
    os.makedirs(temppath, exist_ok=True)
    temp_dir_make = tempfile.mkdtemp(dir=temppath)
    item = gse_list[0]
    if not item.startswith('GSE'):
        raise RuntimeError("GSE IDs must begin with \"GSE\".")
    ftptoken_login = '******'
    retries_left_connection = retries_connection
    while retries_left_connection:
        print('trying ftp connection')
        try:
            ftp = ftplib.FTP(ftptoken_login)
            loginstat = ftp.login()
            print('connection successful, continuing...')
            break
        except ftplib.all_errors as e:
            if retries_left_connection:
                retries_left_connection -= 1
                print('continuing with connection retries left = ' +
                      str(retries_left_connection))
                time.sleep(interval_con)
                continue
            else:
                print('connection retries exhausted, returning...')
                return str(e)
    # mongodb connection
    client = pymongo.MongoClient(settings.rmdbhost, settings.rmdbport)
    dldict = {}
    print('beginning iterations over gse list...')
    for gse in gse_list:
        print('beginning download for gse: ' + gse)
        retries_left_files = retries_files
        dldict[gse] = []
        files_written = []
        filenames = []
        # tokens for soft file ftp address
        id_ftptokens = [
            'ftp.ncbi.nlm.nih.gov', 'geo', 'series', gse[:-3] + 'nnn', gse,
            'soft'
        ]
        id_ftpadd = '/'.join(id_ftptokens[1::]) + '/'
        while retries_left_files:
            try:
                filenames = ftp.nlst(id_ftpadd)
                # filter for only soft file names
                file = list(filter(lambda x: 'family.soft' in x, filenames))[0]
                dldict[gse].append([gse, id_ftpadd, "success"])
                filedate = ""
                filedate_estat = ""
                filedl_estat = ""
                file_tokens = file.split('/')
                try:
                    print('getting date from ' + '/'.join(file_tokens))
                    filedate = ftp.sendcmd("MDTM /" + '/'.join(file_tokens))
                    filedate = datetime.datetime.strptime(
                        filedate[4:], "%Y%m%d%H%M%S")
                    mongo_date = soft_mongo_date(gse, file, client)
                    if filedate in mongo_date:
                        print('online  date same as local date,' +
                              'breaking...')
                        filedate_estat = "same_as_local_date"
                        dldict[gse].append(
                            [gse, file, filedate, filedate_estat])
                        break
                    else:
                        print('new online date found, continuing...')
                        filedate_estat = "new_date"
                        to_write = os.path.join(
                            temp_dir_make,
                            '.'.join([gse, timestamp, file_tokens[-1]]))
                        file_ftpadd = '/'.join(file_tokens[:-1])
                        file_ftpadd = file_ftpadd + '/' + file_tokens[-1:][0]
                        try:
                            print('downloading soft from ' + file_ftpadd)
                            with open(to_write, 'wb') as output_stream:
                                filedl_estat = ftp.retrbinary(
                                    "RETR /" + file_ftpadd,
                                    output_stream.write)
                            dldict[gse].append([
                                gse, file_ftpadd, to_write, filedl_estat,
                                filedate, filedate_estat
                            ])
                            if '226 Transfer complete' in filedl_estat:
                                files_written.append(
                                    (gse, to_write, len(dldict[gse]) - 1))
                            print('total files written = ' +
                                  str(len(files_written)))
                            print('soft transfer successful for ' + to_write +
                                  ', breaking...')
                            break
                        except ftplib.all_errors as efiledl:
                            print('file download error from ' + file_ftpadd)
                            if retries_left_files:
                                retries_left_files -= 1
                                print('continuing with file retries left =' +
                                      str(retries_left_files))
                                time.sleep(interval_file)
                                continue
                            else:
                                print('file retries exhausted, breaking..')
                                filedl_estat = str(efiledl)
                                dldict[gse].append([
                                    gse, file_ftpadd, to_write, filedl_estat,
                                    filedate, filedate_estat
                                ])
                                break
                except ftplib.all_errors as efiledate:
                    print('error getting date from ' + '/'.join(file_tokens))
                    if retries_left_files:
                        retries_left_files -= 1
                        print('continuing with file retries left = ' +
                              str(retries_left_files))
                        time.sleep(interval_file)
                        continue
                    else:
                        print('file retries exhausted, breaking..')
                        filedate_estat = str(efiledate)
                        filedate = "not_available"
                        dldict[gse].append(
                            [gse, file, filedate, filedate_estat])
                        break
            except ftplib.error_temp as eid:
                print('error making ftp connection to ' + id_ftpadd)
                if retries_left_files:
                    retries_left_connection -= 1
                    print('ftplib error encountered, file retries left = ' +
                          str(retries_left_files))
                    time.sleep(interval_file)
                    continue
                else:
                    print('file retries exhausted, breaking..')
                    dldict[gse].append([gse, id_ftpadd, str(eid)])
                    break
    if validate:
        print('commencing file validation...')
        for gse, new_filepath, index in files_written:
            filestr = os.path.basename(new_filepath).split('.')[0]
            gsesoft_latest = getlatest_filepath(gsesoftpath, filestr)
            if gsesoft_latest and not gsesoft_latest == 0:
                if filecmp.cmp(gsesoft_latest, new_filepath):
                    print('identical file found in dest_dir, removing...')
                    dldict[gse].append(False)
                    os.remove(new_filepath)
                else:
                    print('new file detected in temp_dir, moving to ' +
                          'dest_dir...')
                    dldict[gse].append(True)
                    dldict[gse][index][2] = os.path.join(
                        gsesoftpath, os.path.basename(new_filepath))
                    shutil.move(
                        new_filepath,
                        os.path.join(dest_dir, os.path.basename(new_filepath)))
            else:
                print('new file detected in temp_dir, moving to dest_dir..')
                dldict[gse].append(True)
                dldict[gse][index][2] = os.path.join(
                    gsesoftpath, os.path.basename(new_filepath))
                shutil.move(
                    new_filepath,
                    os.path.join(gsesoftpath, os.path.basename(new_filepath)))
            continue
        shutil.rmtree(temp_dir_make)
    return dldict
Exemplo n.º 15
0
        Null, provides status updates over run.

    """


if __name__ == "__main__":
    print("Starting server.py...")
    import subprocess, glob, sys, os, re
    sys.path.insert(0, os.path.join("recountmethylation_server", "src"))
    import edirect_query, settings, argparse
    settings.init()
    from edirect_query import gsm_query, gse_query, gsequery_filter
    from utilities import gettime_ntp, getlatest_filepath, querydict
    from utilities import get_queryfilt_dict
    from gse_celerytask import gse_task
    from random import shuffle
    gselist = []  # queue input, gse-based
    qstatlist = []  # job status object, also stored at sqlite db
    print("Getting timestamp...")
    run_timestamp = gettime_ntp()  # pass this result to child functions
    # Parse the specified GSE ID.
    parser = argparse.ArgumentParser(description='Arguments for server.py')
    parser.add_argument(
        "--gseid",
        type=str,
        required=False,
        default=None,
        help='Option to enter valid GSE ID for immediate download.')
    args = parser.parse_args()
Exemplo n.º 16
0
def compile_rsheet(eqfiltd=get_queryfilt_dict(),
                   sheetfn_ext='rsheet',
                   msrapfn_ext='msrapout',
                   msrapfn='msrapout',
                   idatsfn_ext='idat',
                   timestamp=gettime_ntp()):
    """ compile_rsheet

        Knits poised file data together into a sheet to be read into R using 
        minfi. Steps taken include: 
            1. Grab msrap file list
            2. Grab idats file list
            3. Intersect files lists
            4. Subset eqfilt dict on gse
            5. Form and write new sheet files, one per gse
        
        Arguments
        * eqfiltd (function or dictionary) : Equery filter dictionary object.
        * sheetsdir (str) : Directory to write new sheet files.
        * sheetfn_ext (str) : Filename extension for new sheet files.
        * msrapdir (str) : Directory containing MetaSRA-pipeline datafiles.
        * msrapfn_ext (str) : Filename extension of valid MetaSRA-pipeline
            datafiles.
        * idatsfn_ext (str) : Filename extension of valid idat files.
        * idatsdir (str) : Name of directory containing GSM idat files.
        * filesdir (str) : Root name of directory containing database files.
        * timestamp (str) : NTP timestamp for file versioning.
        * msrapfn (str) : File name stem for MetaSRA-pipeline files
        
        Returns:
        * null, produces sheet files as a side effect.
    """
    # form the sheet path and make dir as needed
    sheetspath = settings.sheetspath
    os.makedirs(sheetspath, exist_ok=True)
    sheets_fpath = os.path.join(sheetspath, ".".join([timestamp, sheetfn_ext]))
    # form msrap and idat paths and get filenames
    msrap_path = settings.gsmmsrapoutpath
    rxmsrap = re.compile(".*" + msrapfn_ext + "$")
    msrap_fnlist = list(filter(rxmsrap.match, os.listdir(msrap_path)))
    print("msrap_fnlist : " + str(msrap_fnlist))
    # idats fn
    idats_path = settings.idatspath
    rxidat = re.compile(".*" + idatsfn_ext + "$")
    idats_fnlist = list(filter(rxidat.match, os.listdir(idats_path)))
    # extract gsm ids
    rxgsm = re.compile(".*GSM[0-9]")
    idats_splitlist = [
        idatfn.split(".")[0] for idatfn in idats_fnlist
        if len(idatfn.split(".")) > 1
    ]
    idats_gsmlist_filt = list(set(filter(rxgsm.match,
                                         idats_splitlist)))  # unique gsm ids
    msrap_splitlist = [
        msrapfn.split(".")[1] for msrapfn in msrap_fnlist
        if len(msrapfn.split(".")) > 1
    ]
    msrap_gsmlist_filt = list(set(filter(rxgsm.match,
                                         msrap_splitlist)))  # unique gsm ids
    print("idats_gsmlist_filt : " + str(idats_gsmlist_filt))
    print("msrap_gsmlist_filt : " + str(msrap_gsmlist_filt))
    gsmvalid = [
        gsmid for gsmid in msrap_gsmlist_filt if gsmid in idats_gsmlist_filt
    ]
    if len(gsmvalid) > 0:
        rxgrn = re.compile(".*Grn.idat$")
        rxred = re.compile(".*Red.idat$")
        lsheet = []  # list object to write rsheet, one row per gsmid
        # append colnames
        lsheet.append(" ".join([
            "gsmid", "gseid", "idats_fn", "msrapmd_fn", "msrapmd_flatjson",
            "SENTRIX_ID", "ARRAY_ID", "Basename"
        ]))
        lsheet[0] = lsheet[0] + "\n"
        for gsmid in gsmvalid:
            # compile the file info for this gsm
            rxgsmi = re.compile(".*" + gsmid + ".*")
            gsmi_idats = list(filter(rxgsmi.match, idats_fnlist))
            gsmi_red_idats = list(filter(rxred.match, gsmi_idats))
            gsmi_grn_idats = list(filter(rxgrn.match, gsmi_idats))
            # get the latest file versions
            gsmi_red_pattern = gsmi_red_idats[0].split(".")[2]
            gsmi_grn_pattern = gsmi_grn_idats[0].split(".")[2]
            gsmi_red_latest = getlatest_filepath(filepath=idats_path,
                                                 filestr=gsmi_red_pattern,
                                                 embeddedpattern=True)
            gsmi_grn_latest = getlatest_filepath(filepath=idats_path,
                                                 filestr=gsmi_grn_pattern,
                                                 embeddedpattern=True)
            # get the latest msrap file
            gsmi_msrap_latest = getlatest_filepath(filepath=msrap_path,
                                                   filestr=gsmid,
                                                   embeddedpattern=True)
            print(gsmi_msrap_latest)
            if (gsmi_red_latest and not gsmi_red_latest == 0
                    and gsmi_grn_latest and not gsmi_grn_latest == 0
                    and gsmi_msrap_latest and not gsmi_msrap_latest == 0):
                # form the rsheets with valid gsm ids
                with open(gsmi_msrap_latest, 'r') as msrapmd:
                    gsmi_metadata_dict = json.load(msrapmd)
                gsmi_md = gsmi_metadata_dict[0]  # weird dictionary
                grows = []
                for key in list(gsmi_md.keys()):
                    kval = gsmi_md[key]
                    if type(kval) is list:
                        grows.append(";".join(kval))
                    else:
                        grows.append(":".join([str(key), str(gsmi_md[key])]))
                gsmi_mdvar = "'" + ";".join(grows) + "'"
                # grab the gse id for this gsm
                gseid = str([
                    gsek for gsek in list(eqfiltd.keys())
                    if gsmid in eqfiltd[gsek]
                ][0])
                # make the gsm arrays path Basename for minfi
                gsmi_bn = "_".join(gsmi_red_latest.split("_")[0:3])
                # one entry per gsm
                lgsmi = " ".join([
                    gsmid,  # gsm id
                    gseid,  # gse id
                    ";".join([
                        os.path.basename(gsmi_red_latest),
                        os.path.basename(gsmi_grn_latest)
                    ]),  # idat filenames
                    os.path.basename(gsmi_msrap_latest),  # metadata filename
                    gsmi_mdvar,  # flattened json file
                    os.path.basename(gsmi_red_latest).split(
                        "_")[-2],  # sentrix id
                    os.path.basename(gsmi_red_latest).split("_")
                    [-3],  # array id
                    gsmi_bn  # minfi path Basename, for arrays
                ])
                lgsmi = lgsmi + "\n"
                lsheet.append(lgsmi)
    else:
        print(
            "No valid GSM IDs detected. Check idats and MetaSRA-pipeline GSM "
            + "files directories.")
        return 0
    # write the final sheet files
    with open(sheets_fpath, 'w') as fsheet:
        for item in lsheet:
            fsheet.write(item)

    return lsheet
Exemplo n.º 17
0
def rmdb_fpaths_old(rmhlinks=False):
    """ rmdb_fpaths
        Get filepaths for existant sample idats and msrap outfiles.
        Arguments:
        * rmhlinks : Whether to remove old hardlinks and form new ones, 
                regardless of whether current hlinks exist (boolean).
        Returns:
        * gsmdocdict (dict.) : Dictionary of validated filepaths.
    """
    timestamp = gettime_ntp()
    # connect to RMDB mongodb
    client = pymongo.MongoClient(settings.rmdbhost, settings.rmdbport)
    dbcon = client.recount_methylation; idatscon = dbcon.gsm.idats
    softcon = dbcon.gse.soft; idatslist = list(idatscon.find())
        # grab unique gsm ids
    idatslist = [record for record in idatslist if 'gsmid' in record.keys()]
    gsmindex = list(set([record['gsmid'] for record in idatslist]))
    print("from idats db, found n = "+str(len(gsmindex))+" gsm ids")
        # fname catch patterns for re
    grnidatcatch = settings.grnidat_expcatch
    redidatcatch = settings.redidat_expcatch
    msrapoutcatch = settings.msrapoutfnpattern
        # filter all records for gsm on most recent update datetime
    gsm_fpaths_dd = {}
    # list all previously expanded idat files directy from idats dir
    allidatslist = os.listdir(settings.idatspath)
    allidatslist = list(filter(re.compile('.*\.idat$').match, allidatslist))
    print("found n = "+str((len(allidatslist)))+" expanded idat filenames...")
    # grab and filter idats and msrap outfiles lists
    if rmhlinks:
        print("Beginning sample iterations with hlink removal.")
    else:
        print("Beginning sample iterations without hlink removal.")
    for gi, gsmid in enumerate(gsmindex, 1):
        print("Getting fpaths for gsm: "+str(gsmid)+", num: "+str(gi), end="\r")
        gsm_fpaths_dd[gsmid] = []
        # all idat records for the GSM id
        recordsgsm = [record for record in idatslist if record['gsmid']==gsmid]
        # filter records by channel type,
        # note most records are for compressed files
        idatsrec_gsmgrn = [record for record in recordsgsm 
            if isinstance(record['date'],datetime.datetime)
            and re.search('.*Grn\.idat.*',os.path.basename(record['filepath']))
        ]
        idatsrec_gsmred = [record for record in recordsgsm 
            if isinstance(record['date'],datetime.datetime)
            and re.search('.*Red\.idat.*',os.path.basename(record['filepath']))
        ]
        if idatsrec_gsmgrn and idatsrec_gsmred:
            # get latest records for each channel
            irec_filtgrn = sorted(idatsrec_gsmgrn, key=lambda k: k['date'])[-1]
            irec_filtred = sorted(idatsrec_gsmred, key=lambda k: k['date'])[-1]
            # valid record file basenames
            igrnrec_bn = os.path.basename(irec_filtgrn['filepath'])
            iredrec_bn = os.path.basename(irec_filtred['filepath'])
            # check for expanded versions of compressed files
            igrn_fn = [fn for fn in allidatslist 
                if igrnrec_bn[:-3] in fn
            ]
            ired_fn = [fn for fn in allidatslist 
                if iredrec_bn[:-3] in fn
            ]
            if igrn_fn and ired_fn:
                igrn_fn = igrn_fn[0]
                ired_fn = ired_fn[0]
                hllist = []
                if rmhlinks:
                    # remove old hard links to sample idats
                    grnhl_torm = [fn for fn in allidatslist
                        if "hlink" in fn 
                        and '.'.join(igrn_fn.split('.')[2:]) in fn
                    ]
                    redhl_torm = [fn for fn in allidatslist
                        if "hlink" in fn 
                        and '.'.join(ired_fn.split('.')[2:]) in fn
                    ]
                    if grnhl_torm:
                        for hlfn in grnhl_torm:
                            os.remove(os.path.join(settings.idatspath, 
                                    hlfn)
                                )
                    if redhl_torm:
                        for hlfn in redhl_torm:
                            os.remove(os.path.join(settings.idatspath, 
                                    hlfn)
                                )
                    # new hlinks
                    hllist = new_idat_hlinks(gsmid, ts=timestamp, 
                            igrn_fn=igrn_fn, ired_fn=ired_fn
                        )
                else:
                    # check if hlinks exist, create new ones otherwise
                    grnhllist = [fn for fn in allidatslist
                        if "hlink" in fn 
                        and '.'.join(igrn_fn.split('.')[2:]) in fn
                    ]
                    redhllist = [fn for fn in allidatslist
                        if "hlink" in fn 
                        and '.'.join(ired_fn.split('.')[2:]) in fn
                    ]
                    # get matching grn and red hlink fn's if they exist
                    status_hlink = None
                    grnfnpass = None
                    redfnpass = None
                    if grnhllist and redhllist:
                        grnhllistfilt = list(set(grnhllist))
                        redhllistfilt = []
                        for ghl in grnhllistfilt:
                            for rhl in redhllist:
                                # check that base array ids identical
                                if ghl[:-9]==rhl[:-9]:
                                    redhllistfilt.append(rhl)
                                else:
                                    redhllistfilt.append("")
                        rhlfiltsub = [rhl[:-9] for rhl in redhllistfilt]
                        grnhllistfilt = [ghl for ghl in grnhllistfilt 
                            if ghl[:-9] in rhlfiltsub]
                        redhllistfilt = [rhl for rhl in redhllistfilt
                            if not rhl==""]
                        if grnhllistfilt and redhllistfilt:
                            grnfnpass = grnhllistfilt[0]
                            redfnpass = redhllistfilt[0]
                            # pass hlinks to return dictionary
                            hllist.append(os.path.join(settings.idatspath, grnfnpass))
                            hllist.append(os.path.join(settings.idatspath, redfnpass))
                        else:
                            # make new hlinks
                            hllist = new_idat_hlinks(gsmid, ts=timestamp, 
                                igrn_fn=igrn_fn, ired_fn=ired_fn)
                    else:
                        # make new hlinks
                        hllist = new_idat_hlinks(gsmid, ts=timestamp, 
                            igrn_fn=igrn_fn, ired_fn=ired_fn)
                # finally, pass listed hlinks to return dictionary
                gsm_fpaths_dd[gsmid].append(hllist[0])  
                gsm_fpaths_dd[gsmid].append(hllist[1])    
            else:
                gsm_fpaths_dd[gsmid].append(None)
                gsm_fpaths_dd[gsmid].append(None)
        else:
            gsm_fpaths_dd[gsmid].append(False)
        # check for valid MetaSRA-pipeline filepaths
        try:
            msraplatest = getlatest_filepath(filepath=settings.gsmmsrapoutpath,
                filestr=gsmid, embeddedpattern=True, tslocindex=0, 
                returntype='returnlist'
            )
            if msraplatest and len(msraplatest)==1:
                gsm_fpaths_dd[gsmid].append(msraplatest[0])
        except:
            gsm_fpaths_dd[gsmid].append(False)
        print("Finished with sample num "+str(gi), end="\r")
    print("Finished sample iterations. Returning...")
    # return gsmid dictionary with lists of filtered results or valid fpaths
    return gsm_fpaths_dd