def getLocalGeo(fsave, fill_or_not=False, xmlPath="geo_gse", DataType=False, refresh=False): """This function can be used if we have some xml file of GEO, then recursion all local existed geo xml and get we wanted samples (ChIP, ATAC, DNase) """ f = open(fsave, 'w') f.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % ('GSE', 'Species', 'PMID', 'Paper', 'Title', 'CellType', 'Tissue', 'Disease', 'Cell_Pop', 'Release_Date', 'Last_Update_Date', 'GSMs', 'fields_dataType', 'Key_Match') + '\n') f.close() getSyncLog( "# 1. go through all the xml and check the type in the path of %s" % xmlPath) local_repo_path = "repository_samples.pickle" if DataType: # datatype = ['sc-rna-seq', 'sc-atac-seq'] datatype = ['sc-atac-seq'] else: datatype = False local_repo_samples_dict = getGEOSamples_byType_gse.getGeoSamples_byTypes( path=local_repo_path, datatype=datatype, ddir=xmlPath, refresh=refresh) out = open(fsave + '_gse.txt', 'w') out.write('\n'.join(list(set(local_repo_samples_dict.keys())))) out.close() local_repo_samples = set(local_repo_samples_dict.keys()) getSyncLog("# 2. calculate new samples") # local_db_samples = set(models.Samples.objects.values_list('unique_id', flat=True)) # getSyncLog("There are %d samples in local repo." % len(local_repo_samples)) # getSyncLog("There are %d samples in local db." % len(local_db_samples)) # need_added_samples = sorted(list(local_repo_samples - local_db_samples)) for gseid in local_repo_samples: print(gseid) # getType = getGEOSamples_byType_gse.getGeoSamples_byTypes(path = "repository_samples.pickle", datatype = ['sc-rna-seq', 'sc-atac-seq'], ddir=xmlPath, gseids=[gseid], refresh=refresh) getType = getGEOSamples_byType_gse.getGeoSamples_byTypes( path="repository_samples.pickle", datatype=['sc-atac-seq'], ddir=xmlPath, gseids=[gseid], refresh=refresh) if getType: for i in getType.keys(): # parse sample annotation list_sample = scrna_parser_detail_gse.update_one_sample( gseid=gseid, ddir=xmlPath) try: list_sample.append(str( getType[i])) # add matched key words f = open(fsave, 'a') f.write('\t'.join(list_sample) + '\n') f.close() except: getSyncLog("Error when writing in table: %s" % s)
def sync_samples_from_gse_factor(infile, gse_col, fsave, xmlPath='geo_gse', exludeFile=False, fill_or_not=False, refresh=False): getSyncLog( "try to add samples based on outside table which contain gsm ID and factor name" ) f = open(fsave, 'w') f.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % ('GSE', 'Species', 'PMID', 'Paper', 'Title', 'CellType', 'Tissue', 'Disease', 'Cell_Pop', 'Release_Date', 'Last_Update_Date', 'GSMs', 'fields_dataType', 'Key_Match') + '\n') f.close() exclude = [] if exludeFile: exclude = [x.rstrip() for x in open(exludeFile)] need_added_samples = [x.rstrip().split('\t') for x in open(infile)] need_added_samples = list(set(need_added_samples) - set(exclude)) # local_db_samples = set(models.Samples.objects.values_list('unique_id', flat=True)) getSyncLog('totally %d samples need to be added' % len(need_added_samples)) n = 1 for iterm in need_added_samples: print(n) # let me know which the process # if (not refresh) and (iterm[int(gse_col)]): # continue # get seqtype gseid = iterm[int(gse_col)] print(gseid) getType = getGEOSamples_byType_gse.getGeoSamples_byTypes( path="repository_samples.pickle", datatype=['sc-rna-seq', 'sc-atac-seq'], gseids=[gseid], refresh=refresh, ddir=xmlPath) if getType: for i in getType.keys(): # parse sample annotation list_sample = scrna_parser_detail_gse.update_one_sample( gseid=gseid, ddir=xmlPath) try: list_sample.append(str( getType[i])) # add matched key words f = open(fsave, 'a') f.write('\t'.join(list_sample) + '\n') f.close() except: getSyncLog("Error when writing in table: %s" % s) else: out = open(fsave + '_others.txt', 'a') out.write('\t'.join(iterm) + '\n') out.close()
def _sync_gse(fsave, fill_or_not=False, DataType=False, dateRegion = False, refresh=False, exludeFile = False, xmlPath = './geo_gse'): ## get all GDS ids of GSE from API gdsSamples = getGDSSamples(dateRegion) # get all gds id getSyncLog('start: There are %s GDS Samples in sum'%(len(gdsSamples)))# exclude = [] # the gse maybe existed if exludeFile: exclude = [x.rstrip() for x in open(exludeFile)] # open a file to save information f = open(fsave, 'w') f.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s'%('GSE', 'Species', 'PMID', 'Paper', 'Title', 'CellType', 'Tissue', 'Disease', 'Cell_Pop', 'Release_Date', 'Last_Update_Date', 'GSMs', 'fields_dataType', 'Key_Match')+'\n') f.close() # convert to gds id to GSE and download XML file cnt = 0 one_percent = len(gdsSamples)/100 for gds in gdsSamples: cnt += 1 if cnt % one_percent == 0: getSyncLog("%s%%"%(cnt/one_percent)) time.sleep(3) gseid = gse_idToAcc(gds) if gseid in exclude: # if existed, don't parse again continue gseXML = getGeoXML(gseid) print(gseid) getType = getGEOSamples_byType_gse.getGeoSamples_byTypes(path = "repository_samples.pickle", datatype = ['sc-rna-seq', 'sc-atac-seq'], gseids=[gseid], refresh=refresh, ddir = xmlPath) if getType: for i in getType.keys(): # parse sample annotation list_sample = scrna_parser_detail_gse.update_one_sample(gseid=gseid, ddir=xmlPath) try: list_sample.append(str(getType[i])) # add matched key words f = open(fsave, 'a') f.write('\t'.join(list_sample)+'\n') f.close() except: getSyncLog("Error when writing in table: %s" % s) else: out = open(fsave+'_others.txt', 'a') out.write('\t'.join(gseid)+'\n') out.close() # out = open('geo_gse_collection.txt', 'a') # out.write(gseid+'\n') # out.close() # time.sleep(0.03) # sleep to avoid IP blocking getSyncLog('done!')#