예제 #1
0
def getLocalGeo(fsave,
                fill_or_not=False,
                xmlPath="geo_gse",
                DataType=False,
                refresh=False):
    """This function can be used if we have some xml file of GEO, then 
    recursion all local existed geo xml and get we wanted samples (ChIP, ATAC, DNase)
    """
    f = open(fsave, 'w')
    f.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' %
            ('GSE', 'Species', 'PMID', 'Paper', 'Title', 'CellType', 'Tissue',
             'Disease', 'Cell_Pop', 'Release_Date', 'Last_Update_Date', 'GSMs',
             'fields_dataType', 'Key_Match') + '\n')
    f.close()
    getSyncLog(
        "# 1. go through all the xml and check the type in the path of %s" %
        xmlPath)
    local_repo_path = "repository_samples.pickle"
    if DataType:
        # datatype = ['sc-rna-seq', 'sc-atac-seq']
        datatype = ['sc-atac-seq']
    else:
        datatype = False
    local_repo_samples_dict = getGEOSamples_byType_gse.getGeoSamples_byTypes(
        path=local_repo_path, datatype=datatype, ddir=xmlPath, refresh=refresh)
    out = open(fsave + '_gse.txt', 'w')
    out.write('\n'.join(list(set(local_repo_samples_dict.keys()))))
    out.close()
    local_repo_samples = set(local_repo_samples_dict.keys())

    getSyncLog("# 2. calculate new samples")

    # local_db_samples = set(models.Samples.objects.values_list('unique_id', flat=True))

    # getSyncLog("There are %d samples in local repo." % len(local_repo_samples))
    # getSyncLog("There are %d samples in local db." % len(local_db_samples))
    # need_added_samples = sorted(list(local_repo_samples - local_db_samples))
    for gseid in local_repo_samples:
        print(gseid)
        # getType = getGEOSamples_byType_gse.getGeoSamples_byTypes(path = "repository_samples.pickle", datatype = ['sc-rna-seq', 'sc-atac-seq'], ddir=xmlPath, gseids=[gseid], refresh=refresh)
        getType = getGEOSamples_byType_gse.getGeoSamples_byTypes(
            path="repository_samples.pickle",
            datatype=['sc-atac-seq'],
            ddir=xmlPath,
            gseids=[gseid],
            refresh=refresh)
        if getType:
            for i in getType.keys():
                # parse sample annotation
                list_sample = scrna_parser_detail_gse.update_one_sample(
                    gseid=gseid, ddir=xmlPath)
                try:
                    list_sample.append(str(
                        getType[i]))  # add matched key words
                    f = open(fsave, 'a')
                    f.write('\t'.join(list_sample) + '\n')
                    f.close()
                except:
                    getSyncLog("Error when writing in table: %s" % s)
예제 #2
0
def sync_samples_from_gse_factor(infile,
                                 gse_col,
                                 fsave,
                                 xmlPath='geo_gse',
                                 exludeFile=False,
                                 fill_or_not=False,
                                 refresh=False):
    getSyncLog(
        "try to add samples based on outside table which contain gsm ID and factor name"
    )
    f = open(fsave, 'w')
    f.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' %
            ('GSE', 'Species', 'PMID', 'Paper', 'Title', 'CellType', 'Tissue',
             'Disease', 'Cell_Pop', 'Release_Date', 'Last_Update_Date', 'GSMs',
             'fields_dataType', 'Key_Match') + '\n')
    f.close()
    exclude = []
    if exludeFile:
        exclude = [x.rstrip() for x in open(exludeFile)]

    need_added_samples = [x.rstrip().split('\t') for x in open(infile)]
    need_added_samples = list(set(need_added_samples) - set(exclude))
    # local_db_samples = set(models.Samples.objects.values_list('unique_id', flat=True))
    getSyncLog('totally %d samples need to be added' % len(need_added_samples))
    n = 1
    for iterm in need_added_samples:
        print(n)  # let me know which the process
        # if (not refresh) and (iterm[int(gse_col)]):
        #     continue
        # get seqtype
        gseid = iterm[int(gse_col)]
        print(gseid)
        getType = getGEOSamples_byType_gse.getGeoSamples_byTypes(
            path="repository_samples.pickle",
            datatype=['sc-rna-seq', 'sc-atac-seq'],
            gseids=[gseid],
            refresh=refresh,
            ddir=xmlPath)
        if getType:
            for i in getType.keys():
                # parse sample annotation
                list_sample = scrna_parser_detail_gse.update_one_sample(
                    gseid=gseid, ddir=xmlPath)
                try:
                    list_sample.append(str(
                        getType[i]))  # add matched key words
                    f = open(fsave, 'a')
                    f.write('\t'.join(list_sample) + '\n')
                    f.close()
                except:
                    getSyncLog("Error when writing in table: %s" % s)
        else:
            out = open(fsave + '_others.txt', 'a')
            out.write('\t'.join(iterm) + '\n')
            out.close()
예제 #3
0
def _sync_gse(fsave, fill_or_not=False, DataType=False, dateRegion = False, refresh=False, exludeFile = False, xmlPath = './geo_gse'):
    ## get all GDS ids of GSE from API
    gdsSamples = getGDSSamples(dateRegion) # get all gds id
    getSyncLog('start: There are %s GDS Samples in sum'%(len(gdsSamples)))#

    exclude = [] # the gse maybe existed
    if exludeFile:
        exclude = [x.rstrip() for x in open(exludeFile)]
    # open a file to save information
    f = open(fsave, 'w')
    f.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s'%('GSE', 'Species',
        'PMID', 'Paper', 'Title', 'CellType', 'Tissue', 'Disease',
        'Cell_Pop', 'Release_Date', 'Last_Update_Date', 'GSMs', 'fields_dataType', 'Key_Match')+'\n')
    f.close()
    # convert to gds id to GSE and download XML file
    cnt = 0
    one_percent = len(gdsSamples)/100
    for gds in gdsSamples:
        cnt += 1
        if cnt % one_percent == 0:
            getSyncLog("%s%%"%(cnt/one_percent))
            time.sleep(3)
        gseid = gse_idToAcc(gds)
        if gseid in exclude: # if existed, don't parse again
            continue
        gseXML = getGeoXML(gseid)
        print(gseid)
        getType = getGEOSamples_byType_gse.getGeoSamples_byTypes(path = "repository_samples.pickle", datatype = ['sc-rna-seq', 'sc-atac-seq'],
            gseids=[gseid], refresh=refresh, ddir = xmlPath)
        if getType:
            for i in getType.keys():
                # parse sample annotation
                list_sample = scrna_parser_detail_gse.update_one_sample(gseid=gseid, ddir=xmlPath)
                try:
                    list_sample.append(str(getType[i])) # add matched key words
                    f = open(fsave, 'a')
                    f.write('\t'.join(list_sample)+'\n')
                    f.close()
                except:
                    getSyncLog("Error when writing in table: %s" % s)
        else:
            out = open(fsave+'_others.txt', 'a')
            out.write('\t'.join(gseid)+'\n')
            out.close()
        # out = open('geo_gse_collection.txt', 'a')
        # out.write(gseid+'\n')
        # out.close() 
        # time.sleep(0.03) # sleep to avoid IP blocking

    getSyncLog('done!')#