示例#1
0
def parseGeo(filename):
    """parseGeo(filename): open a GEO soft file and return the records it contains."""
    try:
        fh = gzip.open(filename)
        records = Geo.parse(fh)
        return records
    except:
        try:
            fh = open(filename)
            records = Geo.parse(fh)
            return records
        except:
            print "Could not open filename"
            sys.exit(3)
示例#2
0
文件: GEO2csv.py 项目: danshea/python
def parseGeo(filename):
    '''parseGeo(filename): open a GEO soft file and return the records it contains.'''
    try:
        fh = gzip.open(filename)
        records = Geo.parse(fh)
        return (records)
    except:
        try:
            fh = open(filename)
            records = Geo.parse(fh)
            return (records)
        except:
            print 'Could not open filename'
            sys.exit(3)
示例#3
0
文件: GEO2csv.py 项目: danshea/python
def parseGeo(filename):
    '''parseGeo(filename): open a GEO soft file and return the records it contains.'''
    try:
            fh = gzip.open(filename)
            records = Geo.parse(fh)
            return(records)
    except:
        try:
                fh = open(filename)
                records = Geo.parse(fh)
                return(records)
        except:
            print 'Could not open filename'
            sys.exit(3)
示例#4
0
def parseGeo(filename):
    try:
        with gzip.open(filename) as fh:
            records = Geo.parse(fh)
            for record in records:
                print record
    except:
        try:
            with open(filename) as fh:
                records = Geo.parse(fh)
                for record in records:
                    print record
        except:
            print 'Could not open filename'
            sys.exit(3)
示例#5
0
def parseGeo(filename):
    try:
        with gzip.open(filename) as fh:
            records = Geo.parse(fh)
            for record in records:
                print record
    except:
        try:
            with open(filename) as fh:
                records = Geo.parse(fh)
                for record in records:
                    print record
        except:
            print 'Could not open filename'
            sys.exit(3)
def read_gene_expr_data(geneset_dict):
    """
    Read gene expression data from file. Returns
    X - array of unnormalized gene expression data, grouped by genesets
    y - control and disease labels
    geneset - filtered genesets that match the given geneset dictionary, same order as returned X array
    """
    handle = open(GENE_EXPR_FILENAME)
    records = Geo.parse(handle)

    # gsm ids of the normal subjects
    normal_subjects = []

    # geneset row ids
    X_groups = {}
    for k in geneset_dict.keys():
        X_groups[k] = set()

    X = []
    y = []

    i = 0
    for record in records:
        i += 1
        if i == 3:
            # Read patient labels so we can make the y vector
            attr = record.entity_attributes
            assert (attr["subset_description"] == "normal")
            normal_subjects = attr["subset_sample_id"].split(",")

        if i == 7:
            # Read actual gene expression data
            col_names = record.table_rows[0]
            gsm_idxs = []
            for idx, col_name in enumerate(col_names):
                if "GSM" == col_name[0:3]:
                    gsm_idxs.append(idx)

                    # populate the y matrix
                    # 1 means diseased. 0 means control.
                    y.append(CONTROL_LABEL if col_name in
                             normal_subjects else DISEASE_LABEL)

            geneid_idx = col_names.index("Gene ID")

            feature_idx = 0
            for row in record.table_rows[1:]:
                geneid = row[geneid_idx]
                geneset = get_geneset_from_dict(geneset_dict, geneid)
                if geneset is not None:
                    # add feature idx to correct geneset
                    X_groups[geneset].add(feature_idx)

                    # append the gene expression data
                    X.append([float(row[i]) for i in gsm_idxs])

                    feature_idx += 1

    # Make feature groups
    X = np.matrix(X).T
    X_genesets = []
    genesets_included = []
    for geneset_key, geneset_col_idxs in X_groups.iteritems():
        if len(geneset_col_idxs) == 0:
            continue
        X_genesets.append(X[:, list(geneset_col_idxs)])
        genesets_included.append(geneset_key)

    y = np.matrix(y).T
    return X_genesets, y, genesets_included
示例#7
0
from Bio import Geo
handle = open('./GDS3292/GDS3292_full.soft')
records = Geo.parse(handle)
for record in records:
    print record
handle.close()
def read_gene_expr_data(geneset_dict):
    """
    Read gene expression data from file. Returns
    X - array of unnormalized gene expression data, grouped by genesets
    y - control and disease labels
    geneset - filtered genesets that match the given geneset dictionary, same order as returned X array
    """
    handle = open(GENE_EXPR_FILENAME)
    records = Geo.parse(handle)

    # gsm ids of the normal subjects
    normal_subjects = []

    # geneset row ids
    X_groups = {}
    for k in geneset_dict.keys():
        X_groups[k] = set()

    X = []
    y = []

    i = 0
    for record in records:
        i += 1
        if i == 3:
            # Read patient labels so we can make the y vector
            attr = record.entity_attributes
            assert(attr["subset_description"] == "normal")
            normal_subjects = attr["subset_sample_id"].split(",")

        if i == 7:
            # Read actual gene expression data
            col_names = record.table_rows[0]
            gsm_idxs = []
            for idx, col_name in enumerate(col_names):
                if "GSM" == col_name[0:3]:
                    gsm_idxs.append(idx)

                    # populate the y matrix
                    # 1 means diseased. 0 means control.
                    y.append(CONTROL_LABEL if col_name in normal_subjects else DISEASE_LABEL)

            geneid_idx = col_names.index("Gene ID")

            feature_idx = 0
            for row in record.table_rows[1:]:
                geneid = row[geneid_idx]
                geneset = get_geneset_from_dict(geneset_dict, geneid)
                if geneset is not None:
                    # add feature idx to correct geneset
                    X_groups[geneset].add(feature_idx)

                    # append the gene expression data
                    X.append([float(row[i]) for i in gsm_idxs])

                    feature_idx += 1

    # Make feature groups
    X = np.matrix(X).T
    X_genesets = []
    genesets_included = []
    for geneset_key, geneset_col_idxs in X_groups.iteritems():
        if len(geneset_col_idxs) == 0:
            continue
        X_genesets.append(X[:, list(geneset_col_idxs)])
        genesets_included.append(geneset_key)

    y = np.matrix(y).T
    return X_genesets, y, genesets_included