Пример #1
0
def loadFromFile(filename, dataset, verbose=False, run_pca=True, explain_rat=4., ret_var=False):
    # TODO: arg check

    tissue_name = os.path.basename(filename).split('.')[0]
    tissue = Tissue(tissue_name, dataset)

    tissue_file = open(filename, 'r')

    patient_ids = tissue_file.readline().strip().split('\t')[4:]
    for patient_id in patient_ids:
        if patient_id not in dataset.patients:
            patient = Patient(patient_id)
            dataset.addPatient(patient)
        patient = dataset.patients[patient_id]
        patient.addTissue(tissue)
        
        tissue._rows[patient_id] = tissue.numPatients
        tissue._patients[patient_id] = patient

    # print 'got patients'

    raw_t = [[float(val_str)
              for val_str in line.strip().split('\t')[4:]]
             for line in tissue_file]

    # print 'got data'

    val = np.array(raw_t).T

    var_exp = 0.
    if run_pca:
        pca_model = PCA(n_components=50, copy=False)
        pca_model.fit_transform(val)
        #cov = val.T.dot(val)/(len(raw_t))

        #U, W, _ = np.linalg.svd(cov)

        #cum_var = np.cumsum(W**2)
        #cum_var = cum_var/cum_var[-1]
        cum_var = np.cumsum(pca_model.explained_variance_ratio_)
        explained_ratio = [float(cum_var[i])/float(i+1)
                           for i in range(len(cum_var))]
        
        best_dim = 0
        for dim in range(len(cum_var)):
            if explained_ratio[dim]*len(patient_ids) > explain_rat:
                best_dim = dim
        n_components = best_dim+1
        n_components = max(n_components, 8)

        #val = val.dot(U[:,:n_components])
        val = val[:,:n_components]
        var_exp = cum_var[n_components-1]
        
        if verbose:
            print tissue_name + ' has {} components to explain {}% variance for {} patients'.format(n_components, 100.*var_exp, len(patient_ids))

    elif verbose:
        print tissue_name + ' parsed'

    tissue._value = val

    if ret_var:
        return tissue, var_exp
    else:
        return tissue