Exemplo n.º 1
0
 def get_change(self, retest_dataset):
     demographics = self.data
     
     retest = get_demographics(retest_dataset)
     retest = residualize_baseline(retest, self.residualize_vars)
     if 'BMI' in retest.columns:
         retest.drop(['WeightPounds', 'HeightInches'], axis=1, inplace=True)
     # get common variables
     common_index = sorted(list(set(demographics.index) & set(retest.index)))
     common_columns = sorted(list(set(demographics.columns) & set(retest.columns)))
     demographics = demographics.loc[common_index, common_columns] 
     retest = retest.loc[common_index, common_columns]
     raw_change = retest-demographics
     # convert to scores
     c = self.get_c()
     demographic_factor_weights = get_attr(self.results['factor_tree_Rout_oblimin'][c],'weights')
     demographic_scores = scale(demographics).dot(demographic_factor_weights)
     retest_scores = scale(retest).dot(demographic_factor_weights)
     
     
     factor_change = pd.DataFrame(retest_scores-demographic_scores,
                           index=common_index,
                           columns = self.get_scores().columns)
     factor_change = self.reorder_factors(factor_change)
     factor_change.columns = [i + ' Change' for i in factor_change.columns]
     return factor_change, raw_change
 def get_change(self, retest_dataset):
     demographics = self.data
     
     retest = get_demographics(retest_dataset)
     retest = residualize_baseline(retest, self.residualize_vars)
     if 'BMI' in retest.columns:
         retest.drop(['WeightPounds', 'HeightInches'], axis=1, inplace=True)
     # get common variables
     common_index = sorted(list(set(demographics.index) & set(retest.index)))
     common_columns = sorted(list(set(demographics.columns) & set(retest.columns)))
     demographics = demographics.loc[common_index, common_columns] 
     retest = retest.loc[common_index, common_columns]
     raw_change = retest-demographics
     # convert to scores
     c = self.get_c()
     demographic_factor_weights = get_attr(self.results['factor_tree_Rout_oblimin'][c],'weights')
     demographic_scores = scale(demographics).dot(demographic_factor_weights)
     retest_scores = scale(retest).dot(demographic_factor_weights)
     
     
     factor_change = pd.DataFrame(retest_scores-demographic_scores,
                           index=common_index,
                           columns = self.get_scores().columns)
     factor_change = self.reorder_factors(factor_change)
     factor_change.columns = [i + ' Change' for i in factor_change.columns]
     return factor_change, raw_change
import matplotlib.pyplot as plt
import numpy as np
from os import path
import pandas as pd
import seaborn as sns
from selfregulation.utils.plot_utils import beautify_legend, format_num, format_variable_names
from selfregulation.utils.utils import filter_behav_data, get_behav_data, get_demographics, get_info

# correlation of ravens and literature
# replication of "Intelligence and socioeconomic success: A meta-analytic
# review of longitudinal research"

base_dir = get_info('base_directory')
ext = 'png'
data = get_behav_data()
demographics = get_demographics()
data = data.loc[demographics.index]
# get dataframe of intelligence measure (raven's progressive matrices) and demographics)
df = pd.concat([data.filter(regex='raven'), demographics], axis=1)

# get raven's reliability
reliability = get_behav_data(dataset='Retest_02-03-2018',
                             file='bootstrap_merged.csv.gz')
raven_reliability = reliability.groupby('dv').icc.mean().filter(
    regex='raven')[0]
# demographic reliabilities
demo_reliabilities = [1.0] * demographics.shape[1]

# correlations
correlations = df.corr().filter(regex='raven').sort_values(
    by='ravens.score').iloc[:-1]
Exemplo n.º 4
0
 def __init__(self, 
              datafile=None, 
              loading_thresh=None,
              dist_metric=distcorr,
              boot_iter=1000,
              name='',
              filter_regex='.',
              ID=None,
              results_dir=None,
              residualize_vars=['Age', 'Sex'],
              saved_obj_file=None
              ):
     """
     Args:
         datafile: name of a directory in "Data"
         loading_thresh: threshold to use for factor analytic result
         dist_metric: distance metric for hierarchical clustering that is 
         passed to pdist
         name: string to append to ID, default to empty string
         filter_regex: regex string passed to data.filter
         ID: specify if a specific ID is desired
         results_dir: where to save results
     """
     assert datafile is not None or saved_obj_file is not None
     # initialize with the saved object if available
     if saved_obj_file:
         self._load_init(saved_obj_file)
     else:
         # set vars
         self.dataset = datafile
         self.loading_thresh = None
         self.dist_metric = dist_metric
         self.boot_iter = boot_iter
         self.residualize_vars = residualize_vars
         if ID is None:
             self.ID =  '%s_%s' % (name, str(random.getrandbits(16)))
         else:
             self.ID = '%s_%s' % (name, str(ID))
         # set up output files
         self.results_dir = results_dir
         # load data
         self.data = get_behav_data(dataset=datafile, 
                                   file='meaningful_variables_imputed.csv',
                                   filter_regex=filter_regex,
                                   verbose=True)
         self.data_no_impute = get_behav_data(dataset=datafile,
                                              file='meaningful_variables_clean.csv',
                                              filter_regex=filter_regex,
                                              verbose=True)
         self.demographics = get_demographics()
         
     
     # initialize analysis classes
     self.DA = Demographic_Analysis(self.demographics, 
                                    residualize_vars=self.residualize_vars,
                                    boot_iter=self.boot_iter)
     self.EFA = EFA_Analysis(self.data, 
                             self.data_no_impute, 
                             boot_iter=self.boot_iter)
     self.HCA = HCA_Analysis(dist_metric=self.dist_metric)
     
     # load the results from the saved object
     if saved_obj_file:
         self._load_results(saved_obj_file)
Exemplo n.º 5
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 13 18:41:59 2019

@author: ian
"""
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from selfregulation.utils.utils import get_demographics, get_recent_dataset

demo=get_demographics(get_recent_dataset(), drop_categorical=False)
race_info = np.unique(demo.Race, return_counts=True)
race_info = {k.lstrip():v for k,v in zip(race_info[0], race_info[1])}
race_percentiles = {k:np.round(v/demo.shape[0]*100,2) for k,v in race_info.items()}
age_stats = demo.Age.describe()

print('** Race Statistics **')
for x,y in race_percentiles.items():
    print (x, ':',  y)
print('Hispanic %', demo.HispanicLatino.mean().round(3))
print('** Age and Sex **')
print(age_stats)
print('Female %', demo.Sex.mean().round(3))


# plots
sns.set_context('paper')
size=5
def style_ax(ax):
import matplotlib.pyplot as plt
import numpy as np
from os import path
import pandas as pd
import seaborn as sns
from selfregulation.utils.plot_utils import beautify_legend, format_num, format_variable_names
from selfregulation.utils.utils import filter_behav_data, get_behav_data, get_demographics, get_info

# correlation of ravens and literature
# replication of "Intelligence and socioeconomic success: A meta-analytic
# review of longitudinal research"

base_dir = get_info('base_directory')
ext= 'png'
data = get_behav_data()     
demographics = get_demographics()                    
data = data.loc[demographics.index]     
# get dataframe of intelligence measure (raven's progressive matrices) and demographics)                              
df = pd.concat([data.filter(regex='raven'), demographics], axis=1)

# get raven's reliability
reliability = get_behav_data(dataset='Retest_02-03-2018', file='bootstrap_merged.csv.gz')
raven_reliability = reliability.groupby('dv').icc.mean().filter(regex='raven')[0]
# demographic reliabilities 
demo_reliabilities = [1.0]*demographics.shape[1]

# correlations
correlations = df.corr().filter(regex='raven').sort_values(by='ravens.score').iloc[:-1]
correlations.insert(0, 'target_reliability', demo_reliabilities)
adjusted = correlations['ravens.score']/(raven_reliability*correlations['target_reliability'])**.5
correlations.insert(0, 'adjusted_correlation', adjusted)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 13 18:41:59 2019

@author: ian
"""
import numpy as np
from selfregulation.utils.utils import get_demographics, get_recent_dataset

demo=get_demographics(get_recent_dataset(), drop_categorical=False)
race_info = np.unique(demo.Race, return_counts=True)
race_info = {k.lstrip():v for k,v in zip(race_info[0], race_info[1])}
race_percentiles = {k:np.round(v/demo.shape[0]*100,2) for k,v in race_info.items()}
age_stats = demo.Age.describe()

print('** Race Statistics **')
for x,y in race_percentiles.items():
    print (x, ':',  y)
print('Hispanic %', demo.HispanicLatino.mean().round(3))
print('** Age and Sex **')
print(age_stats)
print('Female %', demo.Sex.mean().round(3))
 def __init__(self, 
              datafile=None, 
              loading_thresh=None,
              dist_metric=distcorr,
              boot_iter=1000,
              name='',
              filter_regex='.',
              ID=None,
              results_dir=None,
              residualize_vars=['Age', 'Sex'],
              saved_obj_file=None
              ):
     """
     Args:
         datafile: name of a directory in "Data"
         loading_thresh: threshold to use for factor analytic result
         dist_metric: distance metric for hierarchical clustering that is 
         passed to pdist
         name: string to append to ID, default to empty string
         filter_regex: regex string passed to data.filter
         ID: specify if a specific ID is desired
         results_dir: where to save results
     """
     assert datafile is not None or saved_obj_file is not None
     # initialize with the saved object if available
     if saved_obj_file:
         self._load_init(saved_obj_file)
     else:
         # set vars
         self.dataset = datafile
         self.loading_thresh = None
         self.dist_metric = dist_metric
         self.boot_iter = boot_iter
         self.residualize_vars = residualize_vars
         if ID is None:
             self.ID =  '%s_%s' % (name, str(random.getrandbits(16)))
         else:
             self.ID = '%s_%s' % (name, str(ID))
         # set up output files
         self.results_dir = results_dir
         # load data
         self.data = get_behav_data(dataset=datafile, 
                                   file='meaningful_variables_imputed.csv',
                                   filter_regex=filter_regex,
                                   verbose=True)
         self.data_no_impute = get_behav_data(dataset=datafile,
                                              file='meaningful_variables_clean.csv',
                                              filter_regex=filter_regex,
                                              verbose=True)
         self.demographics = get_demographics()
         
     
     # initialize analysis classes
     self.DA = Demographic_Analysis(self.demographics, 
                                    residualize_vars=self.residualize_vars,
                                    boot_iter=self.boot_iter)
     self.EFA = EFA_Analysis(self.data, 
                             self.data_no_impute, 
                             boot_iter=self.boot_iter)
     self.HCA = HCA_Analysis(dist_metric=self.dist_metric)
     
     # load the results from the saved object
     if saved_obj_file:
         self._load_results(saved_obj_file)