cIV, (cognate, GSSYearsWithCognate) = random.choice(cIVCogPairs.items()) return cIV, cognate, GSSYearsWithCognate ############################################################ if __name__ == "__main__": # define the storage containers for outputs output = defaultdict(dict) groups = ['group1', 'group2'] outcomes = ['propSig', 'paramSizesNormed', 'Rs', 'adjRs', 'pvalues', 'numTotal'] for group in groups: for outcome in outcomes: output[group][outcome] = [] articleClasses = filterArticles(articleClasses, GSSYearsUsed=True, GSSYearsPossible=False, centralIVs=True) for article in random.sample(articleClasses, 400): # for article in articleClasses: # for article in [a for a in articleClasses if a.articleID == 6197]: print 'Processing article:', article.articleID # define the outcomes I'm interseted in for the two groups td = defaultdict(dict) for group in groups: td[group]['numTotal'] = 0.0 # td[group]['coeffsSig'] = [] td[group]['numSig'] = 0.0 # proportions of significant coeffs # td[group]['paramSizes'] = [] td[group]['paramSizesNormed'] = [] td[group]['Rs'] = []
sys.path.append('../Code/') from articleClass import * from filterArticleClasses import filterArticles pathToData = '../Data/' ALL_VARIABLE_NAMES = cp.load(open(pathToData + 'ALL_VARIABLE_NAMES.pickle')) ALL_VARIABLE_NAMES = [str.upper(el) for el in ALL_VARIABLE_NAMES] MISSING_VALUES_DICT = cp.load(open(pathToData + 'MISSING_VALUES_DICT.pickle', 'rb')) MEASURE_LEVELS = cp.load(open(pathToData + 'MEASURE_LEVELS.pickle')) articleIDAndGSSYearsUsed = cp.load(open(pathToData + 'articleIDAndGssYearsUsed-cleaned.pickle')) # load the years used VARS_BY_YEAR = cp.load(open(pathToData + 'VARS_BY_YEAR.pickle')) # structure of the dictionary above: { year (int) : [ set of variable names (strs), [variable_i, metadata_i] ] } YEAR_INDICES = cp.load(open(pathToData + 'YEAR_INDICES.pickle')) VAR_INDICES = cp.load(open(pathToData + 'VAR_INDICES_binary.pickle', 'rb')) articleClasses = cp.load(open(pathToData + 'articleClasses.pickle', 'rb')) articleClasses = filterArticles(articleClasses) ''' # load GSS data GSSFilename = 'GSS Dataset/GSS7212_R2.sav' data = srw.SavReader(pathToData + GSSFilename) df = pd.DataFrame(data.all(), index=data[:,0], columns=ALL_VARIABLE_NAMES) with data: # this makes sure the file will be closed, memory cleaned up after the program is run data = np.array(data.all()) # this makes sure the entire dataset is loaded into RAM, which makes accessing much faster ''' from collections import defaultdict output = defaultdict(dict) groups = ['group1', 'group2'] outcomes = ['propSig', 'paramSizesNormed', 'Rs', 'adjRs', 'pvalues', 'numTotal'] for group in groups: