Exemplo n.º 1
0
 def __init__(self,
              group_key='Group',
              outcome_key='Outcome',
              groups = {'1':[], '2':[]},
              population_of_interest=None,
              *args,
              **kwargs
              ):
     ExactTest.__init__(self, *args, **kwargs)
     
     # validate input parameters
     if self.input_file:
         if not group_key or not outcome_key:
             raise ValueError('Keys must be defined for group and outcome columns.')
         groups = list(set(entry[group_key] for entry in csv.DictReader(open(self.input_file))))
         group_of_interest = 0
         try: group_of_interest = groups.index(population_of_interest)
         except: population_of_interest = groups[0]
         g1 = list()
         g2 = list()
         for row in csv.DictReader(open(self.input_file)):
             if row[group_key] == groups[group_of_interest]: g1.append(row[outcome_key])
             else: g2.append(row[outcome_key])
             if not row[group_key]: sys.stderr.write('Warning: row contains no group assignment in file %s: %s.\n'%(self.input_file, row))
         groups = { groups[group_of_interest]:g1, 'Not %s'%groups[group_of_interest]:g2 }
     if not isinstance(groups, dict):
         raise TypeError('Groups must be supplied as a dictionary of pairs Group Label : Outcome Values.')
     if len(groups.keys()) != 2:
         raise ValueError('There must be exactly 2 groups; found %s'%len(groups.keys()))
     for k in groups.keys():
         if not groups[k]:
             raise ValueError('No scores found for Group: %s.'%k)
         if not self.__class__.is_array_like(groups[k]):
             raise TypeError('Scores for Group: %s must be a list or tuple.'%k)
         groups[k] = list(groups[k])
     if not population_of_interest: population_of_interest = groups.keys()[0]
     self.population_of_interest = population_of_interest
     
     # store group scores as a data attribute
     self.groups = groups
     # set the test parameter to median of the population of interest
     self.test_parameter = 'M %s'%population_of_interest
     # store the outcome key as a data attribute
     self.outcome_key = outcome_key
     # store the key of the smaller group
     self.smaller_group_key = groups.keys()[0] if len(groups[groups.keys()[0]]) < len(groups[groups.keys()[1]]) else groups.keys()[1]
     #self.hypothesized_value = 'M %s'%(groups.keys())[(groups.keys()).index(self.smaller_group_key)-1] # TODO: need to adjust when taking actual hypothesized value into account
     self.hypothesized_value = 'M %s'%groups.keys()[groups.keys().index(population_of_interest)-1] 
     # order all of the scores based on rank
     try: self.combined_scores = [float(x) for k in groups.keys() for x in groups[k]]
     except: self.combined_scores = [x for k in groups.keys() for x in groups[k]]
     self.combined_scores.sort()
     # store sample size
     self.n = len(self.combined_scores)
Exemplo n.º 2
0
 def __init__(self,
              pre_key='Pre', post_key='Post',    # keys for the two columns to compare in the csv file
              pre=[], post=[],                   # optionally specify the pre and post scores manually
              hypothesized_value=0,              # the value under test
              *args,
              **kwargs):
     ExactTest.__init__(self, *args, **kwargs)
     
     # validate input parameters
     if self.input_file:
         if not pre_key:
             raise ValueError('You must specify the name of the column containing pre-test scores.')
         if not post_key:
             raise ValueError('You must specify the name of the column containing post-test scores.')
         pre = list()
         post = list()
         try:
             for row in csv.DictReader(open(self.input_file)):
                 if not pre_key in row.keys(): raise ValueError('The pre-test key %s was not found.'%pre_key)
                 if not post_key in row.keys(): raise ValueError('The post-test key %s was not found.'%pre_key)
                 if row[pre_key] and row[post_key]:
                     pre.append(float(row[pre_key]))
                     post.append(float(row[post_key]))
                 elif (row[pre_key] and not row[post_key]) or (row[post_key] and not row[pre_key]):
                     sys.stderr.write('Row does not contain matched pairs (%s). Data is being skipped.\n'%row)
         except KeyError as e:
             raise KeyError('Unable to locate key %s in file %s.'%(e, self.input_file))
     if not pre or not post:
         raise ValueError('You must supply scores for both pre and post.')
     if not self.__class__.is_array_like(pre):
         raise TypeError('pre must be a list or tuple.')
     if not self.__classs__.is_array_like(post):
         raise TypeError('post must be a list or tuple.')
     if pre and post and not (len(pre)==len(post)):
         raise ValueError('pre and post must be sequences of equal length.')
     
     # set the test parameter to median difference
     self.test_parameter = 'Md'
     # store difference scores as data attribute
     self.difference_scores = [post[i]-pre[i] for i in range(len(post))]
     # sort difference scores by absolute value
     self.difference_scores.sort(cmp=lambda x,y: cmp(abs(x), abs(y)))
     # store sample size
     self.n = len(self.difference_scores)
     # generate replacement values and weights
     self.replacement_values = self.difference_scores
     self.weights = [1.0 if x > 0.0 else (0.0 if x < 0.0 else 0.5) for x in self.difference_scores]