def parse_bestworst_data(file, bestCol="best", worstCol="worst", sep=None): """parses best-worst data from file, where each trial is returned as tuple: (best, worst, (unchosen1, unchosen2, ..., unchosen[K-2])) A list of parsed trials is returned. """ # figure out our seperator first if sep == None and file.endswith(".tsv"): sep = "\t" elif sep == None: sep = "," # read in the trials ss = Spreadsheet.read_csv(file, delimiter=sep) trials = [] for row in ss: # read best and worst choices best = row[bestCol] worst = row[worstCol] opts = [] # read all options that were choosable i = 1 while True: col = "option%d" % i if col in ss.header: opts.append(row[col]) i += 1 else: break # strip best and worst choices from the list of other options if best in opts: opts.remove(best) if worst in opts: opts.remove(worst) trials.append((best, worst, tuple(opts))) # return the parsed trials return trials
def main(argv=sys.argv[1:]): parser = argparse.ArgumentParser( description= 'Command line for generating best-worst trials from a list of items.') parser.add_argument( "input", type=str, help= "Path to a file containing input data (i.e., a list of words or other text stimuli)." ) parser.add_argument( "N", nargs="?", type=int, default=None, help= "Number of best-worst trials to generate. Suggested amount: number of items, times 8. So if you have 1000 items, 8000 trials. You may get *slightly* better results up to an N of 16, but gains are marginal. You may need fewer trials if your K is larger. Empirical testing not yet done." ) parser.add_argument("K", nargs="?", type=int, default=4, help="Number of items per best-worst trial.") parser.add_argument( "--generator", type=str, default="norepeateven", help= "Method for generating trials. Don't screw with unless you know what you are doing. Options are: random, even, norepeat, norepeateven." ) parser.add_argument( "--column", type=str, default=None, help= "If inputting a structured text file, indicate which column to pull data from." ) parser.add_argument( "--sep", type=str, default=None, help= "Specify the column separator. If None specified, use default (tab for .tsv, comma for all else)" ) args = parser.parse_args() # read in the items to build trials from items = [] if args.input.endswith( ".txt") and args.sep == None and args.column == None: items = [item.strip() for item in open(args.input, "r").read().split()] elif args.column == None: raise Exception( "You must specify a column name with --column= to generate trials this way." ) else: # comma by default or if specified, tab if .tsv sep = args.sep if sep == None and args.input.endswith(".tsv"): sep = "\t" elif sep == None: sep = "," ss = Spreadsheet.read_csv(args.input, delimiter=sep) # take the first column if no column specified items = ss[args.column] # filter out empty strings items = [item for item in items if len(item) > 0] # parse our N and K K = args.K N = args.N if N == None: N = len(items) * 3 # generate trials from items trials = [] if args.generator == "norepeateven": trials = trialgen.build_trials_even_bigram_norepeat(items, N=N, K=K) elif args.generator == 'even': trials = trialgen.build_trials_even(items, N=N, K=K) elif args.generator == 'random': trials = trialgen.build_trials_random(items, N=N, K=K) elif args.generator == "norepeat": trials = trialgen.build_trials_random_bigram_norepeat(items, N=N, K=K) else: raise Exception( "You must specify a proper generation method: norepeateven, even, random, norepeat." ) # print the output, complete with header. # header = [ "option%d" % (i+1) for i in range(K) ] # print(",".join(header)) # for trial in trials: # print(",".join(trial)) # return trials df = pd.DataFrame(trials) df.to_csv("all_trials_reduced.csv")
def main(argv=sys.argv[1:]): parser = argparse.ArgumentParser( description='Interface for best-worst simulation') parser.add_argument( "input", type=str, help= "A .csv or .tsv input file containing two columns, named by default Item and LatentValue. Item is an identifying label and LatentValue is the item's True value along the dimension to be evaluated." ) parser.add_argument( "N", type=int, help="Number of trials to generate for the simulation.") parser.add_argument("K", type=int, default=4, help="Number of items per trial, defaults to 4.") parser.add_argument( "--noise", type=float, default=0.0, help= "the sd to use for generating noise on each decision (noise is normally distributed)." ) parser.add_argument( "--generator", type=str, default="even", help= "The type of trial generation method for running the simulation. Options are: random, even, norepeat, norepeateven. See Hollis (2017) for details." ) parser.add_argument("--sep", type=str, default=None, help="Column seperator for the input file") parser.add_argument("--item", type=str, default="Item", help="Column corresponding to item name.") parser.add_argument("--latentvalue", type=str, default="LatentValue", help="Column corresponding to latent value name.") parser.add_argument( "--dummy", type=bool, default=True, help="use a dummy player to bound tournament-based scores.") parser.add_argument( "--iters", type=int, default=100, help= "Number of iterations to run tournament-based methods for. 100 is likely sufficient to ensure convergence, if not a little overkill." ) args = parser.parse_args() # determine the column seperator for our input data sep = args.sep if sep == None and args.input.endswith(".tsv"): sep = "\t" elif sep == None: sep = "," # read in latent values from the input data latent_values = {} ss = Spreadsheet.read_csv(args.input, delimiter=sep) for row in ss: latent_values[str(row[args.item])] = row[args.latentvalue] # get the names of our unique items items = latent_values.keys() # parse our N and K K = args.K N = args.N # generate trials from items trials = [] if args.generator == "norepeateven": trials = trialgen.build_trials_even_bigram_norepeat(items, N=N, K=K) elif args.generator == 'even': trials = trialgen.build_trials_even(items, N=N, K=K) elif args.generator == 'random': trials = trialgen.build_trials_random(items, N=N, K=K) elif args.generator == "norepeat": trials = trialgen.build_trials_random_bigram_norepeat(items, N=N, K=K) else: raise Exception( "You must specify a proper generation method: norepeateven, even, random, norepeat." ) # sort words in each trial by their latent value, plus noise trials = [sort_words(trial, latent_values, args.noise) for trial in trials] # convert the trials into format: (best, worst, (others,)) trials = [(trial[0], trial[-1], tuple(trial[1:-1])) for trial in trials] # perform scoring. This takes awhile. methods = [ "Value", "Elo", "RW", "Best", "Worst", "Unchosen", "BestWorst", "ABW", "David", "ValueLogit", "RWLogit", "BestWorstLogit" ] results = scoring.score_trials(trials, methods, iters=args.iters, dummy=args.dummy) # print the header and results header = [args.item, args.latentvalue] + methods print ",".join(header) for name, data in results.iteritems(): # skip dummy items if type(name) != str: continue scores = [scoring.scoring_methods[method](data) for method in methods] out = [name, latent_values[name]] + [str(score) for score in scores] print ",".join([str(v) for v in out])
def main(argv = sys.argv[1:]): parser = argparse.ArgumentParser(description='Command line for filtering noncompliant participants from best-worst data.') parser.add_argument("scores", type=str, help="Path to a file containing scores computed over all users (including noncompliant ones).") parser.add_argument("input", nargs="*", type=str, help="Path to a file(s) containing trial-level data.") parser.add_argument("--id_column", type=str, default=None, help="A column in your input data that specifies user ID. If no value is supplied, uses the name of the file.") parser.add_argument("--best", type=str, default="best", help="Name of column that holds string of 'best' choice.") parser.add_argument("--worst", type=str, default="worst", help="Name of column that holds string of 'worst' choice.") parser.add_argument("--score_method", type=str, default="Value", help="The scoring method to calculate compliance by.") parser.add_argument("--filter", type=float, default=None, help="If you want to filter users by compliance, specify the threshold here. The output will be the trials for just users who met your threshold of compliance, as a single file. This can be submitted to score_trials.py for rescoring.") args = parser.parse_args() # read the scores ss = Spreadsheet.read_csv(args.scores) scores = { } for row in ss: scores[row[0]] = row[args.score_method] # go through each file and calculate participant compliance and log their # trials compliance = { } paircount = { } user_trials = { } for file in args.input: # if a participant ID column is not specified, use the name of the file. # Otherwise, use values in the specified column to determine ID. id_col = None if args.id_column != None: ss = Spreadsheet.read_csv(file) id_col = ss[args.id_column] # read in the user's data, calculate compliance for each trial trials = scoring.parse_bestworst_data(file, bestCol=args.best, worstCol=args.worst) # generate a user ID based on the file name if id_col == None: id_col = [ file ] * len(trials) # check for agreement on each trial for i in xrange(len(trials)): id = id_col[i] trial = trials[i] best, worst, others = trial # something funny going on in your data; check it out duplic = best == worst # skip trial if best == worst if duplic == True: continue # set default values for participant if needed if id not in compliance: compliance[id] = 0 paircount[id] = 0 user_trials[id]= [ ] # calculate compliance pairs = [ (best, other) for other in ((worst,) + others) ] + [ (other, worst) for other in others ] consistent = [ scores[pair[0]] > scores[pair[1]] for pair in pairs ] compliance[id] += sum(consistent) paircount[id] += len(pairs) user_trials[id].append(trial) # calculate overall accuracy for each participant accuracy = { } for user in compliance.iterkeys(): accuracy[user] = float(compliance[user]) / paircount[user] # print compliance for each person if args.filter == None: # sort users by their accuracy users = accuracy.keys() users.sort(lambda a,b: cmp(accuracy[a], accuracy[b]), reverse=True) print "ID,Compliance" for user in users: print "%s,%0.3f" % (str(user), accuracy[user]) # print trials for users that meet the filter threshold else: user_count = 0 for user in accuracy.iterkeys(): # skip by everyone who doesn't make the cut if accuracy[user] < args.filter: continue # print out data for everyone else best,worst,others = user_trials[user][0] options = list(others) # do we need to print the header? if user_count == 0: optCols = [ "option%d" % (i+1) for i in xrange(len(options)) ] header = [ "User", "best", "worst" ] + optCols print ",".join(header) # print out each trial for the user for trial in user_trials[user]: best,worst, others = trial options = list(others) out = [ user, best, worst ] + options out = [ str(v) for v in out ] print ",".join(out) # increment our users user_count += 1