Exemplo n.º 1
0
def parse_bestworst_data(file, bestCol="best", worstCol="worst", sep=None):
    """parses best-worst data from file, where each trial is returned as tuple:
         (best, worst, (unchosen1, unchosen2, ..., unchosen[K-2]))

       A list of parsed trials is returned.
    """
    # figure out our seperator first
    if sep == None and file.endswith(".tsv"):
        sep = "\t"
    elif sep == None:
        sep = ","

    # read in the trials
    ss = Spreadsheet.read_csv(file, delimiter=sep)
    trials = []
    for row in ss:
        # read best and worst choices
        best = row[bestCol]
        worst = row[worstCol]
        opts = []

        # read all options that were choosable
        i = 1
        while True:
            col = "option%d" % i
            if col in ss.header:
                opts.append(row[col])
                i += 1
            else:
                break

        # strip best and worst choices from the list of other options
        if best in opts:
            opts.remove(best)
        if worst in opts:
            opts.remove(worst)

        trials.append((best, worst, tuple(opts)))

    # return the parsed trials
    return trials
Exemplo n.º 2
0
def main(argv=sys.argv[1:]):
    parser = argparse.ArgumentParser(
        description=
        'Command line for generating best-worst trials from a list of items.')
    parser.add_argument(
        "input",
        type=str,
        help=
        "Path to a file containing input data (i.e., a list of words or other text stimuli)."
    )
    parser.add_argument(
        "N",
        nargs="?",
        type=int,
        default=None,
        help=
        "Number of best-worst trials to generate. Suggested amount: number of items, times 8. So if you have 1000 items, 8000 trials. You may get *slightly* better results up to an N of 16, but gains are marginal. You may need fewer trials if your K is larger. Empirical testing not yet done."
    )
    parser.add_argument("K",
                        nargs="?",
                        type=int,
                        default=4,
                        help="Number of items per best-worst trial.")
    parser.add_argument(
        "--generator",
        type=str,
        default="norepeateven",
        help=
        "Method for generating trials. Don't screw with unless you know what you are doing. Options are: random, even, norepeat, norepeateven."
    )
    parser.add_argument(
        "--column",
        type=str,
        default=None,
        help=
        "If inputting a structured text file, indicate which column to pull data from."
    )
    parser.add_argument(
        "--sep",
        type=str,
        default=None,
        help=
        "Specify the column separator. If None specified, use default (tab for .tsv, comma for all else)"
    )

    args = parser.parse_args()

    # read in the items to build trials from
    items = []
    if args.input.endswith(
            ".txt") and args.sep == None and args.column == None:
        items = [item.strip() for item in open(args.input, "r").read().split()]
    elif args.column == None:
        raise Exception(
            "You must specify a column name with --column= to generate trials this way."
        )
    else:
        # comma by default or if specified, tab if .tsv
        sep = args.sep
        if sep == None and args.input.endswith(".tsv"):
            sep = "\t"
        elif sep == None:
            sep = ","
        ss = Spreadsheet.read_csv(args.input, delimiter=sep)

        # take the first column if no column specified
        items = ss[args.column]

    # filter out empty strings
    items = [item for item in items if len(item) > 0]

    # parse our N and K
    K = args.K
    N = args.N
    if N == None:
        N = len(items) * 3

    # generate trials from items
    trials = []
    if args.generator == "norepeateven":
        trials = trialgen.build_trials_even_bigram_norepeat(items, N=N, K=K)
    elif args.generator == 'even':
        trials = trialgen.build_trials_even(items, N=N, K=K)
    elif args.generator == 'random':
        trials = trialgen.build_trials_random(items, N=N, K=K)
    elif args.generator == "norepeat":
        trials = trialgen.build_trials_random_bigram_norepeat(items, N=N, K=K)
    else:
        raise Exception(
            "You must specify a proper generation method: norepeateven, even, random, norepeat."
        )

    # print the output, complete with header.
    # header = [ "option%d" % (i+1) for i in range(K) ]
    # print(",".join(header))
    # for trial in trials:
    #     print(",".join(trial))
    # return trials
    df = pd.DataFrame(trials)
    df.to_csv("all_trials_reduced.csv")
Exemplo n.º 3
0
def main(argv=sys.argv[1:]):
    parser = argparse.ArgumentParser(
        description='Interface for best-worst simulation')
    parser.add_argument(
        "input",
        type=str,
        help=
        "A .csv or .tsv input file containing two columns, named by default Item and LatentValue. Item is an identifying label and LatentValue is the item's True value along the dimension to be evaluated."
    )
    parser.add_argument(
        "N", type=int, help="Number of trials to generate for the simulation.")
    parser.add_argument("K",
                        type=int,
                        default=4,
                        help="Number of items per trial, defaults to 4.")
    parser.add_argument(
        "--noise",
        type=float,
        default=0.0,
        help=
        "the sd to use for generating noise on each decision (noise is normally distributed)."
    )
    parser.add_argument(
        "--generator",
        type=str,
        default="even",
        help=
        "The type of trial generation method for running the simulation. Options are: random, even, norepeat, norepeateven. See Hollis (2017) for details."
    )
    parser.add_argument("--sep",
                        type=str,
                        default=None,
                        help="Column seperator for the input file")
    parser.add_argument("--item",
                        type=str,
                        default="Item",
                        help="Column corresponding to item name.")
    parser.add_argument("--latentvalue",
                        type=str,
                        default="LatentValue",
                        help="Column corresponding to latent value name.")
    parser.add_argument(
        "--dummy",
        type=bool,
        default=True,
        help="use a dummy player to bound tournament-based scores.")
    parser.add_argument(
        "--iters",
        type=int,
        default=100,
        help=
        "Number of iterations to run tournament-based methods for. 100 is likely sufficient to ensure convergence, if not a little overkill."
    )

    args = parser.parse_args()

    # determine the column seperator for our input data
    sep = args.sep
    if sep == None and args.input.endswith(".tsv"):
        sep = "\t"
    elif sep == None:
        sep = ","

    # read in latent values from the input data
    latent_values = {}
    ss = Spreadsheet.read_csv(args.input, delimiter=sep)
    for row in ss:
        latent_values[str(row[args.item])] = row[args.latentvalue]

    # get the names of our unique items
    items = latent_values.keys()

    # parse our N and K
    K = args.K
    N = args.N

    # generate trials from items
    trials = []
    if args.generator == "norepeateven":
        trials = trialgen.build_trials_even_bigram_norepeat(items, N=N, K=K)
    elif args.generator == 'even':
        trials = trialgen.build_trials_even(items, N=N, K=K)
    elif args.generator == 'random':
        trials = trialgen.build_trials_random(items, N=N, K=K)
    elif args.generator == "norepeat":
        trials = trialgen.build_trials_random_bigram_norepeat(items, N=N, K=K)
    else:
        raise Exception(
            "You must specify a proper generation method: norepeateven, even, random, norepeat."
        )

    # sort words in each trial by their latent value, plus noise
    trials = [sort_words(trial, latent_values, args.noise) for trial in trials]

    # convert the trials into format: (best, worst, (others,))
    trials = [(trial[0], trial[-1], tuple(trial[1:-1])) for trial in trials]

    # perform scoring. This takes awhile.
    methods = [
        "Value", "Elo", "RW", "Best", "Worst", "Unchosen", "BestWorst", "ABW",
        "David", "ValueLogit", "RWLogit", "BestWorstLogit"
    ]
    results = scoring.score_trials(trials,
                                   methods,
                                   iters=args.iters,
                                   dummy=args.dummy)

    # print the header and results
    header = [args.item, args.latentvalue] + methods
    print ",".join(header)
    for name, data in results.iteritems():
        # skip dummy items
        if type(name) != str:
            continue

        scores = [scoring.scoring_methods[method](data) for method in methods]
        out = [name, latent_values[name]] + [str(score) for score in scores]
        print ",".join([str(v) for v in out])
def main(argv = sys.argv[1:]):
    parser = argparse.ArgumentParser(description='Command line for filtering noncompliant participants from best-worst data.')
    parser.add_argument("scores", type=str, help="Path to a file containing scores computed over all users (including noncompliant ones).")
    parser.add_argument("input", nargs="*", type=str, help="Path to a file(s) containing trial-level data.")
    parser.add_argument("--id_column", type=str, default=None, help="A column in your input data that specifies user ID. If no value is supplied, uses the name of the file.")
    parser.add_argument("--best", type=str, default="best", help="Name of column that holds string of 'best' choice.")
    parser.add_argument("--worst", type=str, default="worst", help="Name of column that holds string of 'worst' choice.")
    parser.add_argument("--score_method", type=str, default="Value", help="The scoring method to calculate compliance by.")
    parser.add_argument("--filter", type=float, default=None, help="If you want to filter users by compliance, specify the threshold here. The output will be the trials for just users who met your threshold of compliance, as a single file. This can be submitted to score_trials.py for rescoring.")

    args = parser.parse_args()

    # read the scores
    ss = Spreadsheet.read_csv(args.scores)
    scores = { }
    for row in ss:
        scores[row[0]] = row[args.score_method]

    # go through each file and calculate participant compliance and log their
    # trials
    compliance  = { }
    paircount   = { }
    user_trials = { }
    
    for file in args.input:
        # if a participant ID column is not specified, use the name of the file.
        # Otherwise, use values in the specified column to determine ID.
        id_col = None
        if args.id_column != None:
            ss = Spreadsheet.read_csv(file)
            id_col = ss[args.id_column]
                        
        # read in the user's data, calculate compliance for each trial
        trials = scoring.parse_bestworst_data(file, bestCol=args.best, worstCol=args.worst)
        # generate a user ID based on the file name
        if id_col == None:
            id_col = [ file ] * len(trials)

        # check for agreement on each trial
        for i in xrange(len(trials)):
            id    = id_col[i]
            trial = trials[i]
            best, worst, others = trial
            
            # something funny going on in your data; check it out
            duplic = best == worst
            # skip trial if best == worst
            if duplic == True:
                continue

            # set default values for participant if needed
            if id not in compliance:
                compliance[id] = 0
                paircount[id]  = 0
                user_trials[id]= [ ]

            # calculate compliance
            pairs = [ (best, other) for other in ((worst,) + others) ] + [ (other, worst) for other in others ]
            consistent = [ scores[pair[0]] > scores[pair[1]] for pair in pairs ]
            compliance[id] += sum(consistent)
            paircount[id]  += len(pairs)
            user_trials[id].append(trial)
                
    # calculate overall accuracy for each participant
    accuracy = { }
    for user in compliance.iterkeys():
        accuracy[user] = float(compliance[user]) / paircount[user]

    # print compliance for each person
    if args.filter == None:
        # sort users by their accuracy
        users = accuracy.keys()
        users.sort(lambda a,b: cmp(accuracy[a], accuracy[b]), reverse=True)

        print "ID,Compliance"
        for user in users:
            print "%s,%0.3f" % (str(user), accuracy[user])
            
    # print trials for users that meet the filter threshold
    else:
        user_count = 0
        for user in accuracy.iterkeys():
            # skip by everyone who doesn't make the cut
            if accuracy[user] < args.filter:
                continue

            # print out data for everyone else
            best,worst,others = user_trials[user][0]
            options = list(others)

            # do we need to print the header?
            if user_count == 0:
                optCols = [ "option%d" % (i+1) for i in xrange(len(options)) ]
                header  = [ "User", "best", "worst" ] + optCols
                print ",".join(header)

            # print out each trial for the user
            for trial in user_trials[user]:
                best,worst, others = trial
                options = list(others)

                out = [ user, best, worst ] + options
                out = [ str(v) for v in out ]
                print ",".join(out)

            # increment our users
            user_count += 1