def main(): parser = argparse.ArgumentParser( description='Computes distances between word pairs.') parser.add_argument("--input", "-i", type=openfile, metavar="FILE", help='The input vector space.') parser.add_argument('--pairsfile', '-w', metavar='FILE', type=openfile, help='The list of tab separated word pairs.') parser.add_argument( 'words', nargs='*', metavar='WORD', help=('Additional word pairs specified at the command line. ' 'Every two specifies an additional word pair. Must be ' 'given an even number of words.')) parser.add_argument('--pos', '-p', action='store_true', help='Marks that the word pairs are POS tagged.') parser.add_argument('--distance-metric', '-d', action='append', choices=METRICS.keys(), help='Distance metrics to use.') args = parser.parse_args() pairs = set() if args.pairsfile: pairs.update(read_pairs(args.pairsfile)) if len(args.words) % 2 != 0: raise ValueError, "You need to specify an even number of pair words." if not args.distance_metric: args.distance_metric = ['cosine'] pairs.update(zip(args.words[::2], args.words[1::2])) vecspace = read_vector_file(args.input) if not args.pos: # need to strip the POS from the targets vecspace = df_remove_pos(vecspace) distance_metric_names = args.distance_metric distance_metrics = [METRICS[name] for name in distance_metric_names] output_measures = calculate_distance_metrics(vecspace, pairs, distance_metrics) output_measures.to_csv(sys.stdout, sep="\t", index=False)
def main(): parser = argparse.ArgumentParser( description='Computes distances between word pairs.') parser.add_argument("--input", "-i", type=openfile, metavar="FILE", help='The input vector space.') parser.add_argument('--pairsfile', '-w', metavar='FILE', type=openfile, help='The list of tab separated word pairs.') parser.add_argument('words', nargs='*', metavar='WORD', help=('Additional word pairs specified at the command line. ' 'Every two specifies an additional word pair. Must be ' 'given an even number of words.')) parser.add_argument('--pos', '-p', action='store_true', help='Marks that the word pairs are POS tagged.') parser.add_argument('--distance-metric', '-d', action='append', choices=METRICS.keys(), help='Distance metrics to use.') args = parser.parse_args() pairs = set() if args.pairsfile: pairs.update(read_pairs(args.pairsfile)) if len(args.words) % 2 != 0: raise ValueError, "You need to specify an even number of pair words." if not args.distance_metric: args.distance_metric = ['cosine'] pairs.update(zip(args.words[::2], args.words[1::2])) vecspace = read_vector_file(args.input) if not args.pos: # need to strip the POS from the targets vecspace = df_remove_pos(vecspace) distance_metric_names = args.distance_metric distance_metrics = [METRICS[name] for name in distance_metric_names] output_measures = calculate_distance_metrics(vecspace, pairs, distance_metrics) output_measures.to_csv(sys.stdout, sep="\t", index=False)
def main(): parser = argparse.ArgumentParser( description='Computes correlations with compositionality ratings.') parser.add_argument('--input', '-i', action="append", type=openfile, metavar="FILE", help='Input vector space.') parser.add_argument('--ratings', '-r', metavar='COMPFILE', type=openfile, help='The compositionality ratings file.') parser.add_argument('--self', '-s', action="store_true", help='Whether we should include self-comp ratings.') parser.add_argument('--no-tsv', '-T', action="store_true", help="*Don't* output the TSV containing comp and model ratings.") parser.add_argument('--corrs', '-c', action="store_true", help='Specifies whether correlations should be computed and outputed.') parser.add_argument('--pdf', '-p', metavar="FILE", default=None, help='Output plots as a PDF to the given filename.') args = parser.parse_args() compratings = pd.read_table(args.ratings) if not args.self: compratings = compratings[compratings["compound"] != compratings["const"]] word_pairs = set(zip(compratings['compound'], compratings['const'])) named_vector_spaces = [ (basename(f.name), norm2_matrix(df_remove_pos(read_vector_file(f)))) for f in args.input ] if len(named_vector_spaces) > 1: # need to do concatenation names, vses = zip(*named_vector_spaces) concat_space = pd.concat(vses, keys=names) named_vector_spaces.append(("<concat>", concat_space)) # compute all the distances AND keep the different measures independently named distances = [ cdm(vs, word_pairs, [DISTANCE_METRIC]) .rename(columns={DISTANCE_METRIC.name: fn + ":" + DISTANCE_METRIC.name}) for fn, vs in named_vector_spaces ] # now we need to join all the distance calculations: joined_measures = reduce(pd.merge, distances).rename( columns={"left": "compound", "right": "const"}) # finally join the similarity measures with the human ratings dm_and_comp = pd.merge(compratings, joined_measures) # output dm_and_comp unless the user specified not to if not args.no_tsv: dm_and_comp.to_csv(sys.stdout, index=False, sep="\t") # nicer output if not args.no_tsv and args.corrs: # let's compute our correlations print "\n" + "-" * 80 + "\n" # compute and output correlations if the user asked if args.corrs: corrs = correlations(dm_and_comp).to_csv(sys.stdout, index=False, sep="\t") # plot the measures if the user asked. if args.pdf: scatters(dm_and_comp, args.pdf)
def main(): parser = argparse.ArgumentParser( description='Computes correlations with compositionality ratings.') parser.add_argument('--input', '-i', action="append", type=openfile, metavar="FILE", help='Input vector space.') parser.add_argument('--ratings', '-r', metavar='COMPFILE', type=openfile, help='The compositionality ratings file.') parser.add_argument('--self', '-s', action="store_true", help='Whether we should include self-comp ratings.') parser.add_argument( '--no-tsv', '-T', action="store_true", help="*Don't* output the TSV containing comp and model ratings.") parser.add_argument( '--corrs', '-c', action="store_true", help='Specifies whether correlations should be computed and outputed.') parser.add_argument('--pdf', '-p', metavar="FILE", default=None, help='Output plots as a PDF to the given filename.') args = parser.parse_args() compratings = pd.read_table(args.ratings) if not args.self: compratings = compratings[ compratings["compound"] != compratings["const"]] word_pairs = set(zip(compratings['compound'], compratings['const'])) named_vector_spaces = [(basename(f.name), norm2_matrix(df_remove_pos(read_vector_file(f)))) for f in args.input] if len(named_vector_spaces) > 1: # need to do concatenation names, vses = zip(*named_vector_spaces) concat_space = pd.concat(vses, keys=names) named_vector_spaces.append(("<concat>", concat_space)) # compute all the distances AND keep the different measures independently named distances = [ cdm(vs, word_pairs, [DISTANCE_METRIC]).rename( columns={DISTANCE_METRIC.name: fn + ":" + DISTANCE_METRIC.name}) for fn, vs in named_vector_spaces ] # now we need to join all the distance calculations: joined_measures = reduce(pd.merge, distances).rename(columns={ "left": "compound", "right": "const" }) # finally join the similarity measures with the human ratings dm_and_comp = pd.merge(compratings, joined_measures) # output dm_and_comp unless the user specified not to if not args.no_tsv: dm_and_comp.to_csv(sys.stdout, index=False, sep="\t") # nicer output if not args.no_tsv and args.corrs: # let's compute our correlations print "\n" + "-" * 80 + "\n" # compute and output correlations if the user asked if args.corrs: corrs = correlations(dm_and_comp).to_csv(sys.stdout, index=False, sep="\t") # plot the measures if the user asked. if args.pdf: scatters(dm_and_comp, args.pdf)