def main(): sys.stdout = codecs.getwriter(FILE_ENC)(sys.stdout) sys.stderr = codecs.getwriter(FILE_ENC)(sys.stderr) args = parser.parse_args() target_pairs = csv.parse_csv(TargetPairCollector(), input_file=args.target_pairs).pairs collector = csv.parse_csv(DataCollector(), input_file=args.input_file) collector.print_merged(target_pairs, args.column_name)
def run(self): self.thesaurus = csv.parse_csv(Thesaurus(self.args.column_name), self.args.thesaurus_file) for line_num, line in enumerate(sys.stdin): line = line[:-1] # Strip off '\n' self.treat_line(line, line_num) if self.args.out_all_stats: print("Accuracy: {:2.2f}%".format(100 * self.n_correct / self.n_lines)) print("Empty: {:2.2f}%".format(100 * self.n_empty / self.n_lines))
def run(self): self.thesaurus = csv.parse_csv( Thesaurus(self.args.column_name), self.args.thesaurus_file) for line_num, line in enumerate(sys.stdin): line = line[:-1] # Strip off '\n' self.treat_line(line, line_num) if self.args.out_all_stats: print("Accuracy: {:2.2f}%".format(100 * self.n_correct/self.n_lines)) print("Empty: {:2.2f}%".format(100 * self.n_empty/self.n_lines))
def main(): sys.stdout = codecs.getwriter(FILE_ENC)(sys.stdout) sys.stderr = codecs.getwriter(FILE_ENC)(sys.stderr) args = parser.parse_args() triples = csv.parse_csv(TriplesCollector(), input_file=args.target_addition_triples).triples data_parser = csv.parse_csv if args.input_format == "word2vec": data_parser = word2vec_parser collector = data_parser(DataCollector(args, triples), input_file=args.input_file) collector.print_merged()
def check_missing(self): r"""Complain about missing gold/value keys.""" sample_col_gold = self.columns_gold[self.parser_gold.colnames[0]] sample_col_pred = self.columns_pred[self.parser_pred.colnames[0]] missing_in_pred = [kG for kG in sample_col_gold if kG not in sample_col_pred] missing_in_gold = [kP for kP in sample_col_pred if kP not in sample_col_gold] if missing_in_gold: warn("{n} keys (e.g. `{key}`) not found in gold file; " \ "will use 0.0", n=len(missing_in_gold), key=missing_in_gold[0]) if missing_in_pred: warn("{n} keys (e.g. `{key}`) not found in prediction file; " \ "will use avg(predictions)", n=len(missing_in_pred), key=missing_in_pred[0]) ##################################################### if __name__ == "__main__": args = parser.parse_args() parser_gold = NumValuesParser(id_col=args.gold_id_column, colnames=args.gold_value_columns) parser_pred = NumValuesParser(id_col=args.pred_id_column, colnames=args.pred_value_columns, inverted_scales=args.inverted_scales) csv.parse_csv(parser_gold, input_file=args.gold_file) csv.parse_csv(parser_pred, input_file=args.pred_file) Main(args, parser_gold, parser_pred).run()
def handle_data(self, line, data_namedtuple): target = data_namedtuple.target neighbor = data_namedtuple.neighbor if target != self.current_target: self.current_target = target self.current_target_count = 0 self.current_target_count += 1 if self.current_target_count <= self.args.best_k: wnpath = "{0:.10f}".format(self.wnpath(target, neighbor)) print("\t".join(data_namedtuple).encode("utf8"), wnpath, sep="\t") def wnpath(self, target, neighbor): r"""Return the best path_similarity between `target` and `neighbor`.""" synsetsT = wn.synsets(target, self.args.wordnet_pos_tag) synsetsN = wn.synsets(neighbor, self.args.wordnet_pos_tag) if not synsetsT: return 0 # XXX no synsets for `target` if not synsetsN: return 0 # XXX no synsets for `neighbor` return ( max(wn.path_similarity(sT, sN) for sT in synsetsT for sN in synsetsN) or 0 ) # When `wn` returns None, we just say sim==0 ##################################################### if __name__ == "__main__": csv.parse_csv(WnAdder(parser.parse_args()))
(columns, self.args.column_name) def handle_data(self, line, data_tuple): discriminant = tuple( getattr(data_tuple, col_name) for col_name in self.args.discriminate_by) if discriminant != self.current_discriminant: self.print_stats() self.current_discriminant = discriminant self.stats = statistics.Statistics() self.stats.add(float(getattr(data_tuple, self.args.column_name))) def end(self): self.print_stats() if self.args.print_global: print("GLOBAL:", self.global_stats.n, self.global_stats.avg, self.global_stats.stddev_sample) def print_stats(self): if self.stats is not None: fields = self.current_discriminant + (self.stats.n, self.stats.avg, self.stats.stddev_sample) self.global_stats.add(self.stats.avg) print(*fields, sep="\t") ##################################################### if __name__ == "__main__": csv.parse_csv(StatsPrinter(parser.parse_args()))
sum_squares = [0] * len(self.header_names) for data_namedtuple in self.current_group: for i, elem in enumerate(data_namedtuple): try: sum_squares[i] += float(elem)**2 except ValueError: pass # Value cannot be converted to float denominators = [math.sqrt(s) for s in sum_squares] for data_namedtuple in self.current_group: print(*[self.divide(i, value, denom) for (i, (value, denom)) in enumerate(zip(data_namedtuple, denominators))], sep="\t") self.current_group[:] = [] def divide(self, index, stringvalue, denominator): r"""Return float(stringvalue)/denominator.""" if not self.header_chosen[index]: return stringvalue try: return float(stringvalue) / denominator except ValueError: return stringvalue ##################################################### if __name__ == "__main__": csv.parse_csv(NormalizingPrinter(parser.parse_args()))
sum_squares[i] += float(elem)**2 except ValueError: pass # Value cannot be converted to float denominators = [math.sqrt(s) for s in sum_squares] for data_namedtuple in self.current_group: print(*[ self.divide(i, value, denom) for (i, (value, denom)) in enumerate(zip(data_namedtuple, denominators)) ], sep="\t") self.current_group[:] = [] def divide(self, index, stringvalue, denominator): r"""Return float(stringvalue)/denominator.""" if not self.header_chosen[index]: return stringvalue try: return float(stringvalue) / denominator except ValueError: return stringvalue ##################################################### if __name__ == "__main__": csv.parse_csv(NormalizingPrinter(parser.parse_args()))
def handle_data(self, line, data_namedtuple): target = data_namedtuple.target neighbor = data_namedtuple.neighbor if target != self.current_target: self.current_target = target self.current_target_count = 0 self.current_target_count += 1 if self.current_target_count <= self.args.best_k: wnpath = "{0:.10f}".format(self.wnpath(target, neighbor)) print("\t".join(data_namedtuple).encode('utf8'), wnpath, sep="\t") def wnpath(self, target, neighbor): r"""Return the best path_similarity between `target` and `neighbor`.""" synsetsT = wn.synsets(target, self.args.wordnet_pos_tag) synsetsN = wn.synsets(neighbor, self.args.wordnet_pos_tag) if not synsetsT: return 0 # XXX no synsets for `target` if not synsetsN: return 0 # XXX no synsets for `neighbor` return max(wn.path_similarity(sT, sN) for sT in synsetsT for sN in synsetsN) \ or 0 # When `wn` returns None, we just say sim==0 ##################################################### if __name__ == "__main__": csv.parse_csv(WnAdder(parser.parse_args()))
def handle_comment(self, line): print(line.encode('utf8')) def handle_header(self, line, header_list): print(line.encode('utf8'), "w2v_cosine", sep="\t") def handle_data(self, line, data_namedtuple): target = data_namedtuple.target neighbor = data_namedtuple.neighbor if target != self.current_target: self.current_target = target self.current_target_count = 0 self.current_target_count += 1 if self.current_target_count <= self.args.best_k: cosine = "{0:.10f}".format(self.compare(target, neighbor)) print("\t".join(data_namedtuple).encode('utf8') + "\t" + cosine) def compare(self, target, neighbor): r"""Return the best path_similarity between `target` and `neighbor`.""" try: return self.embedding_set.compare(target, neighbor) except KeyError: return 0.0 ##################################################### if __name__ == "__main__": csv.parse_csv(EmbeddingsCmpAdder(parser.parse_args()))
sample_col_gold = self.columns_gold[self.parser_gold.colnames[0]] sample_col_pred = self.columns_pred[self.parser_pred.colnames[0]] missing_in_pred = [ kG for kG in sample_col_gold if kG not in sample_col_pred ] missing_in_gold = [ kP for kP in sample_col_pred if kP not in sample_col_gold ] if missing_in_gold: warn("{n} keys (e.g. `{key}`) not found in gold file; " \ "will use 0.0", n=len(missing_in_gold), key=missing_in_gold[0]) if missing_in_pred: warn("{n} keys (e.g. `{key}`) not found in prediction file; " \ "will use avg(predictions)", n=len(missing_in_pred), key=missing_in_pred[0]) ##################################################### if __name__ == "__main__": args = parser.parse_args() parser_gold = NumValuesParser(id_col=args.gold_id_column, colnames=args.gold_value_columns) parser_pred = NumValuesParser(id_col=args.pred_id_column, colnames=args.pred_value_columns, inverted_scales=args.inverted_scales) csv.parse_csv(parser_gold, input_file=args.gold_file) csv.parse_csv(parser_pred, input_file=args.pred_file) Main(args, parser_gold, parser_pred).run()
(columns, self.args.column_name) def handle_data(self, line, data_tuple): discriminant = tuple(getattr(data_tuple, col_name) for col_name in self.args.discriminate_by) if discriminant != self.current_discriminant: self.print_stats() self.current_discriminant = discriminant self.stats = statistics.Statistics() self.stats.add(float(getattr(data_tuple, self.args.column_name))) def end(self): self.print_stats() if self.args.print_global: print("GLOBAL:", self.global_stats.n, self.global_stats.avg, self.global_stats.stddev_sample) def print_stats(self): if self.stats is not None: fields = self.current_discriminant + (self.stats.n, self.stats.avg, self.stats.stddev_sample) self.global_stats.add(self.stats.avg) print(*fields, sep="\t") ##################################################### if __name__ == "__main__": csv.parse_csv(StatsPrinter(parser.parse_args()))