示例#1
0
def main():
    sys.stdout = codecs.getwriter(FILE_ENC)(sys.stdout)
    sys.stderr = codecs.getwriter(FILE_ENC)(sys.stderr)

    args = parser.parse_args()
    target_pairs = csv.parse_csv(TargetPairCollector(),
                                 input_file=args.target_pairs).pairs

    collector = csv.parse_csv(DataCollector(), input_file=args.input_file)
    collector.print_merged(target_pairs, args.column_name)
示例#2
0
def main():
    sys.stdout = codecs.getwriter(FILE_ENC)(sys.stdout)
    sys.stderr = codecs.getwriter(FILE_ENC)(sys.stderr)

    args = parser.parse_args()
    target_pairs = csv.parse_csv(TargetPairCollector(),
            input_file=args.target_pairs).pairs

    collector = csv.parse_csv(DataCollector(),
            input_file=args.input_file)
    collector.print_merged(target_pairs, args.column_name)
示例#3
0
 def run(self):
     self.thesaurus = csv.parse_csv(Thesaurus(self.args.column_name),
                                    self.args.thesaurus_file)
     for line_num, line in enumerate(sys.stdin):
         line = line[:-1]  # Strip off '\n'
         self.treat_line(line, line_num)
     if self.args.out_all_stats:
         print("Accuracy: {:2.2f}%".format(100 * self.n_correct /
                                           self.n_lines))
         print("Empty: {:2.2f}%".format(100 * self.n_empty / self.n_lines))
示例#4
0
 def run(self):
     self.thesaurus = csv.parse_csv(
             Thesaurus(self.args.column_name),
             self.args.thesaurus_file)
     for line_num, line in enumerate(sys.stdin):
         line = line[:-1]  # Strip off '\n'
         self.treat_line(line, line_num)
     if self.args.out_all_stats:
         print("Accuracy: {:2.2f}%".format(100 * self.n_correct/self.n_lines))
         print("Empty: {:2.2f}%".format(100 * self.n_empty/self.n_lines))
示例#5
0
def main():
    sys.stdout = codecs.getwriter(FILE_ENC)(sys.stdout)
    sys.stderr = codecs.getwriter(FILE_ENC)(sys.stderr)

    args = parser.parse_args()
    triples = csv.parse_csv(TriplesCollector(),
            input_file=args.target_addition_triples).triples

    data_parser = csv.parse_csv
    if args.input_format == "word2vec":
        data_parser = word2vec_parser

    collector = data_parser(DataCollector(args, triples),
            input_file=args.input_file)
    collector.print_merged()
示例#6
0
def main():
    sys.stdout = codecs.getwriter(FILE_ENC)(sys.stdout)
    sys.stderr = codecs.getwriter(FILE_ENC)(sys.stderr)

    args = parser.parse_args()
    triples = csv.parse_csv(TriplesCollector(),
                            input_file=args.target_addition_triples).triples

    data_parser = csv.parse_csv
    if args.input_format == "word2vec":
        data_parser = word2vec_parser

    collector = data_parser(DataCollector(args, triples),
                            input_file=args.input_file)
    collector.print_merged()
示例#7
0

    def check_missing(self):
        r"""Complain about missing gold/value keys."""
        sample_col_gold = self.columns_gold[self.parser_gold.colnames[0]]
        sample_col_pred = self.columns_pred[self.parser_pred.colnames[0]]
        missing_in_pred = [kG for kG in sample_col_gold if kG not in sample_col_pred]
        missing_in_gold = [kP for kP in sample_col_pred if kP not in sample_col_gold]

        if missing_in_gold:
            warn("{n} keys (e.g. `{key}`) not found in gold file; " \
                    "will use 0.0", n=len(missing_in_gold), key=missing_in_gold[0])

        if missing_in_pred:
            warn("{n} keys (e.g. `{key}`) not found in prediction file; " \
                    "will use avg(predictions)", n=len(missing_in_pred), key=missing_in_pred[0])


#####################################################

if __name__ == "__main__":
    args = parser.parse_args()
    parser_gold = NumValuesParser(id_col=args.gold_id_column,
           colnames=args.gold_value_columns)
    parser_pred = NumValuesParser(id_col=args.pred_id_column,
           colnames=args.pred_value_columns,
           inverted_scales=args.inverted_scales)
    csv.parse_csv(parser_gold, input_file=args.gold_file)
    csv.parse_csv(parser_pred, input_file=args.pred_file)
    Main(args, parser_gold, parser_pred).run()
示例#8
0
    def handle_data(self, line, data_namedtuple):
        target = data_namedtuple.target
        neighbor = data_namedtuple.neighbor
        if target != self.current_target:
            self.current_target = target
            self.current_target_count = 0
        self.current_target_count += 1
        if self.current_target_count <= self.args.best_k:
            wnpath = "{0:.10f}".format(self.wnpath(target, neighbor))
            print("\t".join(data_namedtuple).encode("utf8"), wnpath, sep="\t")

    def wnpath(self, target, neighbor):
        r"""Return the best path_similarity between
        `target` and `neighbor`."""
        synsetsT = wn.synsets(target, self.args.wordnet_pos_tag)
        synsetsN = wn.synsets(neighbor, self.args.wordnet_pos_tag)
        if not synsetsT:
            return 0  # XXX no synsets for `target`
        if not synsetsN:
            return 0  # XXX no synsets for `neighbor`
        return (
            max(wn.path_similarity(sT, sN) for sT in synsetsT for sN in synsetsN) or 0
        )  # When `wn` returns None, we just say sim==0


#####################################################

if __name__ == "__main__":
    csv.parse_csv(WnAdder(parser.parse_args()))
                (columns, self.args.column_name)

    def handle_data(self, line, data_tuple):
        discriminant = tuple(
            getattr(data_tuple, col_name)
            for col_name in self.args.discriminate_by)
        if discriminant != self.current_discriminant:
            self.print_stats()
            self.current_discriminant = discriminant
            self.stats = statistics.Statistics()
        self.stats.add(float(getattr(data_tuple, self.args.column_name)))

    def end(self):
        self.print_stats()
        if self.args.print_global:
            print("GLOBAL:", self.global_stats.n, self.global_stats.avg,
                  self.global_stats.stddev_sample)

    def print_stats(self):
        if self.stats is not None:
            fields = self.current_discriminant + (self.stats.n, self.stats.avg,
                                                  self.stats.stddev_sample)
            self.global_stats.add(self.stats.avg)
            print(*fields, sep="\t")


#####################################################

if __name__ == "__main__":
    csv.parse_csv(StatsPrinter(parser.parse_args()))
示例#10
0
        sum_squares = [0] * len(self.header_names)
        for data_namedtuple in self.current_group:
            for i, elem in enumerate(data_namedtuple):
                try:
                    sum_squares[i] += float(elem)**2
                except ValueError:
                    pass  # Value cannot be converted to float

        denominators = [math.sqrt(s) for s in sum_squares]
        for data_namedtuple in self.current_group:
            print(*[self.divide(i, value, denom) for (i, (value, denom))
                    in enumerate(zip(data_namedtuple, denominators))], sep="\t")

        self.current_group[:] = []


    def divide(self, index, stringvalue, denominator):
        r"""Return float(stringvalue)/denominator."""
        if not self.header_chosen[index]:
            return stringvalue
        try:
            return float(stringvalue) / denominator
        except ValueError:
            return stringvalue


#####################################################

if __name__ == "__main__":
    csv.parse_csv(NormalizingPrinter(parser.parse_args()))
示例#11
0
                    sum_squares[i] += float(elem)**2
                except ValueError:
                    pass  # Value cannot be converted to float

        denominators = [math.sqrt(s) for s in sum_squares]
        for data_namedtuple in self.current_group:
            print(*[
                self.divide(i, value, denom)
                for (i,
                     (value,
                      denom)) in enumerate(zip(data_namedtuple, denominators))
            ],
                  sep="\t")

        self.current_group[:] = []

    def divide(self, index, stringvalue, denominator):
        r"""Return float(stringvalue)/denominator."""
        if not self.header_chosen[index]:
            return stringvalue
        try:
            return float(stringvalue) / denominator
        except ValueError:
            return stringvalue


#####################################################

if __name__ == "__main__":
    csv.parse_csv(NormalizingPrinter(parser.parse_args()))
示例#12
0
    def handle_data(self, line, data_namedtuple):
        target = data_namedtuple.target
        neighbor = data_namedtuple.neighbor
        if target != self.current_target:
            self.current_target = target
            self.current_target_count = 0
        self.current_target_count += 1
        if self.current_target_count <= self.args.best_k:
            wnpath = "{0:.10f}".format(self.wnpath(target, neighbor))
            print("\t".join(data_namedtuple).encode('utf8'), wnpath, sep="\t")

    def wnpath(self, target, neighbor):
        r"""Return the best path_similarity between
        `target` and `neighbor`."""
        synsetsT = wn.synsets(target, self.args.wordnet_pos_tag)
        synsetsN = wn.synsets(neighbor, self.args.wordnet_pos_tag)
        if not synsetsT:
            return 0  # XXX no synsets for `target`
        if not synsetsN:
            return 0  # XXX no synsets for `neighbor`
        return max(wn.path_similarity(sT, sN)
                for sT in synsetsT for sN in synsetsN) \
                or 0  # When `wn` returns None, we just say sim==0


#####################################################

if __name__ == "__main__":
    csv.parse_csv(WnAdder(parser.parse_args()))
示例#13
0
    def handle_comment(self, line):
        print(line.encode('utf8'))

    def handle_header(self, line, header_list):
        print(line.encode('utf8'), "w2v_cosine",  sep="\t")

    def handle_data(self, line, data_namedtuple):
        target = data_namedtuple.target
        neighbor = data_namedtuple.neighbor
        if target != self.current_target:
            self.current_target = target
            self.current_target_count = 0
        self.current_target_count += 1
        if self.current_target_count <= self.args.best_k:
            cosine = "{0:.10f}".format(self.compare(target, neighbor))
            print("\t".join(data_namedtuple).encode('utf8') + "\t" + cosine)

    def compare(self, target, neighbor):
        r"""Return the best path_similarity between
        `target` and `neighbor`."""
        try:
            return self.embedding_set.compare(target, neighbor)
        except KeyError:
            return 0.0


#####################################################

if __name__ == "__main__":
    csv.parse_csv(EmbeddingsCmpAdder(parser.parse_args()))
示例#14
0
        sample_col_gold = self.columns_gold[self.parser_gold.colnames[0]]
        sample_col_pred = self.columns_pred[self.parser_pred.colnames[0]]
        missing_in_pred = [
            kG for kG in sample_col_gold if kG not in sample_col_pred
        ]
        missing_in_gold = [
            kP for kP in sample_col_pred if kP not in sample_col_gold
        ]

        if missing_in_gold:
            warn("{n} keys (e.g. `{key}`) not found in gold file; " \
                    "will use 0.0", n=len(missing_in_gold), key=missing_in_gold[0])

        if missing_in_pred:
            warn("{n} keys (e.g. `{key}`) not found in prediction file; " \
                    "will use avg(predictions)", n=len(missing_in_pred), key=missing_in_pred[0])


#####################################################

if __name__ == "__main__":
    args = parser.parse_args()
    parser_gold = NumValuesParser(id_col=args.gold_id_column,
                                  colnames=args.gold_value_columns)
    parser_pred = NumValuesParser(id_col=args.pred_id_column,
                                  colnames=args.pred_value_columns,
                                  inverted_scales=args.inverted_scales)
    csv.parse_csv(parser_gold, input_file=args.gold_file)
    csv.parse_csv(parser_pred, input_file=args.pred_file)
    Main(args, parser_gold, parser_pred).run()
示例#15
0
                (columns, self.args.column_name)

    def handle_data(self, line, data_tuple):
        discriminant = tuple(getattr(data_tuple, col_name)
                for col_name in self.args.discriminate_by)
        if discriminant != self.current_discriminant:
            self.print_stats()
            self.current_discriminant = discriminant
            self.stats = statistics.Statistics()
        self.stats.add(float(getattr(data_tuple, self.args.column_name)))

    def end(self):
        self.print_stats()
        if self.args.print_global:
            print("GLOBAL:", self.global_stats.n, self.global_stats.avg,
                    self.global_stats.stddev_sample)

    def print_stats(self):
        if self.stats is not None:
            fields = self.current_discriminant + (self.stats.n,
                    self.stats.avg, self.stats.stddev_sample)
            self.global_stats.add(self.stats.avg)
            print(*fields, sep="\t")



#####################################################

if __name__ == "__main__":
    csv.parse_csv(StatsPrinter(parser.parse_args()))
示例#16
0
    def handle_comment(self, line):
        print(line.encode('utf8'))

    def handle_header(self, line, header_list):
        print(line.encode('utf8'), "w2v_cosine", sep="\t")

    def handle_data(self, line, data_namedtuple):
        target = data_namedtuple.target
        neighbor = data_namedtuple.neighbor
        if target != self.current_target:
            self.current_target = target
            self.current_target_count = 0
        self.current_target_count += 1
        if self.current_target_count <= self.args.best_k:
            cosine = "{0:.10f}".format(self.compare(target, neighbor))
            print("\t".join(data_namedtuple).encode('utf8') + "\t" + cosine)

    def compare(self, target, neighbor):
        r"""Return the best path_similarity between
        `target` and `neighbor`."""
        try:
            return self.embedding_set.compare(target, neighbor)
        except KeyError:
            return 0.0


#####################################################

if __name__ == "__main__":
    csv.parse_csv(EmbeddingsCmpAdder(parser.parse_args()))