def read_corpus(corpus): if not isinstance(corpus, HitaextDoc): corpus = HitaextDoc(file=corpus) from_tree = corpus.get_doc_tree("from") from_tree.update() to_tree = corpus.get_doc_tree("to") to_tree.update() corpus.inject_alignments(from_tree, to_tree) return corpus, from_tree, to_tree
def eval_alignment(filename_pairs, tag, labels=None): """ Print an evaluation of the alignment w.r.t. tag @param corpus_pairs: list of filename pairs consisting of a true and a predicted parallel text corpus @param tag: only alignments involving this tag are considered @keyword label_pairs: list of string labels """ overall_true = overall_pred = overall_common = 0 print( " #true: #pred: #common: ratio:" " prec: rec: f-score: label:") print 128 * "-" if not labels: labels = [os.path.basename(pair[1]) for pair in filename_pairs] for (true_fn, pred_fn), label in zip(filename_pairs, labels): true_corpus = HitaextDoc(file=true_fn) pred_corpus = HitaextDoc(file=pred_fn) n_true, n_pred, n_common = count_alignment(true_corpus, pred_corpus, tag) overall_true += n_true overall_pred += n_pred overall_common += n_common ratio, prec, rec, f = compute_scores(n_true, n_pred, n_common) print "%10d%10d%10d%10.2f%10.2f%10.2f%10.2f %s" % ( n_true, n_pred, n_common, ratio, prec, rec, f, label) ratio, prec, rec, f = compute_scores(overall_true, overall_pred, overall_common) print 128 * "-" print "%10d%10d%10d%10.2f%10.2f%10.2f%10.2f" % ( overall_true, overall_pred, overall_common, ratio, prec, rec, f)
) parser.add_argument( "-V", "--verbose", action="store_true", help="verbose output" ) args = parser.parse_args() if args.verbose: print >>stderr, "Reading corpus from", args.corpus corpus = HitaextDoc(file=args.corpus) from_tree = corpus.get_doc_tree("from") to_tree = corpus.get_doc_tree("to") from_tree.update() to_tree.update() corpus.inject_alignments(from_tree, to_tree) corpus.alignment.set("method", "id") corpus.extract_alignments(from_tree, to_tree) if args.verbose: print >>stderr, "Writing corpus to", args.corpus
def pgc_from_ptc(text_corpus_file, source_graphbank_file, target_graphbank_file, focus_tags=Pair("s", "s"), graph_formats=Pair("alpino", "alpino"), relations=RELATIONS, min_token_diff=0, max_token_len=99999): """ Create a new parallel graph corpus from a parallel text corpus and a pair of graphbanks @PARAM text_corpus_file: parallel text corpus filename @PARAM source_bank: source graphank filename @PARAM target_bank: target graphbank filname @KEYWORD focus_tags: pair of focus tags @KEYWORD graph_format: pair of graphbank formats @KEYWORD relations: list of alignment relations @keyword min_token_diff: minimum number of different tokens @keyword max_token_len: maximum number of tokens per focus element @RETURN: ParallelGraphCorpus object """ # read parallel text corpus text_corpus = HitaextDoc(file=text_corpus_file) doc_trees = text_corpus.get_doc_trees(search=True) # read graph banks source_bank = GraphBank(source_graphbank_file, graph_formats.source) source_bank.load() target_bank = GraphBank(target_graphbank_file, graph_formats.target) target_bank.load() graph_banks = Pair(source_bank, target_bank) # create an empty parallel graph corpus graph_corpus = ParallelGraphCorpus(relations=relations) for alignment in text_corpus.alignment: if ( alignment.get("from_tag") != focus_tags.source or alignment.get("to_tag") != focus_tags.target ): continue source_tokens = _get_elem_tokens(doc_trees.source, focus_tags.source, alignment.get("from_id")) target_tokens = _get_elem_tokens(doc_trees.target, focus_tags.target, alignment.get("to_id")) if len(source_tokens) > max_token_len or len(target_tokens) > max_token_len: continue if ( min_token_diff and _token_diff(source_tokens, target_tokens) < min_token_diff ): continue # the crucial assumption is that id's of the aligned focus # elements in the marked-up text have corresponding graphs with # the same id in the graph banks source_graph_id = alignment.get("from_id") target_graph_id = alignment.get("to_id") graphs = Pair( source_bank.get_graph(source_graph_id), target_bank.get_graph(target_graph_id)) graph_pair = GraphPair(graph_banks, graphs) graph_corpus.append(graph_pair) return graph_corpus
parser.add_argument( "-V", "--verbose", action="store_true", help="verbose output" ) args = parser.parse_args() for fn in args.corpus: if args.verbose or args.test: print >>stderr, "Reading Hitaext document", fn htdoc = HitaextDoc(file=fn) for side in ("from", "to"): path = htdoc.get_filename(side) if args.verbose or args.test: print >>stderr, "Current %s path is %s" % (side, path) # a heuristic to deal with windows paths if path[0] in uppercase and path[1] == ":": # strip drive letter path = path[2:] path = path.replace("\\", "/") path = join(args.dir, basename(path))
parser.add_argument("corpus", nargs="+", default="parallel text corpus", help="") parser.add_argument("-V", "--verbose", action="store_true", help="verbose output") args = parser.parse_args() if args.verbose: print >> stderr, "Reading corpus from", args.corpus corpus = HitaextDoc(file=args.corpus) from_tree = corpus.get_doc_tree("from") to_tree = corpus.get_doc_tree("to") from_tree.update() to_tree.update() corpus.inject_alignments(from_tree, to_tree) corpus.alignment.set("method", "id") corpus.extract_alignments(from_tree, to_tree) if args.verbose: print >> stderr, "Writing corpus to", args.corpus
def pgc_from_ptc(text_corpus_file, source_graphbank_file, target_graphbank_file, focus_tags=Pair("s", "s"), graph_formats=Pair("alpino", "alpino"), relations=RELATIONS, min_token_diff=0, max_token_len=99999): """ Create a new parallel graph corpus from a parallel text corpus and a pair of graphbanks @PARAM text_corpus_file: parallel text corpus filename @PARAM source_bank: source graphank filename @PARAM target_bank: target graphbank filname @KEYWORD focus_tags: pair of focus tags @KEYWORD graph_format: pair of graphbank formats @KEYWORD relations: list of alignment relations @keyword min_token_diff: minimum number of different tokens @keyword max_token_len: maximum number of tokens per focus element @RETURN: ParallelGraphCorpus object """ # read parallel text corpus text_corpus = HitaextDoc(file=text_corpus_file) doc_trees = text_corpus.get_doc_trees(search=True) # read graph banks source_bank = GraphBank(source_graphbank_file, graph_formats.source) source_bank.load() target_bank = GraphBank(target_graphbank_file, graph_formats.target) target_bank.load() graph_banks = Pair(source_bank, target_bank) # create an empty parallel graph corpus graph_corpus = ParallelGraphCorpus(relations=relations) for alignment in text_corpus.alignment: if (alignment.get("from_tag") != focus_tags.source or alignment.get("to_tag") != focus_tags.target): continue source_tokens = _get_elem_tokens(doc_trees.source, focus_tags.source, alignment.get("from_id")) target_tokens = _get_elem_tokens(doc_trees.target, focus_tags.target, alignment.get("to_id")) if len(source_tokens) > max_token_len or len( target_tokens) > max_token_len: continue if (min_token_diff and _token_diff(source_tokens, target_tokens) < min_token_diff): continue # the crucial assumption is that id's of the aligned focus # elements in the marked-up text have corresponding graphs with # the same id in the graph banks source_graph_id = alignment.get("from_id") target_graph_id = alignment.get("to_id") graphs = Pair(source_bank.get_graph(source_graph_id), target_bank.get_graph(target_graph_id)) graph_pair = GraphPair(graph_banks, graphs) graph_corpus.append(graph_pair) return graph_corpus
"--test", action="store_true", help="perform a dry run without actually changing the files (implies -v)") parser.add_argument("-V", "--verbose", action="store_true", help="verbose output") args = parser.parse_args() for fn in args.corpus: if args.verbose or args.test: print >> stderr, "Reading Hitaext document", fn htdoc = HitaextDoc(file=fn) for side in ("from", "to"): path = htdoc.get_filename(side) if args.verbose or args.test: print >> stderr, "Current %s path is %s" % (side, path) # a heuristic to deal with windows paths if path[0] in uppercase and path[1] == ":": # strip drive letter path = path[2:] path = path.replace("\\", "/") path = join(args.dir, basename(path))