) args = parser.parse_args() pgc_fns = multiglob(args.file) def log(s): if args.verbose: print >>sys.stderr, "***", s log("Reading corpus from " + pgc_fns[0]) corpus = ParallelGraphCorpus(inf=pgc_fns[0]) for fn in pgc_fns[1:]: log("Joining corpus from " + fn) # __iadd__ also checks if another corpus is compatible w.r.t. relations # and meta-data corpus += ParallelGraphCorpus(inf=fn) # Purge the corpus of duplicate graphbanks held in memory log("Purging corpus") corpus.purge() log("Writing corpus") corpus.write(pprint=args.format)
"--config", metavar="FILE", help="configuration file to set up a corpus aligner") parser.add_argument("-x", "--clear", action="store_true", help="remove all existing alignments") parser.add_argument("-i", "--in-place", action="store_true", help="modify input file(s)") args = parser.parse_args() if args.config: config = imp.load_source("config", args.config) corpus_aligner = set_up_corpus_aligner(config) else: from daeso_nl.ga.corpus import CorpusAligner corpus_aligner = CorpusAligner() for inf in multiglob(args.pgc_files): corpus = ParallelGraphCorpus(inf=inf) corpus_aligner.align(corpus, clear=args.clear) if args.in_place: corpus.write(outf=inf, pprint=True) else: corpus.write(pprint=True)
if not exists(source): stderr.write("warning: source " + repr(source) + " does not exist " "(not copied)\n") continue try: corpus = ParallelGraphCorpus(inf=source, graph_loading=LOAD_NONE) except Exception, inst: stderr.write(str(inst) + "\n") stderr.write("warning: source " + repr(source) + " is not a valid parallel graph corpus (not copied) \n") continue if isdir(args.target): target = join(args.target, basename(source)) else: target = args.target if exists(target) and samefile(source, target): stderr.write("warning: source " + repr(source) + " and target " + repr(target) + " are the same file (not copied) \n") continue if exists(target) and not args.overwrite: stderr.write("warning: target " + repr(target) + "exists " "(not copied); use --overwrite to force copy\n") continue corpus.write(outf=target)
class Aligner(object): """ the Algraeph application model """ def __init__(self): self._corpus = ParallelGraphCorpus() # the domain model self._changed = False self._filename = None self._graph_pair = None self._graph_pair_index = None self._graphs = Pair(None, None) self._nodes = Pair(None, None) # the special relation which stands for "no relation" self._no_relation = "none" self._co_node_selection = False # ------------------------------------------------------------------------------ # Corpus # ------------------------------------------------------------------------------ def open_corpus(self, filename): send(self.open_corpus, "statusDescription", "Loading corpus %s ..." % filename) # May raise errors such IOErrors, not an xml file, corrupt format, etc. # Use of relax_gb_paths allows graphbank files to be located in the # same direcory as the corpus file instead of the location specified # in the <file> element corpus = ParallelGraphCorpus() corpus.read(inf=filename, relax_gb_paths=True) if not corpus: raise AlgraephException("Parallel graph corpus contains no alignments") self._corpus = corpus self._filename = filename self._changed = False send(self.open_corpus, "statusDescription") send(self.open_corpus, "newCorpus") send(self.open_corpus, "newCorpusName") self.goto_graph_pair(0) # implies send("newGraphPair"), and sets self._graph_pair, # self._graph_pair_index, self._graphs and self._nodes def save_corpus(self, filename=None): if filename: self._filename = filename send(self.save_corpus, "newCorpusName") send(self.save_corpus, "statusDescription", "Saving corpus %s ..." % self._filename) self._corpus.write(self._filename, pprint=True) self._changed = False send(self.save_corpus, "statusDescription") def get_corpus_len(self): return len(self._corpus) def get_corpus_filename(self): return self._filename def get_corpus_dir(self): try: return dirname(self._filename) except (AttributeError, TypeError): return None def corpus_changed(self): """ returns True if the corpus has unsaved changes """ return self._changed # ------------------------------------------------------------------------------ # Treebanks # ------------------------------------------------------------------------------ def get_graphbanks_format(self): # The ParallelGraphCorpus class in principle supports graphbanks in # different formats, although untested for the time being. Formats are # therefore stored as a property of the graphbanks, but there is no # global format defined as a property of the corpus. So getting "the # graphbanks format" is not straightforward. We will make the # assumption that all graphbanks are in the same format, and there it # is sufficient to look at any graphbank linked to an arbitary graph # pair. return self._corpus[0].get_source_bank().get_format() # ------------------------------------------------------------------------------ # Graphs (GraphPair and DaesoGraph) # ------------------------------------------------------------------------------ def get_graph_pair(self): return self._graph_pair def goto_prev_graph_pair(self): self.goto_graph_pair(self._graph_pair_index - 1) def goto_next_graph_pair(self): self.goto_graph_pair(self._graph_pair_index + 1) def goto_graph_pair(self, index): # don't use try-except here, because negative index is allowed for list if 0 <= index < len(self._corpus): self._graph_pair = self._corpus[index] self._graph_pair_index = index self._graphs = self._graph_pair.get_graphs() self._nodes = Pair(None, None) send(self.goto_graph_pair, "newGraphPair.viz") send(self.goto_graph_pair, "newGraphPair.gui") def get_from_graph(self): return self._graphs.source def get_to_graph(self): return self._graphs.target def get_from_graph_tokens(self): return self._graphs.source.get_graph_token_string() def get_to_graph_tokens(self): return self._graphs.target.get_graph_token_string() def get_graph_pair_counter(self): # counting starts from 1 return (self._graph_pair_index + 1, len(self._corpus)) # ------------------------------------------------------------------------------ # Nodes # ------------------------------------------------------------------------------ def co_node_selection_mode(self, state=False): self._co_node_selection = state def set_from_node(self, node=None): self._nodes.source = node if self._co_node_selection: self._nodes.target = self.get_aligned_to_node() send(self.set_from_node, "newNodeSelect.viz") send(self.set_from_node, "newNodeSelect.gui") def set_to_node(self, node=None): self._nodes.target = node if self._co_node_selection: self._nodes.source = self.get_aligned_from_node() send(self.set_to_node, "newNodeSelect.viz") send(self.set_to_node, "newNodeSelect.gui") def get_from_node(self): return self._nodes.source def get_to_node(self): return self._nodes.target def nodes_are_selected(self): return all(self._nodes) def get_from_node_tokens(self): return ( self._graphs.source.get_node_token_string(self._nodes.source) or "" ) def get_to_node_tokens(self): return ( self._graphs.target.get_node_token_string(self._nodes.target) or "" ) # ------------------------------------------------------------------------------ # Alignment # ------------------------------------------------------------------------------ def get_relation_set(self): try: return [self._no_relation] + self._corpus.get_relations() except TypeError: return [self._no_relation] def get_node_pair_relation(self): return self._graph_pair.get_align(self._nodes) or self._no_relation def set_node_pair_relation(self, relation): if self.nodes_are_selected(): if relation != self._no_relation: self._graph_pair.add_align(self._nodes, relation) else: self._graph_pair.del_align(self._nodes) self._changed = True send(self.set_node_pair_relation, "newRelation.viz") send(self.set_node_pair_relation, "newRelation.gui") def get_aligned_to_node(self): """ Get 'to' node aligned to the selected 'from' node """ return self._graph_pair.get_aligned_target_node(self._nodes.source) def get_aligned_from_node(self): """ Get 'from' node aligned to the selected 'to' node """ return self._graph_pair.get_aligned_source_node(self._nodes.target) def get_auto_fold_equal_nodes(self): """ Get lists of non-terminal 'from' and 'to' nodes aligned with an 'equals' relation """ # ignoring terminals, so the list may be of unequal size from_nodes = [] to_nodes = [] for (nodes, rel) in self._graph_pair.alignments_iter(): if rel == "equals": if self._graphs.source.node_is_non_terminal(nodes.source): from_nodes.append(nodes.source) if self._graphs.target.node_is_non_terminal(nodes.target): to_nodes.append(nodes.target) return from_nodes, to_nodes #------------------------------------------------------------------------------ # Comments #------------------------------------------------------------------------------ def get_comment(self): try: return self._graph_pair.get_meta_data().find("comment").text except AttributeError: return "" def set_comment(self, text): meta_data_elem = self._graph_pair.get_meta_data() comment_elem = meta_data_elem.find("comment") if text.strip(): if comment_elem is None: comment_elem = SubElement(meta_data_elem, "comment") comment_elem.text = text elif comment_elem: meta_data_elem.remove(comment_elem) self._changed = True
for source in args.source: if not exists(source): stderr.write("warning: source " + repr(source) + " does not exist " "(not copied)\n") continue try: corpus = ParallelGraphCorpus(inf=source, graph_loading=LOAD_NONE) except Exception, inst: stderr.write(str(inst) + "\n") stderr.write("warning: source " + repr(source) + " is not a valid parallel graph corpus (not copied) \n") continue if isdir(args.target): target = join(args.target, basename(source)) else: target = args.target if exists(target) and samefile(source, target): stderr.write("warning: source " + repr(source) + " and target " + repr(target) + " are the same file (not copied) \n") continue if exists(target) and not args.overwrite: stderr.write("warning: target " + repr(target) + "exists " "(not copied); use --overwrite to force copy\n") continue corpus.write(outf=target)
parser.add_argument( "-x", "--clear", action="store_true", help="remove all existing alignments" ) parser.add_argument( "-i", "--in-place", action="store_true", help="modify input file(s)" ) args = parser.parse_args() if args.config: config = imp.load_source("config", args.config) corpus_aligner = set_up_corpus_aligner(config) else: from daeso_nl.ga.corpus import CorpusAligner corpus_aligner = CorpusAligner() for inf in multiglob(args.pgc_files): corpus = ParallelGraphCorpus(inf=inf) corpus_aligner.align(corpus, clear=args.clear) if args.in_place: corpus.write(outf=inf, pprint=True) else: corpus.write(pprint=True)
class Aligner(object): """ the Algraeph application model """ def __init__(self): self._corpus = ParallelGraphCorpus() # the domain model self._changed = False self._filename = None self._graph_pair = None self._graph_pair_index = None self._graphs = Pair(None, None) self._nodes = Pair(None, None) # the special relation which stands for "no relation" self._no_relation = "none" self._co_node_selection = False # ------------------------------------------------------------------------------ # Corpus # ------------------------------------------------------------------------------ def open_corpus(self, filename): send(self.open_corpus, "statusDescription", "Loading corpus %s ..." % filename) # May raise errors such IOErrors, not an xml file, corrupt format, etc. # Use of relax_gb_paths allows graphbank files to be located in the # same direcory as the corpus file instead of the location specified # in the <file> element corpus = ParallelGraphCorpus() corpus.read(inf=filename, relax_gb_paths=True) if not corpus: raise AlgraephException( "Parallel graph corpus contains no alignments") self._corpus = corpus self._filename = filename self._changed = False send(self.open_corpus, "statusDescription") send(self.open_corpus, "newCorpus") send(self.open_corpus, "newCorpusName") self.goto_graph_pair(0) # implies send("newGraphPair"), and sets self._graph_pair, # self._graph_pair_index, self._graphs and self._nodes def save_corpus(self, filename=None): if filename: self._filename = filename send(self.save_corpus, "newCorpusName") send(self.save_corpus, "statusDescription", "Saving corpus %s ..." % self._filename) self._corpus.write(self._filename, pprint=True) self._changed = False send(self.save_corpus, "statusDescription") def get_corpus_len(self): return len(self._corpus) def get_corpus_filename(self): return self._filename def get_corpus_dir(self): try: return dirname(self._filename) except (AttributeError, TypeError): return None def corpus_changed(self): """ returns True if the corpus has unsaved changes """ return self._changed # ------------------------------------------------------------------------------ # Treebanks # ------------------------------------------------------------------------------ def get_graphbanks_format(self): # The ParallelGraphCorpus class in principle supports graphbanks in # different formats, although untested for the time being. Formats are # therefore stored as a property of the graphbanks, but there is no # global format defined as a property of the corpus. So getting "the # graphbanks format" is not straightforward. We will make the # assumption that all graphbanks are in the same format, and there it # is sufficient to look at any graphbank linked to an arbitary graph # pair. return self._corpus[0].get_source_bank().get_format() # ------------------------------------------------------------------------------ # Graphs (GraphPair and DaesoGraph) # ------------------------------------------------------------------------------ def get_graph_pair(self): return self._graph_pair def goto_prev_graph_pair(self): self.goto_graph_pair(self._graph_pair_index - 1) def goto_next_graph_pair(self): self.goto_graph_pair(self._graph_pair_index + 1) def goto_graph_pair(self, index): # don't use try-except here, because negative index is allowed for list if 0 <= index < len(self._corpus): self._graph_pair = self._corpus[index] self._graph_pair_index = index self._graphs = self._graph_pair.get_graphs() self._nodes = Pair(None, None) send(self.goto_graph_pair, "newGraphPair.viz") send(self.goto_graph_pair, "newGraphPair.gui") def get_from_graph(self): return self._graphs.source def get_to_graph(self): return self._graphs.target def get_from_graph_tokens(self): return self._graphs.source.get_graph_token_string() def get_to_graph_tokens(self): return self._graphs.target.get_graph_token_string() def get_graph_pair_counter(self): # counting starts from 1 return (self._graph_pair_index + 1, len(self._corpus)) # ------------------------------------------------------------------------------ # Nodes # ------------------------------------------------------------------------------ def co_node_selection_mode(self, state=False): self._co_node_selection = state def set_from_node(self, node=None): self._nodes.source = node if self._co_node_selection: self._nodes.target = self.get_aligned_to_node() send(self.set_from_node, "newNodeSelect.viz") send(self.set_from_node, "newNodeSelect.gui") def set_to_node(self, node=None): self._nodes.target = node if self._co_node_selection: self._nodes.source = self.get_aligned_from_node() send(self.set_to_node, "newNodeSelect.viz") send(self.set_to_node, "newNodeSelect.gui") def get_from_node(self): return self._nodes.source def get_to_node(self): return self._nodes.target def nodes_are_selected(self): return all(self._nodes) def get_from_node_tokens(self): return (self._graphs.source.get_node_token_string(self._nodes.source) or "") def get_to_node_tokens(self): return (self._graphs.target.get_node_token_string(self._nodes.target) or "") # ------------------------------------------------------------------------------ # Alignment # ------------------------------------------------------------------------------ def get_relation_set(self): try: return [self._no_relation] + self._corpus.get_relations() except TypeError: return [self._no_relation] def get_node_pair_relation(self): return self._graph_pair.get_align(self._nodes) or self._no_relation def set_node_pair_relation(self, relation): if self.nodes_are_selected(): if relation != self._no_relation: self._graph_pair.add_align(self._nodes, relation) else: self._graph_pair.del_align(self._nodes) self._changed = True send(self.set_node_pair_relation, "newRelation.viz") send(self.set_node_pair_relation, "newRelation.gui") def get_aligned_to_node(self): """ Get 'to' node aligned to the selected 'from' node """ return self._graph_pair.get_aligned_target_node(self._nodes.source) def get_aligned_from_node(self): """ Get 'from' node aligned to the selected 'to' node """ return self._graph_pair.get_aligned_source_node(self._nodes.target) def get_auto_fold_equal_nodes(self): """ Get lists of non-terminal 'from' and 'to' nodes aligned with an 'equals' relation """ # ignoring terminals, so the list may be of unequal size from_nodes = [] to_nodes = [] for (nodes, rel) in self._graph_pair.alignments_iter(): if rel == "equals": if self._graphs.source.node_is_non_terminal(nodes.source): from_nodes.append(nodes.source) if self._graphs.target.node_is_non_terminal(nodes.target): to_nodes.append(nodes.target) return from_nodes, to_nodes #------------------------------------------------------------------------------ # Comments #------------------------------------------------------------------------------ def get_comment(self): try: return self._graph_pair.get_meta_data().find("comment").text except AttributeError: return "" def set_comment(self, text): meta_data_elem = self._graph_pair.get_meta_data() comment_elem = meta_data_elem.find("comment") if text.strip(): if comment_elem is None: comment_elem = SubElement(meta_data_elem, "comment") comment_elem.text = text elif comment_elem: meta_data_elem.remove(comment_elem) self._changed = True
"--verbose", action="store_true", help="verbose ouput to stderr") args = parser.parse_args() pgc_fns = multiglob(args.file) def log(s): if args.verbose: print >> sys.stderr, "***", s log("Reading corpus from " + pgc_fns[0]) corpus = ParallelGraphCorpus(inf=pgc_fns[0]) for fn in pgc_fns[1:]: log("Joining corpus from " + fn) # __iadd__ also checks if another corpus is compatible w.r.t. relations # and meta-data corpus += ParallelGraphCorpus(inf=fn) # Purge the corpus of duplicate graphbanks held in memory log("Purging corpus") corpus.purge() log("Writing corpus") corpus.write(pprint=args.format)