def pgc_stats(files, with_empty_nodes=False, with_failed_parses=False, with_punc=False, with_unaligned_roots=False, threshold=0, with_unaligned_graphs=False): pgc_table = PgcStatsTable(size=len(files)) gb_table = GbStatsTable() pgc_row = gb_row = 0 for fn in files: pgc = ParallelGraphCorpus() pgc.read(inf=fn, graph_loading=LOAD_ALL) graph_pair_stats(os.path.basename(fn), pgc, pgc_table, pgc_row, with_empty_nodes, with_failed_parses, with_punc, with_unaligned_roots, threshold) graphbanks = pgc._graphbanks() gb_table.grow(len(graphbanks)) # somewhat messy to process pgc and gb files intertwined, # but otherwise all graphbanks must be kept in memory for gb in graphbanks: graph_stats(gb, gb_table, gb_row, with_empty_nodes, with_failed_parses, with_punc, with_unaligned_roots, threshold, with_unaligned_graphs) gb_row += 1 pgc_row += 1 pgc_table.summarize() gb_table.summarize() return pgc_table, gb_table
def test_parser_load_relaxed(self): tmp_dir = tempfile.gettempdir() shutil.copy("data/corpus-2.pgc", tmp_dir + "/corpus-2.pgc") shutil.copy("../gb/data/source-gb-2.xml", tmp_dir + "/source-gb-2.xml") shutil.copy("../gb/data/target-gb-2.xml", tmp_dir + "/target-gb-2.xml") pg_corpus = ParallelGraphCorpus() pg_corpus.read(tmp_dir + "/corpus-2.pgc", relax_gb_paths=True) for graph_pair in pg_corpus: for graph in graph_pair._graphs: self.assertFalse(isinstance(graph, GraphStub)) for bank in graph_pair._banks: self.assertEqual(bank.__class__, SparseGraphBank) self.assertEqual(len(bank), 3) os.remove(tmp_dir + "/corpus-2.pgc") os.remove(tmp_dir + "/source-gb-2.xml") os.remove(tmp_dir + "/target-gb-2.xml")
def open_corpus(self, filename): send(self.open_corpus, "statusDescription", "Loading corpus %s ..." % filename) # May raise errors such IOErrors, not an xml file, corrupt format, etc. # Use of relax_gb_paths allows graphbank files to be located in the # same direcory as the corpus file instead of the location specified # in the <file> element corpus = ParallelGraphCorpus() corpus.read(inf=filename, relax_gb_paths=True) if not corpus: raise AlgraephException("Parallel graph corpus contains no alignments") self._corpus = corpus self._filename = filename self._changed = False send(self.open_corpus, "statusDescription") send(self.open_corpus, "newCorpus") send(self.open_corpus, "newCorpusName") self.goto_graph_pair(0)
def open_corpus(self, filename): send(self.open_corpus, "statusDescription", "Loading corpus %s ..." % filename) # May raise errors such IOErrors, not an xml file, corrupt format, etc. # Use of relax_gb_paths allows graphbank files to be located in the # same direcory as the corpus file instead of the location specified # in the <file> element corpus = ParallelGraphCorpus() corpus.read(inf=filename, relax_gb_paths=True) if not corpus: raise AlgraephException( "Parallel graph corpus contains no alignments") self._corpus = corpus self._filename = filename self._changed = False send(self.open_corpus, "statusDescription") send(self.open_corpus, "newCorpus") send(self.open_corpus, "newCorpusName") self.goto_graph_pair(0)
def test_parser_load_relaxed(self): tmp_dir = tempfile.gettempdir() shutil.copy("data/corpus-2.pgc", tmp_dir + "/corpus-2.pgc") shutil.copy("../gb/data/source-gb-2.xml", tmp_dir + "/source-gb-2.xml") shutil.copy("../gb/data/target-gb-2.xml", tmp_dir + "/target-gb-2.xml") pg_corpus = ParallelGraphCorpus() pg_corpus.read(tmp_dir + "/corpus-2.pgc", relax_gb_paths=True) for graph_pair in pg_corpus: for graph in graph_pair._graphs: self.assertFalse(isinstance(graph, GraphStub)) for bank in graph_pair._banks: self.assertEqual(bank.__class__, SparseGraphBank) self.assertEqual(len(bank), 3) os.remove(tmp_dir + "/corpus-2.pgc") os.remove(tmp_dir + "/source-gb-2.xml") os.remove(tmp_dir + "/target-gb-2.xml")