class TestAnnotatedWikiExtractor(unittest.TestCase): def setUp(self): self.annotated_wikiextractor = AnnotatedWikiExtractor() self.wikiextractor = WikiExtractor() """ Test the extraction process by comparing the result with a pre-processed result serialized in the file singlepage_original.xml This test targets the script in wikiextractor.py """ def test_extract_original(self): page = map(lambda x: x.rstrip("\n"), open("resources/singlepage_wikien.txt", "r").readlines()) wiki_document = annotated_wikiextractor.wikiextractor.extract_document( page) wiki_document = self.wikiextractor.extract(wiki_document) #create test file: #open("resources/singlepage_original.xml", "w").write(wiki_document.__str__()) self.assertEquals( open("resources/singlepage_original.xml").read(), wiki_document.__str__()) """ Test the extraction process by comparing the result with a pre-processed result serialized in the file singlepage_annotated.json This test targets the script in annotated_wikiextractor.py """ def test_extract_annotated(self): page = map(lambda x: x.rstrip("\n"), open("resources/singlepage_wikien.txt", "r").readlines()) wiki_document = annotated_wikiextractor.wikiextractor.extract_document( page) annotated_wiki_document = self.annotated_wikiextractor.extract( wiki_document) #create test file: #json.dump(json.loads(str(annotated_wiki_document)), open("resources/singlepage_annotated.json", "w")) self.assertEquals( open("resources/singlepage_annotated.json").read(), json.dumps(annotated_wiki_document))
class TestAnnotatedWikiExtractor(unittest.TestCase): def setUp(self): self.annotated_wikiextractor = AnnotatedWikiExtractor() self.wikiextractor = WikiExtractor() """ Test the extraction process by comparing the result with a pre-processed result serialized in the file singlepage_original.xml This test targets the script in wikiextractor.py """ def test_extract_original(self): page = map(lambda x: x.rstrip("\n"), open("resources/singlepage_wikien.txt", "r").readlines()) wiki_document = annotated_wikiextractor.wikiextractor.extract_document(page) wiki_document = self.wikiextractor.extract(wiki_document) #create test file: #open("resources/singlepage_original.xml", "w").write(wiki_document.__str__()) self.assertEquals(open("resources/singlepage_original.xml").read(), wiki_document.__str__()) """ Test the extraction process by comparing the result with a pre-processed result serialized in the file singlepage_annotated.json This test targets the script in annotated_wikiextractor.py """ def test_extract_annotated(self): page = map(lambda x: x.rstrip("\n"), open("resources/singlepage_wikien.txt", "r").readlines()) wiki_document = annotated_wikiextractor.wikiextractor.extract_document(page) annotated_wiki_document = self.annotated_wikiextractor.extract(wiki_document) #create test file: #json.dump(json.loads(str(annotated_wiki_document)), open("resources/singlepage_annotated.json", "w")) self.assertEquals(open("resources/singlepage_annotated.json").read(), json.dumps(annotated_wiki_document))
def setUp(self): self.annotated_wikiextractor = AnnotatedWikiExtractor() self.wikiextractor = WikiExtractor()
# -*- coding: utf-8 -*- import sys import os import marshal #Add the folder containing annotated_wikiextractor to the PYTHON_PATH, so #it can be executed in hadoop sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),"../.."))) #Import annotated_wikiextractor from the path added in the last step import annotated_wikiextractor from annotated_wikiextractor.annotated_wikiextractor import AnnotatedWikiExtractor #Use the standard AnnotatedWikiExtractor wiki_extractor = AnnotatedWikiExtractor() for line in sys.stdin: # remove leading and trailing whitespace line = line.strip() page = marshal.loads(line) wiki_document = annotated_wikiextractor.wikiextractor.extract_document(page) annotated_wiki_document = wiki_extractor.extract(wiki_document) print "%s\t%s" % (annotated_wiki_document["url"].replace(annotated_wikiextractor.wikiextractor.prefix, ""), annotated_wiki_document)
# -*- coding: utf-8 -*- import sys import os import marshal #Add the folder containing annotated_wikiextractor to the PYTHON_PATH, so #it can be executed in hadoop sys.path.append( os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))) #Import annotated_wikiextractor from the path added in the last step import annotated_wikiextractor from annotated_wikiextractor.annotated_wikiextractor import AnnotatedWikiExtractor #Use the standard AnnotatedWikiExtractor wiki_extractor = AnnotatedWikiExtractor() for line in sys.stdin: # remove leading and trailing whitespace line = line.strip() page = marshal.loads(line) wiki_document = annotated_wikiextractor.wikiextractor.extract_document( page) annotated_wiki_document = wiki_extractor.extract(wiki_document) print "%s\t%s" % (annotated_wiki_document["url"].replace( annotated_wikiextractor.wikiextractor.prefix, ""), annotated_wiki_document)