示例#1
0
class TestAnnotatedWikiExtractor(unittest.TestCase):
    def setUp(self):
        self.annotated_wikiextractor = AnnotatedWikiExtractor()
        self.wikiextractor = WikiExtractor()

    """
    Test the extraction process by comparing the result with a pre-processed result
    serialized in the file singlepage_original.xml
    
    This test targets the script in wikiextractor.py
    """

    def test_extract_original(self):
        page = map(lambda x: x.rstrip("\n"),
                   open("resources/singlepage_wikien.txt", "r").readlines())
        wiki_document = annotated_wikiextractor.wikiextractor.extract_document(
            page)
        wiki_document = self.wikiextractor.extract(wiki_document)

        #create test file:
        #open("resources/singlepage_original.xml", "w").write(wiki_document.__str__())

        self.assertEquals(
            open("resources/singlepage_original.xml").read(),
            wiki_document.__str__())

    """
    Test the extraction process by comparing the result with a pre-processed result
    serialized in the file singlepage_annotated.json
    
    This test targets the script in annotated_wikiextractor.py
    """

    def test_extract_annotated(self):
        page = map(lambda x: x.rstrip("\n"),
                   open("resources/singlepage_wikien.txt", "r").readlines())
        wiki_document = annotated_wikiextractor.wikiextractor.extract_document(
            page)
        annotated_wiki_document = self.annotated_wikiextractor.extract(
            wiki_document)

        #create test file:
        #json.dump(json.loads(str(annotated_wiki_document)), open("resources/singlepage_annotated.json", "w"))

        self.assertEquals(
            open("resources/singlepage_annotated.json").read(),
            json.dumps(annotated_wiki_document))
class TestAnnotatedWikiExtractor(unittest.TestCase):
    
    def setUp(self):
        self.annotated_wikiextractor = AnnotatedWikiExtractor()
        self.wikiextractor = WikiExtractor()
    
    """
    Test the extraction process by comparing the result with a pre-processed result
    serialized in the file singlepage_original.xml
    
    This test targets the script in wikiextractor.py
    """
    def test_extract_original(self):
        page = map(lambda x: x.rstrip("\n"), open("resources/singlepage_wikien.txt", "r").readlines())
        wiki_document = annotated_wikiextractor.wikiextractor.extract_document(page)
        wiki_document = self.wikiextractor.extract(wiki_document)
        
        #create test file: 
        #open("resources/singlepage_original.xml", "w").write(wiki_document.__str__())
        
        self.assertEquals(open("resources/singlepage_original.xml").read(), wiki_document.__str__())   

    
    """
    Test the extraction process by comparing the result with a pre-processed result
    serialized in the file singlepage_annotated.json
    
    This test targets the script in annotated_wikiextractor.py
    """
    def test_extract_annotated(self):
        page = map(lambda x: x.rstrip("\n"), open("resources/singlepage_wikien.txt", "r").readlines())
        wiki_document = annotated_wikiextractor.wikiextractor.extract_document(page)
        annotated_wiki_document = self.annotated_wikiextractor.extract(wiki_document)
        
        #create test file: 
        #json.dump(json.loads(str(annotated_wiki_document)), open("resources/singlepage_annotated.json", "w"))
        
        self.assertEquals(open("resources/singlepage_annotated.json").read(), json.dumps(annotated_wiki_document))   
示例#3
0
 def setUp(self):
     self.annotated_wikiextractor = AnnotatedWikiExtractor()
     self.wikiextractor = WikiExtractor()
 def setUp(self):
     self.annotated_wikiextractor = AnnotatedWikiExtractor()
     self.wikiextractor = WikiExtractor()
# -*- coding: utf-8 -*-

import sys
import os
import marshal

#Add the folder containing annotated_wikiextractor to the PYTHON_PATH, so
#it can be executed in hadoop 
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),"../..")))

#Import annotated_wikiextractor from the path added in the last step
import annotated_wikiextractor
from annotated_wikiextractor.annotated_wikiextractor import AnnotatedWikiExtractor

#Use the standard AnnotatedWikiExtractor
wiki_extractor = AnnotatedWikiExtractor()

for line in sys.stdin:
    # remove leading and trailing whitespace
    line = line.strip()
    page = marshal.loads(line)
    wiki_document = annotated_wikiextractor.wikiextractor.extract_document(page)
    annotated_wiki_document = wiki_extractor.extract(wiki_document)
    print "%s\t%s" % (annotated_wiki_document["url"].replace(annotated_wikiextractor.wikiextractor.prefix, ""), annotated_wiki_document)
# -*- coding: utf-8 -*-

import sys
import os
import marshal

#Add the folder containing annotated_wikiextractor to the PYTHON_PATH, so
#it can be executed in hadoop
sys.path.append(
    os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))

#Import annotated_wikiextractor from the path added in the last step
import annotated_wikiextractor
from annotated_wikiextractor.annotated_wikiextractor import AnnotatedWikiExtractor

#Use the standard AnnotatedWikiExtractor
wiki_extractor = AnnotatedWikiExtractor()

for line in sys.stdin:
    # remove leading and trailing whitespace
    line = line.strip()
    page = marshal.loads(line)
    wiki_document = annotated_wikiextractor.wikiextractor.extract_document(
        page)
    annotated_wiki_document = wiki_extractor.extract(wiki_document)
    print "%s\t%s" % (annotated_wiki_document["url"].replace(
        annotated_wikiextractor.wikiextractor.prefix,
        ""), annotated_wiki_document)