Exemplo n.º 1
0
class TestBuildLinks(unittest.TestCase):

    def setUp(self):
        self.maxDiff = None
        self.cleaner = Cleaner()

    def test_build_links(self):
        text = "[[印欧语系|西方语言]]中“數學”(μαθηματικά)一詞源自於[[古希臘語]]的μάθημα(máthēma),其" \
                   "有“學習”、“學問”、“[[科學]]”,以及另外還有個較狹義且技術性的意思-「數學研究」,即使在其語源內。其形容詞μα" \
                   "θηματικός(mathēmatikós),意思為''和學習有關的''或''用功的'',亦會被用來指''數學的''。其在[[英语]]中" \
                   "表面上的複數形式,及在[[法语]]中的表面複數形式''les mathématiques'',可溯至[[拉丁文]]的中性複數''mathe" \
                   "matica'',由[[西塞罗]]譯自希臘文複數τα μαθηματικά(ta mathēmatiká),此一希臘語被[[亚里士多德]]拿來指" \
                   "「[[萬物皆數]]」的概念。"
        expected = "西方语言中“數學”(μαθηματικά)一詞源自於古希臘語的μάθημα(máthēma),其有“學習”、“學問”、“科" \
                   "學”,以及另外還有個較狹義且技術性的意思-「數學研究」,即使在其語源內。其形容詞μαθηματικός(mathēmatikós)," \
                   "意思為''和學習有關的''或''用功的'',亦會被用來指''數學的''。其在英语中表面上的複數形式,及在法语中的表面複數" \
                   "形式''les mathématiques'',可溯至拉丁文的中性複數''mathematica'',由西塞罗譯自希臘文複數τα μαθηματικά(t" \
                   "a mathēmatiká),此一希臘語被亚里士多德拿來指「萬物皆數」的概念。"
        actual, links = self.cleaner.build_links(text)
        self.assertEqual(expected, actual)

    def test_no_links(self):
        text = "西方语言中“數學”(μαθηματικά)一詞源自於古希臘語的μάθημα(máthēma),其有“學習”、“學問”、“科" \
                   "學”,以及另外還有個較狹義且技術性的意思-「數學研究」,即使在其語源內。其形容詞μαθηματικός(mathēmatikós)," \
                   "意思為''和學習有關的''或''用功的'',亦會被用來指''數學的''。其在英语中表面上的複數形式,及在法语中的表面複數" \
                   "形式''les mathématiques'',可溯至拉丁文的中性複數''mathematica'',由西塞罗譯自希臘文複數τα μαθηματικά(t" \
                   "a mathēmatiká),此一希臘語被亚里士多德拿來指「萬物皆數」的概念。"
        expected = "西方语言中“數學”(μαθηματικά)一詞源自於古希臘語的μάθημα(máthēma),其有“學習”、“學問”、“科" \
                   "學”,以及另外還有個較狹義且技術性的意思-「數學研究」,即使在其語源內。其形容詞μαθηματικός(mathēmatikós)," \
                   "意思為''和學習有關的''或''用功的'',亦會被用來指''數學的''。其在英语中表面上的複數形式,及在法语中的表面複數" \
                   "形式''les mathématiques'',可溯至拉丁文的中性複數''mathematica'',由西塞罗譯自希臘文複數τα μαθηματικά(t" \
                   "a mathēmatiká),此一希臘語被亚里士多德拿來指「萬物皆數」的概念。"
        text, links = self.cleaner.build_links(text)
        actual, links = self.cleaner.build_links(text)
        self.assertEqual(expected, actual)
        self.assertEqual(id(actual), id(text))

    def test_category(self):
        text = "2004年6月28日 [[User:Shizhao|Shizhao]] [[MediaWiki:Categoryarticlecount]]被保护"
        expected = "2004年6月28日 Shizhao Categoryarticlecount被保护"
        text, links = self.cleaner.build_links(text)
        actual, links = self.cleaner.build_links(text)
        self.assertEqual(expected, actual)

        text = "[[Category:未被普遍承認的歷史國家]]"
        expected = "未被普遍承認的歷史國家"
        text, links = self.cleaner.build_links(text)
        actual, links = self.cleaner.build_links(text)
        self.assertEqual(expected, actual)

        text = "柏拉圖的著作(其中大多數都是對話錄)曾經被以好幾種不同方式出版過;因此對於柏拉圖著作的命名和引用也有數種不同的" \
               "方式。有獨立條目的柏拉圖對話錄介紹可以在[[:Category:柏拉圖對話錄]]找到。"
        expected = "柏拉圖的著作(其中大多數都是對話錄)曾經被以好幾種不同方式出版過;因此對於柏拉圖著作的命名和引用也有數種不同" \
                   "的方式。有獨立條目的柏拉圖對話錄介紹可以在柏拉圖對話錄找到。"
        text, links = self.cleaner.build_links(text)
        actual, links = self.cleaner.build_links(text)
        self.assertEqual(expected, actual)
Exemplo n.º 2
0
class TestIterate(unittest.TestCase):
    def setUp(self):
        self.maxDiff = None
        self.cleaner = Cleaner()
        self.current_path = os.path.dirname(os.path.abspath(__file__))
        self.sample_file_path = os.path.join(self.current_path, 'wikis',
                                             'zhwiki-test-pages.xml')

    def read_target(self, name):
        path = os.path.join(self.current_path, 'targets', name + '.txt')
        with codecs.open(path, 'r', 'utf8') as reader:
            target = reader.read()
        return target

    def save_temp(self, name, text):
        path = os.path.join(self.current_path, 'targets', name + '.tmp')
        with codecs.open(path, 'w', 'utf8') as writer:
            writer.write(text)

    def test_broken(self):
        broken_files = ['zhwiki-broken-%d.xml' % i for i in range(1, 5)]
        for broken_file in broken_files:
            path = os.path.join(self.current_path, 'wikis', broken_file)
            for _ in iterate(path):
                self.assertTrue(False)

    def test_clean(self):
        targets = {
            '数学': 'Mathematics',
            '哲学': 'Philosophy',
            '文學': 'Literature',
        }
        for target_title, target in targets.items():
            found = False
            for title, text in iterate(self.sample_file_path):
                if title == target_title:
                    found = True
                    text = self.cleaner.clean_text(text)
                    actual, _ = self.cleaner.build_links(text)
                    expected = self.read_target(target)
                    if actual != expected:
                        self.save_temp(target, actual)
                    self.assertEqual(expected, actual, target)
                else:
                    text = self.cleaner.clean_text(text)
                    self.cleaner.build_links(text)
            self.assertTrue(found)
Exemplo n.º 3
0
class TestCleanText(unittest.TestCase):
    def setUp(self):
        self.maxDiff = None
        self.cleaner = Cleaner()

    def test_case_1(self):
        text = "[[印欧语系|西方语言]]中“數學”({{lang-el|μαθηματικά}})一詞源自於[[古希臘語]]的{{lang|el|μάθημα}}({" \
               "{lang|la|máthēma}}),其有“學習”、“學問”、“[[科學]]”,以及另外還有個較狹義且技術性的意思-「數學研究」," \
               "即使在其語源內。其形容詞{{lang|el|μαθηματικός}}({{lang|la|mathēmatikós}}),意思為''和學習有關的''或" \
               "''用功的'',亦會被用來指''數學的''。其在[[英语]]中表面上的複數形式,及在[[法语]]中的表面複數形式''{{lang|f" \
               "r|les mathématiques}}'',可溯至[[拉丁文]]的中性複數''{{lang|la|mathematica}}'',由[[西塞罗]]譯自希臘" \
               "文複數{{lang|el|τα μαθηματικά}}({{lang|la|ta mathēmatiká}}),此一希臘語被[[亚里士多德]]拿來指「[[萬" \
               "物皆數]]」的概念。"
        expected = "西方语言中“數學”(μαθηματικά)一詞源自於古希臘語的μάθημα(máthēma),其有“學習”、“學問”、“科" \
                   "學”,以及另外還有個較狹義且技術性的意思-「數學研究」,即使在其語源內。其形容詞μαθηματικός(mathēmatikós)," \
                   "意思為和學習有關的或用功的,亦會被用來指數學的。其在英语中表面上的複數形式,及在法语中的表面複數" \
                   "形式les mathématiques,可溯至拉丁文的中性複數mathematica,由西塞罗譯自希臘文複數τα μαθηματικά(t" \
                   "a mathēmatiká),此一希臘語被亚里士多德拿來指「萬物皆數」的概念。"
        actual = self.cleaner.clean_text(text)
        actual, links = self.cleaner.build_links(actual)
        self.assertEqual(expected, actual)

    def test_case_3(self):
        text = "例如,[[全球資訊網]]是在[[歐洲核子研究組織]]由-{A|zh:[[蒂姆·伯纳斯-李]];zh-cn:[[蒂姆·伯纳斯-李]];zh-tw:[[提" \
               "姆·柏納-李]];zh-hk:[[添·柏納-李]];}-創始與發展成功的,原先設計目标為向組織內部和全世界的物理學者提供資訊傳播服務。" \
               "廣受歡迎的[[arXiv]]網站也是在類似狀況下創立的。"
        expected = "例如,全球資訊網是在歐洲核子研究組織由蒂姆·伯纳斯-李創始與發展成功的,原先設計目标為向組織內部和全世界的物理學" \
                   "者提供資訊傳播服務。廣受歡迎的arXiv網站也是在類似狀況下創立的。"
        actual = self.cleaner.clean_text(text)
        actual, links = self.cleaner.build_links(actual)
        self.assertEqual(expected, actual)

    def test_case_4(self):
        text = "亚里士多德死后,整个哲学界陷入了独立时期,称为{{link-en|希腊化哲学|Hellenistic_philosophy}}时期。因为整个社会" \
               "和政治陷入混乱。这段时期产生了[[斯多葛学派]]和[[伊壁鸠鲁学派]],以及[[皮浪主义|怀疑主义派]]、[[新柏拉图主义|新柏" \
               "拉图派]]和{{le|新毕达哥拉斯主义|Neopythagoreanism}}。这些学派的共同特点是伦理化。斯多葛学派主要是顺应自然和自制" \
               "。伊壁鸠鲁学派则是把快乐作为生活的本质和善的标准。而新柏拉图派和新毕达哥拉斯派都是带有[[宗教]]主义的哲学,并逐渐产" \
               "生融化[[基督教]]和希腊哲学于一体的理论,即为后来的[[基督教哲学]]。"
        expected = "亚里士多德死后,整个哲学界陷入了独立时期,称为希腊化哲学时期。因为整个社会和政治陷入混乱。这段时期产生了斯多葛学" \
                   "派和伊壁鸠鲁学派,以及怀疑主义派、新柏拉图派和新毕达哥拉斯主义。这些学派的共同特点是伦理化。斯多葛学派主要是顺应" \
                   "自然和自制。伊壁鸠鲁学派则是把快乐作为生活的本质和善的标准。而新柏拉图派和新毕达哥拉斯派都是带有宗教主义的哲学," \
                   "并逐渐产生融化基督教和希腊哲学于一体的理论,即为后来的基督教哲学。"
        actual = self.cleaner.clean_text(text)
        actual, links = self.cleaner.build_links(actual)
        self.assertEqual(expected, actual)
Exemplo n.º 4
0
    def load_files(self):
        cleaner = Cleaner()
        i = 0
        for title, text in iterate(
                'wiki/simplewiki-20191120-pages-articles.xml'):
            if i >= self.files_number:
                break
            cleaned_text = cleaner.clean_text(text)[:self.characters_per_file]
            cleaned_fragment, _ = cleaner.build_links(text)
            self.texts.append(title)

            word_tokens = self.pattern.sub(' ',
                                           cleaned_text.lower()).split(' ')
            cleaned_text = [
                PorterStemmer().stem(w) for w in word_tokens
                if w not in self.stop_words
            ]
            self.file_dictionaries.append(Counter(cleaned_text))
            self.bag_of_words = self.bag_of_words.union(set(cleaned_text))
            i += 1
Exemplo n.º 5
0
    def load_files(self, dictionary_size=20000):
        cleaner = Cleaner()
        i = 0
        for title, text in iterate(
                'wiki/simplewiki-20191120-pages-articles.xml'):
            if i >= self.files_number:
                break
            cleaned_text = cleaner.clean_text(text)[:self.characters_per_file]
            cleaned_fragment, _ = cleaner.build_links(text)
            self.texts.append(title)

            word_tokens = self.pattern.sub(' ',
                                           cleaned_text.lower()).split(' ')
            cleaned_text = [
                PorterStemmer().stem(w) for w in word_tokens
                if w not in self.stop_words
            ]
            self.file_dictionaries.append(Counter(cleaned_text))
            self.bag_of_words = self.bag_of_words.union(set(cleaned_text))
            i += 1

        self.dictionary = {w: 0 for w in self.bag_of_words}
        for file in self.file_dictionaries:
            for word in self.bag_of_words:
                if word in file.keys():
                    self.dictionary[word] += 1

        if len(self.dictionary) > dictionary_size:
            self.dictionary = Counter(
                self.dictionary).most_common(dictionary_size)
            self.bag_of_words = []
            for (word, num) in self.dictionary:
                self.bag_of_words.append(word)
                self.nw_vector.append(num)
        else:
            self.bag_of_words = list(self.dictionary.keys())
            self.nw_vector = list(self.dictionary.values())
import string, re, os, sys
from tqdm import tqdm

cleaner = Cleaner()
my_cleaner = MyCleaner()
lines = []

brk = 40000
print("Extracting text from xml ...")
for title, text in tqdm(iterate('raw/wiki/rowiki-latest-pages-articles.xml')):
    #if brk<=0:
    #    break
    #brk-=1

    text = cleaner.clean_text(text)
    cleaned_text, links = cleaner.build_links(text) # get text
    lines.extend(cleaned_text.splitlines())

print("Cleaning extracted text ...")
sys.stdout.flush()
cleaned_lines, stats = my_cleaner.process(lines, min_line_length=30, disable_pbar=False)
my_cleaner.print_stats(stats)


print("Post-cleaning extracted text ...")
forbidden_in = ["٭", "*", "†", "sp.", " f.", ".org", "oraș în", "localitate în", "comună în", "sat în", ".com", ".it", "o.o.", "px", ".jpg", ".gif", " n. ", ".bmp", "\\", "(n.", "\\left", "\\right", "(d.", "&nbsp;", "::", "[[", "//", ", un ora", "este un municipiu", "este o comun", "este un ora", "{{", "Period", "from:", "till:", "BackgroundColors", "canvas:", "color:", "width:", "align:", "fontsize:", "pos:", "File", "##", "==", "image:", "ISBN", "\\over", "\\math", "style", "border", "background", "Wikipedia", "id:", "bar:", "ImageSize", "height:", "DateFormat", "text:", "orientation:", "format:", "position:", "columns:", "id:", "value:", "legend:", "ScaleMajor", "increment:", "ScaleMinor", "increment:", "REDIRECT"]
forbidden_startswith = ["redirect", "Reședințe", "Locuri", "Sedii municipale", "Orașe", "Orase", "Actori", "Actri", "Localit", "Municipii", "Pagina", "List", "Secole", "Limbi", ":", "«",".",";","?","!","#"] + [x for x in string.punctuation]
forbidden_endswith = ["Descoperă",")","}","?)","aici",".ro","-lea",";"]
# ^word: regex
re1 = re.compile(r"^\w+:", re.UNICODE)
# \d)$ ex: Coreea, statul Koryo: Kojong (Wang Ch'ol) (rege din dinastia Wang, 1214-1259)
Exemplo n.º 7
0
# Dependencies
# pip install wiki-dump-reader
# pip install tqdm

from wiki_dump_reader import Cleaner, iterate
from tqdm import tqdm
import re

cleaner = Cleaner()
output = open('bn_wiki.txt', 'w')
for title, text in tqdm(iterate('bnwiki-latest-pages-articles.xml')):
    text = cleaner.clean_text(text)
    cleaned_text, _ = cleaner.build_links(text)
    cleaned_text = re.sub(r'[A-Za-z]', '', cleaned_text)
    #     print(cleaned_text)
    output.write(cleaned_text + "\n")

output.close()