def main(): pathWikiXML = os.path.join(PATH_WIKI_XML, FILENAME_WIKI) pathArticles = os.path.join(PATH_CSV, FILENAME_ARTICLES) pathArticlesRedirect = os.path.join(PATH_CSV, FILENAME_REDIRECT) pathTemplateRedirect = os.path.join(PATH_CSV, FILENAME_TEMPLATE) templateCount = 0 articleCount = 0 totalCount = 0 redirectCount = 0 with open(pathArticles, 'w') as output_file: cw = csv.writer(output_file, delimiter='\t') cw.writerow(['Title', 'Text']) cleaner = Cleaner() for title, text in tqdm(iterate(pathWikiXML)): totalCount += 1 text = cleaner.clean_text(text) #cleaned_text, links = cleaner.build_links(text) if text.startswith("REDIRECT"): redirectCount += 1 elif text.startswith("TEMPLATE"): templateCount += 1 else: articleCount += 1 with open(pathArticles, 'a') as output_file: cw = csv.writer(output_file, delimiter='\t') cw.writerow([title, text]) print("Total pages: {:,}".format(totalCount)) print("Template pages: {:,}".format(templateCount)) print("Article pages: {:,}".format(articleCount)) print("Redirect pages: {:,}".format(redirectCount))
class TestIterate(unittest.TestCase): def setUp(self): self.maxDiff = None self.cleaner = Cleaner() self.current_path = os.path.dirname(os.path.abspath(__file__)) self.sample_file_path = os.path.join(self.current_path, 'wikis', 'zhwiki-test-pages.xml') def read_target(self, name): path = os.path.join(self.current_path, 'targets', name + '.txt') with codecs.open(path, 'r', 'utf8') as reader: target = reader.read() return target def save_temp(self, name, text): path = os.path.join(self.current_path, 'targets', name + '.tmp') with codecs.open(path, 'w', 'utf8') as writer: writer.write(text) def test_broken(self): broken_files = ['zhwiki-broken-%d.xml' % i for i in range(1, 5)] for broken_file in broken_files: path = os.path.join(self.current_path, 'wikis', broken_file) for _ in iterate(path): self.assertTrue(False) def test_clean(self): targets = { '数学': 'Mathematics', '哲学': 'Philosophy', '文學': 'Literature', } for target_title, target in targets.items(): found = False for title, text in iterate(self.sample_file_path): if title == target_title: found = True text = self.cleaner.clean_text(text) actual, _ = self.cleaner.build_links(text) expected = self.read_target(target) if actual != expected: self.save_temp(target, actual) self.assertEqual(expected, actual, target) else: text = self.cleaner.clean_text(text) self.cleaner.build_links(text) self.assertTrue(found)
class TestCleanText(unittest.TestCase): def setUp(self): self.maxDiff = None self.cleaner = Cleaner() def test_case_1(self): text = "[[印欧语系|西方语言]]中“數學”({{lang-el|μαθηματικά}})一詞源自於[[古希臘語]]的{{lang|el|μάθημα}}({" \ "{lang|la|máthēma}}),其有“學習”、“學問”、“[[科學]]”,以及另外還有個較狹義且技術性的意思-「數學研究」," \ "即使在其語源內。其形容詞{{lang|el|μαθηματικός}}({{lang|la|mathēmatikós}}),意思為''和學習有關的''或" \ "''用功的'',亦會被用來指''數學的''。其在[[英语]]中表面上的複數形式,及在[[法语]]中的表面複數形式''{{lang|f" \ "r|les mathématiques}}'',可溯至[[拉丁文]]的中性複數''{{lang|la|mathematica}}'',由[[西塞罗]]譯自希臘" \ "文複數{{lang|el|τα μαθηματικά}}({{lang|la|ta mathēmatiká}}),此一希臘語被[[亚里士多德]]拿來指「[[萬" \ "物皆數]]」的概念。" expected = "西方语言中“數學”(μαθηματικά)一詞源自於古希臘語的μάθημα(máthēma),其有“學習”、“學問”、“科" \ "學”,以及另外還有個較狹義且技術性的意思-「數學研究」,即使在其語源內。其形容詞μαθηματικός(mathēmatikós)," \ "意思為和學習有關的或用功的,亦會被用來指數學的。其在英语中表面上的複數形式,及在法语中的表面複數" \ "形式les mathématiques,可溯至拉丁文的中性複數mathematica,由西塞罗譯自希臘文複數τα μαθηματικά(t" \ "a mathēmatiká),此一希臘語被亚里士多德拿來指「萬物皆數」的概念。" actual = self.cleaner.clean_text(text) actual, links = self.cleaner.build_links(actual) self.assertEqual(expected, actual) def test_case_3(self): text = "例如,[[全球資訊網]]是在[[歐洲核子研究組織]]由-{A|zh:[[蒂姆·伯纳斯-李]];zh-cn:[[蒂姆·伯纳斯-李]];zh-tw:[[提" \ "姆·柏納-李]];zh-hk:[[添·柏納-李]];}-創始與發展成功的,原先設計目标為向組織內部和全世界的物理學者提供資訊傳播服務。" \ "廣受歡迎的[[arXiv]]網站也是在類似狀況下創立的。" expected = "例如,全球資訊網是在歐洲核子研究組織由蒂姆·伯纳斯-李創始與發展成功的,原先設計目标為向組織內部和全世界的物理學" \ "者提供資訊傳播服務。廣受歡迎的arXiv網站也是在類似狀況下創立的。" actual = self.cleaner.clean_text(text) actual, links = self.cleaner.build_links(actual) self.assertEqual(expected, actual) def test_case_4(self): text = "亚里士多德死后,整个哲学界陷入了独立时期,称为{{link-en|希腊化哲学|Hellenistic_philosophy}}时期。因为整个社会" \ "和政治陷入混乱。这段时期产生了[[斯多葛学派]]和[[伊壁鸠鲁学派]],以及[[皮浪主义|怀疑主义派]]、[[新柏拉图主义|新柏" \ "拉图派]]和{{le|新毕达哥拉斯主义|Neopythagoreanism}}。这些学派的共同特点是伦理化。斯多葛学派主要是顺应自然和自制" \ "。伊壁鸠鲁学派则是把快乐作为生活的本质和善的标准。而新柏拉图派和新毕达哥拉斯派都是带有[[宗教]]主义的哲学,并逐渐产" \ "生融化[[基督教]]和希腊哲学于一体的理论,即为后来的[[基督教哲学]]。" expected = "亚里士多德死后,整个哲学界陷入了独立时期,称为希腊化哲学时期。因为整个社会和政治陷入混乱。这段时期产生了斯多葛学" \ "派和伊壁鸠鲁学派,以及怀疑主义派、新柏拉图派和新毕达哥拉斯主义。这些学派的共同特点是伦理化。斯多葛学派主要是顺应" \ "自然和自制。伊壁鸠鲁学派则是把快乐作为生活的本质和善的标准。而新柏拉图派和新毕达哥拉斯派都是带有宗教主义的哲学," \ "并逐渐产生融化基督教和希腊哲学于一体的理论,即为后来的基督教哲学。" actual = self.cleaner.clean_text(text) actual, links = self.cleaner.build_links(actual) self.assertEqual(expected, actual)
def load_files(self): cleaner = Cleaner() i = 0 for title, text in iterate( 'wiki/simplewiki-20191120-pages-articles.xml'): if i >= self.files_number: break cleaned_text = cleaner.clean_text(text)[:self.characters_per_file] cleaned_fragment, _ = cleaner.build_links(text) self.texts.append(title) word_tokens = self.pattern.sub(' ', cleaned_text.lower()).split(' ') cleaned_text = [ PorterStemmer().stem(w) for w in word_tokens if w not in self.stop_words ] self.file_dictionaries.append(Counter(cleaned_text)) self.bag_of_words = self.bag_of_words.union(set(cleaned_text)) i += 1
def load_files(self, dictionary_size=20000): cleaner = Cleaner() i = 0 for title, text in iterate( 'wiki/simplewiki-20191120-pages-articles.xml'): if i >= self.files_number: break cleaned_text = cleaner.clean_text(text)[:self.characters_per_file] cleaned_fragment, _ = cleaner.build_links(text) self.texts.append(title) word_tokens = self.pattern.sub(' ', cleaned_text.lower()).split(' ') cleaned_text = [ PorterStemmer().stem(w) for w in word_tokens if w not in self.stop_words ] self.file_dictionaries.append(Counter(cleaned_text)) self.bag_of_words = self.bag_of_words.union(set(cleaned_text)) i += 1 self.dictionary = {w: 0 for w in self.bag_of_words} for file in self.file_dictionaries: for word in self.bag_of_words: if word in file.keys(): self.dictionary[word] += 1 if len(self.dictionary) > dictionary_size: self.dictionary = Counter( self.dictionary).most_common(dictionary_size) self.bag_of_words = [] for (word, num) in self.dictionary: self.bag_of_words.append(word) self.nw_vector.append(num) else: self.bag_of_words = list(self.dictionary.keys()) self.nw_vector = list(self.dictionary.values())
from text_cleaner import Cleaner as MyCleaner import string, re, os, sys from tqdm import tqdm cleaner = Cleaner() my_cleaner = MyCleaner() lines = [] brk = 40000 print("Extracting text from xml ...") for title, text in tqdm(iterate('raw/wiki/rowiki-latest-pages-articles.xml')): #if brk<=0: # break #brk-=1 text = cleaner.clean_text(text) cleaned_text, links = cleaner.build_links(text) # get text lines.extend(cleaned_text.splitlines()) print("Cleaning extracted text ...") sys.stdout.flush() cleaned_lines, stats = my_cleaner.process(lines, min_line_length=30, disable_pbar=False) my_cleaner.print_stats(stats) print("Post-cleaning extracted text ...") forbidden_in = ["٭", "*", "†", "sp.", " f.", ".org", "oraș în", "localitate în", "comună în", "sat în", ".com", ".it", "o.o.", "px", ".jpg", ".gif", " n. ", ".bmp", "\\", "(n.", "\\left", "\\right", "(d.", " ", "::", "[[", "//", ", un ora", "este un municipiu", "este o comun", "este un ora", "{{", "Period", "from:", "till:", "BackgroundColors", "canvas:", "color:", "width:", "align:", "fontsize:", "pos:", "File", "##", "==", "image:", "ISBN", "\\over", "\\math", "style", "border", "background", "Wikipedia", "id:", "bar:", "ImageSize", "height:", "DateFormat", "text:", "orientation:", "format:", "position:", "columns:", "id:", "value:", "legend:", "ScaleMajor", "increment:", "ScaleMinor", "increment:", "REDIRECT"] forbidden_startswith = ["redirect", "Reședințe", "Locuri", "Sedii municipale", "Orașe", "Orase", "Actori", "Actri", "Localit", "Municipii", "Pagina", "List", "Secole", "Limbi", ":", "«",".",";","?","!","#"] + [x for x in string.punctuation] forbidden_endswith = ["Descoperă",")","}","?)","aici",".ro","-lea",";"] # ^word: regex re1 = re.compile(r"^\w+:", re.UNICODE)
def run(self): """Cleans the text gotten from wikipedia. Returns: True if the stage execution succeded, False otherwise. """ self.logger.info("Starting text cleaning...") input_file_path = join(constants.TMP_PATH, "{}.raw.txt".format(self.parent.topic)) output_file_path = join(constants.TMP_PATH, "{}.clean.txt".format(self.parent.topic)) cleaner = Cleaner() with open(input_file_path, "r") as file: text = file.read() text = re.sub(' ', '', text) self.logger.info( "Cleaning the markup and applying token-wise operations") lemmatizer = WordNetLemmatizer() articles = text.split("<<article_end>>") for i in range(len(articles)): article = articles[i] # Removing special tokens article = re.sub('<<article_start>>', '', article) # Removing wikipedia markup article = cleaner.clean_text(article) # Removing left out > article = re.sub(">", '', article) # Openning up [[...]] article = re.sub('\[{2}(.*?)(\|[\w\s\|]*)?\]{2}', '\\1', article) # Removing | article = re.sub('\|', ' ', article) tokens = word_tokenize(article) for j in range(len(tokens)): token = tokens[j] token = token.lower() token = token.encode("ascii", "ignore") token = token.decode() token = lemmatizer.lemmatize(token) tokens[j] = token article = " ".join(tokens) articles[i] = "<<article_start>> {} <<article_end>>".format( article) text = " ".join(articles) self.logger.info("Changing years to <<year>>") text = re.sub(' \d{4}(\-\d+|s)?', ' <<year>>', text) self.logger.info("Changing numbers to <<number>>") text = re.sub(' \d[\d\.,%]*(st|nd|rd|th| %)?', ' <<number>>', text) text = re.sub('<<number>>\-[\d\.,%]+', '<<number>>', text) self.logger.info("Section title formatting") text = re.sub('==+(.*?)==+', '<<section_title_start>> \\1 <<section_title_end>>', text) self.logger.info("Removing extra white-spaces") text = re.sub('\s\s+', ' ', text) with open(output_file_path, "w") as file: file.write(text) num_tokens = len(text.split(" ")) self.logger.info( "Saved the cleaned text. Contains ~ {} tokens".format( num_tokens)) return True