def main(): pathWikiXML = os.path.join(PATH_WIKI_XML, FILENAME_WIKI) pathArticles = os.path.join(PATH_CSV, FILENAME_ARTICLES) pathArticlesRedirect = os.path.join(PATH_CSV, FILENAME_REDIRECT) pathTemplateRedirect = os.path.join(PATH_CSV, FILENAME_TEMPLATE) templateCount = 0 articleCount = 0 totalCount = 0 redirectCount = 0 with open(pathArticles, 'w') as output_file: cw = csv.writer(output_file, delimiter='\t') cw.writerow(['Title', 'Text']) cleaner = Cleaner() for title, text in tqdm(iterate(pathWikiXML)): totalCount += 1 text = cleaner.clean_text(text) #cleaned_text, links = cleaner.build_links(text) if text.startswith("REDIRECT"): redirectCount += 1 elif text.startswith("TEMPLATE"): templateCount += 1 else: articleCount += 1 with open(pathArticles, 'a') as output_file: cw = csv.writer(output_file, delimiter='\t') cw.writerow([title, text]) print("Total pages: {:,}".format(totalCount)) print("Template pages: {:,}".format(templateCount)) print("Article pages: {:,}".format(articleCount)) print("Redirect pages: {:,}".format(redirectCount))
def load_files(self): cleaner = Cleaner() i = 0 for title, text in iterate( 'wiki/simplewiki-20191120-pages-articles.xml'): if i >= self.files_number: break cleaned_text = cleaner.clean_text(text)[:self.characters_per_file] cleaned_fragment, _ = cleaner.build_links(text) self.texts.append(title) word_tokens = self.pattern.sub(' ', cleaned_text.lower()).split(' ') cleaned_text = [ PorterStemmer().stem(w) for w in word_tokens if w not in self.stop_words ] self.file_dictionaries.append(Counter(cleaned_text)) self.bag_of_words = self.bag_of_words.union(set(cleaned_text)) i += 1
def test_clean(self): targets = { '数学': 'Mathematics', '哲学': 'Philosophy', '文學': 'Literature', } for target_title, target in targets.items(): found = False for title, text in iterate(self.sample_file_path): if title == target_title: found = True text = self.cleaner.clean_text(text) actual, _ = self.cleaner.build_links(text) expected = self.read_target(target) if actual != expected: self.save_temp(target, actual) self.assertEqual(expected, actual, target) else: text = self.cleaner.clean_text(text) self.cleaner.build_links(text) self.assertTrue(found)
def load_files(self, dictionary_size=20000): cleaner = Cleaner() i = 0 for title, text in iterate( 'wiki/simplewiki-20191120-pages-articles.xml'): if i >= self.files_number: break cleaned_text = cleaner.clean_text(text)[:self.characters_per_file] cleaned_fragment, _ = cleaner.build_links(text) self.texts.append(title) word_tokens = self.pattern.sub(' ', cleaned_text.lower()).split(' ') cleaned_text = [ PorterStemmer().stem(w) for w in word_tokens if w not in self.stop_words ] self.file_dictionaries.append(Counter(cleaned_text)) self.bag_of_words = self.bag_of_words.union(set(cleaned_text)) i += 1 self.dictionary = {w: 0 for w in self.bag_of_words} for file in self.file_dictionaries: for word in self.bag_of_words: if word in file.keys(): self.dictionary[word] += 1 if len(self.dictionary) > dictionary_size: self.dictionary = Counter( self.dictionary).most_common(dictionary_size) self.bag_of_words = [] for (word, num) in self.dictionary: self.bag_of_words.append(word) self.nw_vector.append(num) else: self.bag_of_words = list(self.dictionary.keys()) self.nw_vector = list(self.dictionary.values())
from wiki_dump_reader import Cleaner, iterate from text_cleaner import Cleaner as MyCleaner import string, re, os, sys from tqdm import tqdm cleaner = Cleaner() my_cleaner = MyCleaner() lines = [] brk = 40000 print("Extracting text from xml ...") for title, text in tqdm(iterate('raw/wiki/rowiki-latest-pages-articles.xml')): #if brk<=0: # break #brk-=1 text = cleaner.clean_text(text) cleaned_text, links = cleaner.build_links(text) # get text lines.extend(cleaned_text.splitlines()) print("Cleaning extracted text ...") sys.stdout.flush() cleaned_lines, stats = my_cleaner.process(lines, min_line_length=30, disable_pbar=False) my_cleaner.print_stats(stats) print("Post-cleaning extracted text ...") forbidden_in = ["٭", "*", "†", "sp.", " f.", ".org", "oraș în", "localitate în", "comună în", "sat în", ".com", ".it", "o.o.", "px", ".jpg", ".gif", " n. ", ".bmp", "\\", "(n.", "\\left", "\\right", "(d.", " ", "::", "[[", "//", ", un ora", "este un municipiu", "este o comun", "este un ora", "{{", "Period", "from:", "till:", "BackgroundColors", "canvas:", "color:", "width:", "align:", "fontsize:", "pos:", "File", "##", "==", "image:", "ISBN", "\\over", "\\math", "style", "border", "background", "Wikipedia", "id:", "bar:", "ImageSize", "height:", "DateFormat", "text:", "orientation:", "format:", "position:", "columns:", "id:", "value:", "legend:", "ScaleMajor", "increment:", "ScaleMinor", "increment:", "REDIRECT"] forbidden_startswith = ["redirect", "Reședințe", "Locuri", "Sedii municipale", "Orașe", "Orase", "Actori", "Actri", "Localit", "Municipii", "Pagina", "List", "Secole", "Limbi", ":", "«",".",";","?","!","#"] + [x for x in string.punctuation] forbidden_endswith = ["Descoperă",")","}","?)","aici",".ro","-lea",";"]
from wiki_dump_reader import Cleaner, iterate import pickle from nltk.corpus import stopwords from nltk.stem import PorterStemmer from collections import Counter import re cleaner = Cleaner() files_number = 50002 i = 0 titles = [] bag_of_words = set() file_dictionaries = [] pattern = re.compile('[^a-z0-9]+') stop_words = set(stopwords.words('english')) for title, text in iterate('wiki/simplewiki-20191120-pages-articles.xml'): if i >= files_number: break titles.append(title) cleaned_text = cleaner.clean_text(text) # cleaned_fragment, _ = cleaner.build_links(cleaned_text) # f = open(f'wiki/files/{i}.txt', "w") # f.write(cleaned_fragment) # f.close() i += 1
def test_broken(self): broken_files = ['zhwiki-broken-%d.xml' % i for i in range(1, 5)] for broken_file in broken_files: path = os.path.join(self.current_path, 'wikis', broken_file) for _ in iterate(path): self.assertTrue(False)
import tqdm import json import pickle from wiki_dump_reader import Cleaner, iterate titles = [] data = {} counter = 0 identif = 0 cleaner = Cleaner() print("Parsing data...") for title, text in tqdm.tqdm( iterate("./enwiki-20210301-pages-articles-multistream.xml")): text = cleaner.clean_text(text) cleaned_text, links = cleaner.build_links(text) titles.append(title) data[title] = {"text": cleaned_text, "links": links} counter += 1 if counter != 0 and counter % 65536 == 0: print("Writing parsed datapack ", identif) with open(f"./enwiki_data/enwiki-data_{identif}.json", "w") as df: json.dump({"titles": titles, "data": data}, df) counter = 0 titles = [] data = {} identif += 1
#postprocessing partitioned_text = re.sub("\n\n\n\n*", "\n\n", partitioned_text) #get rids of more than 2 new lines, #makes them all 2 new lines for passage in partitioned_text.split("\n\n"): #we tokenize each passage one by one to be able to seperate them for sentence in tokenize.sent_tokenize(passage): #now we are ready to split sentences formatted_text += sentence + "\n" formatted_text += "\n" formatted_text = formatted_text.replace(".\n", "\n") #delete the dots at the end of the sentences # somehow, multiple new line problem occurs, so we must solve it formatted_text = re.sub("\n\n\n\n*", "\n\n", formatted_text) return formatted_text if __name__ == "__main__": cleaner = Cleaner() f = open(path_to_organised_data, "w+", encoding="utf8") for title, text in tqdm(iterate(PATH_WIKI_XML)): text = cleaner.clean_text(text) cleaned_text, links = cleaner.build_links(text) if "REDIRECT" not in cleaned_text: text_in_format = process_text(cleaned_text) f.write(text_in_format) f.close() print("Everything is processed :)")
import re import json from tqdm import tqdm from wiki_dump_reader import Cleaner, iterate from nltk.tokenize import sent_tokenize database = [] index = {} prefix = "enwiki" cleaner = Cleaner() for title, text in tqdm( iterate(f"./source/{prefix}-latest-pages-articles.xml"), total=21181268): # for title, text in tqdm(iterate(f"./source/{prefix}-latest-pages-articles.xml"), total=346229): text = cleaner.clean_text(text) cleaned_text, links = cleaner.build_links(text) passage = [] for i in cleaned_text.split("\n"): try: if (i[0] == "="): break passage.append(i) except IndexError: break if (len(passage) < 4): continue
# Dependencies # pip install wiki-dump-reader # pip install tqdm from wiki_dump_reader import Cleaner, iterate from tqdm import tqdm import re cleaner = Cleaner() output = open('bn_wiki.txt', 'w') for title, text in tqdm(iterate('bnwiki-latest-pages-articles.xml')): text = cleaner.clean_text(text) cleaned_text, _ = cleaner.build_links(text) cleaned_text = re.sub(r'[A-Za-z]', '', cleaned_text) # print(cleaned_text) output.write(cleaned_text + "\n") output.close()
from wiki_dump_reader import Cleaner, iterate from text_cleaner import Cleaner as MyCleaner import string, re, os, sys from tqdm import tqdm cleaner = Cleaner() my_cleaner = MyCleaner() lines = [] brk = 40000 print("Extracting text from xml ...") for title, text in tqdm(iterate('raw/wiki/rowiki-20200220-pages-articles.xml')): #if brk<=0: # break #brk-=1 text = cleaner.clean_text(text) cleaned_text, links = cleaner.build_links(text) # get text lines.extend(cleaned_text.splitlines()) print("Cleaning extracted text ...") sys.stdout.flush() cleaned_lines, stats = my_cleaner.process(lines, min_line_length=30, disable_pbar=False) my_cleaner.print_stats(stats) print("Post-cleaning extracted text ...") forbidden_in = ["٭", "*", "†", "sp.", " f.", ".org", "oraș în", "localitate în", "comună în", "sat în", ".com", ".it", "o.o.", "px", ".jpg", ".gif", " n. ", ".bmp", "\\", "(n.", "\\left", "\\right", "(d.", " ", "::", "[[", "//", ", un ora", "este un municipiu", "este o comun", "este un ora", "{{", "Period", "from:", "till:", "BackgroundColors", "canvas:", "color:", "width:", "align:", "fontsize:", "pos:", "File", "##", "==", "image:", "ISBN", "\\over", "\\math", "style", "border", "background", "Wikipedia", "id:", "bar:", "ImageSize", "height:", "DateFormat", "text:", "orientation:", "format:", "position:", "columns:", "id:", "value:", "legend:", "ScaleMajor", "increment:", "ScaleMinor", "increment:", "REDIRECT"] forbidden_startswith = ["redirect", "Reședințe", "Locuri", "Sedii municipale", "Orașe", "Orase", "Actori", "Actri", "Localit", "Municipii", "Pagina", "List", "Secole", "Limbi", ":", "«",".",";","?","!","#"] + [x for x in string.punctuation] forbidden_endswith = ["Descoperă",")","}","?)","aici",".ro","-lea",";"]
from wiki_dump_reader import Cleaner, iterate cleaner = Cleaner() count = 0 f = open('dumbfuck.txt', 'w') for title, text in iterate('viwiki-latest-pages-articles-multistream.xml'): text = cleaner.clean_text(text) cleaned_text, links = cleaner.build_links(text) #print(cleaned_text) f.write(cleaned_text) count += 1 if count > 10: break
i = 0 cleaned_text_string = "" f = open(r"..\Cleaned_Corpora\cleaned_lvwiki.txt", "w", encoding="utf-8") f.write("") f.close() print("Cleared file") f = open(r"..\Cleaned_Corpora\cleaned_lvwiki.txt", "a", encoding="utf-8") print("starting clean") cleaner = Cleaner() for title, text in iterate(r"..\Corpora\lvwiki-latest-pages-articles.xml"): text = cleaner.clean_text(text) cleaned_text, _ = cleaner.build_links(text) cleaned_text_string += cleaned_text i += 1 if i % 1000 == 0: #gets rid of uppercase abreviations cleaned_text_string = re.sub('([A-ZĀČĒĢĪĶĻŅŠŪŽ]{2})+', '', cleaned_text_string) #lowercase the string cleaned_text_string = cleaned_text_string.lower() #perform all regex checks on it for old, new in RE_replacements_new: #RE_replacements_simple works pretty much the same speed