def main():
	pathWikiXML = os.path.join(PATH_WIKI_XML, FILENAME_WIKI)
	pathArticles = os.path.join(PATH_CSV, FILENAME_ARTICLES)
	pathArticlesRedirect = os.path.join(PATH_CSV, FILENAME_REDIRECT)
	pathTemplateRedirect = os.path.join(PATH_CSV, FILENAME_TEMPLATE)

	templateCount = 0
	articleCount = 0
	totalCount = 0
	redirectCount = 0

	with open(pathArticles, 'w') as output_file:
		cw = csv.writer(output_file, delimiter='\t')
		cw.writerow(['Title', 'Text'])

	cleaner = Cleaner()
	for title, text in tqdm(iterate(pathWikiXML)):
		totalCount += 1
		text = cleaner.clean_text(text)
		#cleaned_text, links = cleaner.build_links(text)

		if text.startswith("REDIRECT"):
			redirectCount += 1
		elif text.startswith("TEMPLATE"):
			templateCount += 1
		else:
			articleCount += 1
			with open(pathArticles, 'a') as output_file:
				cw = csv.writer(output_file, delimiter='\t')
				cw.writerow([title, text])

	print("Total pages: {:,}".format(totalCount))
	print("Template pages: {:,}".format(templateCount))
	print("Article pages: {:,}".format(articleCount))
	print("Redirect pages: {:,}".format(redirectCount))
Пример #2
0
    def load_files(self):
        cleaner = Cleaner()
        i = 0
        for title, text in iterate(
                'wiki/simplewiki-20191120-pages-articles.xml'):
            if i >= self.files_number:
                break
            cleaned_text = cleaner.clean_text(text)[:self.characters_per_file]
            cleaned_fragment, _ = cleaner.build_links(text)
            self.texts.append(title)

            word_tokens = self.pattern.sub(' ',
                                           cleaned_text.lower()).split(' ')
            cleaned_text = [
                PorterStemmer().stem(w) for w in word_tokens
                if w not in self.stop_words
            ]
            self.file_dictionaries.append(Counter(cleaned_text))
            self.bag_of_words = self.bag_of_words.union(set(cleaned_text))
            i += 1
Пример #3
0
 def test_clean(self):
     targets = {
         '数学': 'Mathematics',
         '哲学': 'Philosophy',
         '文學': 'Literature',
     }
     for target_title, target in targets.items():
         found = False
         for title, text in iterate(self.sample_file_path):
             if title == target_title:
                 found = True
                 text = self.cleaner.clean_text(text)
                 actual, _ = self.cleaner.build_links(text)
                 expected = self.read_target(target)
                 if actual != expected:
                     self.save_temp(target, actual)
                 self.assertEqual(expected, actual, target)
             else:
                 text = self.cleaner.clean_text(text)
                 self.cleaner.build_links(text)
         self.assertTrue(found)
Пример #4
0
    def load_files(self, dictionary_size=20000):
        cleaner = Cleaner()
        i = 0
        for title, text in iterate(
                'wiki/simplewiki-20191120-pages-articles.xml'):
            if i >= self.files_number:
                break
            cleaned_text = cleaner.clean_text(text)[:self.characters_per_file]
            cleaned_fragment, _ = cleaner.build_links(text)
            self.texts.append(title)

            word_tokens = self.pattern.sub(' ',
                                           cleaned_text.lower()).split(' ')
            cleaned_text = [
                PorterStemmer().stem(w) for w in word_tokens
                if w not in self.stop_words
            ]
            self.file_dictionaries.append(Counter(cleaned_text))
            self.bag_of_words = self.bag_of_words.union(set(cleaned_text))
            i += 1

        self.dictionary = {w: 0 for w in self.bag_of_words}
        for file in self.file_dictionaries:
            for word in self.bag_of_words:
                if word in file.keys():
                    self.dictionary[word] += 1

        if len(self.dictionary) > dictionary_size:
            self.dictionary = Counter(
                self.dictionary).most_common(dictionary_size)
            self.bag_of_words = []
            for (word, num) in self.dictionary:
                self.bag_of_words.append(word)
                self.nw_vector.append(num)
        else:
            self.bag_of_words = list(self.dictionary.keys())
            self.nw_vector = list(self.dictionary.values())
from wiki_dump_reader import Cleaner, iterate
from text_cleaner import Cleaner as MyCleaner
import string, re, os, sys
from tqdm import tqdm

cleaner = Cleaner()
my_cleaner = MyCleaner()
lines = []

brk = 40000
print("Extracting text from xml ...")
for title, text in tqdm(iterate('raw/wiki/rowiki-latest-pages-articles.xml')):
    #if brk<=0:
    #    break
    #brk-=1

    text = cleaner.clean_text(text)
    cleaned_text, links = cleaner.build_links(text) # get text
    lines.extend(cleaned_text.splitlines())

print("Cleaning extracted text ...")
sys.stdout.flush()
cleaned_lines, stats = my_cleaner.process(lines, min_line_length=30, disable_pbar=False)
my_cleaner.print_stats(stats)


print("Post-cleaning extracted text ...")
forbidden_in = ["٭", "*", "†", "sp.", " f.", ".org", "oraș în", "localitate în", "comună în", "sat în", ".com", ".it", "o.o.", "px", ".jpg", ".gif", " n. ", ".bmp", "\\", "(n.", "\\left", "\\right", "(d.", "&nbsp;", "::", "[[", "//", ", un ora", "este un municipiu", "este o comun", "este un ora", "{{", "Period", "from:", "till:", "BackgroundColors", "canvas:", "color:", "width:", "align:", "fontsize:", "pos:", "File", "##", "==", "image:", "ISBN", "\\over", "\\math", "style", "border", "background", "Wikipedia", "id:", "bar:", "ImageSize", "height:", "DateFormat", "text:", "orientation:", "format:", "position:", "columns:", "id:", "value:", "legend:", "ScaleMajor", "increment:", "ScaleMinor", "increment:", "REDIRECT"]
forbidden_startswith = ["redirect", "Reședințe", "Locuri", "Sedii municipale", "Orașe", "Orase", "Actori", "Actri", "Localit", "Municipii", "Pagina", "List", "Secole", "Limbi", ":", "«",".",";","?","!","#"] + [x for x in string.punctuation]
forbidden_endswith = ["Descoperă",")","}","?)","aici",".ro","-lea",";"]
Пример #6
0
from wiki_dump_reader import Cleaner, iterate
import pickle
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter
import re

cleaner = Cleaner()
files_number = 50002
i = 0
titles = []
bag_of_words = set()
file_dictionaries = []
pattern = re.compile('[^a-z0-9]+')
stop_words = set(stopwords.words('english'))
for title, text in iterate('wiki/simplewiki-20191120-pages-articles.xml'):
    if i >= files_number:
        break
    titles.append(title)
    cleaned_text = cleaner.clean_text(text)
    # cleaned_fragment, _ = cleaner.build_links(cleaned_text)
    # f = open(f'wiki/files/{i}.txt', "w")
    # f.write(cleaned_fragment)
    # f.close()

    i += 1
Пример #7
0
 def test_broken(self):
     broken_files = ['zhwiki-broken-%d.xml' % i for i in range(1, 5)]
     for broken_file in broken_files:
         path = os.path.join(self.current_path, 'wikis', broken_file)
         for _ in iterate(path):
             self.assertTrue(False)
Пример #8
0
import tqdm
import json
import pickle
from wiki_dump_reader import Cleaner, iterate

titles = []
data = {}
counter = 0
identif = 0

cleaner = Cleaner()

print("Parsing data...")
for title, text in tqdm.tqdm(
        iterate("./enwiki-20210301-pages-articles-multistream.xml")):
    text = cleaner.clean_text(text)
    cleaned_text, links = cleaner.build_links(text)
    titles.append(title)
    data[title] = {"text": cleaned_text, "links": links}

    counter += 1
    if counter != 0 and counter % 65536 == 0:
        print("Writing parsed datapack ", identif)
        with open(f"./enwiki_data/enwiki-data_{identif}.json", "w") as df:
            json.dump({"titles": titles, "data": data}, df)

        counter = 0
        titles = []
        data = {}
        identif += 1
Пример #9
0
	#postprocessing
	partitioned_text = re.sub("\n\n\n\n*", "\n\n", partitioned_text) #get rids of more than 2 new lines,
																	#makes them all 2 new lines

	for passage in partitioned_text.split("\n\n"): #we tokenize each passage one by one to be able to seperate them
		for sentence in tokenize.sent_tokenize(passage): #now we are ready to split sentences
			formatted_text += sentence + "\n"
		formatted_text += "\n"

	formatted_text = formatted_text.replace(".\n", "\n") #delete the dots at the end of the sentences

	# somehow, multiple new line problem occurs, so we must solve it
	formatted_text = re.sub("\n\n\n\n*", "\n\n", formatted_text)

	return formatted_text

if __name__ == "__main__":
	cleaner = Cleaner()
	f = open(path_to_organised_data, "w+", encoding="utf8")

	for title, text in tqdm(iterate(PATH_WIKI_XML)):
		text = cleaner.clean_text(text)
		cleaned_text, links = cleaner.build_links(text)

		if "REDIRECT" not in cleaned_text:
			text_in_format = process_text(cleaned_text)
			f.write(text_in_format)

	f.close()
	print("Everything is processed :)")
Пример #10
0
import re
import json
from tqdm import tqdm
from wiki_dump_reader import Cleaner, iterate

from nltk.tokenize import sent_tokenize

database = []
index = {}

prefix = "enwiki"

cleaner = Cleaner()
for title, text in tqdm(
        iterate(f"./source/{prefix}-latest-pages-articles.xml"),
        total=21181268):
    # for title, text in tqdm(iterate(f"./source/{prefix}-latest-pages-articles.xml"), total=346229):
    text = cleaner.clean_text(text)
    cleaned_text, links = cleaner.build_links(text)

    passage = []
    for i in cleaned_text.split("\n"):
        try:
            if (i[0] == "="):
                break
            passage.append(i)
        except IndexError:
            break

    if (len(passage) < 4):
        continue
Пример #11
0
# Dependencies
# pip install wiki-dump-reader
# pip install tqdm

from wiki_dump_reader import Cleaner, iterate
from tqdm import tqdm
import re

cleaner = Cleaner()
output = open('bn_wiki.txt', 'w')
for title, text in tqdm(iterate('bnwiki-latest-pages-articles.xml')):
    text = cleaner.clean_text(text)
    cleaned_text, _ = cleaner.build_links(text)
    cleaned_text = re.sub(r'[A-Za-z]', '', cleaned_text)
    #     print(cleaned_text)
    output.write(cleaned_text + "\n")

output.close()
Пример #12
0
from wiki_dump_reader import Cleaner, iterate
from text_cleaner import Cleaner as MyCleaner
import string, re, os, sys
from tqdm import tqdm

cleaner = Cleaner()
my_cleaner = MyCleaner()
lines = []

brk = 40000
print("Extracting text from xml ...")
for title, text in tqdm(iterate('raw/wiki/rowiki-20200220-pages-articles.xml')):
    #if brk<=0:
    #    break
    #brk-=1

    text = cleaner.clean_text(text)
    cleaned_text, links = cleaner.build_links(text) # get text
    lines.extend(cleaned_text.splitlines())

print("Cleaning extracted text ...")
sys.stdout.flush()
cleaned_lines, stats = my_cleaner.process(lines, min_line_length=30, disable_pbar=False)
my_cleaner.print_stats(stats)


print("Post-cleaning extracted text ...")
forbidden_in = ["٭", "*", "†", "sp.", " f.", ".org", "oraș în", "localitate în", "comună în", "sat în", ".com", ".it", "o.o.", "px", ".jpg", ".gif", " n. ", ".bmp", "\\", "(n.", "\\left", "\\right", "(d.", "&nbsp;", "::", "[[", "//", ", un ora", "este un municipiu", "este o comun", "este un ora", "{{", "Period", "from:", "till:", "BackgroundColors", "canvas:", "color:", "width:", "align:", "fontsize:", "pos:", "File", "##", "==", "image:", "ISBN", "\\over", "\\math", "style", "border", "background", "Wikipedia", "id:", "bar:", "ImageSize", "height:", "DateFormat", "text:", "orientation:", "format:", "position:", "columns:", "id:", "value:", "legend:", "ScaleMajor", "increment:", "ScaleMinor", "increment:", "REDIRECT"]
forbidden_startswith = ["redirect", "Reședințe", "Locuri", "Sedii municipale", "Orașe", "Orase", "Actori", "Actri", "Localit", "Municipii", "Pagina", "List", "Secole", "Limbi", ":", "«",".",";","?","!","#"] + [x for x in string.punctuation]
forbidden_endswith = ["Descoperă",")","}","?)","aici",".ro","-lea",";"]
Пример #13
0
from wiki_dump_reader import Cleaner, iterate

cleaner = Cleaner()
count = 0
f = open('dumbfuck.txt', 'w')
for title, text in iterate('viwiki-latest-pages-articles-multistream.xml'):
    text = cleaner.clean_text(text)
    cleaned_text, links = cleaner.build_links(text)
    #print(cleaned_text)
    f.write(cleaned_text)
    count += 1
    if count > 10:
        break
i = 0
cleaned_text_string = ""

f = open(r"..\Cleaned_Corpora\cleaned_lvwiki.txt", "w", encoding="utf-8")
f.write("")
f.close()
print("Cleared file")

f = open(r"..\Cleaned_Corpora\cleaned_lvwiki.txt", "a", encoding="utf-8")

print("starting clean")

cleaner = Cleaner()

for title, text in iterate(r"..\Corpora\lvwiki-latest-pages-articles.xml"):
    text = cleaner.clean_text(text)
    cleaned_text, _ = cleaner.build_links(text)
    cleaned_text_string += cleaned_text
    i += 1
    if i % 1000 == 0:

        #gets rid of uppercase abreviations
        cleaned_text_string = re.sub('([A-ZĀČĒĢĪĶĻŅŠŪŽ]{2})+', '',
                                     cleaned_text_string)

        #lowercase the string
        cleaned_text_string = cleaned_text_string.lower()

        #perform all regex checks on it
        for old, new in RE_replacements_new:  #RE_replacements_simple works pretty much the same speed