#!/usr/bin/env python3 from xml.dom import minidom from html import unescape import re from utils import sanitize, line_rules, download # TODO: ['−', '-'] # replace unicode minus sign with hypen-minus (the minus commonly used on computer keyboard) # TODO: sentences with different alphabets DOWNLOAD_PATH = 'https://dumps.wikimedia.org/itwikiquote/latest/itwikiquote-latest-pages-articles.xml.bz2' OUTFILE = "output/wikiquote.txt" DISCARD_FILE = 'output/discarded/wikiquote.json' download_me = download.Download() validate_line = line_rules.LineRules(discard_file=DISCARD_FILE) clean_me = sanitize.Sanitization() sub_regex = re.compile( r"""\[\[(File|Category):[\s\S]+\]\]| \[\[[^|^\]]+\|| \[\[|\]\]| \'{2,5}| (<s>|<!--)[\s\S]+(</s>|-->)| (<s>|<!)[\s\S]+(</s>|>)| {{[\s\S\n]+?}}| <.*?>| ={1,6}""", re.VERBOSE) normalize_rules = [['*', u"\n"], ['<br />', u"\n"], ['<br>', u"\n"], ["\(\d\d\d\d\)", ""], ["[\(\[].*?[\)\]]", ""], ['AvvertenzaContattiDonazioni', ''],
#!/usr/bin/env python3 from xml.dom import minidom import re from utils import sanitize, line_rules, download download_me = download.Download() validate_line = line_rules.LineRules() clean_me = sanitize.Sanitization() xml_path = download_me.if_not_exist('https://dumps.wikimedia.org/itwikiquote/latest/itwikiquote-latest-pages-articles.xml.bz2').bz2_decompress() print(' Reading XML file') mydoc = minidom.parse(xml_path) items = mydoc.getElementsByTagName('page') result = open( './output/wikiquote.txt', 'w' ) print(' Parsing in progress') text = '' for elem in items: title = elem.getElementsByTagName("title")[0].firstChild.data if 'wiki' not in title and title != 'Pagina principale': textdom = elem.getElementsByTagName("revision")[0].getElementsByTagName("text")[0] if textdom.firstChild is not None: text = '' raw_text = clean_me.escapehtml(textdom.firstChild.data) raw_text = re.compile(r"""\[\[(File|Category):[\s\S]+\]\]| \[\[[^|^\]]+\|| \[\[|\]\]| \'{2,5}| (<s>|<!--)[\s\S]+(</s>|-->)|
#!/usr/bin/env python3 from utils import sanitize, line_rules, download from urllib import parse import time import os import re OUTFILE = "output/wikisource.txt" PARSING = './parsing/wikisource/' if not os.path.isdir(PARSING): os.mkdir(PARSING) DISCARD_FILE = 'output/discarded/wikisource.json' DOWNLOAD_LINK = 'https://wsexport.wmflabs.org/tool/book.php?lang=it&format=txt&page=' validate_line = line_rules.LineRules(DISCARD_FILE) clean_me = sanitize.Sanitization() download_me = download.Download() def process_line(line, out_file): """if line is invalid returns early, if is correct writes the line to the file""" line = re.sub("[eE]'", "è", line) line = clean_me.clean_single_line(line) if (validate_line.is_not_valid(line) or len(line) <= 12 or line == 'creativecommons' or validate_line.contain(line, [ '§', '=', '--', '~', 'wiki', 'licenses', '//', ' pp', ' Ibid', '■', '^' ]) or # line.find('/') >= 1 or or commented out because with the current regex digits and brackets are always discarded validate_line.isbrokenparenthesis(line)