#!/usr/bin/env python3 from xml.dom import minidom import re from utils import sanitize, line_rules, download download_me = download.Download() validate_line = line_rules.LineRules() clean_me = sanitize.Sanitization() xml_path = download_me.if_not_exist('https://dumps.wikimedia.org/itwikiquote/latest/itwikiquote-latest-pages-articles.xml.bz2').bz2_decompress() print(' Reading XML file') mydoc = minidom.parse(xml_path) items = mydoc.getElementsByTagName('page') result = open( './output/wikiquote.txt', 'w' ) print(' Parsing in progress') text = '' for elem in items: title = elem.getElementsByTagName("title")[0].firstChild.data if 'wiki' not in title and title != 'Pagina principale': textdom = elem.getElementsByTagName("revision")[0].getElementsByTagName("text")[0] if textdom.firstChild is not None: text = '' raw_text = clean_me.escapehtml(textdom.firstChild.data) raw_text = re.compile(r"""\[\[(File|Category):[\s\S]+\]\]| \[\[[^|^\]]+\|| \[\[|\]\]| \'{2,5}| (<s>|<!--)[\s\S]+(</s>|-->)|
#!/usr/bin/env python3 import re import os import zipfile from utils import sanitize, download # start downloading ITALIANO.ZIP downloader = download.Download() downloader = downloader.if_not_exist( 'http://www.parlaritaliano.it/attachments/article/716/ITALIANO.zip') with zipfile.ZipFile(downloader.file) as italiano: with italiano.open('ITALIANO/ITALIANO_TRASCRIZIONI.zip') as trascrizioni: with zipfile.ZipFile(trascrizioni) as trascrizioni_ita: trascrizioni_ita.extractall(path=downloader.folder) downloader = downloader.if_not_exist( 'http://www.parlaritaliano.it/attachments/article/644/PALERMO.zip') with zipfile.ZipFile(downloader.file) as palermo: with open( os.path.join(downloader.folder, "ITALIANO_TRASCRIZIONI", "palermo.txt"), 'wb') as f: f.write(palermo.read('PALERMO/corpusPa/DGmtB03P.txt')) downloader = downloader.if_not_exist( 'http://www.parlaritaliano.it/attachments/article/644/ROMA.zip') with zipfile.ZipFile(downloader.file) as palermo: with open( os.path.join(downloader.folder, "ITALIANO_TRASCRIZIONI", "roma.txt"), 'wb') as f: f.write(palermo.read('ROMA/corpusRm/DGtdB04R.txt'))
[re.compile('che\`'), u'ché'], [re.compile('e\`'), u'è'], ] # managing parse directory name parsedir = "parsing/qall/" # managing output pathname + output filename output = "output/qallme.txt" output_file = open(output, "w", encoding='utf-8') print("Qallme Importer") downloader = download.Download().if_not_exist( 'http://qallme.fbk.eu/archive/QB_IT_V1.0_TranscriptionsReferences.zip' ).zip_decompress(parsedir) ### XML ### qallmef = ET.parse( parsedir + "QB_IT_V1.0_Translations/QallmebenchmarkIT_v1.0_final-translation.xml") sentences = qallmef.findall("question/text") # We are looking for sentences, not xml elements! # turning xml elements into real sentences for s in sentences: line = s.text if line is not None: line = sanitizer.maybe_normalize(line, mapping_normalization) output_file.write(line)