def run(self): if self.has_run: raise StandardError('Already Run') self.has_run = True xml_parser = XMLPageParser(self.xml_file, dePage) counter = 0 alert_counter = 0 for page in xml_parser: try: counter += 1 if counter % 1000 == 0: print counter page.parse() all_alerts = page.get_all_alerts() if not all_alerts: continue for alert_type, alerts in all_alerts.items(): for alert in alerts: alert_counter += 1 print counter, alert_counter print alert.message #if alert_type is NoFTMatchAlert and alert.section_class == SubstantivTabelleSection: # silben_counter += 1 # print silben_counter, counter # print alert.text # for alert in all_alerts.get(NoFTMatchAlert, []): # print alert.title, alert.text # for alert in all_alerts.get(PatchRemainderAlert, []): # print alert.title, alert.patched_text, '--------------', alert.text except: print 'Failed on page %s' % page.title print page.text raise
def run(self): if self.has_run: raise StandardError('Already Run') self.has_run = True xml_parser = XMLPageParser(self.xml_file, dePage) counter = 0 alert_counter = 0 f_alert_counter = 0 st_counter = 0 for page in xml_parser: try: counter += 1 if counter % 1000 == 0: print counter page.parse() # get wortart sections wortart_sections = [] for section in page.children: if isinstance(section, deLanguageSection): for section2 in section.children: if isinstance(section2, deWortartSection): wortart_sections.append(section2) alerts = [] f_alerts = [] for section in wortart_sections: word = section.get_property('word') if word and not word.is_name: alerts = alerts + section.get_alerts( SubstantivTabelleAlert) f_alerts = f_alerts + section.get_alerts( FixableSubstantivTabelleAlert) # Check if it has a substativ-tabelle for child in section.children: if isinstance(child, deWortartContentSection): for grandchild in child.children: if isinstance(grandchild, SubstantivTabelleSection): st_counter += 1 for alert in alerts: alert_counter += 1 print counter, alert_counter, f_alert_counter, st_counter print alert.message for alert in f_alerts: f_alert_counter += 1 print counter, alert_counter, f_alert_counter, st_counter print alert.message except: print 'Failed on page %s' % page.title print page.text raise
def load_xml(): session = Session() Base.metadata.create_all(engine) xml_file = open( '../../wiktionary_data/enwiktionary-20120220-pages-meta-current.xml') xml_parser = XMLPageParser(xml_file, enPage) total_lines = 0 counter = 0 for page in xml_parser: ev = session.query(Page).filter(Page.language == page.language, Page.title == page.title) if ev.count() == 0: session.add(page) if counter % 1000 == 0: print(counter) session.commit() print('committed') counter += 1 session.commit()
def scan_xml(self, max_no=None): # Check title hasn't been used before if self.title in self.log_page.einsatz_sections: raise StandardError(u'title has been used before') # Parse xml dump xml_parser = XMLPageParser(self.xml_file, self.page_class) counter = 0 all_counter = 0 for page in xml_parser: all_counter += 1 page.parse() if self.requires_approval(page): counter += 1 print '%d: %d: Fixable alert for page %s' % (counter, all_counter, page.title) if max_no is not None and counter >= max_no: break if all_counter % 1000 == 0: print all_counter self.memory.save() self.memory.save()
import sys sys.path.insert( 0, '/disk/scratch/s1146856/project_codes/tools/sense_stuff/wiktionary-parser-xml/wiktionary_parser' ) sys.path.insert( 0, '/disk/scratch/s1146856/project_codes/tools/sense_stuff/wiktionary-parser-xml' ) from wiktionary_parser.xml_parser import XMLPageParser from wiktionary_parser.languages.fr.page import frPage from wiktionary_parser.languages.fr.parseText import FrParseText xml_file = open( '../../../../../datasets/sense_disambiguation_datasets/frwiktionary-20161101-pages-articles-multistream.xml' ) xml_parser = XMLPageParser(xml_file, frPage) french_words = set(['sauter']) #for title, page in xml_parser.from_titles(german_words): # found_words.add(title) for page in xml_parser.from_titles(french_words): #print page.text parseData = FrParseText(page.text) print 'Title', page.title #parseData.view_sections() #print parseData.synonyms print parseData.verb_translations break
# -*- coding: utf-8 -*- """ This example extracts a number of words from the wiktionary xml file. """ from wiktionary_parser.xml_parser import XMLPageParser from wiktionary_parser.languages.de.page import dePage xml_file = open('../../wiktionary_data/dewiktionary-20110504-pages-articles.xml') xml_parser = XMLPageParser(xml_file, dePage) german_words = set([u'Bank', u'Kiefer']) found_words = set([]) for title, page in xml_parser.from_titles(german_words): found_words.add(title) page.parse() for word in page.words: print('') print(word.title) print('******************') if word.bedeutungen: print('--Bedeutungen---------------') print(word.bedeutungen) if word.beispiele: print('--Beispiele-----------------') print(word.beispiele) if word.gender: print('--Gender--------------------') print(word.gender)
from difflib import Differ from wiktionary_parser.xml_parser import XMLPageParser from wiktionary_parser.languages.de.page import dePage xml_file = open('../../data/dewiktionary-20090913-pages-articles.xml') xml_parser = XMLPageParser(xml_file, dePage) counter = 0 show_num = 10 for page in xml_parser: page.parse() if page.fixable(): print '%s FIXABLE' % page.title page.fix() new_text = page.render() if new_text != page.text: result = list(Differ().compare(page.text.split('\n'), new_text.split('\n'))) for line in result: if line[0] != ' ': print line print '----------------------------------' #break
# -*- coding: utf-8 -*- """ This example extracts a number of words from the simple.wiktionary xml file. """ from wiktionary_parser.xml_parser import XMLPageParser from wiktionary_parser.languages.simple.page import simplePage xml_file = open('../../wiktionary_data/simplewiktionary-20110514-pages-articles.xml') xml_parser = XMLPageParser(xml_file, simplePage) # The words we want to extract wanted_words = set([u'fish']) found_words = set([]) for title, page in xml_parser.from_titles(wanted_words): page.parse() # Print out a summary of the want for word in page.words: print word.summary() found_words.add(title) if wanted_words == found_words: break
# -*- coding: utf-8 -*- """ This example extracts a number of words from the simple.wiktionary xml file. """ from wiktionary_parser.xml_parser import XMLPageParser from wiktionary_parser.languages.simple.page import simplePage xml_file = open( '../../wiktionary_data/simplewiktionary-20110514-pages-articles.xml') xml_parser = XMLPageParser(xml_file, simplePage) # The words we want to extract wanted_words = set([u'fish']) found_words = set([]) for title, page in xml_parser.from_titles(wanted_words): page.parse() # Print out a summary of the want for word in page.words: print word.summary() found_words.add(title) if wanted_words == found_words: break
# -*- coding: utf-8 -*- """ This example extracts a number of words from the wiktionary xml file. """ from wiktionary_parser.xml_parser import XMLPageParser from wiktionary_parser.languages.de.page import dePage xml_file = open( '../../wiktionary_data/dewiktionary-20110504-pages-articles.xml') xml_parser = XMLPageParser(xml_file, dePage) german_words = set([u'Bank', u'Kiefer']) found_words = set([]) for title, page in xml_parser.from_titles(german_words): found_words.add(title) page.parse() for word in page.words: print('') print(word.title) print('******************') if word.bedeutungen: print('--Bedeutungen---------------') print(word.bedeutungen) if word.beispiele: print('--Beispiele-----------------') print(word.beispiele) if word.gender: print('--Gender--------------------')
# -*- coding: utf-8 -*- """ This example scans through the simple-english wiktionary file and displays an alerts that the parser raises. """ from wiktionary_parser.xml_parser import XMLPageParser from wiktionary_parser.languages.simple.page import simplePage from wiktionary_parser.languages.simple.alerts import UnknownType xml_file = open( '../../wiktionary_data/simplewiktionary-20110514-pages-articles.xml') xml_parser = XMLPageParser(xml_file, simplePage) page_count = 0 errors = 0 for page in xml_parser: page_count += 1 page.parse() if False: fixables = page.get_fixable_alerts() if fixables: print('--------') print(page.title) print('--------') for alert in fixables: print(alert.description) print(alert.message)
# -*- coding: utf-8 -*- """ This example extracts a number of words from the wiktionary xml file. """ import sys sys.path.insert(0, '/disk/scratch/s1146856/project_codes/tools/sense_stuff/wiktionary-parser-xml/wiktionary_parser') sys.path.insert(0, '/disk/scratch/s1146856/project_codes/tools/sense_stuff/wiktionary-parser-xml') from wiktionary_parser.xml_parser import XMLPageParser from wiktionary_parser.languages.nl.page import nlPage from wiktionary_parser.languages.nl.parseText import NlParseText xml_file = open('../../../../../datasets/sense_disambiguation_datasets/nlwiktionary-20161120-pages-articles-multistream.xml') xml_parser = XMLPageParser(xml_file, nlPage) dutch_words = set(['springen']) #for title, page in xml_parser.from_titles(german_words): # found_words.add(title) for page in xml_parser.from_titles(dutch_words): print page.text parseData = NlParseText(page.text) print 'Title', page.title #parseData.view_sections() #print parseData.synonyms print parseData.verb_translations break