示例#1
0
    def run(self):
        if self.has_run:
            raise StandardError('Already Run')
        self.has_run = True
        xml_parser = XMLPageParser(self.xml_file, dePage)
        counter = 0
        alert_counter = 0
        for page in xml_parser:
            try:
                counter += 1
                if counter % 1000 == 0:
                    print counter
                page.parse()
                all_alerts = page.get_all_alerts()
                if not all_alerts:
                    continue
                for alert_type, alerts in all_alerts.items():
                    for alert in alerts:
                        alert_counter += 1
                        print counter, alert_counter
                        print alert.message
                        #if alert_type is NoFTMatchAlert and alert.section_class == SubstantivTabelleSection:


#                            silben_counter += 1
#                            print silben_counter, counter
#    print alert.text
#                for alert in all_alerts.get(NoFTMatchAlert, []):
#                    print alert.title, alert.text
#                for alert in all_alerts.get(PatchRemainderAlert, []):
#                    print alert.title, alert.patched_text, '--------------', alert.text
            except:
                print 'Failed on page %s' % page.title
                print page.text
                raise
示例#2
0
 def run(self):
     if self.has_run:
         raise StandardError('Already Run')
     self.has_run = True
     xml_parser = XMLPageParser(self.xml_file, dePage)
     counter = 0
     alert_counter = 0
     f_alert_counter = 0
     st_counter = 0
     for page in xml_parser:
         try:
             counter += 1
             if counter % 1000 == 0:
                 print counter
             page.parse()
             # get wortart sections
             wortart_sections = []
             for section in page.children:
                 if isinstance(section, deLanguageSection):
                     for section2 in section.children:
                         if isinstance(section2, deWortartSection):
                             wortart_sections.append(section2)
             alerts = []
             f_alerts = []
             for section in wortart_sections:
                 word = section.get_property('word')
                 if word and not word.is_name:
                     alerts = alerts + section.get_alerts(
                         SubstantivTabelleAlert)
                     f_alerts = f_alerts + section.get_alerts(
                         FixableSubstantivTabelleAlert)
                     # Check if it has a substativ-tabelle
                     for child in section.children:
                         if isinstance(child, deWortartContentSection):
                             for grandchild in child.children:
                                 if isinstance(grandchild,
                                               SubstantivTabelleSection):
                                     st_counter += 1
             for alert in alerts:
                 alert_counter += 1
                 print counter, alert_counter, f_alert_counter, st_counter
                 print alert.message
             for alert in f_alerts:
                 f_alert_counter += 1
                 print counter, alert_counter, f_alert_counter, st_counter
                 print alert.message
         except:
             print 'Failed on page %s' % page.title
             print page.text
             raise
示例#3
0
def load_xml():
    session = Session()
    Base.metadata.create_all(engine)
    xml_file = open(
        '../../wiktionary_data/enwiktionary-20120220-pages-meta-current.xml')
    xml_parser = XMLPageParser(xml_file, enPage)
    total_lines = 0
    counter = 0
    for page in xml_parser:
        ev = session.query(Page).filter(Page.language == page.language,
                                        Page.title == page.title)
        if ev.count() == 0:
            session.add(page)
        if counter % 1000 == 0:
            print(counter)
            session.commit()
            print('committed')
        counter += 1
    session.commit()
示例#4
0
 def scan_xml(self, max_no=None):
     # Check title hasn't been used before
     if self.title in self.log_page.einsatz_sections:
         raise StandardError(u'title has been used before')
     # Parse xml dump
     xml_parser = XMLPageParser(self.xml_file, self.page_class)
     counter = 0
     all_counter = 0
     for page in xml_parser:
         all_counter += 1
         page.parse()
         if self.requires_approval(page):
             counter += 1
             print '%d: %d: Fixable alert for page %s' % (counter, all_counter, page.title)
             if max_no is not None and counter >= max_no:
                 break
         if all_counter % 1000 == 0:
             print all_counter
             self.memory.save()
     self.memory.save()
import sys
sys.path.insert(
    0,
    '/disk/scratch/s1146856/project_codes/tools/sense_stuff/wiktionary-parser-xml/wiktionary_parser'
)
sys.path.insert(
    0,
    '/disk/scratch/s1146856/project_codes/tools/sense_stuff/wiktionary-parser-xml'
)
from wiktionary_parser.xml_parser import XMLPageParser
from wiktionary_parser.languages.fr.page import frPage
from wiktionary_parser.languages.fr.parseText import FrParseText

xml_file = open(
    '../../../../../datasets/sense_disambiguation_datasets/frwiktionary-20161101-pages-articles-multistream.xml'
)
xml_parser = XMLPageParser(xml_file, frPage)

french_words = set(['sauter'])

#for title, page in xml_parser.from_titles(german_words):
#    found_words.add(title)
for page in xml_parser.from_titles(french_words):
    #print page.text
    parseData = FrParseText(page.text)
    print 'Title', page.title
    #parseData.view_sections()
    #print parseData.synonyms
    print parseData.verb_translations
    break
示例#6
0
# -*- coding: utf-8 -*-
"""
This example extracts a number of words from the wiktionary xml file.
"""

from wiktionary_parser.xml_parser import XMLPageParser
from wiktionary_parser.languages.de.page import dePage

xml_file = open('../../wiktionary_data/dewiktionary-20110504-pages-articles.xml')
xml_parser = XMLPageParser(xml_file, dePage)

german_words = set([u'Bank', u'Kiefer'])

found_words = set([])

for title, page in xml_parser.from_titles(german_words):
    found_words.add(title)
    page.parse()
    for word in page.words:
        print('')
        print(word.title)
        print('******************')
        if word.bedeutungen:
            print('--Bedeutungen---------------')
            print(word.bedeutungen)
        if word.beispiele:
            print('--Beispiele-----------------')
            print(word.beispiele)
        if word.gender:
            print('--Gender--------------------')
            print(word.gender)
示例#7
0
from difflib import Differ

from wiktionary_parser.xml_parser import XMLPageParser
from wiktionary_parser.languages.de.page import dePage

xml_file = open('../../data/dewiktionary-20090913-pages-articles.xml')

xml_parser = XMLPageParser(xml_file, dePage)

counter = 0
show_num = 10
for page in xml_parser:
    page.parse()
    if page.fixable():
        print '%s FIXABLE' % page.title
        page.fix()
    new_text = page.render()
    if new_text != page.text:
        result = list(Differ().compare(page.text.split('\n'),
                                       new_text.split('\n')))
        for line in result:
            if line[0] != ' ':
                print line
        print '----------------------------------'
        #break
# -*- coding: utf-8 -*-
"""
This example extracts a number of words from the simple.wiktionary xml file.
"""

from wiktionary_parser.xml_parser import XMLPageParser
from wiktionary_parser.languages.simple.page import simplePage

xml_file = open('../../wiktionary_data/simplewiktionary-20110514-pages-articles.xml')
xml_parser = XMLPageParser(xml_file, simplePage)

# The words we want to extract
wanted_words = set([u'fish'])

found_words = set([])

for title, page in xml_parser.from_titles(wanted_words):
    page.parse()
    # Print out a summary of the want
    for word in page.words:
        print word.summary()
    found_words.add(title)
    if wanted_words == found_words:
        break
    
# -*- coding: utf-8 -*-
"""
This example extracts a number of words from the simple.wiktionary xml file.
"""

from wiktionary_parser.xml_parser import XMLPageParser
from wiktionary_parser.languages.simple.page import simplePage

xml_file = open(
    '../../wiktionary_data/simplewiktionary-20110514-pages-articles.xml')
xml_parser = XMLPageParser(xml_file, simplePage)

# The words we want to extract
wanted_words = set([u'fish'])

found_words = set([])

for title, page in xml_parser.from_titles(wanted_words):
    page.parse()
    # Print out a summary of the want
    for word in page.words:
        print word.summary()
    found_words.add(title)
    if wanted_words == found_words:
        break
# -*- coding: utf-8 -*-
"""
This example extracts a number of words from the wiktionary xml file.
"""

from wiktionary_parser.xml_parser import XMLPageParser
from wiktionary_parser.languages.de.page import dePage

xml_file = open(
    '../../wiktionary_data/dewiktionary-20110504-pages-articles.xml')
xml_parser = XMLPageParser(xml_file, dePage)

german_words = set([u'Bank', u'Kiefer'])

found_words = set([])

for title, page in xml_parser.from_titles(german_words):
    found_words.add(title)
    page.parse()
    for word in page.words:
        print('')
        print(word.title)
        print('******************')
        if word.bedeutungen:
            print('--Bedeutungen---------------')
            print(word.bedeutungen)
        if word.beispiele:
            print('--Beispiele-----------------')
            print(word.beispiele)
        if word.gender:
            print('--Gender--------------------')
# -*- coding: utf-8 -*-
"""
This example scans through the simple-english wiktionary file and displays
an alerts that the parser raises.
"""

from wiktionary_parser.xml_parser import XMLPageParser
from wiktionary_parser.languages.simple.page import simplePage

from wiktionary_parser.languages.simple.alerts import UnknownType

xml_file = open(
    '../../wiktionary_data/simplewiktionary-20110514-pages-articles.xml')
xml_parser = XMLPageParser(xml_file, simplePage)

page_count = 0
errors = 0

for page in xml_parser:
    page_count += 1
    page.parse()
    if False:
        fixables = page.get_fixable_alerts()
        if fixables:
            print('--------')
            print(page.title)
            print('--------')
            for alert in fixables:
                print(alert.description)
                print(alert.message)
# -*- coding: utf-8 -*-
"""
This example extracts a number of words from the wiktionary xml file.
"""

import sys
sys.path.insert(0, '/disk/scratch/s1146856/project_codes/tools/sense_stuff/wiktionary-parser-xml/wiktionary_parser')
sys.path.insert(0, '/disk/scratch/s1146856/project_codes/tools/sense_stuff/wiktionary-parser-xml')
from wiktionary_parser.xml_parser import XMLPageParser
from wiktionary_parser.languages.nl.page import nlPage
from wiktionary_parser.languages.nl.parseText import NlParseText

xml_file = open('../../../../../datasets/sense_disambiguation_datasets/nlwiktionary-20161120-pages-articles-multistream.xml')
xml_parser = XMLPageParser(xml_file, nlPage)

dutch_words = set(['springen'])


#for title, page in xml_parser.from_titles(german_words):
#    found_words.add(title)
for page in xml_parser.from_titles(dutch_words):
    print page.text
    parseData = NlParseText(page.text)
    print 'Title', page.title
    #parseData.view_sections()
    #print parseData.synonyms
    print parseData.verb_translations
    break