Exemplo n.º 1
0
def typographify(input):
    """
    Run from parent directory.
    >>> import os
    >>> os.chdir('typographify')
    >>> print typographify(open('typographify.txt').read())
    <strong>strong</strong> <em>words</em>
    parsed 17 chars of 17

    https://pypi.python.org/pypi/SimpleParse/
    Version 2.2
    http://www.ibm.com/developerworks/linux/library/l-simple/index.html
    https://books.google.com/books?id=GxKWdn7u4w8C&pg=PA319&lpg=PA319&dq=simpleparse+standard+input&source=bl&ots=M8x58SCzpT&sig=5DOLvoC5-TZyxxlq3_LHD68gbXY&hl=en&sa=X&ved=0ahUKEwjFjOCurKjMAhVMuYMKHaM4ATUQ6AEIMTAD#v=onepage&q=simpleparse%20standard%20input&f=false
    """
    parser = Parser(open('typographify.def').read(), 'para')
    taglist = parser.parse(input)
    text = ''
    for tag, beg, end, parts in taglist[1]:
        if tag == 'plain':
            text += (input[beg:end])
        elif tag == 'markup':
            markup = parts[0]
            mtag, mbeg, mend = markup[:3]
            start, stop = codes.get(mtag, ('<!-- unknown -->','<!-- / -->'))
            text += start + input[mbeg+1:mend-1] + stop
    text += 'parsed %s chars of %s' %  (taglist[-1], len(input))
    return text
Exemplo n.º 2
0
 def parse(self, *args, **kwargs):
     res = Parser.parse(self, *args, **kwargs)
     l = [r for r in res[1] if isinstance(r, Node)]
     count = 0
     while count < len(l):
         l += l[count].children
         count += 1
     for e in l:
         if e.__class__.__name__ == "Inline":
             url = os.path.join(self.root_path, e.url)
             if e.children[:]:
                 continue
             logger.info("Parse inlined vrml {0}".format(url))
             e.children = Parser.parse(self, open(url).read())[1]
             for child in e.children:
                 child._parent = e
     code = "from parser import Node\n\n"
     for name, prototype in self.prototypes.items():
         obj = prototype()
         attrs = [(key, getattr(obj, key)) for key in dir(obj)
                  if not ( key.startswith("_") or callable(getattr(obj, key))
                           or key == "children")]
         code += "class {0}({1}):\n".format(name, "object")#prototype.__bases__[0].__name__)
         #print obj, dir(obj), "\n---\n", obj._ftypes, "\n---\n",attrs
         code += "    def __init__(self):\n"
         for key, value in attrs:
             code += "        self.{0} = {1} #{2}\n".format(key, repr(value),  prototype.ftype(key))
         code += "\n"
     f = open("/tmp/robotviewer_protos.py",'w')
     f.write(code)
     f.close()
     logger.debug("internally generated foloowing classes:\n{0}".format(code))
     return res[0], res[1], res[2]
Exemplo n.º 3
0
 def testTermSharing( self ):
     """Test that shared terminal productions are using the same parser"""
     first =""" a := b,b >b<:= d d:= 'this'"""
     pFirst = Parser( first, "a")
     tFirst = pFirst.buildTagger()
     b,c = tFirst
     assert b is c, """Not sharing the same tuple for b and c instances"""
Exemplo n.º 4
0
 def testBasic(self):
     proc = dispatchprocessor.DispatchProcessor()
     setattr(proc, "string", strings.StringInterpreter())
     for production, yestable, notable in parseTests:
         p = Parser("x := %s" % production, "x")
         for data in yestable:
             if production == "string":
                 success, results, next = p.parse(data, processor=proc)
             else:
                 success, results, next = p.parse(data)
             assert success and (next == len(data)), """Did not parse string %s as a %s result=%s""" % (
                 repr(data),
                 production,
                 (success, results, next),
             )
             assert results, """Didn't get any results for string %s as a %s result=%s""" % (
                 repr(data),
                 production,
                 (success, results, next),
             )
             if production == "string":
                 expected = eval(data, {}, {})
                 assert results[0] == expected, (
                     """Got different interpreted value for data %s, we got %s, expected %s"""
                     % (repr(data), repr(results[0]), repr(expected))
                 )
         for data in notable:
             success, results, next = p.parse(data)
             assert not success, """Parsed %s of %s as a %s result=%s""" % (
                 repr(data),
                 production,
                 (success, results, next),
             )
Exemplo n.º 5
0
def main():
    oparser = get_parser()
    opts, args = oparser.parse_args()

    parser = Parser(open(opts.grammar).read(),
                    opts.root)
    success, tags, next = parser.parse( open(opts.input).read() )
    print tags
Exemplo n.º 6
0
 def debugparser(self, filename):
     file = open(filename).read()
     debugparser = Parser (self.declaration)
     import pprint
     info("started debug parsing")
     pprint.pprint(debugparser.parse(file))
     info("completed debug parsing")
     exit(0)
 def testTZ( self ):
     names = list(timezone_names.timezone_mapping.keys())
     names.sort() # tests that the items don't match shorter versions...
     decl = Parser("""this := (timezone_name, ' '?)+""", 'this')
     proc = dispatchprocessor.DispatchProcessor()
     proc.timezone_name = timezone_names.TimeZoneNameInterpreter()
     text = ' '.join(names)
     success, result, next = decl.parse( text, processor = proc )
     assert success, """Unable to complete parsing the timezone names, stopped parsing at char %s %s"""%(next, text[next:])
     assert result == list(map( timezone_names.timezone_mapping.get, names)), """Got different results for interpretation than expected (expected first, recieved second)\n%s\n%s"""%(list(map( timezone_names.timezone_mapping.get, names)), result)
Exemplo n.º 8
0
	def __init__(self, filename):
		with open(filename) as f:
			content = f.read()

		parser = Parser(declaration)
		success, tree, nextChar =  parser.parse(content, processor=ConfigProcessor(self))
		if not success:
			raise Exception

		for k, v in tree[0].iteritems():
			setattr(self, k, v)
Exemplo n.º 9
0
 def testBasic( self ):
     for production, yestable, notable in parseTests:
         p = Parser( "x := %s"%production, 'x')
         for data in yestable:
             success, results, next = p.parse( data)
             assert success and (next == len(data)), """Did not parse comment %s as a %s result=%s"""%( repr(data), production, (success, results, next))
             assert results, """Didn't get any results for comment %s as a %s result=%s"""%( repr(data), production, (success, results, next))
         for data in notable:
             success, results, next = p.parse( data)
             assert not success, """Parsed %s of %s as a %s result=%s"""%( 
                 next, repr(data), production, results
             )
Exemplo n.º 10
0
    def testTermCompression( self ):
        """Test that unreported productions are compressed

        Term compression is basically an inlining of terminal
        expressions into the calling table.  At the moment
        the terminal expressions are all duplicated, which may
        balloon the size of the grammar, not sure if this will
        be an actual problem.  As written, this optimization
        should provide a significant speed up, but there may
        the even more of a speed up if we allow for sharing
        the terminal tuples as well.

        This:
            a:=b <b>:= -c* c:='this'
        Should eventually compress to this:
            a := -'this'*
        """
        failures = []
        for first, second in [
            ("""a:=b <b>:= -c* c:='this'""", """a := -'this'*"""),
            ("""a:=b >b<:= c c:= 'this'""", """a := c c:= 'this'"""),
            ("""a:=b >b<:= c <c>:= 'this'""", """a := 'this'"""),
            ("""a:=b >b<:= c+ <c>:= 'this'""", """a := 'this'+"""),
            # The following will never work, so eventually may raise
            # an error or at least give a warning!
            ("""a:=b,c >b<:= c+ <c>:= 'this'""", """a := 'this'+,'this'"""),
            ("""a:=b/c >b<:= c+ <c>:= 'this'""", """a := 'this'+/'this'"""),
            # This is requiring group-compression, which isn't yet written
            ("""a:=-b/c >b<:= c+ <c>:= 'this'""", """a := -'this'+/'this'"""),
            ("""a    := (table1 / table2 / any_line)*
  <any_line> := ANY*, EOL
  <ANY>      := -EOL
  <EOL>      := '\n'
  table1 := 'a'
  table2 := 'b'
  """, """a    := (table1 / table2 / (-'\n'*, '\n'))*
    table1 := 'a'
  table2 := 'b'
"""),
            ("""a:= b,c <b>:= -c* <c>:= '\n'""", """a := -'\n'*,'\n'"""),
            
        ]:
            pFirst = Parser( first, "a")
            pSecond = Parser( second, "a")
            tFirst = pFirst.buildTagger()
            tSecond = pSecond.buildTagger()
            if not rcmp( tFirst , tSecond):
                tFirstRepr = pprint.pformat(tFirst)
                tSecondRepr = pprint.pformat(tSecond)
                failures.append( """%(first)r did not produce the same parser as %(second)r\n\t%(tFirstRepr)s\n\t%(tSecondRepr)s"""%locals())
        if failures:
            raise ValueError( "\n".join(failures))
Exemplo n.º 11
0
 def __init__(self):
     declaration = r'''
     fun             :=  fun_name,'(',exp_list,')'
     fun_name        :=  [a-zA-Z0-9_-]+
     exp_list        :=  exp,(',',exp)*
     exp             :=  pm_exp
     pm_exp          :=  md_exp,('+'/'-',md_exp)*
     md_exp          :=  bracket_exp,('*'/'/',bracket_exp)*
     bracket_exp     :=  ('(',exp,')')/str_var/number
     str_var         :=  [a-zA-Z],[a-zA-Z0-9_-\\.]*
     '''
     self.fun_parser = Parser( declaration, "fun" )
     self.exp_parser = Parser( declaration, "exp" )
Exemplo n.º 12
0
 def testBasic( self ):
     for production, processor, yestable, notable in _data:
         p = Parser( "x := %s"%production, 'x')
         proc = dispatchprocessor.DispatchProcessor()
         setattr(proc, production, processor())
         for data, length, value in yestable:
             success, results, next = p.parse( data, processor = proc)
             assert next == length, """Did not parse string %s of %s as a %s result=%s"""%( repr(data[:length]), repr(data), production, (success, results, next))
             assert results[0] == value, """Didn't get expected value from processing value %s, expected %s, got %s"""%( data[:length], value, results[0])
             
         for data in notable:
             success, results, next = p.parse( data)
             assert not success, """Parsed %s of %s as a %s result=%s"""%( repr(data[:length]), repr(data), production, (success, results, next))
Exemplo n.º 13
0
 def __init__(self, grammar = None, root_node = "vrmlScene"):
     path = os.path.abspath(os.path.dirname(__file__))
     if not grammar:
         grammar_file = os.path.join(path,"vrml.sbnf" )
         # print("Using grammar {0}".format(grammar_file))
         grammar = open(grammar_file).read()
     #logger.info("Grammar: {0}".format(grammar))
     Parser.__init__(self, grammar, root_node)
     logging.debug("created parser instance")
     self.root_path = ""
     self.prototypes = {}
     spec_data = open(os.path.join(path, 'standard_nodes.wrl')).read()
     self.parse(spec_data)
     logging.debug("Parsed vrml2.0 specs")
Exemplo n.º 14
0
def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hn')
    except getopt.GetoptError:
        usage()
        sys.exit(2)

    # Get any options
    navFlag = False
    for o, a in opts:
        if o == '-h':
            usage()
            sys.exit()
        if o == '-n':
            navFlag = True

    # Get the input filename
    if len(args) != 1:
        usage()
        sys.exit(2)
    else:
        filename = args[0]

    # Initialise data base
    db = freenav.freedb.Freedb()
    db.delete_airspace()

    # Initialise parser
    parser = Parser(tnp.TNP_DECL, 'tnp_file')
    p = db.get_projection()
    proj = freenav.projection.Lambert(p['parallel1'], p['parallel2'],
                                      p['latitude'], p['longitude'])
    output_processor = AirProcessor(db, proj)
    tnp_processor = tnp.TnpProcessor(output_processor)

    # Read data and parse
    airdata = open(filename).read()
    success, parse_result, next_char = parser.parse(airdata,
                                                    processor=tnp_processor)

    # Report any syntax errors
    if not (success and next_char==len(airdata)):
        print "%s: Syntax error at (or near) line %d" % \
            (filename, len(airdata[:next_char].splitlines())+1)
        sys.exit(1)

    # Create indices and tidy up
    db.commit()
    db.vacuum()
Exemplo n.º 15
0
	def _testSet( self, set, singleName, multiName ):
		"""Test multi-line definitions"""
		decl = """single := %s multiple := %s"""%( singleName, multiName )
		p = Parser(decl)
		notset = string.translate( fulltrans, fulltrans, set )
		for char in set:
			success, children, next = p.parse( char, singleName)
			assert success and (next == 1), """Parser for %s couldn't parse %s"""%( singleName, char )
		for char in notset:
			success, children, next = p.parse( char, singleName)
			assert (not success) and (next == 0), """Parser for %s parsed %s"""%( singleName, char )
			success, children, next = p.parse( char, multiName)
			assert (not success) and (next == 0), """Parser for %s parsed %s"""%( multiName, char )
		success, children, next = p.parse( set, multiName)
		assert success and (next == len(set)), """Parser for %s couldn't parse full set of chars, failed at %s"""%( multiName, set[next:] )
Exemplo n.º 16
0
 def testISODate( self ):
     """Test the parsing of ISO date and time formats"""
     values = [
         ("2002-02-03", DateTime.DateTime( 2002, 2,3)),
         ("2002-02",DateTime.DateTime( 2002, 2)),
         ("2002",DateTime.DateTime( 2002)),
         ("2002-02-03T04:15", DateTime.DateTime( 2002, 2,3, 4,15)),
         ("2002-02-03T04:15:16", DateTime.DateTime( 2002, 2,3, 4,15, 16)),
         ("2002-02-03T04:15:16+00:00", DateTime.DateTime( 2002, 2,3, 4,15, 16)-tzOffset),
     ]
     p = Parser ("d:= ISO_date_time", "d")
     proc = iso_date.MxInterpreter()
     for to_parse, date in values:
         success, children, next = p.parse( to_parse, processor=proc)
         assert success, """Unable to parse any of the string %s with the ISO date-time parser"""% (to_parse)
         assert next == len(to_parse),"""Did not finish parsing string %s with the ISO date-time parser, remainder was %s, found was %s"""%( to_parse, to_parse [next:],children)
         assert children [0] == date,"""Returned different date for string %s than expected, got %s, expected %s"""% (to_parse,children [0], date)
Exemplo n.º 17
0
def convert(input, definition = 'compilation_unit'):
    """
    Example of converting syntax from ActionScript to C#.

    >>> print(convert('import com.finegamedesign.anagram.Model;', 'import_definition'))
    using /*<com>*/Finegamedesign.Anagram/*<Model>*/;

    Related to grammar unit testing specification (gUnit)
    https://theantlrguy.atlassian.net/wiki/display/ANTLR3/gUnit+-+Grammar+Unit+Testing
    """
    source = cfg['source']
    to = cfg['to']
    parser = Parser(grammars[source], definition)
    input = may_import(None, input, definition, to)
    taglist = parser.parse(input)
    taglist = [(definition, 0, taglist[-1], taglist[1])]
    text = _recurse_tags(taglist, input, source, to)
    text = may_import(taglist, text, definition, to)
    text = may_format(definition, text)
    return text
Exemplo n.º 18
0
Arquivo: calc.py Projeto: maffe/anna
 def parse(self, txt, *args, **kwargs):
     # Easter egg.
     if txt == "2+2": return (True, 5)
     try:
         success, children, next = Parser.parse(self, txt, *args, **kwargs)
     except ParserSyntaxError:
         return (False, 0.0)
     if not (success and next == len(txt)):
         return (False, 0.0)
     else:
         return (True, children[0])
Exemplo n.º 19
0
class GeneratorAPI1:
	"""Stand-in class supporting operation of SimpleParse 1.0 applications

	There was really only the one method of interest, parserbyname,
	everything else was internal (and is now part of
	simpleparsegrammar.py).
	"""
	def __init__( self, production, prebuilt=() ):
		from simpleparse.parser import Parser
		self.parser = Parser( production, prebuilts=prebuilt )
	def parserbyname( self, name ):
		"""Retrieve a tag-table by production name"""
		return self.parser.buildTagger( name )
Exemplo n.º 20
0
class Compiler:

    def __init__(self):
        self.parser = Parser(grammar)
        self.translator = SyntaxTreeProcessor()

    def compile(self, command):
        cmd = re.sub('\s', '', command)

        (success, children, nextchar) = self.parser.parse(cmd)
        result = self.translator((success, children, nextchar), cmd)
        python_src = result[1][0]

        return compile(python_src, '', 'exec')
Exemplo n.º 21
0
	def __init__(self, *args):
		scriptDir = os.getcwd()

		self.graph = Graph()
		n3File = Util.relpath(scriptDir + '/etc/sfs-extra.n3')
		self.graph.load(n3File, format='n3')

		self.roots = []
		self.uriFormatter = {}
		self.decl = ''
		self.namedLaws = {}
		self.loadEbnf(scriptDir + '/etc/base.ebnf')
		self.args = args
		
		if self.LAGRUM in args:
			prods = self.loadEbnf(scriptDir + '/etc/lagrum.ebnf')
			for p in prods: 
				self.uriFormatter[p] = self.sfsFormatUri
			self.namedLaws.update(self.getRelationship(RDFS.label))
			self.roots.append('sfsrefs')
			self.roots.append('sfsref')

		if self.KORTALAGRUM in args:
			# TODO: Fix korta lagrum also
			pass

		if self.FORARBETEN in args:
			prods = self.loadEbnf(scriptDir + '/etc/forarbeten.ebnf')
			for p in prods:
				self.uriFormatter[p] = self.forarbeteFormatUri
			self.roots.append('forarbeteref')

		self.decl += 'root ::= (%s/plain)+\n' % '/'.join(self.roots)
		self.parser = Parser(self.decl, 'root')
		self.tagger = self.parser.buildTagger('root')
		self.depth 	= 0

		#SFS specific settings
		self.currentLaw 		= None
		self.currentChapter 	= None
		self.currentSection 	= None
		self.currentPiece		= None
		self.lastLaw			= None
		self.currentNamedLaws	= {}
Exemplo n.º 22
0
testdata.append('cn=="Klaus Müller" AND (dob <= dob2) OR (dob == 2000) AND (gender != "M")')
testdata.append('  (  (  (  test  ==  (  test  )  )  )  )  ')
testdata.append('peter == "test"')
testdata.append('cn="Klaus Müller" AND (dob < dob2 OR dob = 2000) AND gender != "M"')
testdata.append('test')
testdata.append('(test)')
testdata.append('((test))')
testdata.append('test==test')
testdata.append('()')
testdata.append('(test')
testdata.append('cn="Klaus Müller" AND (dob < 1975 OR dob = 2000) AND gender != "M"')



from simpleparse.parser import Parser
import pprint

parser = Parser(declaration, "FILTER")
if __name__ =="__main__":
    for entry in testdata: 
        res = parser.parse(entry);

        print '-' * 90
        if(res[2] != len(entry)): 
            print "FAILED: ", entry
            pprint.pprint(parser.parse(entry))
            print len(entry), res[2]
        else:
            print "OK: ", entry

Exemplo n.º 23
0
 def testEOFFail( self ):
     p = Parser( """this := 'a',EOF""", 'this')
     success, children, next = p.parse( 'a ' )
     assert not success, """EOF matched before end of string"""
Exemplo n.º 24
0
 def parse(self, definition, parserName, testValue, source):
     result = Parser(definition, ).parse(testValue,
                                         production=parserName,
                                         processor=source)
     return result
Exemplo n.º 25
0
def format_taglist(input, definition):
    source = cfg['source']
    parser = Parser(grammars[source], definition)
    taglist = parser.parse(input)
    return pformat(taglist)
Exemplo n.º 26
0
class EBNFSpill(object):
    DEFAULT_MAX_TIMES_CHAR = 35
    DEFAULT_MAX_TIMES_FUNC = 10
    DEFAULT_MAX_SELF_RECURSION = 25
    DEFAULT_MAX_WALK_RECURSION = 100

    def __init__(self,showTags=False,showTagsRecursive=False,recursionLevel=0):
        self._reset()
        self.showTags=showTags
        self.showTagsRecursive=showTagsRecursive
        self.recursionLevelObj=recursionLevel
        
        if self.recursionLevelObj>self.DEFAULT_MAX_SELF_RECURSION: raise Exception("a")
        #print "INIT",recursionLevel
        pass
    
    def __del__(self):
        self.recursionLevelObj-=1
        pass
    
    def validate(self,data):
        return self.parser.parse(data)
    
    def setDeclaration(self,declaration,production):
        self.parser = Parser(declaration, production)
        self.table =  self.parser.buildTagger(production=production)
    
    def setTable(self,table,nodes=None):
        self.table = table
        self.nodes=nodes or self.nodes
    
    def _reset(self):
        self.nodes = {}
        self.ctx = []       # context (infos like recurion for table2)
        #self.recursionLevelObj=0
        self.recursionLevelWalk=0
        random.seed()
        
    def setDefaults(self,**kwargs):
        valid_defaults = [i for i in dir(self) if i.startswith("DEFAULT_")]
        for k,v in kwargs.iteritems():
            if k in valid_defaults:
                setattr(self,k,v)
            else:
                raise Exception("Not allowed to change %s to %s (valid options: %s)"%(k,v,valid_defaults))
    
    def getTable(self):
        return self.table

    def getTagName(self,node):
        if self.showTags and node[0]:
            return "<%s>"%node[0]
        return ""
    
    def checkTypeIterable(self,l):
        return isinstance(l, collections.Iterable) and not isinstance(l, basestring)
    def checkTypeIterableRecursive(self,l):
        return isinstance(l, collections.Iterable) and not isinstance(l, basestring) and isinstance(l,tuple) and isinstance(l[0],list) and isinstance(l[1],int)

    def checkTypeNodeBase(self,l):
        #checks ( None|str, int, *)
        return self.checkTypeIterable(l) and len(l)>=2 and (l[0]==None or isinstance(l[0],basestring)) and isinstance(l[1],int)
    def checkTypeNodeWithChilds(self,l):
        #print "check_",str(l)[:50]
        try:
            #print "check_metric",checkTypeNodeBase(l),len(l)>=3 , checkTypeIterable(l[2])
            pass
        except:
            pass
        return self.checkTypeNodeBase(l) and len(l)>=3 and self.checkTypeIterable(l[2])
    
    def next(self):
        return

    def rndTimesFunc(self,sample_func,args,minlen=0,maxlen=None):
        maxlen = maxlen or self.DEFAULT_MAX_TIMES_FUNC
        maxlen+=1
        out = ""
        for i in range(random.randrange(minlen,maxlen)):
            out+=sample_func(args)
        return out   

    def rndTimes(self,sample,minlen=0,maxlen=None):
        maxlen = maxlen or self.DEFAULT_MAX_TIMES_CHAR
        maxlen+=1
        out = ""
        for i in range(random.randrange(minlen,maxlen)):
            out+=sample
        return out
    
    def rndSelect(self,haystack,sample_len=1,minlen=0,maxlen=None):
        maxlen = maxlen or self.DEFAULT_MAX_TIMES_CHAR
        maxlen+=1
        out = ""
        for i in range(random.randrange(minlen,maxlen)):
            out += "".join(random.sample(haystack,sample_len))
        
        return out

    def eval(self,node):
        # different lenght commandos
        #print node
        #print id(node),node
        #if self.recursionLevelObj>self.DEFAULT_MAX_SELF_RECURSION or self.recursionLevelWalk>self.DEFAULT_MAX_WALK_RECURSION:
        #    return "<recursion_exception>"
        
        if not node:
            return ""
        
        if len(node)<3:
            raise Exception( "<3 - %s"%repr(node) )          #this is an error!
        
        elif node[1]==Tdef.MATCH_RECURSION_EXCEPTION:
            return "<<"
        
        elif node[1]==Tdef.MATCH_RECURSION:
            # create a new EBNFSpill object, and resolv this one?
            #print node[2],self.nodes[node[2]]
            self.recursionLevelObj+=1
            try:
                x = EBNFSpill(showTags=self.showTagsRecursive,recursionLevel=self.recursionLevelObj)
                x.setTable(self.table)
                recr_node=self.nodes[node[2]]
            except:
                return ""
            
            #
            #print "REKR",node
            #print "REKR2",self.nodes
            #print "<DAMN_RECURSION %s wild=%s>"%(node[2],self.ctx)
            #return "<RECURSION"
            
            #print "EXCEPT:",node[2],self.nodes
            #return self.rndTimes(x.generate(recr_node['obj']), 0, 3)
            return self.getTagName(node)+x.generate(recr_node)
        
        # single words/selections
        elif len(node)==3:
            if node[1]==Tdef.MATCH_WORD or node[1]==Tdef.MATCH_IS:
                return self.getTagName(node)+node[2]
            elif node[1]==Tdef.MATCH_ALLIN or node[1]==Tdef.MATCH_ISIN:
                return self.getTagName(node)+self.rndSelect(node[2],minlen=1,maxlen=1)
            elif node[1]==Tdef.MATCH_TABLE:
                # (xyz,MATCH_TABLE, <table>, 1)  == exact 1
                # (xyz,MATCH_TABLE, <table>, 2,1)  == *
                return self.getTagName(node)+""
                #return "<TABLE: %s>"%node[0]
        
        # mostly recursive ones
        elif len(node)>3:
            # recursions and stuff
            if node[1]==Tdef.MATCH_IS or node[1]==Tdef.MATCH_IS:
                # like (none,"MATCH_IS",'c',1,0) - choose zero or xx times
                return self.getTagName(node)+self.rndTimes(node[2])
            elif node[1]==Tdef.MATCH_ALLIN or node[1]==Tdef.MATCH_ISIN:
                return self.getTagName(node)+self.rndSelect(node[2])
            
            elif node[1]>=Tdef.MATCH_CALL and node[1]<=Tdef.MATCH_SUBTABLEINLIST:
                # (xyz,MATCH_TABLE, <table>, 1)  == exact 1
                # (xyz,MATCH_TABLE, <table>, 2,1)  == *
                self.recursionLevelObj+=1
                try:
                    x = EBNFSpill(showTags=self.showTagsRecursive,recursionLevel=self.recursionLevelObj)
                except:
                    return ""
                x.setTable(self.table)
                #print "<TABLE: %s | %s  || %s || nodeid:%s>"%(node[0:1],node[3],self.ctx,id(node[3]))
                #print node[2]
                #return self.getTagName(node)+""
                return self.getTagName(node)+self.rndTimesFunc(x.generate,(node[2]))            
        
        
        return self.getTagName(node)

    def generate(self,node=None):
        out = ""
        for n in self.walk(node):
            #print n
            #print self.recursionLevelObj,self.recursionLevelWalk

            out+= self.eval(n)

        return out

    def process(self,l):
        if self.checkTypeNodeBase(l):
            return (l[0],Tdef().toName()[l[1]])+l[2:]
        return l

    def _checkRecursion(self,node):
        # return boolean if boolean=True
        nID = id(node)

        #print "-->",nID, " NODE ",node
        if self.nodes.has_key(nID):
            raise StopRecursionException(('[RECURSION of Node=%s]'%nID,Tdef.MATCH_RECURSION,nID))
        #self.nodeIDs.append(nID)
        #print nID,node
        return node
    
    def _trackNode(self,node,nodeID=None):
        nID = nodeID or id(node)
        #print node
        if self.checkTypeNodeBase(node):
            #print "ISIN1",Tdef.MATCH_CALL,Tdef.MATCH_SUBTABLEINLIST,node[1],node[1]>=Tdef.MATCH_CALL and node[1]<=Tdef.MATCH_SUBTABLEINLIST
            
            if node[1]>=Tdef.MATCH_CALL and node[1]<=Tdef.MATCH_SUBTABLEINLIST:
                #print "ISIN2"
                #print "--- add BASE",id(node),node
                self.nodes[nID]=node
        elif self.checkTypeIterable(node):
            #print "--- add LIST",id(node),node
            self.nodes[nID]=node
        return node 
    
    def _pushLevel(self,node):
        # add one level. . to check recursion space
        if node[1]>=Tdef.MATCH_CALL and node[1]<=Tdef.MATCH_SUBTABLEINLIST  \
          and len(node)>3 and node[3]==2:  
            #print "push__"
            self.ctx.append(id(node))
        return node
    
    def _popLevel(self,node):
        if node[1]>=Tdef.MATCH_CALL and node[1]<=Tdef.MATCH_SUBTABLEINLIST  \
          and len(node)>3 and node[3]==2:
            #print "pop___"
            return self.ctx.pop()
        return node
    
    def walk(self,table=None):
        table=table or self.table
        if not table: raise Exception("EBNF TagTable not set, please generate [.setDeclaration()] or set one [.setTable()]")       #must not be !NONE!, please .setDeclaration() first!
        
        retn =  self._walk(table)
        self._reset()
        return retn

    def _walk(self,l):
        # check if (None|basestring, int, ... ) > 2
        #import time
        #time.sleep(0.8)
        #print "BEGIN",str(l)[:50]
        #recursion check
        if self.recursionLevelObj>self.DEFAULT_MAX_SELF_RECURSION or self.recursionLevelWalk>self.DEFAULT_MAX_WALK_RECURSION:
            #print self.recursionLevelWalk
            #print self.recursionLevelObj
            #nID=
            #raise StopRecursionException(('[RECURSION of Node=%s]'%nID,Tdef.MATCH_RECURSION,nID))
            #print self.nodes
            #yield l
            #print "StopIter",l
            #print self.recursionLevelObj,self.recursionLevelWalk
            raise StopIteration("HMM")
            #yield (None,Tdef.MATCH_RECURSION_EXCEPTION,())
            #raise StopRecursionException(("[RECURSION_EXCEPTION_LEVEL_REACHED]",Tdef.MATCH_RECURSION_EXCEPTION,None))
        self.recursionLevelWalk+=1
        #print id(l),len(l),l
        

        try:  
            if self.checkTypeNodeWithChilds(l):
                #print "Childs"
                self._checkRecursion(l)
                yield self._trackNode(l)
                self._pushLevel(l)
                for e in self._walk(l[2]):
                    yield e
                self._popLevel(l)
                    
            elif self.checkTypeNodeBase(l):
                #print "Base"
                self._checkRecursion(l)
                yield self._trackNode(l)
       
            elif self.checkTypeIterableRecursive(l):
                #print "xxx",l[0][0]
                nID=id(l[0][0])
                #print "IterReck"
                #print '[RECURSION of Node=%s]'%nID
                #TODO: does not work
                #fixme: does not work - recurses too much
                raise StopRecursionException(('[RECURSION of Node=%s]'%nID,Tdef.MATCH_RECURSION,nID))
                    
            elif self.checkTypeIterable(l):
                #print "list"
                self._checkRecursion(l)
                self._trackNode(l)              # checkTypeIterableRecursive refs one of these nodes :( // damn need to reparse if this doesnt work out
                #self._pushLevel(l)
                for e in l:
                    self._pushLevel(e)
                    for x in self._walk(e):
                        yield x             #do not check recursion here.. this is not what we want
                    self._popLevel(e)
                #self._popLevel(l)
            else:
                self._checkRecursion(l)
                print "Elem? - ",l
                #print self.checkTypeNodeWithChilds(l),self.checkTypeNodeBase(l),self.checkTypeIterable(l)
                yield self._trackNode(l)
        
        except StopRecursionException, e:
            #print self.nodes[e.getObj()[2]]
            #print "Except:",e.getObj()
            yield e.getObj()

    
        self.recursionLevelWalk-=1
Exemplo n.º 27
0
seq_group            :=  ts, element_token, (ts, element_token)+, ts

element_token       :=  (optional_element / base_element), repetition?

repetition          := ('*'/'+')
optional_element    := '[',fo_group,']'
>base_element<      := (range/string/group/name)
<fo_indicator>      :=  '|'

name                :=  [a-zA-Z_],[a-zA-Z0-9_]*
<ts>                :=  (
	('\n', ?-name) /
	[ \011]+ /
	comment
)*
comment             :=  '#',-'\n'+,'\n'

range               :=  string, ts, '...', ts, string

"""
from simpleparse.parser import Parser
from simpleparse.common import strings

parser = Parser( declaration )
if __name__ == "__main__":
	from simpleparse.stt.TextTools import print_tags
	grammar = open("""py_grammar.txt""").read()
	success, result, next = parser.parse( grammar, 'declarationset')
	print( 'success', success, next )
	print_tags( grammar, result )
Exemplo n.º 28
0
    '''make the config tuple, and adds them to config'''
    global config, text, lastItem, section_name
    if tag == 'section_name':
        section_name = text[start:end]
    elif tag == 'item':
        lastItem = text[start:end]
    elif tag == 'value':
        config.append((lastItem, text[start:end]))

def travel(root, func):
    if root == None: return

    tag, start, end, children = root
    func(tag, start, end)

    if children != None:
        for item in children: travel(item, func)

if __name__ =="__main__":
    parser = Parser( declaration, "file" )
    success, resultTrees, nextChar = parser.parse(text)

    output = {}
    for section in resultTrees:
        config = []
        travel(section, config_maker)
        output[section_name] = config

    pprint.pprint(output)

Exemplo n.º 29
0
        print offset * ' ', '->',
        isFisrtBlock = True
    elif tag == 'block':
        print "%s%s" % ('' if isFisrtBlock else (maxTagLen + 3) * ' ' + '| ', text[start:end])
        isFisrtBlock = False

def travel(root, func):
    if root == None: return

    tag, start, end, children = root
    func(tag, start, end)

    if children != None:
        for item in children: travel(item, func)

if __name__ =="__main__":
    inFile = open("2.txt")
    text = ""
    for line in inFile.readlines():
        text += line + "\n"

    parser = Parser( declaration, "file" )
    success, resultTrees, nextChar = parser.parse(text)
    #pprint.pprint(resultTrees) 

    for item in resultTrees: travel(item, counter)
    print maxTagLen
    for item in resultTrees: travel(item, printer)
    

Exemplo n.º 30
0
plusset   := '+',(set/atom), (set/atom)
atom      := -[+*]

>interesting< := (example8/example7/example6/example5/example4/example3/example2/example1)
example1     := '*+',(set/atom),(set/atom),'+',(set/atom),(set/atom)
example2     := '**',(set/atom),(set/atom),'++',(set/atom),(set/atom),(set/atom)
example3     := 'fsd*',(set/atom),(set/atom),'++',(set/atom),(set/atom),(set/atom)
example4     := 'm*',(set/atom),(set/atom),'++',(set/atom),(set/atom),(set/atom)
example5     := 'a*',(set/atom),(set/atom),'++',(set/atom),(set/atom),(set/atom)
example6     := 's*',(set/atom),(set/atom),'++',(set/atom),(set/atom),(set/atom)
example7     := 'bdf*',(set/atom),(set/atom),'++',(set/atom),(set/atom),(set/atom)
example8     := 'sd*',(set/atom),(set/atom),'++',(set/atom),(set/atom),(set/atom)
'''
import sys
from simpleparse.parser import Parser
parser = Parser(declaration, 'set')


class Emitter:
    def process(self, data):
        #import pprint
        tree = self.parse(data)
        #pprint.pprint( tree )
        # wrap up the tuple 'cause TextTools uses a different format for the top-level :(
        tree = ('set', 0, tree[-1], tree[1])
        return self.emit(tree)

    def parse(self, data):
        self.data = data
        return parser.parse(data)
Exemplo n.º 31
0
value                := simple_value  / (simple_value, (tb,'#', tb, simple_value)+)
>simple_value<       :=  string / number / name
alpha_name           := [a-zA-Z]+
name                 := []-[a-z_A-Z!$&+./:;<>?^`|'] , []-[a-z_A-Z0-9!$&+./:;<>?^`|']*
number               :=  [0-9]+ / ([[0-9]+, tb, [-]+, tb, [0-9]+)
string               :=  ('\"' , quotes_string?, '\"') / ('{' , braces_string?, '}')
<braces_string>      := (-[{}@]+ / string)+
<quotes_string>      := (-[\"{}]+ / ('{', braces_string,'}'))+
<junk>               := -[ \t\r\n]+
<tb>                 := (comment / ws)*
<ws>                 := [ \t\n\r]
<comment>            := '%' , -[\n]*, '\n'
"""

## instantiate SimpleParse parsers
parser = Parser(dec, 'bibfile')
entry_parser = Parser(dec, 'entry')


## offer a default parse function
def Parse(src, processor=None):
    '''Parse the bibtex string *src*, process with *processor*.'''
    return parser.parse(src, processor=processor)


## self-test
if __name__ == "__main__":
    import sys, pprint
    if len(sys.argv) > 1:
        src = open(sys.argv[1]).read()
        taglist = Parse(src)
from simpleparse.xml import xml_parser
from simpleparse.parser import Parser
import unittest, string

p = Parser(xml_parser.declaration)


class XMLProductionTests(unittest.TestCase):
    """Tests that XML grammar productions match appropriate values"""
    ### ProductionTests will be added here by loop below...


class ProductionTest:
    def __init__(self, production, should, shouldnot):
        self.production = production
        self.should = should
        self.shouldnot = shouldnot

    def __call__(self):
        """Perform the test"""
        for item in self.should:
            success, children, next = p.parse(item, self.production)
            assert success, """Didn't parse %s as a %s, should have""" % (
                repr(item), self.production)
            assert next == len(
                item
            ), """Didn't parse whole of %s as a %s, parsed %s of %s characters, results were:\n%s\nRest was:\n%s""" % (
                repr(item), self.production, next, len(item), children,
                item[next:])
        for item in shouldnot:
            success, children, next = p.parse(item, self.production)
Exemplo n.º 33
0
                    ]),
                ]),
            ]),
        ], 21))

    def testDeclarationSet2(self):
        '''Just tries to parse and sees that everything was parsed, doesn't predict the result'''
        parser = SPGenerator.buildParser("declarationset")
        result = TextTools.tag(declaration, parser)
        assert result[-1] == len(
            declaration
        ), '''Didn't complete parse of the simpleparse declaration, only got %s chars, should have %s''' % (
            result[-1], len(declaration))


recursiveParser = Parser(declaration)


class SimpleParseRecursiveTests(SimpleParseGrammarTests):
    """Test parsing of grammar elements with generated version of simpleparse grammar"""
    def doBasicTest(
        self,
        parserName,
        testValue,
        expected,
    ):
        result = recursiveParser.parse(testValue, production=parserName)
        assert result == expected, '''\nexpected:%s\n     got:%s\n''' % (
            expected, result)

    ("string_triple_single", """
nondelimiter               :=  -"'''"
<delimiter>                :=  "'''"
char_no_quote              :=  -[\\\\']+
string_special_escapes     := [\\\\abfnrtv']
"""),
    ("string_triple_double", '''
nondelimiter               :=  -'"""'
<delimiter>                :=  '"""'
char_no_quote              :=  -[\\\\"]+
string_special_escapes     := [\\\\abfnrtv"]
'''),
]

for name, partial in _stringTypeData:
    _p = Parser(stringDeclaration + partial)
    c[name] = objectgenerator.LibraryElement(
        generator=_p._generator,
        production="str",
    )
common.share(c)
_p = Parser("""
string :=  string_triple_double/string_triple_single/string_double_quote/string_single_quote
""")
c["string"] = objectgenerator.LibraryElement(
    generator=_p._generator,
    production="string",
)


class StringInterpreter(DispatchProcessor):
Exemplo n.º 35
0
rulespec1		:= match*, ts, target*
match			:= -[\n]*
target			:= '-j'/'--jump', ts, chainname, ts, targetopt*
targetopt		:= -[\n]*

arg					:=  literal / -[\\"\\' \t\n]+
negarg				:= '!', ts, arg
literal             :=  ("'",(CHARNOSNGLQUOTE/ESCAPEDCHAR)*,"'")  /  ('"',(CHARNODBLQUOTE/ESCAPEDCHAR)*,'"')
CHARNOSNGLQUOTE     :=  -[\\']+
CHARNODBLQUOTE      :=  -[\\"]+
ESCAPEDCHAR         :=  '\\',( SPECIALESCAPEDCHAR / OCTALESCAPEDCHAR )
SPECIALESCAPEDCHAR  :=  [\\abfnrtv]
OCTALESCAPEDCHAR    :=  [0-7],[0-7]?,[0-7]?
"""

parser = Parser(iptCommandLine, "tablesection")
fw = Firewall()


class IptProcessor(dispatchprocessor.DispatchProcessor):
    def tableline(self, tup, prsbuf):
        print "###", tup[3][0]
        self.table = dispatchprocessor.dispatch(self, tup[3][0], prsbuf)

    def tablename(self, tup, prsbuf):
        return repr(dispatchprocessor.getString(tup, prsbuf))

    def chainline(self, tup, prsbuf):
        print repr(dispatchprocessor.getString(tup, prsbuf))

    def chainname(self, tup, prsbuf):
Exemplo n.º 36
0
}

# Examples:
# https://github.com/blakeembrey/change-case
first_letter_case_tags = {
    'as': {
        'function_identifier': 'lower',
        'namespace_identifier': 'lower',
    },
    'cs': {
        'function_identifier': 'upper',
        'namespace_identifier': 'upper',
    }
}

ebnf_parser = Parser(declaration, 'declarationset')

source_keys = [key for source in SOURCES for key in source.keys()]
source_keys.sort()
## print(pformat(source_keys))


def reset():
    if 'DECLARED_TYPE' in literals and 'DECLARED_TYPE_ORIGINAL' in literals:
        literals['DECLARED_TYPE'] = literals['DECLARED_TYPE_ORIGINAL']
    for direction in ['as', 'cs']:
        if 'DECLARED_TYPE' in literals[direction] \
        and 'DECLARED_TYPE_ORIGINAL' in literals[direction]:
            literals[direction]['DECLARED_TYPE'] = literals[direction][
                'DECLARED_TYPE_ORIGINAL']
    data_types.clear()
Exemplo n.º 37
0
    def __init__(self,*args):
        if not os.path.sep in __file__:
            scriptdir = os.getcwd()
        else:
            scriptdir = os.path.dirname(__file__)

        #n3file = os.path.sep.join([scriptdir,"etc","sfs-extra.n3"])
        #n3url = "file://" + n3file.replace("\\","/")

        #print "scriptdir: %s" % scriptdir
        #print "n3file: %s" % n3file
        #print "n3url: %s" % n3url

        self.graph = Graph()
        n3file = Util.relpath(scriptdir + "/etc/sfs-extra.n3")
        # print "loading n3file %s" % n3file
        self.graph.load(n3file, format="n3")
        self.roots = []
        self.uriformatter = {}
        self.decl = ""
        self.namedlaws = {}
        self.load_ebnf(scriptdir+"/etc/base.ebnf")

        self.args = args
        if self.LAGRUM in args:
            productions = self.load_ebnf(scriptdir+"/etc/lagrum.ebnf")
            for p in productions:
                self.uriformatter[p] = self.sfs_format_uri
            self.namedlaws.update(self.get_relations(RDFS.label))
            self.roots.append("sfsrefs")
            self.roots.append("sfsref")

        if self.KORTLAGRUM in args:
            # om vi inte redan laddat lagrum.ebnf måste vi göra det
            # nu, eftersom kortlagrum.ebnf beror på produktioner som
            # definerats där
            if not self.LAGRUM in args:
                self.load_ebnf(scriptdir+"/etc/lagrum.ebnf")
                
            productions = self.load_ebnf(scriptdir+"/etc/kortlagrum.ebnf")
            for p in productions:
                self.uriformatter[p] = self.sfs_format_uri
            DCT = Namespace("http://purl.org/dc/terms/")
            d = self.get_relations(DCT['alternate'])
            self.namedlaws.update(d)
            lawlist = [x.encode(SP_CHARSET) for x in d.keys()]
            # Make sure longer law abbreviations come before shorter
            # ones (so that we don't mistake "3 § MBL" for "3 § MB"+"L")
            lawlist.sort(cmp=lambda x,y:len(y)-len(x))
            self.decl += "LawAbbreviation ::= ('%s')\n" % "'/'".join(lawlist)
            self.roots.insert(0,"kortlagrumref")

        if self.EGLAGSTIFTNING in args:
            productions = self.load_ebnf(scriptdir+"/etc/eglag.ebnf")
            for p in productions:
                self.uriformatter[p] = self.eglag_format_uri
            self.roots.append("eglagref")
        if self.FORARBETEN in args:
            productions = self.load_ebnf(scriptdir+"/etc/forarbeten.ebnf")
            for p in productions:
                self.uriformatter[p] = self.forarbete_format_uri
            self.roots.append("forarbeteref")
        if self.RATTSFALL in args:
            productions = self.load_ebnf(scriptdir+"/etc/rattsfall.ebnf")
            for p in productions:
                self.uriformatter[p] = self.rattsfall_format_uri
            self.roots.append("rattsfallref")
        if self.EGRATTSFALL in args:
            productions = self.load_ebnf(scriptdir+"/etc/egratt.ebnf")
            for p in productions:
                self.uriformatter[p] = self.egrattsfall_format_uri
            self.roots.append("ecjcaseref")
            
        self.decl += "root ::= (%s/plain)+\n" % "/".join(self.roots)
        # pprint(productions)
        # print self.decl.decode(SP_CHARSET,'ignore')

        self.parser = Parser(self.decl, "root")
        self.tagger = self.parser.buildTagger("root")
        # print "tagger length: %d" % len(repr(self.tagger))
        self.verbose = False
        self.depth = 0

        # SFS-specifik kod
        self.currentlaw     = None
        self.currentchapter = None
        self.currentsection = None
        self.currentpiece   = None
        self.lastlaw        = None
        self.currentlynamedlaws = {}
Exemplo n.º 38
0
"""Demonstrates what happens when your declaration is syntactically incorrect

When run as a script, will generate a traceback
telling you that the grammar defined here is
incorrectly formatted.
"""
from simpleparse.common import numbers, strings, comments

declaration = r'''# note use of raw string when embedding in python code...
file           :=  [ \t\n]*, section+
section        :=  '[',identifier,']' ts,'\n', body
body           :=  statement*
statement      :=  (ts,semicolon_comment)/equality/nullline
nullline       :=  ts,'\n'
comment        :=  -'\n'*
equality       :=  ts, identifier,ts,'=',ts,identified,ts,'\n'
identifier     :=  [a-zA-Z], [a-zA-Z0-9_]*
identified     :=  string/number/identifier
ts             :=  [ \t]*
'''

testdata = '''[test1]
    val=23
'''
if __name__ == "__main__":
    from simpleparse.parser import Parser
    parser = Parser(declaration, "file")  # will raise ValueError
Exemplo n.º 39
0
class LegalRef:
    # Kanske detta borde vara 1,2,4,8 osv, så att anroparen kan be om
    # LAGRUM | FORESKRIFTER, och så vi kan definera samlingar av
    # vanliga kombinationer (exv ALL_LAGSTIFTNING = LAGRUM |
    # KORTLAGRUM | FORESKRIFTER | EGLAGSTIFTNING)
    LAGRUM = 1             # hänvisningar till lagrum i SFS
    KORTLAGRUM = 2         # SFS-hänvisningar på kortform
    FORESKRIFTER = 3       # hänvisningar till myndigheters författningssamlingar
    EGLAGSTIFTNING = 4     # EG-fördrag, förordningar och direktiv
    INTLLAGSTIFTNING = 5   # Fördrag, traktat etc
    FORARBETEN = 6         # proppar, betänkanden, etc
    RATTSFALL = 7          # Rättsfall i svenska domstolar
    MYNDIGHETSBESLUT = 8   # Myndighetsbeslut (JO, ARN, DI...)
    EGRATTSFALL = 9        # Rättsfall i EG-domstolen/förstainstansrätten 
    INTLRATTSFALL = 10     # Europadomstolen

    
    # re_urisegments = re.compile(r'([\w]+://[^/]+/[^\d]*)(\d+:(bih\. |N|)?\d+( s\.\d+|))#?(K(\d+)|)(P(\d+)|)(S(\d+)|)(N(\d+)|)')
    re_urisegments = re.compile(r'([\w]+://[^/]+/[^\d]*)(\d+:(bih\.[_ ]|N|)?\d+([_ ]s\.\d+|))#?(K([a-z0-9]+)|)(P([a-z0-9]+)|)(S(\d+)|)(N(\d+)|)')
    re_escape_compound = re.compile(r'\b(\w+-) (och) (\w+-?)(lagen|förordningen)\b', re.UNICODE)
    re_escape_named = re.compile(r'\B(lagens?|balkens?|förordningens?|formens?|ordningens?|kungörelsens?|stadgans?)\b', re.UNICODE)

    re_descape_compound = re.compile(r'\b(\w+-)_(och)_(\w+-?)(lagen|förordningen)\b', re.UNICODE)
    re_descape_named = re.compile(r'\|(lagens?|balkens?|förordningens?|formens?|ordningens?|kungörelsens?|stadgans?)')
    re_xmlcharref = re.compile("&#\d+;")
    def __init__(self,*args):
        if not os.path.sep in __file__:
            scriptdir = os.getcwd()
        else:
            scriptdir = os.path.dirname(__file__)

        #n3file = os.path.sep.join([scriptdir,"etc","sfs-extra.n3"])
        #n3url = "file://" + n3file.replace("\\","/")

        #print "scriptdir: %s" % scriptdir
        #print "n3file: %s" % n3file
        #print "n3url: %s" % n3url

        self.graph = Graph()
        n3file = Util.relpath(scriptdir + "/etc/sfs-extra.n3")
        # print "loading n3file %s" % n3file
        self.graph.load(n3file, format="n3")
        self.roots = []
        self.uriformatter = {}
        self.decl = ""
        self.namedlaws = {}
        self.load_ebnf(scriptdir+"/etc/base.ebnf")

        self.args = args
        if self.LAGRUM in args:
            productions = self.load_ebnf(scriptdir+"/etc/lagrum.ebnf")
            for p in productions:
                self.uriformatter[p] = self.sfs_format_uri
            self.namedlaws.update(self.get_relations(RDFS.label))
            self.roots.append("sfsrefs")
            self.roots.append("sfsref")

        if self.KORTLAGRUM in args:
            # om vi inte redan laddat lagrum.ebnf måste vi göra det
            # nu, eftersom kortlagrum.ebnf beror på produktioner som
            # definerats där
            if not self.LAGRUM in args:
                self.load_ebnf(scriptdir+"/etc/lagrum.ebnf")
                
            productions = self.load_ebnf(scriptdir+"/etc/kortlagrum.ebnf")
            for p in productions:
                self.uriformatter[p] = self.sfs_format_uri
            DCT = Namespace("http://purl.org/dc/terms/")
            d = self.get_relations(DCT['alternate'])
            self.namedlaws.update(d)
            lawlist = [x.encode(SP_CHARSET) for x in d.keys()]
            # Make sure longer law abbreviations come before shorter
            # ones (so that we don't mistake "3 § MBL" for "3 § MB"+"L")
            lawlist.sort(cmp=lambda x,y:len(y)-len(x))
            self.decl += "LawAbbreviation ::= ('%s')\n" % "'/'".join(lawlist)
            self.roots.insert(0,"kortlagrumref")

        if self.EGLAGSTIFTNING in args:
            productions = self.load_ebnf(scriptdir+"/etc/eglag.ebnf")
            for p in productions:
                self.uriformatter[p] = self.eglag_format_uri
            self.roots.append("eglagref")
        if self.FORARBETEN in args:
            productions = self.load_ebnf(scriptdir+"/etc/forarbeten.ebnf")
            for p in productions:
                self.uriformatter[p] = self.forarbete_format_uri
            self.roots.append("forarbeteref")
        if self.RATTSFALL in args:
            productions = self.load_ebnf(scriptdir+"/etc/rattsfall.ebnf")
            for p in productions:
                self.uriformatter[p] = self.rattsfall_format_uri
            self.roots.append("rattsfallref")
        if self.EGRATTSFALL in args:
            productions = self.load_ebnf(scriptdir+"/etc/egratt.ebnf")
            for p in productions:
                self.uriformatter[p] = self.egrattsfall_format_uri
            self.roots.append("ecjcaseref")
            
        self.decl += "root ::= (%s/plain)+\n" % "/".join(self.roots)
        # pprint(productions)
        # print self.decl.decode(SP_CHARSET,'ignore')

        self.parser = Parser(self.decl, "root")
        self.tagger = self.parser.buildTagger("root")
        # print "tagger length: %d" % len(repr(self.tagger))
        self.verbose = False
        self.depth = 0

        # SFS-specifik kod
        self.currentlaw     = None
        self.currentchapter = None
        self.currentsection = None
        self.currentpiece   = None
        self.lastlaw        = None
        self.currentlynamedlaws = {}
        
    def load_ebnf(self,file):
        """Laddar in produktionerna i den angivna filen i den
        EBNF-deklaration som används, samt returnerar alla
        *Ref och *RefId-produktioner"""
        # print "%s: Loading %s" % (id(self), file)
        f = open(file)
        content = f.read()
        self.decl += content
        f.close()
        return [x.group(1) for x in re.finditer(r'(\w+(Ref|RefID))\s*::=', content)]

    def get_relations(self, predicate):
        d = {}
        for obj, subj in self.graph.subject_objects(predicate):
            d[unicode(subj)] = unicode(obj)
        return d


    def parse(self, indata, baseuri="http://rinfo.lagrummet.se/publ/sfs/9999:999#K9P9S9P9",predicate=None):
        if indata == "": return indata # this actually triggered a bug...
        # h = hashlib.sha1()
        # h.update(indata)
        # print "Called with %r (%s) (%s)" % (indata, h.hexdigest(), self.verbose)
        self.predicate = predicate
        self.baseuri = baseuri
        if baseuri:
            m = self.re_urisegments.match(baseuri)
            if m:
                self.baseuri_attributes = {'baseuri':m.group(1),
                                           'law':m.group(2),
                                           'chapter':m.group(6),
                                           'section':m.group(8),
                                           'piece':m.group(10),
                                           'item':m.group(12)}
            else:
                self.baseuri_attributes = {'baseuri':baseuri}
        else:
            self.baseuri_attributes = {}
        # Det är svårt att få EBNF-grammatiken att känna igen
        # godtyckliga ord som slutar på ett givet suffix (exv
        # 'bokföringslagen' med suffixet 'lagen'). Därför förbehandlar
        # vi indatasträngen och stoppar in ett '|'-tecken innan vissa
        # suffix. Vi transformerar även 'Radio- och TV-lagen' till
        # 'Radio-_och_TV-lagen'
        #
        # FIXME: Obviously, this shouldn't be done in a general class,
        # but rather in a subclas or via proxy/adapter
        # if we don't do the unicode conversion and pass
        # BeautifulSoup.NavigableString, the later .encode call fails
        # (since it's not a real unicode string)
            
        fixedindata = unicode(indata)
        # print "Before: %r" % type(fixedindata)
        
        if self.LAGRUM in self.args:
            fixedindata = self.re_escape_compound.sub(r'\1_\2_\3\4', fixedindata)
            fixedindata = self.re_escape_named.sub(r'|\1', fixedindata)
        # print "After: %r" % type(fixedindata)
        
        # SimpleParse har inget stöd för unicodesträngar, så vi
        # konverterar intdatat till en bytesträng. Tyvärr får jag inte
        # det hela att funka med UTF8, så vi kör xml character
        # references istället
        if isinstance(fixedindata,unicode):
            fixedindata = fixedindata.encode(SP_CHARSET,'xmlcharrefreplace')
            
        # Parsea texten med TextTools.tag - inte det enklaste sättet
        # att göra det, men om man gör enligt
        # Simpleparse-dokumentationen byggs taggertabellen om för
        # varje anrop till parse()
        if self.verbose: print u"calling tag with '%s'" % (fixedindata.decode(SP_CHARSET))
        # print "tagger length: %d" % len(repr(self.tagger))
        taglist = tag(fixedindata, self.tagger,0,len(fixedindata))
        result = []

        root = NodeTree(taglist,fixedindata)
        for part in root.nodes:
            if part.tag != 'plain' and self.verbose:
                sys.stdout.write(self.prettyprint(part))
            if part.tag in self.roots:
                self.clear_state()
                # self.verbose = False
                result.extend(self.formatter_dispatch(part))
            else:
                assert part.tag == 'plain',"Tag is %s" % part.tag
                result.append(part.text)
                
            # clear state
            if self.currentlaw != None: self.lastlaw = self.currentlaw
            self.currentlaw = None


        if taglist[-1] != len(fixedindata):
            log.error(u'Problem (%d:%d) with %r / %r' % (taglist[-1]-8,taglist[-1]+8,fixedindata,indata))

            raise ParseError, "parsed %s chars of %s (...%s...)" %  (taglist[-1], len(indata),
                                                                               indata[(taglist[-1]-2):taglist[-1]+3])


        # Normalisera resultatet, dvs konkatenera intilliggande
        # textnoder, och ta bort ev '|'-tecken som vi stoppat in
        # tidigare.
        normres = []
        for i in range(len(result)):
            if not self.re_descape_named.search(result[i]):
                node = result[i]
            else:
                if self.LAGRUM in self.args:
                    text = self.re_descape_named.sub(r'\1',result[i])
                    text = self.re_descape_compound.sub(r'\1 \2 \3\4', text)
                if isinstance(result[i], Link):
                    # Eftersom Link-objekt är immutable måste vi skapa
                    # ett nytt och kopiera dess attribut
                    if hasattr(result[i],'predicate'):
                        node = LinkSubject(text, predicate=result[i].predicate,
                                           uri=result[i].uri)
                    else:
                        node = Link(text,uri=result[i].uri)
                else:
                    node = text
            if (len(normres) > 0
                and not isinstance(normres[-1],Link) 
                and not isinstance(node,Link)):
                normres[-1] += node
            else:
                normres.append(node)

        # and finally...
        for i in range(len(normres)):
            if isinstance(normres[i], Link):
                # deal with these later
                pass
            else:
                normres[i] = self.re_xmlcharref.sub(self.unescape_xmlcharref, normres[i])
        return normres

    def unescape_xmlcharref(self, m):
        # print "Changing %r to a %r" % (m.group(0)[2:-1], unichr(int(m.group(0)[2:-1])))
        return unichr(int(m.group(0)[2:-1]))

    def find_attributes(self,parts,extra={}):
        """recurses through a parse tree and creates a dictionary of
        attributes"""
        d = {}
        
        self.depth += 1
        if self.verbose: print ". "*self.depth+"find_attributes: starting with %s"%d
        if extra:
            d.update(extra)
            
        for part in parts:
            current_part_tag = part.tag.lower()
            if current_part_tag.endswith('refid'):
                if ((current_part_tag == 'singlesectionrefid') or
                    (current_part_tag == 'lastsectionrefid')):
                    current_part_tag = 'sectionrefid'
                d[current_part_tag[:-5]] = part.text.strip()
                if self.verbose: print ". "*self.depth+"find_attributes: d is now %s" % d
                
            if part.nodes:
                d.update(self.find_attributes(part.nodes,d))
        if self.verbose: print ". "*self.depth+"find_attributes: returning %s" % d
        self.depth -= 1

        if self.currentlaw     and 'law' not in d    : d['law']     = self.currentlaw
        if self.currentchapter and 'chapter' not in d: d['chapter'] = self.currentchapter
        if self.currentsection and 'section' not in d: d['section'] = self.currentsection
        if self.currentpiece   and 'piece' not in d  : d['piece']   = self.currentpiece

        return d


    def find_node(self,root,nodetag):
        """Returns the first node in the tree that has a tag matching nodetag. The search is depth-first"""
        if root.tag == nodetag: # base case
            return root
        else:
            for node in root.nodes:
                x = self.find_node(node,nodetag)
                if x != None: return x
            return None

    def find_nodes(self,root,nodetag):
        if root.tag == nodetag:
            return [root]
        else:
            res = []
            for node in root.nodes:
                res.extend(self.find_nodes(node,nodetag))
            return res
                

    def flatten_tokentree(self,part,suffix):
        """returns a 'flattened' tokentree ie for the following tree and the suffix 'RefID'
           foo->bar->BlahongaRefID
              ->baz->quux->Blahonga2RefID
                         ->Blahonga3RefID
              ->Blahonga4RefID

           this should return [BlahongaRefID, Blahonga2RefID, Blahonga3RefID, Blahonga4RefID]"""
        l = []
        if part.tag.endswith(suffix): l.append(part)
        if not part.nodes: return l

        for subpart in part.nodes:
            l.extend(self.flatten_tokentree(subpart,suffix))
        return l

    def formatter_dispatch(self,part):
        # print "Verbositiy: %r" % self.verbose
        self.depth += 1
        # Finns det en skräddarsydd formatterare?
        if "format_"+part.tag in dir(self): 
            formatter = getattr(self,"format_"+part.tag)
            if self.verbose: print (". "*self.depth)+ "formatter_dispatch: format_%s defined, calling it" % part.tag
            res = formatter(part)
            assert res != None, "Custom formatter for %s didn't return anything" % part.tag
        else:
            if self.verbose: print (". "*self.depth)+ "formatter_dispatch: no format_%s, using format_tokentree" % part.tag
            res = self.format_tokentree(part)

        if res == None: print (". "*self.depth)+ "something wrong with this:\n" + self.prettyprint(part)
        self.depth -= 1
        return res
        
    def format_tokentree(self,part):
        # This is the default formatter. It converts every token that
        # ends with a RefID into a Link object. For grammar
        # productions like SectionPieceRefs, which contain
        # subproductions that also end in RefID, this is not a good
        # function to use - use a custom formatter instead.

        res = []

        if self.verbose: print (". "*self.depth)+ "format_tokentree: called for %s" % part.tag
        # this is like the bottom case, or something
        if (not part.nodes) and (not part.tag.endswith("RefID")):
            res.append(part.text)
        else:
            if part.tag.endswith("RefID"):
                res.append(self.format_generic_link(part))
            elif part.tag.endswith("Ref"):
                res.append(self.format_generic_link(part))
            else:
                for subpart in part.nodes:
                    if self.verbose and part.tag == 'LawRef':
                        print (". "*self.depth) + "format_tokentree: part '%s' is a %s" % (subpart.text, subpart.tag)
                    res.extend(self.formatter_dispatch(subpart))
        if self.verbose: print (". "*self.depth)+ "format_tokentree: returning '%s' for %s" % (res,part.tag)
        return res
    

    def prettyprint(self,root,indent=0):
        res = u"%s'%s': '%s'\n" % ("    "*indent,root.tag,re.sub(r'\s+', ' ',root.text))
        if root.nodes != None:
            for subpart in root.nodes:
                res += self.prettyprint(subpart,indent+1)
            return res
        else: return u""


    def format_generic_link(self,part,uriformatter=None):
        try:
            uri = self.uriformatter[part.tag](self.find_attributes([part]))
        except KeyError:
            if uriformatter:
                uri = uriformatter(self.find_attributes([part]))
            else:
                uri = self.sfs_format_uri(self.find_attributes([part]))
        except AttributeError:
            # Normal error from eglag_format_uri
            return part.text
        except:
            exc = sys.exc_info()
            # If something else went wrong, just return the plaintext
            log.warning("(unknown): Unable to format link for text %s (production %s)" % (part.text, part.tag))
            return part.text
        
        if self.verbose: print (". "*self.depth)+ "format_generic_link: uri is %s" % uri
        if not uri:
            # the formatting function decided not to return a URI for
            # some reason (maybe it was a partial/relative reference
            # without a proper base uri context
            return part.text
        elif self.predicate:
            return LinkSubject(part.text, uri=uri, predicate=self.predicate)
        else:
            return Link(part.text, uri=uri)
        
    # FIXME: unify this with format_generic_link
    def format_custom_link(self, attributes, text, production):
        try:
            uri = self.uriformatter[production](attributes)
        except KeyError:
            uri = self.sfs_format_uri(attributes)

        if not uri:
            # the formatting function decided not to return a URI for
            # some reason (maybe it was a partial/relative reference
            # without a proper base uri context
            return part.text
        elif self.predicate:
            return LinkSubject(text, uri=uri, predicate=self.predicate)
        else:
            return Link(text, uri=uri)


    ################################################################
    # KOD FÖR LAGRUM
    def clear_state(self):
        self.currentlaw     = None
        self.currentchapter = None
        self.currentsection = None
        self.currentpiece   = None

    def normalize_sfsid(self,sfsid):
        # sometimes '1736:0123 2' is given as '1736:0123 s. 2' or
        # '1736:0123.2'. This fixes that.
        sfsid = re.sub(r'(\d+:\d+)\.(\d)',r'\1 \2',sfsid)
        #return sfsid.replace('s. ','').replace('s.','') # more advanced normalizations to come...
        return sfsid
        
    def normalize_lawname(self,lawname):
        lawname=lawname.replace('|','').replace('_',' ').lower()
        if lawname.endswith('s'):
            lawname = lawname[:-1]
        return lawname
        
    def namedlaw_to_sfsid(self,text,normalize=True):
        if normalize:
            text = self.normalize_lawname(text)
        
        nolaw = [
            u'aktieslagen',
            u'anordningen',
            u'anordningen',
            u'anslagen',
            u'arbetsordningen',
            u'associationsformen',
            u'avfallsslagen',
            u'avslagen',
            u'avvittringsutslagen',
            u'bergslagen',
            u'beskattningsunderlagen',
            u'bolagen',
            u'bolagsordningen',
            u'bolagsordningen',
            u'dagordningen',
            u'djurslagen',
            u'dotterbolagen',
            u'emballagen',
            u'energislagen',
            u'ersättningsformen',
            u'ersättningsslagen',
            u'examensordningen',
            u'finansbolagen',
            u'finansieringsformen',
            u'fissionsvederlagen',
            u'flygbolagen',
            u'fondbolagen',
            u'förbundsordningen',
            u'föreslagen',
            u'företrädesordningen',
            u'förhandlingsordningen',
            u'förlagen',
            u'förmånsrättsordningen',
            u'förmögenhetsordningen',
            u'förordningen',
            u'förslagen',
            u'försäkringsaktiebolagen',
            u'försäkringsbolagen',
            u'gravanordningen',
            u'grundlagen',
            u'handelsplattformen',
            u'handläggningsordningen',
            u'inkomstslagen',
            u'inköpssamordningen',
            u'kapitalunderlagen',
            u'klockslagen',
            u'kopplingsanordningen',
            u'låneformen',
            u'mervärdesskatteordningen',
            u'nummerordningen',
            u'omslagen',
            u'ordalagen',
            u'pensionsordningen',
            u'renhållningsordningen',
            u'representationsreformen',
            u'rättegångordningen',
            u'rättegångsordningen',
            u'rättsordningen',
            u'samordningen',
            u'samordningen',
            u'skatteordningen',
            u'skatteslagen',
            u'skatteunderlagen',
            u'skolformen',
            u'skyddsanordningen',
            u'slagen',
            u'solvärmeanordningen',
            u'storslagen',
            u'studieformen',
            u'stödformen',
            u'stödordningen',
            u'stödordningen',
            u'säkerhetsanordningen',
            u'talarordningen',
            u'tillslagen',
            u'tivolianordningen',
            u'trafikslagen',
            u'transportanordningen',
            u'transportslagen',
            u'trädslagen',
            u'turordningen',
            u'underlagen',
            u'uniformen',
            u'uppställningsformen',
            u'utvecklingsbolagen',
            u'varuslagen',
            u'verksamhetsformen',
            u'vevanordningen',
            u'vårdformen',
            u'ägoanordningen',
            u'ägoslagen',
            u'ärendeslagen',
            u'åtgärdsförslagen',
                 ]
        if text in nolaw:
            return None

        if self.currentlynamedlaws.has_key(text):
            return self.currentlynamedlaws[text]
        elif self.namedlaws.has_key(text):
            return self.namedlaws[text]
        else:
            if self.verbose:
                # print "(unknown): I don't know the ID of named law [%s]" % text
                log.warning("(unknown): I don't know the ID of named law [%s]" % text)
            return None

    def sfs_format_uri(self,attributes):
        piecemappings = {u'första' :'1',
                         u'andra'  :'2',
                         u'tredje' :'3',
                         u'fjärde' :'4',
                         u'femte'  :'5',
                         u'sjätte' :'6',
                         u'sjunde' :'7',
                         u'åttonde':'8',
                         u'nionde' :'9'}
        keymapping = {'lawref'  :'L',
                      'chapter' :'K',
                      'section' :'P',
                      'piece'   :'S',
                      'item'    :'N',
                      'itemnumeric':'N',
                      'element' :'O',
                      'sentence':'M', # is this ever used?
                      }
        attributeorder = ['law', 'lawref', 'chapter', 'section', 'element', 'piece', 'item', 'itemnumeric','sentence']

        if 'law' in attributes:
            if attributes['law'].startswith('http://'):
                res = ''
            else:
                res = 'http://rinfo.lagrummet.se/publ/sfs/'
            
        else:
            if 'baseuri' in self.baseuri_attributes:
                res = self.baseuri_attributes['baseuri']
            else:
                res = ''
        resolvetobase = True
        addfragment = False
        justincase = None
        for key in attributeorder:
            if attributes.has_key(key):
                resolvetobase = False
                val = attributes[key]
            elif (resolvetobase and self.baseuri_attributes.has_key(key)):
                val = self.baseuri_attributes[key]
            else:
                val = None

            if val:
                if addfragment:
                    res += '#'
                    addfragment = False
                if (key in ['piece', 'itemnumeric', 'sentence'] and val in piecemappings):
                    res += '%s%s' % (keymapping[key],piecemappings[val.lower()])
                else:
                    if key == 'law':
                        val = self.normalize_sfsid(val)
                        val = val.replace(" ", "_")
                        res += val
                        addfragment = True
                    else:
                        if justincase:
                            res += justincase
                            justincase = None
                        val = val.replace(" ", "")
                        val = val.replace("\n", "")
                        val = val.replace("\r", "")
                        res += '%s%s' % (keymapping[key],val)
            else:
                if key == 'piece':
                    justincase = "S1" 
        return res
        
    def format_ChapterSectionRefs(self,root):
        assert(root.tag == 'ChapterSectionRefs')
        assert(len(root.nodes) == 3) # ChapterRef, wc, SectionRefs
        
        part = root.nodes[0]
        self.currentchapter = part.nodes[0].text.strip()

        if self.currentlaw:
            res = [self.format_custom_link({'law':self.currentlaw,
                                            'chapter':self.currentchapter},
                                           part.text,
                                           part.tag)]
        else:
            res = [self.format_custom_link({'chapter':self.currentchapter},
                                           part.text,
                                           part.tag)]

        res.extend(self.formatter_dispatch(root.nodes[1]))
        res.extend(self.formatter_dispatch(root.nodes[2]))
        self.currentchapter = None
        return res

    def format_ChapterSectionPieceRefs(self,root):
        assert(root.nodes[0].nodes[0].tag == 'ChapterRefID')
        self.currentchapter = root.nodes[0].nodes[0].text.strip()
        res = []
        for node in root.nodes:
            res.extend(self.formatter_dispatch(node))
        return res

    def format_LastSectionRef(self, root):
        # the last section ref is a bit different, since we want the
        # ending double section mark to be part of the link text
        assert(root.tag == 'LastSectionRef')
        assert(len(root.nodes) == 3) # LastSectionRefID, wc, DoubleSectionMark
        sectionrefid = root.nodes[0]
        sectionid = sectionrefid.text
      
        return [self.format_generic_link(root)]


    def format_SectionPieceRefs(self, root):
        assert(root.tag == 'SectionPieceRefs')
        self.currentsection = root.nodes[0].nodes[0].text.strip()

        res = [self.format_custom_link(self.find_attributes([root.nodes[2]]),
                                       "%s %s" % (root.nodes[0].text, root.nodes[2].text),
                                       root.tag)]
        for node in root.nodes[3:]:
            res.extend(self.formatter_dispatch(node))
            
        self.currentsection = None
        return res

    def format_SectionPieceItemRefs(self,root):
        assert(root.tag == 'SectionPieceItemRefs')
        self.currentsection = root.nodes[0].nodes[0].text.strip()
        self.currentpiece = root.nodes[2].nodes[0].text.strip()

        res = [self.format_custom_link(self.find_attributes([root.nodes[2]]),
                                       "%s %s" % (root.nodes[0].text, root.nodes[2].text),
                                       root.tag)]

        for node in root.nodes[3:]:
            res.extend(self.formatter_dispatch(node))
            
        self.currentsection = None
        self.currentpiece =  None
        return res
        

    # This is a special case for things like '17-29 och 32 §§ i lagen
    # (2004:575)', which picks out the LawRefID first and stores it in
    # .currentlaw, so that find_attributes finds it
    # automagically. Although now it seems to be branching out and be
    # all things to all people.
    def format_ExternalRefs(self,root):
        assert(root.tag == 'ExternalRefs')
        # print "DEBUG: start of format_ExternalRefs; self.currentlaw is %s" % self.currentlaw

        lawrefid_node = self.find_node(root,'LawRefID')
        if lawrefid_node == None:
            # Ok, no explicit LawRefID found, lets see if this is a named law that we have the ID for
            # namedlaw_node = self.find_node(root, 'NamedLawExternalLawRef')
            namedlaw_node = self.find_node(root, 'NamedLaw')
            if namedlaw_node == None:
                # As a last chance, this might be a reference back to a previously mentioned law ("...enligt 4 § samma lag")
                samelaw_node = self.find_node(root, 'SameLaw')
                assert(samelaw_node != None)
                if self.lastlaw == None:
                    log.warning(u"(unknown): found reference to \"{samma,nämnda} {lag,förordning}\", but self.lastlaw is not set")

                self.currentlaw = self.lastlaw
            else:
                # the NamedLaw case
                self.currentlaw = self.namedlaw_to_sfsid(namedlaw_node.text)
                if self.currentlaw == None:
                    # unknow law name - in this case it's better to
                    # bail out rather than resolving chapter/paragraph
                    # references relative to baseuri (which is almost
                    # certainly wrong)
                    return [root.text]
        else:
            self.currentlaw = lawrefid_node.text
            if self.find_node(root,'NamedLaw'):
                namedlaw = self.normalize_lawname(self.find_node(root,'NamedLaw').text)
                # print "remember that %s is %s!" % (namedlaw, self.currentlaw)
                self.currentlynamedlaws[namedlaw] = self.currentlaw

        #print "DEBUG: middle of format_ExternalRefs; self.currentlaw is %s" % self.currentlaw
        if self.lastlaw is None:
            #print "DEBUG: format_ExternalRefs: setting self.lastlaw to %s" % self.currentlaw
            self.lastlaw = self.currentlaw

        # if the node tree only contains a single reference, it looks
        # better if the entire expression, not just the
        # chapter/section part, is linked. But not if it's a
        # "anonymous" law ('1 § i lagen (1234:234) om blahonga')
        if (len(self.find_nodes(root,'GenericRefs')) == 1 and
            len(self.find_nodes(root,'SectionRefID')) == 1 and
            len(self.find_nodes(root,'AnonymousExternalLaw')) == 0):
            res = [self.format_generic_link(root)]
        else:
            res = self.format_tokentree(root)

        return res

    def format_SectionItemRefs(self,root):
        assert(root.nodes[0].nodes[0].tag == 'SectionRefID')
        self.currentsection = root.nodes[0].nodes[0].text.strip()
        #res = self.formatter_dispatch(root.nodes[0]) # was formatter_dispatch(self.root)
        res = self.format_tokentree(root)
        self.currentsection = None
        return res

    def format_PieceItemRefs(self,root):
        self.currentpiece = root.nodes[0].nodes[0].text.strip()
        res = [self.format_custom_link(self.find_attributes([root.nodes[2].nodes[0]]),
                                       "%s %s" % (root.nodes[0].text, root.nodes[2].nodes[0].text),
                                       root.tag)]
        for node in root.nodes[2].nodes[1:]:
            res.extend(self.formatter_dispatch(node))
        
        self.currentpiece = None
        return res

    def format_ChapterSectionRef(self,root):
        assert(root.nodes[0].nodes[0].tag == 'ChapterRefID')
        self.currentchapter = root.nodes[0].nodes[0].text.strip()
        return [self.format_generic_link(root)]

    def format_AlternateChapterSectionRefs(self,root):
        assert(root.nodes[0].nodes[0].tag == 'ChapterRefID')
        self.currentchapter = root.nodes[0].nodes[0].text.strip()
        # print "Self.currentchapter is now %s" % self.currentchapter
        res = self.format_tokentree(root)
        self.currentchapter = None
        return res

        
        
    def format_ExternalLaw(self,root):
        self.currentchapter = None
        return self.formatter_dispatch(root.nodes[0])

    def format_ChangeRef(self,root):
        id = self.find_node(root,'LawRefID').data
        return [self.format_custom_link({'lawref':id},
                                        root.text,
                                        root.tag)]

    def format_SFSNr(self,root):
        if self.baseuri == None: 
            sfsid = self.find_node(root,'LawRefID').data
            self.baseuri_attributes = {'baseuri':'http://rinfo.lagrummet.se/publ/sfs/'+sfsid+'#'}
        return self.format_tokentree(root)


    def format_NamedExternalLawRef(self,root):
        resetcurrentlaw = False
        #print "format_NamedExternalLawRef: self.currentlaw is %r"  % self.currentlaw
        if self.currentlaw == None:
            resetcurrentlaw = True
            lawrefid_node = self.find_node(root,'LawRefID')
            if lawrefid_node == None:
                self.currentlaw = self.namedlaw_to_sfsid(root.text)
            else:
                self.currentlaw = lawrefid_node.text
                namedlaw = self.normalize_lawname(self.find_node(root,'NamedLaw').text)
                # print "remember that %s is %s!" % (namedlaw, self.currentlaw)
                self.currentlynamedlaws[namedlaw] = self.currentlaw
            #print "format_NamedExternalLawRef: self.currentlaw is now %r"  % self.currentlaw

        #print "format_NamedExternalLawRef: self.baseuri is %r" % self.baseuri
        if self.currentlaw == None: # if we can't find a ID for this law, better not <link> it
            res = [root.text]
        else:
            res = [self.format_generic_link(root)]

        #print "format_NamedExternalLawRef: self.baseuri is %r" % self.baseuri
        if self.baseuri == None and self.currentlaw != None:
            #print "format_NamedExternalLawRef: setting baseuri_attributes"
            # use this as the new baseuri_attributes
            m = self.re_urisegments.match(self.currentlaw)
            if m:
                self.baseuri_attributes = {'baseuri':m.group(1),
                                           'law':m.group(2),
                                           'chapter':m.group(6),
                                           'section':m.group(8),
                                           'piece':m.group(10),
                                           'item':m.group(12)}
            else:
                self.baseuri_attributes = {'baseuri':'http://rinfo.lagrummet.se/publ/sfs/'+self.currentlaw+'#'}

        if resetcurrentlaw:
            if self.currentlaw != None: self.lastlaw = self.currentlaw
            self.currentlaw = None
        return res

    ################################################################
    # KOD FÖR KORTLAGRUM
    def format_AbbrevLawNormalRef(self,root):
        lawabbr_node = self.find_node(root,'LawAbbreviation')
        self.currentlaw = self.namedlaw_to_sfsid(lawabbr_node.text,normalize=False)
        res = [self.format_generic_link(root)]
        if self.currentlaw != None: self.lastlaw = self.currentlaw
        self.currentlaw = None
        return res

    def format_AbbrevLawShortRef(self,root):
        assert(root.nodes[0].tag == 'LawAbbreviation')
        assert(root.nodes[2].tag == 'ShortChapterSectionRef')
        self.currentlaw = self.namedlaw_to_sfsid(root.nodes[0].text,normalize=False)
        shortsection_node = root.nodes[2]
        assert(shortsection_node.nodes[0].tag == 'ShortChapterRefID')
        assert(shortsection_node.nodes[2].tag == 'ShortSectionRefID')
        self.currentchapter = shortsection_node.nodes[0].text
        self.currentsection = shortsection_node.nodes[2].text
        
        res = [self.format_generic_link(root)]

        self.currentchapter = None
        self.currentsection = None
        self.currentlaw     = None
        return res

    
    ################################################################
    # KOD FÖR FORARBETEN
    def forarbete_format_uri(self,attributes):
        # res = self.baseuri_attributes['baseuri']
        res = 'http://rinfo.lagrummet.se/'
        resolvetobase = True
        addfragment = False
        
        for key,val in attributes.items():
            if key == 'prop':
                res += "publ/prop/%s" % val
            elif key == 'bet':
                res += "ext/bet/%s" % val
            elif key == 'skrivelse':
                res += "ext/rskr/%s" % val
            elif key == 'celex':
                if len(val) == 8: # incorrectly formatted, uses YY instead of YYYY
                    val = val[0]+'19'+val[1:]
                res += "ext/celex/%s" % val
        if 'sidnr' in attributes:
            res += "#s%s" % attributes['sidnr']

        return res

    def format_ChapterSectionRef(self,root):
        assert(root.nodes[0].nodes[0].tag == 'ChapterRefID')
        self.currentchapter = root.nodes[0].nodes[0].text.strip()
        return [self.format_generic_link(root)]

    ################################################################
    # KOD FÖR EGLAGSTIFTNING
    def eglag_format_uri(self,attributes):
        res = 'http://rinfo.lagrummet.se/ext/celex/'
        if not 'akttyp' in attributes:
            if 'forordning' in attributes:
                attributes['akttyp'] = u'förordning';
            elif 'direktiv' in attributes:
                attributes['akttyp'] = u'direktiv';

        if 'akttyp' not in attributes:
            raise AttributeError("Akttyp saknas")
        # Om hur CELEX-nummer konstrueras
        # https://www.infotorg.sema.se/infotorg/itweb/handbook/rb/hlp_celn.htm
        # https://www.infotorg.sema.se/infotorg/itweb/handbook/rb/hlp_celf.htm
        # Om hur länkning till EURLEX ska se ut:
        # http://eur-lex.europa.eu/sv/tools/help_syntax.htm
        # Absolut URI?
        if 'ar' in attributes and 'lopnummer' in attributes:
            sektor = '3'
            rattslig_form = {u'direktiv':'L',
                             u'förordning':'R'}

            if len(attributes['ar']) == 2:
                attributes['ar'] = '19'+attributes['ar']
            res += "%s%s%s%04d" % (sektor,attributes['ar'],
                                   rattslig_form[attributes['akttyp']],
                                   int(attributes['lopnummer']))
        else:
            if not self.baseuri_attributes['baseuri'].startswith(res):
                # FIXME: should we warn about this?
                # print "Relative reference, but base context %s is not a celex context" % self.baseuri_attributes['baseuri']
                return None

        if 'artikel' in attributes:
            res += "#%s" % attributes['artikel']
            if 'underartikel' in attributes:
                res += ".%s" % attributes['underartikel']

        return res


    ################################################################
    # KOD FÖR RATTSFALL
    def rattsfall_format_uri(self,attributes):
        # Listan härledd från containers.n3/rattsfallsforteckningar.n3 i
        # rinfoprojektets källkod - en ambitiösare lösning vore att läsa
        # in de faktiska N3-filerna i en rdflib-graf.
        containerid = {u'NJA': '/publ/rattsfall/nja/',
                       u'RH': '/publ/rattsfall/rh/',
                       u'MÖD': '/publ/rattsfall/mod/',
                       u'RÅ': '/publ/rattsfall/ra/',
                       u'HFD': '/publ/rattsfall/hfd/',
                       u'RK': '/publ/rattsfall/rk/',
                       u'MIG': '/publ/rattsfall/mig/',
                       u'AD': '/publ/rattsfall/ad/',
                       u'MD': '/publ/rattsfall/md/',
                       u'FÖD': '/publ/rattsfall/fod/'}

        # res = self.baseuri_attributes['baseuri']
        if 'nja' in attributes:
            attributes['domstol'] = attributes['nja']

        assert 'domstol' in attributes, "No court provided"
        assert attributes['domstol'] in containerid, "%s is an unknown court" % attributes['domstol']
        res = "http://rinfo.lagrummet.se"+containerid[attributes['domstol']]

        if 'lopnr' in attributes and ":" in attributes['lopnr']:
            (attributes['ar'], attributes['lopnr']) = lopnr.split(":", 1)

        if attributes['domstol'] == u'NJA':
            # FIXME: URIs should be based on publikationsordinal, not
            # pagenumber (which this in effect is) - but this requires
            # a big lookup table/database/graph with
            # pagenumber-to-ordinal-mappings
            res += '%ss%s' % (attributes['ar'], attributes['sidnr'])
        else:
            res += '%s:%s' % (attributes['ar'], attributes['lopnr'])

        return res

    ################################################################
    # KOD FÖR EGRÄTTSFALL
    def egrattsfall_format_uri(self,attributes):
        descriptormap = {'C':'J', # Judgment of the Court
                         'T':'A', # Judgment of the Court of First Instance
                         'F':'W', # Judgement of the Civil Service Tribunal
                         }
        # FIXME: Change this before the year 2054 (as ECJ will
        # hopefully have fixed their case numbering by then)
        if len(attributes['year']) == 2:
            if int(attributes['year']) < 54:
                year = "20"+attributes['year']
            else:
                year = "19"+attributes['year']
        else:
            year = attributes['year']

        serial = '%04d' % int(attributes['serial'])
        descriptor = descriptormap[attributes['decision']]
        uri = "http://lagen.nu/ext/celex/6%s%s%s" % (year, descriptor, serial)
        return uri
Exemplo n.º 40
0
from simpleparse.common import numbers
from simpleparse.parser import Parser

import rollparse

parser = Parser(rollparse.declaration)

tests_success = [
    "d6", "5d6", "5d6 + d8", "(5d6 + d8)", "6 + (5d6 + d8)", "[5d6 + d8] + 6",
    "{3d20} + 10"
]

prod = "roll"

for test in tests_success:
    success, children, nextcharacter = parser.parse(test, production=prod)
    assert success and nextcharacter == len(
        test
    ), """Wasn't able to parse %s as a %s (%s chars parsed of %s), returned value was %s""" % (
        repr(test), prod, nextcharacter, len(test),
        (success, children, nextcharacter))

tests_fail = [
    "{5d6}+{8d8}",
    "5d",
    "3+",
    "8d8"  #this one should actually work
]

for test in tests_fail:
    success, children, nextcharacter = parser.parse(test, production=prod)
def CorpusPTBReader(ptb_data_path):
    ptb_sent_file = open("total_ptb.txt", "w")

    file_pattern = r".*/.*\.mrg"

    ptb = BracketParseCorpusReader(ptb_data_path, file_pattern)
    #print (ptb.fileids())
    #print ((ptb.sents()))
    #ptb.sents(fileids= 'brown/cf/cf01.mrg')[0]
    count = 0
    for sent in ptb.sents():
        '''sent = ""
        for word in sent:
            if "\\" in word or "e_s" in word or "n_s" in word:
                continue
            else:
                sent += word + " "
        out = sent[:-1]'''
        if len(sent) < 7: continue
        out = ' '.join(sent)
        out = out.lower()
        #        print(len(sent), out)

        parser = Parser(grammar, 'all')
        temp_result = parser.parse(out)
        sub_sent = []
        start_index = 0
        for num_info in temp_result[1]:
            sub_sent.append(out[start_index:num_info[1]])
            sub_sent.append("NUM" + (str(num_info[2] - num_info[1])))
            start_index = num_info[2]
        sub_sent.append(out[start_index:])
        final_out = ''.join(sub_sent)

        final_out = re.sub(r'\*\-NUM\d ', '', final_out)
        final_out = re.sub(r'e_s ', '', final_out)
        final_out = re.sub(r'n_s ', '', final_out)
        final_out = re.sub(r'e_s', '', final_out)
        final_out = re.sub(r'n_s', '', final_out)
        final_out = re.sub(r'\\. ', '', final_out)
        final_out = re.sub(r'\\.', '', final_out)
        final_out = re.sub(r'\*. ', '', final_out)
        final_out = re.sub(r'\*.', '', final_out)
        final_out = re.sub(r'-. ', '', final_out)
        final_out = re.sub(r'-.', '', final_out)
        #final_out = re.sub(r'\**.\* ', '', final_out)
        #final_out = re.sub(r'\**.\*', '', final_out)
        final_out = re.sub(r'\*{,3}.\*.. ', '', final_out)
        final_out = re.sub(r'\*{,3}.\*. ', '', final_out)
        final_out = re.sub(r'\*.. ', '', final_out)
        final_out = re.sub(r'\*..', '', final_out)
        final_out = re.sub(r'\* ', '', final_out)
        #final_out = re.sub(r'\*', '', final_out)
        final_out = re.sub(r'- ', '', final_out)
        final_out = re.sub(r'-', '', final_out)
        final_out = re.sub(r'; ; ', '; ', final_out)
        final_out = final_out[:-1]
        ptb_sent_file.write(final_out)
        ptb_sent_file.write("\n")
        #print(final_out)
        count += 1
        #if count == 10000: break
        #if count > 10: break
    ptb_sent_file.close()
    print(count)
Exemplo n.º 42
0
from simpleparse.common import numbers
from simpleparse.parser import Parser

import rollparse

parser = Parser(rollparse.declaration)

tests_success = [
    "d6",
    "5d6",
    "5d6 + d8",
    "(5d6 + d8)",
    "6 + (5d6 + d8)",
    "[5d6 + d8] + 6",
    "{3d20} + 10"
    ]

prod = "roll"

for test in tests_success:
    success, children, nextcharacter = parser.parse(test, production=prod)
    assert success and nextcharacter==len(test), """Wasn't able to parse %s as a %s (%s chars parsed of %s), returned value was %s"""%( repr(test), prod, nextcharacter, len(test), (success, children, nextcharacter))

tests_fail = [
    "{5d6}+{8d8}",
    "5d",
    "3+",
    "8d8" #this one should actually work
    ]

for test in tests_fail:
Exemplo n.º 43
0
     a job, a column, and another job, from left to right. In the 
     second row there are two jobs, from left to right.

     The column in the first row has two jobs side by side, then
     another one above them.
  """

    try:
        fid = file(fname, 'rt')
    except Exception, detail:
        raise RuntimeError, "Unable to open layout file: %s\n  %s" % (
            fname, str(detail))

    data = fid.read()
    fid.close()
    parser = Parser(declaration, "file")

    # Replace all CR's in data with nothing, to convert DOS line endings
    # to unix format (all LF's).
    data = string.replace(data, '\x0D', '')

    tree = parser.parse(data)

    # Last element of tree is number of characters parsed
    if not tree[0]:
        raise RuntimeError, "Layout file cannot be parsed"

    if tree[2] != len(data):
        raise RuntimeError, "Parse error at character %d in layout file" % tree[
            2]
Exemplo n.º 44
0
def parseInput(input):
	parser = Parser(declaration)
	success, children, nextcharacter = parser.parse(input, production="fastg")
	assert success
	return children
Exemplo n.º 45
0
semesterkuerzel      := ("A-M", [0-9]) / ("IK-M", [0-9]) / ("BWI", [0-9]) / "MINF1" / "BMT5" / "BTI1" / -"-"+
>kuerzel<            := fachKuerzel, nummer?
prakKuerzel          := [A-Z], [A-Z], "P"
verbKuerzel          := [A-Z], [A-Z], "J"
labKuerzel           := ([A-Z], [A-Z], "L") / (fachKuerzel, " L")
gwKuerzel            := [A-Z_-]+, (" ", [a-zA-Z]+)?
oe1                  := "I"
oe2                  := "II"
fachKuerzel          := [a-zA-Z]+
nummer               := int
no                   := int
gruppe               := int
alphanumGruppe       := [A-Z]
'''
VeranstaltungParser = Parser(declaration, root="root")


def tryGetFullName(veranstaltung):
    from veranstaltungenDispatchProcessor import VeranstaltungDispatchProcessor

    fullName = ""
    success, result, nextcharacter = VeranstaltungParser.parse(
        veranstaltung, processor=VeranstaltungDispatchProcessor())
    if success:
        fullName = result[0]
    return fullName


def test():
    from simpleparse import dispatchprocessor
Exemplo n.º 46
0
 def setDeclaration(self,declaration,production):
     self.parser = Parser(declaration, production)
     self.table =  self.parser.buildTagger(production=production)
Exemplo n.º 47
0
c = {}

eolcomments = r"""
### comment formats where the comment goes
### from a marker to the end of the line

comment   := -'\012'*
<EOL>       := ('\r'?,'\n')/EOF

>hash_comment< := '#', comment, EOL
>semicolon_comment< := ';', comment, EOL
>slashslash_comment< := '//', comment, EOL
"""

_p = Parser(eolcomments)
for name in ["hash_comment", "semicolon_comment", "slashslash_comment"]:
    c[name] = objectgenerator.LibraryElement(
        generator=_p._generator,
        production=name,
    )

ccomments = r"""
### comments in format /* comment */ with no recursion allowed
comment := -"*/"*
>slashbang_comment< := '/*', comment, '*/'
"""
_p = Parser(ccomments)
for name in ["c_comment", "slashbang_comment"]:
    c[name] = objectgenerator.LibraryElement(
        generator=_p._generator,
Exemplo n.º 48
0
     This is a panel with two rows. In the first row there is
     a job, a column, and another job, from left to right. In the 
     second row there are two jobs, from left to right.

     The column in the first row has two jobs side by side, then
     another one above them.
  """

  try:
    fid = file(fname, 'rt')
  except Exception, detail:
    raise RuntimeError, "Unable to open layout file: %s\n  %s" % (fname, str(detail))

  data = fid.read()
  fid.close()
  parser = Parser(declaration, "file")

  # Replace all CR's in data with nothing, to convert DOS line endings
  # to unix format (all LF's).
  data = string.replace(data, '\x0D', '')

  tree = parser.parse(data)

  # Last element of tree is number of characters parsed
  if not tree[0]:
    raise RuntimeError, "Layout file cannot be parsed"

  if tree[2] != len(data):
    raise RuntimeError, "Parse error at character %d in layout file" % tree[2]

  Rows = []
Exemplo n.º 49
0
 def __init__(self, filedef=transit_file_def, verbosity=1):
     Parser.__init__(self, filedef)
     self.verbosity = verbosity
     self.tfp = TransitFileProcessor(self.verbosity)