def __init__( self, filename, docid ): # should take object levels, mapping, metapaths as keyword arguments. self.reader = None self.writer = None self.filename = filename self.context = [] self.counts = {} self.current_object = {} self.meta_memory = {} self.metahandler = None self.objects = OHCOVector( ["doc", "div1", "div2", "div3", "para", "sent", "word"]) # the parser should know what its object levels are. self.objects.v[ 0] = docid - 1 # - 1 because we're about to increment it. self.objects_max = self.objects.v self.line_max = 0 self.mapping = { "TEI": "doc", # the raw mapping should be unambiguous, and context-free. "TEI.2": "doc", "front": "div", "div": "div", "div0": "div", "div1": "div", "div2": "div", "div3": "div", "p": "para", "sp": "para", "stage": "para" } # we should be able to put bonus items in here. "ln":"line", in particular. self.metamap = { "titleStmt/author": "author", "titleStmt/title": "title", "div/head": "head", "div1/head": "head" } self.metapaths = { "doc": { "titleStmt/author": "author", # metadata paths; order of evaluation is indeterminate, so MAKE SURE that they are unambiguous. "titleStmt/title": "title" }, "div": { "head": "head" } } # attributes should go in here too. self.context_memory = {} self.parallel = { "line": 0, #this should be implicit. "byte": 0 } # this should be automatic.
def __init__(self, output, docid, filesize, token_regex=r"(\w+)|([\.\?\!])", xpaths=[("doc", "./")], metadata_xpaths=[], suppress_tags=[], pseudo_empty_tags=[], known_metadata={}): self.types = ["doc", "div1", "div2", "div3", "para", "sent", "word"] self.parallel_type = "page" self.output = output self.docid = docid ## Initialize an OHCOVector Stack. operations on this stack produce all parser output. self.v = OHCOVector.CompoundStack(self.types, self.parallel_type, docid, output) self.filesize = filesize self.token_regex = token_regex self.xpaths = xpaths[:] self.metadata_xpaths = metadata_xpaths[:] self.suppress_xpaths = suppress_tags self.pseudo_empty_tags = pseudo_empty_tags self.known_metadata = known_metadata self.buffer_position = 0 self.buffers = []
def __init__(self, known_metadata, docid, format=ARTFLVector, parallel=ARTFLParallels, xpaths=None, metadata_xpaths=None, token_regex=Default_Token_Regex, non_nesting_tags=[], self_closing_tags=[], pseudo_empty_tags=[], output=None): self.known_metadata = known_metadata self.docid = docid self.i = shlaxtree.ShlaxIngestor(target=self) self.tree = None #unnecessary? self.root = None self.stack = [] self.map = xpaths or TEI_XPaths self.metadata_paths = metadata_xpaths or TEI_MetadataXPaths self.v = OHCOVector.CompoundStack(format, parallel, docid, output) # OHCOVector should take an output file handle. self.extractors = [] self.file_position = 0 self.token_regex = token_regex self.non_nesting_tags = non_nesting_tags self.self_closing_tags = self_closing_tags self.pseudo_empty_tags = pseudo_empty_tags self.pushed_tags = {} self.depth_pushed = {}
def __init__(self, output, docid, filesize, token_regex=r"(\w+)|([\.\?\!])", xpaths=[("doc", "./")], metadata_xpaths=[], suppress_tags=[], pseudo_empty_tags=[], words_to_index=[], known_metadata={}): self.types = ["doc", "div1", "div2", "div3", "para", "sent", "word"] self.parallel_type = "page" self.output = output self.docid = docid self.filesize = filesize self.v = OHCOVector.CompoundStack(self.types, self.parallel_type, docid, output) self.token_regex = token_regex self.xpaths = xpaths[:] self.metadata_xpaths = metadata_xpaths[:] self.suppress_xpaths = suppress_tags self.pseudo_empty_tags = pseudo_empty_tags self.known_metadata = known_metadata self.stack = [] self.root = None self.handlers = {} self.buffer_position = 0 self.buffers = []
def __init__(self,filename,docid): # should take object levels, mapping, metapaths as keyword arguments. self.reader = None self.writer = None self.filename = filename self.context = [] self.counts = {} self.current_object = {} self.meta_memory = {} self.metahandler = None self.objects = OHCOVector(["doc","div1","div2","div3","para","sent","word"]) # the parser should know what its object levels are. self.objects.v[0] = docid - 1 # - 1 because we're about to increment it. self.objects_max = self.objects.v self.line_max = 0 self.mapping = {"TEI":"doc", # the raw mapping should be unambiguous, and context-free. "TEI.2":"doc", "front":"div", "div":"div", "div0":"div", "div1":"div", "div2":"div", "div3":"div", "p":"para", "sp":"para", "stage":"para"} # we should be able to put bonus items in here. "ln":"line", in particular. self.metamap = { "titleStmt/author" : "author", "titleStmt/title" : "title", "div/head" : "head", "div1/head" : "head"} self.metapaths = { "doc" : {"titleStmt/author" : "author", # metadata paths; order of evaluation is indeterminate, so MAKE SURE that they are unambiguous. "titleStmt/title" : "title"}, "div" : {"head":"head"} } # attributes should go in here too. self.context_memory = {} self.parallel = {"line":0, #this should be implicit. "byte":0} # this should be automatic.
class AbstractParser: def __init__(self,filename,docid): # should take object levels, mapping, metapaths as keyword arguments. self.reader = None self.writer = None self.filename = filename self.context = [] self.counts = {} self.current_object = {} self.meta_memory = {} self.metahandler = None self.objects = OHCOVector(["doc","div1","div2","div3","para","sent","word"]) # the parser should know what its object levels are. self.objects.v[0] = docid - 1 # - 1 because we're about to increment it. self.objects_max = self.objects.v self.line_max = 0 self.mapping = {"TEI":"doc", # the raw mapping should be unambiguous, and context-free. "TEI.2":"doc", "front":"div", "div":"div", "div0":"div", "div1":"div", "div2":"div", "div3":"div", "p":"para", "sp":"para", "stage":"para"} # we should be able to put bonus items in here. "ln":"line", in particular. self.metamap = { "titleStmt/author" : "author", "titleStmt/title" : "title", "div/head" : "head", "div1/head" : "head"} self.metapaths = { "doc" : {"titleStmt/author" : "author", # metadata paths; order of evaluation is indeterminate, so MAKE SURE that they are unambiguous. "titleStmt/title" : "title"}, "div" : {"head":"head"} } # attributes should go in here too. self.context_memory = {} self.parallel = {"line":0, #this should be implicit. "byte":0} # this should be automatic. def parse_metapaths(self): pass def match_metapaths(self): for obj_type, paths in self.metapaths.items(): if obj_type in self.meta_memory and self.meta_memory[obj_type]: working_context = self.context[len(self.meta_memory[obj_type]):] # print self.context # print working_context # print self.current_object[obj_type] #metadata xpaths are ALWAYS relative. I should check for that. for path, destination in paths.items(): if context_match(working_context,path): leaf = attribute_leaf(path) if leaf: return ("meta_attribute",obj_type,leaf) else: return ("meta_content",obj_type,destination) def push_object(self,type): self.objects.push(type) self.current_object[type] = self.objects.v[:] self.meta_memory[type] = self.context[:] self.objects_max = [max(x,y) for x,y in zip(self.objects.v,self.objects_max)] #should maintain a toms stack here, basically. def pull_object(self,type): self.objects.pull(type) self.current_object[type] = None self.meta_memory[type] = None self.objects_max = [max(x,y) for x,y in zip(self.objects.v,self.objects_max)] #should remove toms from the stack and print them here. def parse(self, input, output): self.reader = input self.writer = output p = shlax.parser(self.reader) for n in p: if n.type == "StartTag": self.parallel["byte"] = n.start self.context.append(n.name) #match metadata after you append: you want to see if you're entering a metadata context. for pattern in self.metamap: if context_match(self.context,pattern): self.metahandler = self.metamap[pattern] self.context_memory = self.context # meta_result = self.match_metapaths() # if meta_result: print meta_result if n.name in self.mapping: type = self.mapping[n.name] self.push_object(type) attlist = "" for k,v in n.attributes.items(): attlist += " %s=\"%s\"" % (k,v) try: emit_object(self.writer,type,"<" + n.name + attlist + ">",self.objects.v,self.parallel["byte"],self.parallel["line"]) except UnicodeDecodeError: print >> sys.stderr, "bad encoding at %s byte %s" % (self.filename,n.start) if type == "doc": print >> self.writer, "meta %s %s" % ("filename", self.filename) if n.name == "l": if "n" in n.attributes.keys(): self.parallel["line"] = int(n.attributes["n"]) else: self.parallel["line"] += 1 print >> self.writer, "line %d %d" % (self.parallel["byte"],self.parallel["line"]) self.line_max = max(self.parallel["line"],self.line_max) elif n.type == "EndTag": self.parallel["byte"] = n.start #match metadata before you pop: you want to see if you're leaving a metadata context. for pattern in self.metamap: if self.context_memory and self.context_memory == self.context: self.metahandler = None self.context_memory = None if self.context[-1] == n.name: self.context.pop() else: print >> sys.stderr, "mismatched tag at %s byte %s" % (self.filename,n.start) if n.name in self.mapping: type = self.mapping[n.name] self.pull_object(type) emit_object(self.writer,type,"</" + n.name + ">",self.objects.v,self.parallel["byte"],self.parallel["line"]) elif n.type == "text": self.parallel["byte"] = n.start try: # this tokenizer could go into it's own subroutine... text = n.content.decode("UTF-8") tokens = re.finditer(ur"([\w\u2019]+)|([\.;:?!])",text,re.U) offset = self.parallel["byte"] if self.metahandler: cleantext = re.sub("[\n\t]"," ",text) print >> self.writer, "meta %s %s" % (self.metahandler,cleantext) for token in tokens: if token.group(1): self.push_object("word") char_offset = token.start(1) byte_length = len(text[:char_offset].encode("UTF-8")) emit_object(self.writer,"word",token.group(1),self.objects.v,offset + byte_length,self.parallel["line"]) self.counts[token.group(1)] = self.counts.get(token.group(1),0) + 1 if token.group(2): self.push_object("sent") char_offset = token.start(1) byte_length = len(text[:char_offset].encode("UTF-8")) emit_object(self.writer,"sent",token.group(2),self.objects.v,offset + byte_length,self.parallel["line"]) except UnicodeDecodeError: print >> sys.stderr, "bad encoding in %s around byte %s" % (self.filename,n.start) #Here [after done parsing] I should see if I still have an object stack, and unwind it, if so. max_v = self.objects_max max_v.extend((self.parallel["byte"],self.line_max)) return (max_v,self.counts)
class AbstractParser: def __init__( self, filename, docid ): # should take object levels, mapping, metapaths as keyword arguments. self.reader = None self.writer = None self.filename = filename self.context = [] self.counts = {} self.current_object = {} self.meta_memory = {} self.metahandler = None self.objects = OHCOVector( ["doc", "div1", "div2", "div3", "para", "sent", "word"]) # the parser should know what its object levels are. self.objects.v[ 0] = docid - 1 # - 1 because we're about to increment it. self.objects_max = self.objects.v self.line_max = 0 self.mapping = { "TEI": "doc", # the raw mapping should be unambiguous, and context-free. "TEI.2": "doc", "front": "div", "div": "div", "div0": "div", "div1": "div", "div2": "div", "div3": "div", "p": "para", "sp": "para", "stage": "para" } # we should be able to put bonus items in here. "ln":"line", in particular. self.metamap = { "titleStmt/author": "author", "titleStmt/title": "title", "div/head": "head", "div1/head": "head" } self.metapaths = { "doc": { "titleStmt/author": "author", # metadata paths; order of evaluation is indeterminate, so MAKE SURE that they are unambiguous. "titleStmt/title": "title" }, "div": { "head": "head" } } # attributes should go in here too. self.context_memory = {} self.parallel = { "line": 0, #this should be implicit. "byte": 0 } # this should be automatic. def parse_metapaths(self): pass def match_metapaths(self): for obj_type, paths in self.metapaths.items(): if obj_type in self.meta_memory and self.meta_memory[obj_type]: working_context = self.context[len(self.meta_memory[obj_type] ):] # print self.context # print working_context # print self.current_object[obj_type] #metadata xpaths are ALWAYS relative. I should check for that. for path, destination in paths.items(): if context_match(working_context, path): leaf = attribute_leaf(path) if leaf: return ("meta_attribute", obj_type, leaf) else: return ("meta_content", obj_type, destination) def push_object(self, type): self.objects.push(type) self.current_object[type] = self.objects.v[:] self.meta_memory[type] = self.context[:] self.objects_max = [ max(x, y) for x, y in zip(self.objects.v, self.objects_max) ] #should maintain a toms stack here, basically. def pull_object(self, type): self.objects.pull(type) self.current_object[type] = None self.meta_memory[type] = None self.objects_max = [ max(x, y) for x, y in zip(self.objects.v, self.objects_max) ] #should remove toms from the stack and print them here. def parse(self, input, output): self.reader = input self.writer = output p = shlax.parser(self.reader) for n in p: if n.type == "StartTag": self.parallel["byte"] = n.start self.context.append(n.name) #match metadata after you append: you want to see if you're entering a metadata context. for pattern in self.metamap: if context_match(self.context, pattern): self.metahandler = self.metamap[pattern] self.context_memory = self.context # meta_result = self.match_metapaths() # if meta_result: print meta_result if n.name in self.mapping: type = self.mapping[n.name] self.push_object(type) attlist = "" for k, v in n.attributes.items(): attlist += " %s=\"%s\"" % (k, v) try: emit_object(self.writer, type, "<" + n.name + attlist + ">", self.objects.v, self.parallel["byte"], self.parallel["line"]) except UnicodeDecodeError: print >> sys.stderr, "bad encoding at %s byte %s" % ( self.filename, n.start) if type == "doc": print >> self.writer, "meta %s %s" % ("filename", self.filename) if n.name == "l": if "n" in n.attributes.keys(): self.parallel["line"] = int(n.attributes["n"]) else: self.parallel["line"] += 1 print >> self.writer, "line %d %d" % ( self.parallel["byte"], self.parallel["line"]) self.line_max = max(self.parallel["line"], self.line_max) elif n.type == "EndTag": self.parallel["byte"] = n.start #match metadata before you pop: you want to see if you're leaving a metadata context. for pattern in self.metamap: if self.context_memory and self.context_memory == self.context: self.metahandler = None self.context_memory = None if self.context[-1] == n.name: self.context.pop() else: print >> sys.stderr, "mismatched tag at %s byte %s" % ( self.filename, n.start) if n.name in self.mapping: type = self.mapping[n.name] self.pull_object(type) emit_object(self.writer, type, "</" + n.name + ">", self.objects.v, self.parallel["byte"], self.parallel["line"]) elif n.type == "text": self.parallel["byte"] = n.start try: # this tokenizer could go into it's own subroutine... text = n.content.decode("UTF-8") tokens = re.finditer(ur"([\w\u2019]+)|([\.;:?!])", text, re.U) offset = self.parallel["byte"] if self.metahandler: cleantext = re.sub("[\n\t]", " ", text) print >> self.writer, "meta %s %s" % (self.metahandler, cleantext) for token in tokens: if token.group(1): self.push_object("word") char_offset = token.start(1) byte_length = len( text[:char_offset].encode("UTF-8")) emit_object(self.writer, "word", token.group(1), self.objects.v, offset + byte_length, self.parallel["line"]) self.counts[token.group(1)] = self.counts.get( token.group(1), 0) + 1 if token.group(2): self.push_object("sent") char_offset = token.start(1) byte_length = len( text[:char_offset].encode("UTF-8")) emit_object(self.writer, "sent", token.group(2), self.objects.v, offset + byte_length, self.parallel["line"]) except UnicodeDecodeError: print >> sys.stderr, "bad encoding in %s around byte %s" % ( self.filename, n.start) #Here [after done parsing] I should see if I still have an object stack, and unwind it, if so. max_v = self.objects_max max_v.extend((self.parallel["byte"], self.line_max)) return (max_v, self.counts)