def __init__(self,filename,docid,format=ARTFLVector,mapping=TEIMapping,paths=TEIPaths): self.reader = None self.writer = None self.filename = filename self.object_stack = [] self.context = [] self.counts = {} self.format = format self.objects = OHCOVector(self.format) # the parser should know what its object levels are. self.objects.v[0] = docid - 1 # - 1 because we're about to increment it. self.full_did = self.objects.v[:] self.line_max = 0 self.mapping = mapping self.metapaths = paths self.parallel = {"line":0, "byte":0} self.event_stream = None self.objects_max = None
class AbstractParser: def __init__(self,filename,docid,format=ARTFLVector,mapping=TEIMapping,paths=TEIPaths): self.reader = None self.writer = None self.filename = filename self.object_stack = [] self.context = [] self.counts = {} self.format = format self.objects = OHCOVector(self.format) # the parser should know what its object levels are. self.objects.v[0] = docid - 1 # - 1 because we're about to increment it. self.full_did = self.objects.v[:] self.line_max = 0 self.mapping = mapping self.metapaths = paths self.parallel = {"line":0, "byte":0} self.event_stream = None self.objects_max = None def parse(self, input, output): self.reader = input # filtering for bad encoding should be done in the reader self.writer = output # sorting or filtering output should be done by piping the writer self.event_stream = buffer_stream(shlax.parser(self.reader)) for n in self.event_stream: if n.type == "StartTag": self.make_object(n.name,"doc",n,{"filename":self.filename}) # note that objects are emitted in the order in which they END, due to the recursion. # document finishes last. sort as necessary/desired. return (self.objects_max,self.counts) def make_object(self,name,type, first = None, attributes = {}): discard = None mydepth = self.objects.typedepth(type) # print "pushing %s:%s at depth %d" % (type,name,mydepth) self.objects.push(type) my_attributes = attributes.copy() # because we don't want the attributes to get modified by inner objects. my_attributes["start"] = first.start if first else self.parallel["byte"] # here we check if the start tag has any attributes we want to grab. for path_type,path,destination in self.get_xpaths(): (parsed_path,parsed_attr) = parse_xpath(path) if path_type != type: continue if not parsed_attr: continue if context_match(self.context,parsed_path): if parsed_attr in first.attributes.keys() and destination not in my_attributes: my_attributes[destination] = first.attributes[parsed_attr] my_id = self.objects.v[:] my_context = self.context[:] self.object_stack.append((name,type,my_id,my_context,my_attributes)) for node in self.event_stream: self.parallel["byte"] = node.start if node.type == "StartTag": self.context.append(node.name) if node.name in self.mapping: newtype = self.mapping[node.name] if self.objects.typedepth(newtype) <= mydepth: #print "bouncing up!" # should set node.attributes["end"] here. discard = node break # pass # should check here whether new type is child of this type, or else set discard and break. inner_result = self.make_object(node.name,newtype,node,{"parent":" ".join(str(i) for i in my_id)}) if inner_result: self.event_stream.append(inner_result) # result gets pushed onto the front of the loop. elif node.name == "l": #parallel objects are best handled with one-off hacks, for now. maybe a "dispatch" keyword in the mapping? if "n" in node.attributes.keys(): self.parallel["line"] = int(node.attributes["n"]) else: self.parallel["line"] += 1 # self.emit("line", node.name,self.full_did,self.parallel["byte"],self.parallel["line"]) # need to work out a notation for parallel objects. self.line_max = max(self.parallel["line"],self.line_max) else: pass # dispatch child attributes here. elif node.type == "text": self.tokenize(node.content,node.start) elif node.type == "EndTag": if len(self.context) and self.context[-1] == node.name: # very conservative--this makes the stack tend to grow. self.context.pop() if node.name == name: # we know this is the end because self.mapping is unambiguous, right? my_attributes["end"] = node.start + len(node.content) break if "end" not in my_attributes: my_attributes["end"] = self.parallel["byte"] self.emit(type,name, my_id,my_attributes["start"],self.parallel["line"],**my_attributes) self.object_stack.pop() self.objects.pull(type) return discard def tokenize(self,text,offset): try: # long try blocks are bad, they say...so is python's utf-8 handling. text = text.decode("utf-8") tokens = re.finditer(ur"([\w\u2019]+)|([\.;:?!])",text,re.U) # todo: name regex subgroups. for path_type,path,destination in self.get_xpaths(): (parsed_path,parsed_attr) = parse_xpath(path) if parsed_attr: continue parent_match = self.match_parents(parsed_path,path_type) if parent_match: parent_match[4][destination] = parent_match[4].get(destination,"") + re.sub("[\n\t]","",text.encode("utf-8")) break if parent_match: pass # don't tokenize if you are in metadata else: for token in tokens: if token.group(1): self.objects.push("word") char_offset = token.start(1) byte_length = len(text[:char_offset].encode("utf-8")) self.emit("word",token.group(1),self.objects.v,offset + byte_length,self.parallel["line"]) self.counts[token.group(1)] = self.counts.get(token.group(1),0) + 1 if token.group(2): self.objects.push("sent") char_offset = token.start(2) byte_length = len(text[:char_offset].encode("utf-8")) self.emit("sent",token.group(2),self.objects.v,offset + byte_length,self.parallel["line"]) except UnicodeDecodeError as err: print >> sys.stderr, "%s : %s@%s : %s;%s" % (type(err),self.filename,offset,err,err.args) def emit(self,type,name,vector,*parallel,**attr): v = vector[:] if parallel: v += parallel if attr: v.append(attr) print >> self.writer, "%s\t%s\t%s" % (type,name," ".join(str(x) for x in v) ) try: self.objects_max = [max(x,y) for x,y in zip(v,self.objects_max)] except TypeError: self.objects_max = v def match_parents(self,parsed_path,path_type=None): for parent in reversed(self.object_stack): (parent_name,parent_type,parent_id,parent_context,parent_attributes) = parent if self.context[:len(parent_context)] != parent_context: continue if parent_type not in self.metapaths: continue if path_type: if parent_type != path_type: continue working_context = self.context[len(parent_context):] if context_match(working_context,parsed_path): return parent def get_xpaths(self): return [(this_type,this_path,this_destination) for this_type in self.metapaths for (this_path,this_destination) in self.metapaths[this_type].items()]