Python OHCOVector примеры использования

Язык программирования: Python

Пространство имен/Пакет: philologic.OHCOVector

Класс/Тип: OHCOVector

Примеров на hotexamples.com: 2

Python OHCOVector - 2 примера найдено. Это лучшие примеры Python кода для philologic.OHCOVector.OHCOVector, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

OHCOVector(1)

pull(1)

push(1)

typedepth(1)

Пример #1

Показать файл

 def __init__(self,filename,docid,format=ARTFLVector,mapping=TEIMapping,paths=TEIPaths): 
     self.reader = None
     self.writer = None
     self.filename = filename
     self.object_stack = []
     self.context = []
     self.counts = {}
     self.format = format
     self.objects = OHCOVector(self.format) # the parser should know what its object levels are.
     self.objects.v[0] = docid - 1 # - 1 because we're about to increment it.
     self.full_did = self.objects.v[:]
     self.line_max = 0
     self.mapping = mapping
     self.metapaths = paths
     self.parallel = {"line":0,
                      "byte":0}
     self.event_stream = None
     self.objects_max = None

Пример #2

Показать файл

class AbstractParser:
    def __init__(self,filename,docid,format=ARTFLVector,mapping=TEIMapping,paths=TEIPaths): 
        self.reader = None
        self.writer = None
        self.filename = filename
        self.object_stack = []
        self.context = []
        self.counts = {}
        self.format = format
        self.objects = OHCOVector(self.format) # the parser should know what its object levels are.
        self.objects.v[0] = docid - 1 # - 1 because we're about to increment it.
        self.full_did = self.objects.v[:]
        self.line_max = 0
        self.mapping = mapping
        self.metapaths = paths
        self.parallel = {"line":0,
                         "byte":0}
        self.event_stream = None
        self.objects_max = None
        
    def parse(self, input, output):
        self.reader = input # filtering for bad encoding should be done in the reader
        self.writer = output # sorting or filtering output should be done by piping the writer
        self.event_stream = buffer_stream(shlax.parser(self.reader))
        for n in self.event_stream:
            if n.type == "StartTag":
                self.make_object(n.name,"doc",n,{"filename":self.filename})
        # note that objects are emitted in the order in which they END, due to the recursion.
        # document finishes last. sort as necessary/desired.
        return (self.objects_max,self.counts)
            
    def make_object(self,name,type, first = None, attributes = {}):
        discard = None
        mydepth = self.objects.typedepth(type)
#        print "pushing %s:%s at depth %d" % (type,name,mydepth)
        self.objects.push(type)
        my_attributes = attributes.copy() # because we don't want the attributes to get modified by inner objects.
        my_attributes["start"] = first.start if first else self.parallel["byte"]

        # here we check if the start tag has any attributes we want to grab.
        for path_type,path,destination in self.get_xpaths():
            (parsed_path,parsed_attr) = parse_xpath(path)
            if path_type != type: continue
            if not parsed_attr: continue
            if context_match(self.context,parsed_path):
                if parsed_attr in first.attributes.keys() and destination not in my_attributes:
                    my_attributes[destination] = first.attributes[parsed_attr]
        
        my_id = self.objects.v[:]
        my_context = self.context[:]
        self.object_stack.append((name,type,my_id,my_context,my_attributes))
        for node in self.event_stream:
            self.parallel["byte"] = node.start

            if node.type == "StartTag":
                self.context.append(node.name)
                if node.name in self.mapping:
                    newtype = self.mapping[node.name]
                    if self.objects.typedepth(newtype) <= mydepth: 
                        #print "bouncing up!"
                        # should set node.attributes["end"] here.
                       discard = node
                       break
#                        pass
                    # should check here whether new type is child of this type, or else set discard and break. 
                    inner_result = self.make_object(node.name,newtype,node,{"parent":" ".join(str(i) for i in my_id)})
                    if inner_result:
                        self.event_stream.append(inner_result) # result gets pushed onto the front of the loop.
                elif node.name == "l": #parallel objects are best handled with one-off hacks, for now.  maybe a "dispatch" keyword in the mapping?
                    if "n" in node.attributes.keys():
                        self.parallel["line"] = int(node.attributes["n"])
                    else:
                        self.parallel["line"] += 1
                #   self.emit("line", node.name,self.full_did,self.parallel["byte"],self.parallel["line"]) # need to work out a notation for parallel objects.
                    self.line_max = max(self.parallel["line"],self.line_max)
                else:
                    pass # dispatch child attributes here.

            elif node.type == "text":
                self.tokenize(node.content,node.start)

            elif node.type == "EndTag":
                if len(self.context) and self.context[-1] == node.name: # very conservative--this makes the stack tend to grow.
                    self.context.pop()
                if node.name == name: # we know this is the end because self.mapping is unambiguous, right?
                    my_attributes["end"] = node.start + len(node.content)
                    break
        if "end" not in my_attributes:
            my_attributes["end"] = self.parallel["byte"]
        self.emit(type,name, my_id,my_attributes["start"],self.parallel["line"],**my_attributes)
        self.object_stack.pop()
        self.objects.pull(type)
        return discard

    def tokenize(self,text,offset):
        try: # long try blocks are bad, they say...so is python's utf-8 handling.
            text = text.decode("utf-8")
            tokens = re.finditer(ur"([\w\u2019]+)|([\.;:?!])",text,re.U)
            # todo: name regex subgroups.

            for path_type,path,destination in self.get_xpaths():
                (parsed_path,parsed_attr) = parse_xpath(path)
                if parsed_attr: continue
                parent_match = self.match_parents(parsed_path,path_type)
                if parent_match:
                    parent_match[4][destination] = parent_match[4].get(destination,"") + re.sub("[\n\t]","",text.encode("utf-8"))
                    break
            if parent_match: pass # don't tokenize if you are in metadata

            else:
                for token in tokens:
                    if token.group(1):
                        self.objects.push("word")
                        char_offset = token.start(1)
                        byte_length = len(text[:char_offset].encode("utf-8"))
                        self.emit("word",token.group(1),self.objects.v,offset + byte_length,self.parallel["line"]) 
                        self.counts[token.group(1)] = self.counts.get(token.group(1),0) + 1
                    if token.group(2):
                        self.objects.push("sent")
                        char_offset = token.start(2)
                        byte_length = len(text[:char_offset].encode("utf-8"))
                        self.emit("sent",token.group(2),self.objects.v,offset + byte_length,self.parallel["line"]) 

        except UnicodeDecodeError as err:
            print >> sys.stderr, "%s : %s@%s : %s;%s" % (type(err),self.filename,offset,err,err.args)

    def emit(self,type,name,vector,*parallel,**attr):
        v = vector[:]
        if parallel:
            v += parallel
        if attr:
            v.append(attr)
        print >> self.writer, "%s\t%s\t%s" % (type,name," ".join(str(x) for x in v) )     
        try: 
            self.objects_max = [max(x,y) for x,y in zip(v,self.objects_max)]
        except TypeError:
            self.objects_max = v

    def match_parents(self,parsed_path,path_type=None):
        for parent in reversed(self.object_stack):
            (parent_name,parent_type,parent_id,parent_context,parent_attributes) = parent
            if self.context[:len(parent_context)] != parent_context: continue
            if parent_type not in self.metapaths: continue 
            if path_type: 
                if parent_type != path_type: continue
            working_context = self.context[len(parent_context):] 
            if context_match(working_context,parsed_path):
                return parent
            
    def get_xpaths(self):
        return [(this_type,this_path,this_destination) for this_type in self.metapaths for (this_path,this_destination) in self.metapaths[this_type].items()]