def __init__(self, known_metadata, docid, format=ARTFLVector, parallel=ARTFLParallels, xpaths=None, metadata_xpaths=None, token_regex=Default_Token_Regex, non_nesting_tags=[], self_closing_tags=[], pseudo_empty_tags=[], output=None): self.known_metadata = known_metadata self.docid = docid self.i = shlaxtree.ShlaxIngestor(target=self) self.tree = None #unnecessary? self.root = None self.stack = [] self.map = xpaths or TEI_XPaths self.metadata_paths = metadata_xpaths or TEI_MetadataXPaths self.v = OHCOVector.CompoundStack(format, parallel, docid, output) # OHCOVector should take an output file handle. self.extractors = [] self.file_position = 0 self.token_regex = token_regex self.non_nesting_tags = non_nesting_tags self.self_closing_tags = self_closing_tags self.pseudo_empty_tags = pseudo_empty_tags self.pushed_tags = {} self.depth_pushed = {}
def __init__(self, output, docid, filesize, token_regex=r"(\w+)|([\.\?\!])", xpaths=[("doc", "./")], metadata_xpaths=[], suppress_tags=[], pseudo_empty_tags=[], known_metadata={}): self.types = ["doc", "div1", "div2", "div3", "para", "sent", "word"] self.parallel_type = "page" self.output = output self.docid = docid ## Initialize an OHCOVector Stack. operations on this stack produce all parser output. self.v = OHCOVector.CompoundStack(self.types, self.parallel_type, docid, output) self.filesize = filesize self.token_regex = token_regex self.xpaths = xpaths[:] self.metadata_xpaths = metadata_xpaths[:] self.suppress_xpaths = suppress_tags self.pseudo_empty_tags = pseudo_empty_tags self.known_metadata = known_metadata self.buffer_position = 0 self.buffers = []
def __init__(self, output, docid, filesize, token_regex=r"(\w+)|([\.\?\!])", xpaths=[("doc", "./")], metadata_xpaths=[], suppress_tags=[], pseudo_empty_tags=[], words_to_index=[], known_metadata={}): self.types = ["doc", "div1", "div2", "div3", "para", "sent", "word"] self.parallel_type = "page" self.output = output self.docid = docid self.filesize = filesize self.v = OHCOVector.CompoundStack(self.types, self.parallel_type, docid, output) self.token_regex = token_regex self.xpaths = xpaths[:] self.metadata_xpaths = metadata_xpaths[:] self.suppress_xpaths = suppress_tags self.pseudo_empty_tags = pseudo_empty_tags self.known_metadata = known_metadata self.stack = [] self.root = None self.handlers = {} self.buffer_position = 0 self.buffers = []