def _open_subcorpus(self, corpname: str, subcname: str, corp: Corpus, spath: str, decode_desc: bool) -> Corpus: subc = manatee.SubCorpus(corp, spath) subc.corp = corp subc.spath = spath try: open(spath[:-4] + 'used', 'w') except IOError: pass subc.corpname = str(corpname) # never unicode (paths) subc.subcname = subcname with open(spath, 'rb') as subcinfo: subc.subchash = md5(subcinfo.read()).hexdigest() subc.created = datetime.fromtimestamp(int(os.path.getctime(spath))) subc.is_published = subcorpus_is_published(spath) meta, desc = get_subcorp_pub_info(os.path.splitext(spath)[0] + '.name') if meta.subcpath: subc.orig_spath = meta.subcpath subc.orig_subcname = os.path.splitext( os.path.basename(meta.subcpath))[0] else: subc.orig_spath = None subc.orig_subcname = None subc.author = meta.author_name subc.author_id = meta.author_id if desc: subc.description = k_markdown(desc) if decode_desc else desc else: subc.description = None return subc
def get_Corpus(self, corpname, subcname=''): if ':' in corpname: corpname, subcname = corpname.split(':', 1) corp = manatee.Corpus(corpname) corp.corpname = str(corpname) # never unicode (paths) corp.cm = self dsubcpath = self.default_subcpath(corp) if subcname: for sp in self.subcpath + [dsubcpath]: if sp == dsubcpath: spath = os.path.join(sp, subcname + '.subc') else: spath = os.path.join(sp, corpname, subcname + '.subc') if type(spath) == unicode: spath = spath.encode("utf-8") if os.path.isfile(spath): subc = manatee.SubCorpus(corp, spath) subc.corp = corp subc.spath = spath try: open(spath[:-4] + 'used', 'w') except Exception: pass subc.corpname = str(corpname) # never unicode (paths) subc.subcname = subcname subc.cm = self subc.subchash = md5(open(spath).read()).hexdigest() subc.created = datetime.fromtimestamp(int(os.path.getctime(spath))) return subc raise RuntimeError(_('Subcorpus "%s" not found') % subcname) else: return corp
def _open_subcorpus(self, corpname, subcname, corp, spath, decode_desc): subc = manatee.SubCorpus(corp, spath) subc.corp = corp subc.spath = spath try: open(spath[:-4] + 'used', 'w') except IOError: pass subc.corpname = str(corpname) # never unicode (paths) subc.subcname = subcname subc.cm = self subc.subchash = md5(open(spath).read()).hexdigest() subc.created = datetime.fromtimestamp(int(os.path.getctime(spath))) subc.is_published = subcorpus_is_published(spath) orig_path, author, desc = get_subcorp_pub_info( os.path.splitext(spath)[0] + '.name') if orig_path: subc.orig_spath = orig_path subc.orig_subcname = os.path.splitext( os.path.basename(orig_path))[0] else: subc.orig_spath = None subc.orig_subcname = None subc.author = author if desc: subc.description = k_markdown(desc) if decode_desc else desc else: subc.description = None return subc
def _load_corp(corp_id, subc_path): """ Instantiate a manatee.Corpus (or manatee.SubCorpus) instance arguments: corp_id -- a corpus identifier subc_path -- path to a subcorpus """ corp = manatee.Corpus(corp_id) if subc_path: corp = manatee.SubCorpus(corp, subc_path) corp.corpname = corp_id return corp
def get_Corpus(self, corpname, corp_variant='', subcname=''): """ args: corp_variant: a registry file path prefix for (typically) limited variant of a corpus; please note that in many cases this can be omitted as only in case user wants to see a continuous text (e.g. kwic context) we must make sure he sees only a 'legal' chunk. """ if ':' in corpname: corpname, subcname = corpname.split(':', 1) registry_file = os.path.join(corp_variant, corpname) if corp_variant else corpname corp = manatee.Corpus(registry_file) corp.corpname = str(corpname) # never unicode (paths) corp.cm = self dsubcpath = self.default_subcpath(corp) if subcname: for sp in self.subcpath + [dsubcpath]: if sp == dsubcpath: spath = os.path.join(sp, subcname + '.subc') else: spath = os.path.join(sp, corpname, subcname + '.subc') if type(spath) == unicode: spath = spath.encode("utf-8") if os.path.isfile(spath): subc = manatee.SubCorpus(corp, spath) subc.corp = corp subc.spath = spath try: open(spath[:-4] + 'used', 'w') except IOError: pass subc.corpname = str(corpname) # never unicode (paths) subc.subcname = subcname subc.cm = self subc.subchash = md5(open(spath).read()).hexdigest() subc.created = datetime.fromtimestamp( int(os.path.getctime(spath))) return subc raise RuntimeError(_('Subcorpus "%s" not found') % subcname) else: return corp
def run(self): # Check whether query is prepared. if self.corpus is None: raise QueryError('You must specify the corpus to do a search.') if self.attributes is None: raise QueryError('You must specify at least one attribute to do a search.') if self.structures is None: raise QueryError('You must specify at least one structure to do a search.') if self.references is None: raise QueryError('You must specify at least one reference to do a search.') if self.container is None and not issubclass(type(self.processor), Nonprocessor): raise QueryError('You must specify the container to do a search.') if self.string is None or self.string is '': raise QueryError('You must set the string property to a search string.') # Check whether processor of proper type if self.processor and not issubclass(type(self.processor), Processor): raise QueryError('The processor class must inherit from SeaCOW.Processor.') # Emit heuristic warning that container might end up being to small. # This warns about the behviour reported 2020 by EP. q_pattern = r'.* within *<' + self.container + r'(| [^>]+)/>.*' q_string = r'within <' + self.container + r'/>' if not re.match(q_pattern, self.string): print("WARNING! Your query should probably end in '" + q_string + "' or your match might exceed the exported container.") if self.context_left == 0 or self.context_right == 0: print(" ... especially because at least one of your contexts is 0!") print(" ... Watch out for 'Index anomaly' warnings.") print # Allow the processor to engage in preparatory action/check whether everything is fine. if self.processor: self.processor.prepare(self) # Set up and run query. h_corpus = manatee.Corpus(self.corpus) if self.subcorpus is not None: # If subcorpus name is given (instead of path), figure out full path to subcorpus .subc file. if not "/" in self.subcorpus: self.subcorpus = h_corpus.get_conf("PATH") + "subcorp/" + re.sub("\.subc$", "", self.subcorpus.strip(" /")) + ".subc" if os.path.exists(self.subcorpus): h_corpus = manatee.SubCorpus (h_corpus, self.subcorpus) else: raise QueryError('The requested subcorpus cannot be found.') if not issubclass(type(self.processor), Nonprocessor): h_region = manatee.CorpRegion(h_corpus, ','.join(self.attributes), ','.join(self.structures)) h_cont = h_corpus.get_struct(self.container) h_refs = [h_corpus.get_attr(r) for r in self.references] start_time = time.time() results = h_corpus.eval_query(self.string) # Process results. counter = 0 dup_no = 0 # In case class is "Noprocessor", we do not process the stream. if issubclass(type(self.processor), Nonprocessor): # Store the hit count as reported. self.hits = results.count_rest() else: while not results.end() and (self.max_hits < 0 or counter < self.max_hits): # Skip randomly if random subset desired. if self.random_subset > 0 and random.random() > self.random_subset: results.next() continue kwic_beg = results.peek_beg() # Match begin. kwic_end = results.peek_end() # Match end. cont_beg_num = h_cont.num_at_pos(kwic_beg)-self.context_left # Container at match begin. cont_end_num = h_cont.num_at_pos(kwic_beg)+self.context_right # Container at match end. # If hit not in desired region, drop. if cont_beg_num < 0 or cont_end_num < 0: results.next() continue cont_beg_pos = h_cont.beg(cont_beg_num) # Pos at container begin. cont_end_pos = h_cont.end(cont_end_num) # Pos at container end. refs = [h_refs[i].pos2str(kwic_beg) for i in range(0, len(h_refs))] region = h_region.region(cont_beg_pos, cont_end_pos, '\t', '\t') # Deduping. if type(self.bloom) is pybloom_live.ScalableBloomFilter: dd_region = ''.join([region[i].strip().lower() for i in range(0, len(region), 1+len(self.attributes))]) if {dd_region : 0} in self.bloom: dup_no += 1 results.next() continue else: self.bloom.add({dd_region : 0}) # Call the processor. if self.processor: self.processor.process(self, region, refs, kwic_beg - cont_beg_pos, kwic_end - kwic_beg) # Advance stream/loop. results.next() counter = counter + 1 # After loop but inside "if not Nonprocessor", set hit count. self.hits = counter self.querytime = strftime("%Y-%m-%d %H:%M:%S", gmtime()) self.duplicates = dup_no self.elapsed = time.time()-start_time # Allow the processor to finalise its job. if self.processor: self.processor.finalise(self)
def _load_corp(corp_id, subc_path): corp = manatee.Corpus(corp_id) if subc_path: corp = manatee.SubCorpus(corp, subc_path) corp.corpname = corp_id return corp