def word_search(self,qs,*metadata_dicts,**options): hash = hashlib.sha1() hash.update(self.path) hash.update(qs) method = options.get("method","") method_arg = options.get("method_arg","") limit = options.get("limit","") hash.update(method) hash.update(method_arg) hash.update(limit) for metadata_level in metadata_dicts: for k,v in metadata_level.items(): hash.update(k) hash.update(v) hex_hash = hash.hexdigest() # print >> sys.stderr, "('%s' %s %d %s )hashes to %s" % (qs,method,method_arg,repr(metadata_dicts),hex_hash) hfile = "/var/lib/philologic/hitlists/" + hex_hash + ".hitlist" words_per_hit = len(qs.split(" ")) if os.path.isfile(hfile): return HitList.HitList(hfile,words_per_hit) #debug. corpus_file = None corpus_size = self.width corpus_count = 0 # print >> sys.stderr, "metadata = %s" % repr(metadata) if metadata_dicts: corpus_file = "/var/lib/philologic/hitlists/" + hex_hash + ".corpus" corpus_fh = open(corpus_file,"wb") for c_obj in self.toms.compound_query(*metadata_dicts): c_id = [int(x) for x in c_obj["philo_id"].split(" ")] corpus_fh.write(struct.pack("=7i",*c_id)) corpus_count += 1 corpus_fh.close() if corpus_count == 0: return [] # print >> sys.stderr, "%d metadata objects" % corpus_count return Query.query(self.path,qs,corpus_file,corpus_size,filename=hfile,**options)
def query(self,qs,method=None,method_arg=0,**metadata): hashable = (qs,method,method_arg,tuple(metadata.items())) hash = hashlib.sha1() hash.update(self.path) hash.update(qs) hash.update(method or "") hash.update(method_arg or "") for key,value in metadata.items(): hash.update(key) hash.update(value) hex_hash = hash.hexdigest() print >> sys.stderr,"%s hashes to %s" % (hashable,hex_hash) #check here to see if the query is cached. hfile = "/var/lib/philologic/hitlists/" + hex_hash + ".hitlist" words_per_hit = len(qs.split(" ")) if os.path.isfile(hfile): print >> sys.stderr, "%s cached already" % (hashable,) return HitList.HitList(hfile,words_per_hit) #debug. corpus_file = None corpus_size = self.width corpus_count = 0 print >> sys.stderr, "metadata = %s" % repr(metadata) if metadata: corpus_file = "/var/lib/philologic/hitlists/" + hex_hash + ".corpus" corpus_fh = open(corpus_file,"wb") for c_obj in self.toms.query(**metadata): c_id = [int(x) for x in c_obj["philo_id"].split(" ")] corpus_fh.write(struct.pack("=7i",*c_id)) corpus_count += 1 corpus_fh.close() if corpus_count == 0: return [] print >> sys.stderr, "%d metadata objects" % corpus_count return Query.query(self.path,qs,corpus_file,corpus_size,method,method_arg,filename=hfile)
def query(self, qs="", method="", method_arg=0, limit=10000000, **metadata): """query the PhiloLogic database""" hash = hashlib.sha1() hash.update(self.path) has_metadata = False corpus_file = None for key, value in metadata.items(): if isinstance(value, str): if value == "": pass else: value = [value] metadata[key] = value value = [v for v in value if v] if value: has_metadata = True hash.update("%s=%s" % (key, "|".join(value))) if has_metadata: corpus_hash = hash.hexdigest() corpus_file = "/var/lib/philologic/hitlists/" + corpus_hash + ".hitlist" corpus_width = 7 if not os.path.isfile(corpus_file): # before we query, we need to figure out what type each parameter belongs to, # and sort them into a list of dictionaries, one for each type. metadata_dicts = [ {} for level in self.locals["metadata_hierarchy"] ] # print >> sys.stderr, "querying %s" % repr(metadata.items()) for k, v in metadata.items(): for i, params in enumerate( self.locals["metadata_hierarchy"]): if v and (k in params): metadata_dicts[i][k] = v if k in self.locals["metadata_types"]: this_type = self.locals["metadata_types"][k] if this_type == "div": metadata_dicts[i]["philo_type"] = [ '"div"|"div1"|"div2"|"div3"' ] else: metadata_dicts[i]["philo_type"] = [ '"%s"' % self.locals["metadata_types"][k] ] metadata_dicts = [d for d in metadata_dicts if d] corpus = MetadataQuery.metadata_query(self, corpus_file, metadata_dicts) else: # print >> sys.stderr, "cached @ %s" % corpus_file corpus = HitList.HitList(corpus_file, 0, self) corpus.finish() print >> sys.stderr, "corpus file of length %d" % len(corpus) if len(corpus) == 0: return corpus else: corpus = None if qs: words_per_hit = len(qs.split(" ")) hash.update(qs) hash.update(method) hash.update(str(method_arg)) hash.update(str(limit)) search_hash = hash.hexdigest() search_file = "/var/lib/philologic/hitlists/" + search_hash + ".hitlist" if not os.path.isfile(search_file): return Query.query(self, qs, corpus_file, self.width, method, method_arg, limit, filename=search_file) else: return HitList.HitList(search_file, words_per_hit, self) else: if corpus: return corpus else: return self.get_all("doc")
def query(self, qs="", method="", method_arg="", limit="", sort_order=["rowid"], **metadata): """query the PhiloLogic database""" method = method or "proxy" if isinstance(method_arg, str): try: method_arg = int(method_arg) except: if method == "cooc" or method == "sentence": method_arg = 6 else: method_arg = 0 if isinstance(limit, str): try: limit = int(limit) except: limit = 10000000 hash = hashlib.sha1() hash.update(self.path) has_metadata = False corpus_file = None for key, value in metadata.items(): if isinstance(value, str): if value == "": pass else: value = [value] metadata[key] = value value = [v for v in value if v] if value: has_metadata = True hash.update("%s=%s" % (key, "|".join(value))) if has_metadata: corpus_hash = hash.hexdigest() corpus_file = self.path + "/hitlists/" + corpus_hash + ".hitlist" corpus_width = 7 if not os.path.isfile(corpus_file): # before we query, we need to figure out what type each parameter belongs to, # and sort them into a list of dictionaries, one for each type. metadata_dicts = [{} for level in self.locals["metadata_hierarchy"]] # print >> sys.stderr, "querying %s" % repr(metadata.items()) for k, v in metadata.items(): for i, params in enumerate(self.locals["metadata_hierarchy"]): if v and (k in params): metadata_dicts[i][k] = v if k in self.locals["metadata_types"]: this_type = self.locals["metadata_types"][k] if this_type == "div": metadata_dicts[i]["philo_type"] = ['"div"|"div1"|"div2"|"div3"'] else: metadata_dicts[i]["philo_type"] = ['"%s"' % self.locals["metadata_types"][k]] metadata_dicts = [d for d in metadata_dicts if d] if "philo_id" in metadata: if metadata_dicts: metadata_dicts[-1]["philo_id"] = metadata["philo_id"] else: metadata_dicts.append({"philo_id": metadata["philo_id"]}) corpus = MetadataQuery.metadata_query(self, corpus_file, metadata_dicts, sort_order) else: # print >> sys.stderr, "cached @ %s" % corpus_file if sort_order == ["rowid"]: sort_order = None corpus = HitList.HitList(corpus_file, 0, self, sort_order=sort_order) corpus.finish() #print >> sys.stderr, "corpus file of length %d" % len(corpus) if len(corpus) == 0: return corpus else: corpus = None if qs: # words_per_hit = len(qs.split(" ")) # words_per_hit = len(qs.split("\n\n")) hash.update(qs) hash.update(method) hash.update(str(method_arg)) hash.update(str(limit)) search_hash = hash.hexdigest() search_file = self.path + "/hitlists/" + search_hash + ".hitlist" if sort_order == ["rowid"]: sort_order = None if not os.path.isfile(search_file): return Query.query(self, qs, corpus_file, self.width, method, method_arg, limit, filename=search_file, sort_order=sort_order) else: parsed = QuerySyntax.parse_query(qs) grouped = QuerySyntax.group_terms(parsed) split = Query.split_terms(grouped) words_per_hit = len(split) return HitList.HitList(search_file, words_per_hit, self, sort_order=sort_order) else: if corpus: return corpus else: return self.get_all("doc", sort_order)
def query(self,qs="",method="",method_arg=0,limit=10000000,**metadata): """query the PhiloLogic database""" hash = hashlib.sha1() hash.update(self.path) has_metadata = False corpus_file = None for key,value in metadata.items(): if isinstance(value,str): if value == "": pass else: value = [value] metadata[key] = value value = [v for v in value if v] if value: has_metadata = True hash.update("%s=%s" % (key,"|".join(value))) if has_metadata: corpus_hash = hash.hexdigest() corpus_file = self.path + "/hitlists/" + corpus_hash + ".hitlist" corpus_width = 7 if not os.path.isfile(corpus_file): # before we query, we need to figure out what type each parameter belongs to, # and sort them into a list of dictionaries, one for each type. metadata_dicts = [{} for level in self.locals["metadata_hierarchy"]] # print >> sys.stderr, "querying %s" % repr(metadata.items()) for k,v in metadata.items(): for i, params in enumerate(self.locals["metadata_hierarchy"]): if v and (k in params): metadata_dicts[i][k] = v if k in self.locals["metadata_types"]: this_type = self.locals["metadata_types"][k] if this_type == "div": metadata_dicts[i]["philo_type"] = ['"div"|"div1"|"div2"|"div3"'] else: metadata_dicts[i]["philo_type"] = ['"%s"' % self.locals["metadata_types"][k]] metadata_dicts = [d for d in metadata_dicts if d] corpus = MetadataQuery.metadata_query(self,corpus_file,metadata_dicts) else: # print >> sys.stderr, "cached @ %s" % corpus_file corpus = HitList.HitList(corpus_file,0,self) corpus.finish() #print >> sys.stderr, "corpus file of length %d" % len(corpus) if len(corpus) == 0: return corpus else: corpus = None if qs: words_per_hit = len(qs.split(" ")) hash.update(qs) hash.update(method) hash.update(str(method_arg)) hash.update(str(limit)) search_hash = hash.hexdigest() search_file = self.path + "/hitlists/" + search_hash + ".hitlist" if not os.path.isfile(search_file): return Query.query(self,qs,corpus_file,self.width,method,method_arg,limit,filename=search_file) else: return HitList.HitList(search_file,words_per_hit,self) else: if corpus: return corpus else: return self.get_all("doc")
def query(self, qs="", method="", method_arg="", limit="", sort_order=["rowid"], raw_results=False, **metadata): """query the PhiloLogic database""" method = method or "proxy" if isinstance(method_arg, str): try: method_arg = int(method_arg) except: if method == "cooc" or method == "sentence": method_arg = 6 else: method_arg = 0 if isinstance(limit, str): try: limit = int(limit) except: limit = 10000000 hash = hashlib.sha1() hash.update(self.path.encode('utf8')) has_metadata = False corpus_file = None for key, value in list(metadata.items()): if isinstance(value, str): if value == "": pass else: value = [value] metadata[key] = value value = [v for v in value if v] if value: has_metadata = True key_value = "%s=%s" % (key, "|".join(value)) hash.update(key_value.encode('utf8')) if has_metadata: corpus_hash = hash.hexdigest() corpus_file = self.path + "/hitlists/" + corpus_hash + ".hitlist" if not os.path.isfile(corpus_file): # before we query, we need to figure out what type each parameter belongs to, # and sort them into a list of dictionaries, one for each type. metadata_dicts = [{} for level in self.locals["metadata_hierarchy"]] # print >> sys.stderr, "querying %s" % repr(metadata.items()) for k, v in list(metadata.items()): for i, params in enumerate(self.locals["metadata_hierarchy"]): if v and (k in params): metadata_dicts[i][k] = v if k in self.locals["metadata_types"]: this_type = self.locals["metadata_types"][k] if this_type == "div": metadata_dicts[i]["philo_type"] = ['"div"|"div1"|"div2"|"div3"'] else: metadata_dicts[i]["philo_type"] = ['"%s"' % self.locals["metadata_types"][k]] metadata_dicts = [d for d in metadata_dicts if d] if "philo_id" in metadata: if metadata_dicts: metadata_dicts[-1]["philo_id"] = metadata["philo_id"] else: metadata_dicts.append({"philo_id": metadata["philo_id"]}) corpus = MetadataQuery.metadata_query(self, corpus_file, metadata_dicts, sort_order) else: if sort_order == ["rowid"]: sort_order = None corpus = HitList.HitList(corpus_file, 0, self, sort_order=sort_order, raw=raw_results) corpus.finish() if len(corpus) == 0: return corpus else: corpus = None if qs: hash.update(qs.encode('utf8')) hash.update(method.encode('utf8')) hash.update(str(method_arg).encode('utf8')) hash.update(str(limit).encode('utf8')) search_hash = hash.hexdigest() search_file = self.path + "/hitlists/" + search_hash + ".hitlist" if sort_order == ["rowid"]: sort_order = None if not os.path.isfile(search_file): return Query.query(self, qs, corpus_file, self.width, method, method_arg, limit, filename=search_file, sort_order=sort_order, raw_results=raw_results) else: parsed = QuerySyntax.parse_query(qs) grouped = QuerySyntax.group_terms(parsed) split = Query.split_terms(grouped) words_per_hit = len(split) return HitList.HitList(search_file, words_per_hit, self, sort_order=sort_order, raw=raw_results) else: if corpus: return corpus else: return self.get_all(self.locals["default_object_level"], sort_order)
def query(self, qs="", method="", method_arg=0, limit=10000000, **metadata): """query the PhiloLogic database""" hash = hashlib.sha1() hash.update(self.path) has_metadata = False corpus_file = None for key, value in metadata.items(): if isinstance(value, str): if value == "": pass else: value = [value] metadata[key] = value value = [v for v in value if v] if value: has_metadata = True hash.update("%s=%s" % (key, "|".join(value))) if has_metadata: corpus_hash = hash.hexdigest() corpus_file = self.path + "/hitlists/" + corpus_hash + ".hitlist" corpus_width = 7 if not os.path.isfile(corpus_file): # before we query, we need to figure out what type each parameter belongs to, # and sort them into a list of dictionaries, one for each type. metadata_dicts = [ {} for level in self.locals["metadata_hierarchy"] ] # print >> sys.stderr, "querying %s" % repr(metadata.items()) for k, v in metadata.items(): for i, params in enumerate( self.locals["metadata_hierarchy"]): if v and (k in params): metadata_dicts[i][k] = v if k in self.locals["metadata_types"]: this_type = self.locals["metadata_types"][k] if this_type == "div": metadata_dicts[i]["philo_type"] = [ '"div"|"div1"|"div2"|"div3"' ] else: metadata_dicts[i]["philo_type"] = [ '"%s"' % self.locals["metadata_types"][k] ] metadata_dicts = [d for d in metadata_dicts if d] if "philo_id" in metadata: if metadata_dicts: metadata_dicts[-1]["philo_id"] = metadata["philo_id"] else: metadata_dicts.append( {"philo_id": metadata["philo_id"]}) corpus = MetadataQuery.metadata_query(self, corpus_file, metadata_dicts) else: # print >> sys.stderr, "cached @ %s" % corpus_file corpus = HitList.HitList(corpus_file, 0, self) corpus.finish() #print >> sys.stderr, "corpus file of length %d" % len(corpus) if len(corpus) == 0: return corpus else: corpus = None if qs: # words_per_hit = len(qs.split(" ")) # words_per_hit = len(qs.split("\n\n")) hash.update(qs) hash.update(method) hash.update(str(method_arg)) hash.update(str(limit)) search_hash = hash.hexdigest() search_file = self.path + "/hitlists/" + search_hash + ".hitlist" if not os.path.isfile(search_file): return Query.query(self, qs, corpus_file, self.width, method, method_arg, limit, filename=search_file) else: parsed = QuerySyntax.parse_query(qs) grouped = QuerySyntax.group_terms(parsed) split = Query.split_terms(grouped) words_per_hit = len(split) # parsed = QuerySyntax.parse_query(qs) # parsed_split = [] # for label,token in parsed: # l,t = label,token # if l == "QUOTE": # subtokens = t[1:-1].split(" ") # parsed_split += [("QUOTE_S",sub_t) for sub_t in subtokens if sub_t] # else: # parsed_split += [(l,t)] # command = Query.format_parsed_query(parsed_split,self) # words_per_hit = len(command.split("\n\n")) return HitList.HitList(search_file, words_per_hit, self) else: if corpus: return corpus else: return self.get_all("doc")