示例#1
0
    def query(self, qs="", method="", method_arg="", limit="", sort_order=["rowid"], raw_results=False, **metadata):
        """query the PhiloLogic database"""
        method = method or "proxy"
        if isinstance(method_arg, str):
            try:
                method_arg = int(method_arg)
            except:
                if method == "cooc" or method == "sentence":
                    method_arg = 6
                else:
                    method_arg = 0

        if isinstance(limit, str):
            try:
                limit = int(limit)
            except:
                limit = 10000000

        hash = hashlib.sha1()
        hash.update(self.path.encode('utf8'))
        has_metadata = False
        corpus_file = None

        for key, value in list(metadata.items()):
            if isinstance(value, str):
                if value == "":
                    pass
                else:
                    value = [value]
                    metadata[key] = value
            value = [v for v in value if v]
            if value:
                has_metadata = True
                key_value = "%s=%s" % (key, "|".join(value))
                hash.update(key_value.encode('utf8'))

        if has_metadata:
            corpus_hash = hash.hexdigest()
            corpus_file = self.path + "/hitlists/" + corpus_hash + ".hitlist"

            if not os.path.isfile(corpus_file):
                # before we query, we need to figure out what type each parameter belongs to,
                # and sort them into a list of dictionaries, one for each type.
                metadata_dicts = [{} for level in self.locals["metadata_hierarchy"]]
                #                print >> sys.stderr, "querying %s" % repr(metadata.items())
                for k, v in list(metadata.items()):
                    for i, params in enumerate(self.locals["metadata_hierarchy"]):
                        if v and (k in params):
                            metadata_dicts[i][k] = v
                            if k in self.locals["metadata_types"]:
                                this_type = self.locals["metadata_types"][k]
                                if this_type == "div":
                                    metadata_dicts[i]["philo_type"] = ['"div"|"div1"|"div2"|"div3"']
                                else:
                                    metadata_dicts[i]["philo_type"] = ['"%s"' % self.locals["metadata_types"][k]]
                metadata_dicts = [d for d in metadata_dicts if d]
                if "philo_id" in metadata:
                    if metadata_dicts:
                        metadata_dicts[-1]["philo_id"] = metadata["philo_id"]
                    else:
                        metadata_dicts.append({"philo_id": metadata["philo_id"]})
                corpus = MetadataQuery.metadata_query(self, corpus_file, metadata_dicts, sort_order)
            else:
                if sort_order == ["rowid"]:
                    sort_order = None
                corpus = HitList.HitList(corpus_file, 0, self, sort_order=sort_order, raw=raw_results)
                corpus.finish()
            if len(corpus) == 0:
                return corpus
        else:
            corpus = None
        if qs:
            hash.update(qs.encode('utf8'))
            hash.update(method.encode('utf8'))
            hash.update(str(method_arg).encode('utf8'))
            hash.update(str(limit).encode('utf8'))
            search_hash = hash.hexdigest()
            search_file = self.path + "/hitlists/" + search_hash + ".hitlist"
            if sort_order == ["rowid"]:
                sort_order = None
            if not os.path.isfile(search_file):
                return Query.query(self,
                                   qs,
                                   corpus_file,
                                   self.width,
                                   method,
                                   method_arg,
                                   limit,
                                   filename=search_file,
                                   sort_order=sort_order,
                                   raw_results=raw_results)
            else:
                parsed = QuerySyntax.parse_query(qs)
                grouped = QuerySyntax.group_terms(parsed)
                split = Query.split_terms(grouped)
                words_per_hit = len(split)
                return HitList.HitList(search_file, words_per_hit, self, sort_order=sort_order, raw=raw_results)
        else:
            if corpus:
                return corpus
            else:
                return self.get_all(self.locals["default_object_level"], sort_order)
示例#2
0
    def query(self, qs="", method="", method_arg="", limit="", sort_order=["rowid"], **metadata):
        """query the PhiloLogic database"""
        method = method or "proxy"
        if isinstance(method_arg, str):
            try:
                method_arg = int(method_arg)
            except:
                if method == "cooc" or method == "sentence":
                    method_arg = 6
                else:
                    method_arg = 0

        if isinstance(limit, str):
            try:
                limit = int(limit)
            except:
                limit = 10000000

        hash = hashlib.sha1()
        hash.update(self.path)
        has_metadata = False
        corpus_file = None

        for key, value in metadata.items():
            if isinstance(value, str):
                if value == "":
                    pass
                else:
                    value = [value]
                    metadata[key] = value
            value = [v for v in value if v]
            if value:
                has_metadata = True
                hash.update("%s=%s" % (key, "|".join(value)))

        if has_metadata:
            corpus_hash = hash.hexdigest()
            corpus_file = self.path + "/hitlists/" + corpus_hash + ".hitlist"
            corpus_width = 7

            if not os.path.isfile(corpus_file):
                # before we query, we need to figure out what type each parameter belongs to,
                # and sort them into a list of dictionaries, one for each type.
                metadata_dicts = [{} for level in self.locals["metadata_hierarchy"]]
                #                print >> sys.stderr, "querying %s" % repr(metadata.items())
                for k, v in metadata.items():
                    for i, params in enumerate(self.locals["metadata_hierarchy"]):
                        if v and (k in params):
                            metadata_dicts[i][k] = v
                            if k in self.locals["metadata_types"]:
                                this_type = self.locals["metadata_types"][k]
                                if this_type == "div":
                                    metadata_dicts[i]["philo_type"] = ['"div"|"div1"|"div2"|"div3"']
                                else:
                                    metadata_dicts[i]["philo_type"] = ['"%s"' % self.locals["metadata_types"][k]]
                metadata_dicts = [d for d in metadata_dicts if d]
                if "philo_id" in metadata:
                    if metadata_dicts:
                        metadata_dicts[-1]["philo_id"] = metadata["philo_id"]
                    else:
                        metadata_dicts.append({"philo_id": metadata["philo_id"]})
                corpus = MetadataQuery.metadata_query(self, corpus_file, metadata_dicts, sort_order)
            else:
                #                print >> sys.stderr, "cached @ %s" % corpus_file
                if sort_order == ["rowid"]:
                    sort_order = None
                corpus = HitList.HitList(corpus_file, 0, self, sort_order=sort_order)
                corpus.finish()
            #print >> sys.stderr, "corpus file of length %d" % len(corpus)
            if len(corpus) == 0:
                return corpus
        else:
            corpus = None
        if qs:
            #            words_per_hit = len(qs.split(" "))
            #            words_per_hit = len(qs.split("\n\n"))
            hash.update(qs)
            hash.update(method)
            hash.update(str(method_arg))
            hash.update(str(limit))
            search_hash = hash.hexdigest()
            search_file = self.path + "/hitlists/" + search_hash + ".hitlist"
            if sort_order == ["rowid"]:
                sort_order = None
            if not os.path.isfile(search_file):
                return Query.query(self, qs, corpus_file, self.width, method, method_arg, limit, filename=search_file, sort_order=sort_order)
            else:
                parsed = QuerySyntax.parse_query(qs)
                grouped = QuerySyntax.group_terms(parsed)
                split = Query.split_terms(grouped)
                words_per_hit = len(split)
                return HitList.HitList(search_file, words_per_hit, self, sort_order=sort_order)
        else:
            if corpus:
                return corpus
            else:
                return self.get_all("doc", sort_order)
示例#3
0
    def query(self,
              qs="",
              method="",
              method_arg=0,
              limit=10000000,
              **metadata):
        """query the PhiloLogic database"""
        hash = hashlib.sha1()
        hash.update(self.path)
        has_metadata = False
        corpus_file = None

        for key, value in metadata.items():
            if isinstance(value, str):
                if value == "":
                    pass
                else:
                    value = [value]
                    metadata[key] = value
            value = [v for v in value if v]
            if value:
                has_metadata = True
                hash.update("%s=%s" % (key, "|".join(value)))

        if has_metadata:
            corpus_hash = hash.hexdigest()
            corpus_file = self.path + "/hitlists/" + corpus_hash + ".hitlist"
            corpus_width = 7

            if not os.path.isfile(corpus_file):
                # before we query, we need to figure out what type each parameter belongs to,
                # and sort them into a list of dictionaries, one for each type.
                metadata_dicts = [
                    {} for level in self.locals["metadata_hierarchy"]
                ]
                #                print >> sys.stderr, "querying %s" % repr(metadata.items())
                for k, v in metadata.items():
                    for i, params in enumerate(
                            self.locals["metadata_hierarchy"]):
                        if v and (k in params):
                            metadata_dicts[i][k] = v
                            if k in self.locals["metadata_types"]:
                                this_type = self.locals["metadata_types"][k]
                                if this_type == "div":
                                    metadata_dicts[i]["philo_type"] = [
                                        '"div"|"div1"|"div2"|"div3"'
                                    ]
                                else:
                                    metadata_dicts[i]["philo_type"] = [
                                        '"%s"' %
                                        self.locals["metadata_types"][k]
                                    ]
                metadata_dicts = [d for d in metadata_dicts if d]
                if "philo_id" in metadata:
                    if metadata_dicts:
                        metadata_dicts[-1]["philo_id"] = metadata["philo_id"]
                    else:
                        metadata_dicts.append(
                            {"philo_id": metadata["philo_id"]})
                corpus = MetadataQuery.metadata_query(self, corpus_file,
                                                      metadata_dicts)
            else:
                #                print >> sys.stderr, "cached @ %s" % corpus_file
                corpus = HitList.HitList(corpus_file, 0, self)
                corpus.finish()
            #print >> sys.stderr, "corpus file of length %d" % len(corpus)
            if len(corpus) == 0:
                return corpus
        else:
            corpus = None
        if qs:
            #            words_per_hit = len(qs.split(" "))
            #            words_per_hit = len(qs.split("\n\n"))
            hash.update(qs)
            hash.update(method)
            hash.update(str(method_arg))
            hash.update(str(limit))
            search_hash = hash.hexdigest()
            search_file = self.path + "/hitlists/" + search_hash + ".hitlist"
            if not os.path.isfile(search_file):
                return Query.query(self,
                                   qs,
                                   corpus_file,
                                   self.width,
                                   method,
                                   method_arg,
                                   limit,
                                   filename=search_file)
            else:
                parsed = QuerySyntax.parse_query(qs)
                grouped = QuerySyntax.group_terms(parsed)
                split = Query.split_terms(grouped)
                words_per_hit = len(split)
                #                parsed = QuerySyntax.parse_query(qs)
                #                parsed_split = []
                #                for label,token in parsed:
                #                    l,t = label,token
                #                    if l == "QUOTE":
                #                        subtokens = t[1:-1].split(" ")
                #                        parsed_split += [("QUOTE_S",sub_t) for sub_t in subtokens if sub_t]
                #                    else:
                #                        parsed_split += [(l,t)]
                #                command = Query.format_parsed_query(parsed_split,self)
                #                words_per_hit = len(command.split("\n\n"))

                return HitList.HitList(search_file, words_per_hit, self)
        else:
            if corpus:
                return corpus
            else:
                return self.get_all("doc")