def linkStatQuery(self, linktype, links):
     statquery = bytearray("Q")
     for link in links:
         statquery.extend(b'T')
         statquery.extend(strusMessage.packString(linktype))
         statquery.extend(strusMessage.packString(link.title))
     statquery.extend(b'N')
     return statquery
 def termStatQuery(self, terms):
     statquery = bytearray(b"Q")
     for term in terms:
         statquery.extend(b'T')
         statquery.extend(strusMessage.packString(term.type))
         statquery.extend(strusMessage.packString(term.value))
     statquery.extend(b'N')
     return statquery
 def evaluateQuery(self, scheme, querystruct, firstrank, nofranks,
                   restrictdn, with_debuginfo):
     rt = None
     try:
         maxnofresults = firstrank + nofranks
         terms = querystruct.terms
         if not terms:
             # Return empty result for empty query:
             rt = [[], []]
         else:
             # Get the global statistics:
             dflist, collectionsize, error = yield self.queryStatserver(
                 self.termStatQuery(terms))
             if not error is None:
                 raise Exception(error)
             # Assemble the query:
             qry = bytearray(b"Q")
             qry.extend(bytearray(b"M"))
             qry.extend(strusMessage.packString(scheme))
             qry.extend(bytearray(b"S"))
             qry.extend(struct.pack(">q", collectionsize))
             qry.extend(bytearray(b"I"))
             qry.extend(struct.pack(">H", 0))
             qry.extend(bytearray(b"N"))
             qry.extend(struct.pack(">H", maxnofresults))
             if with_debuginfo:
                 qry.extend(bytearray(b"B"))
             if restrictdn != 0:
                 qry.extend(bytearray(b"D"))
                 qry.extend(struct.pack(">I", restrictdn))
             for ii in range(0, len(terms)):
                 qry.extend(bytearray(b"T"))
                 qry.extend(strusMessage.packString(terms[ii].type))
                 qry.extend(strusMessage.packString(terms[ii].value))
                 if (terms[ii].length):
                     qry.extend(
                         struct.pack(">Hqd", terms[ii].length, dflist[ii],
                                     1.0))
                 else:
                     qry.extend(struct.pack(">Hqd", 1, dflist[ii], 1.0))
             for lnk in querystruct.links:
                 qry.extend(bytearray(b"L"))
                 qry.extend(strusMessage.packString("vectfeat"))
                 qry.extend(strusMessage.packString(lnk.title))
                 qry.extend(struct.pack(">d", lnk.weight))
             # Query all storage servers:
             results = yield self.issueQueries(storageservers, scheme, qry)
             rt = self.mergeQueryResults(results, firstrank, nofranks)
     except Exception as e:
         rt = ([], ["error evaluation query: %s" % e])
     raise tornado.gen.Return(rt)
示例#4
0
def processCommand(message):
    rt = bytearray(b"Y")
    try:
        messagesize = len(message)
        messageofs = 1
        if message[0] == ord('Q'):
            # QUERY:
            Term = collections.namedtuple(
                'Term', ['type', 'value', 'length', 'df', 'weight'])
            nofranks = 20
            restrictdn = 0
            collectionsize = 0
            firstrank = 0
            scheme = "BM25"
            terms = []
            links = []
            with_debuginfo = False
            # Build query to evaluate from the request:
            messagesize = len(message)
            while (messageofs < messagesize):
                if message[messageofs] == ord('I'):
                    (firstrank, ) = struct.unpack_from(">H", message,
                                                       messageofs + 1)
                    messageofs += struct.calcsize(">H") + 1
                elif message[messageofs] == ord('N'):
                    (nofranks, ) = struct.unpack_from(">H", message,
                                                      messageofs + 1)
                    messageofs += struct.calcsize(">H") + 1
                elif message[messageofs] == ord('D'):
                    (restrictdn, ) = struct.unpack_from(
                        ">I", message, messageofs + 1)
                    messageofs += struct.calcsize(">I") + 1
                elif message[messageofs] == ord('M'):
                    (scheme, messageofs) = strusMessage.unpackString(
                        message, messageofs + 1)
                elif message[messageofs] == ord('S'):
                    (collectionsize, ) = struct.unpack_from(
                        ">q", message, messageofs + 1)
                    messageofs += struct.calcsize(">q") + 1
                elif message[messageofs] == ord('T'):
                    (type, messageofs) = strusMessage.unpackString(
                        message, messageofs + 1)
                    (value, messageofs) = strusMessage.unpackString(
                        message, messageofs)
                    (length, df,
                     weight) = struct.unpack_from(">Hqd", message, messageofs)
                    messageofs += struct.calcsize(">Hqd")
                    terms.append(Term(type, value, length, df, weight))
                elif message[messageofs] == ord('L'):
                    (type, messageofs) = strusMessage.unpackString(
                        message, messageofs + 1)
                    (value, messageofs) = strusMessage.unpackString(
                        message, messageofs)
                    (weight, ) = struct.unpack_from(">d", message, messageofs)
                    messageofs += struct.calcsize(">d")
                    links.append(Term(type, value, 1, 0, weight))
                elif message[messageofs] == ord('B'):
                    messageofs += 1
                    with_debuginfo = True
                else:
                    raise tornado.gen.Return(b"Eunknown parameter")

            if (with_debuginfo or debugtrace):
                backend.enableDebugTrace()

            doTitleSelect = isStopWordsOnlyQuery(terms, collectionsize)
            # ... if we have a query containing only stopwords, we reduce our search space to
            # the documents containing some query terms in the title and the most referenced
            # documents in the collection.

            # Evaluate query:
            if restrictdn == 0:
                results = backend.evaluateQuery(scheme, doTitleSelect, terms,
                                                links, collectionsize,
                                                firstrank, nofranks, [],
                                                debugtrace, with_debuginfo)
            else:
                results = backend.evaluateQuery(scheme, doTitleSelect, terms,
                                                links, collectionsize,
                                                firstrank, nofranks,
                                                [restrictdn], debugtrace,
                                                with_debuginfo)

            # Build the result and pack it into the reply message for the client:
            rt.extend(b'Z')
            rt.extend(struct.pack(">H", serverno))

            if scheme == "NBLNK" or scheme == "TILNK" or scheme == "VCLNK":
                for result in results:
                    rt.extend(b'_')
                    rt.extend(b'D')
                    rt.extend(struct.pack(">I", result.docno))
                    rt.extend(b'W')
                    rt.extend(struct.pack(">d", result.weight))
                    for linkid, weight in result.links:
                        rt.extend(b'L')
                        rt.extend(strusMessage.packString(linkid))
                        rt.extend(struct.pack(">d", weight))
            elif scheme == "STDLNK":
                for result in results:
                    rt.extend(b'_')
                    rt.extend(b'D')
                    rt.extend(struct.pack(">I", result.docno))
                    rt.extend(b'W')
                    rt.extend(struct.pack(">d", result.weight))
                    for linkid, weight in result.links:
                        rt.extend(b'L')
                        rt.extend(strusMessage.packString(linkid))
                        rt.extend(struct.pack(">d", weight))
                    for linkid, weight in result.titles:
                        rt.extend(b'T')
                        rt.extend(strusMessage.packString(linkid))
                        rt.extend(struct.pack(">d", weight))
                    for featid, weight in result.features:
                        rt.extend(b'F')
                        rt.extend(strusMessage.packString(featid))
                        rt.extend(struct.pack(">d", weight))
            else:
                for result in results:
                    rt.extend(b'_')
                    rt.extend(b'D')
                    rt.extend(struct.pack(">I", result.docno))
                    rt.extend(b'W')
                    rt.extend(struct.pack(">d", result.weight))
                    rt.extend(b'T')
                    rt.extend(strusMessage.packString(result.title))
                    if result.paratitle:
                        rt.extend(b'P')
                        rt.extend(strusMessage.packString(result.paratitle))
                    if result.debuginfo:
                        rt.extend(b'B')
                        rt.extend(strusMessage.packString(result.debuginfo))
                    rt.extend(b'A')
                    rt.extend(strusMessage.packString(result.abstract))
            if (with_debuginfo or debugtrace):
                backend.printDebugTrace()
                backend.disableDebugTrace()
        else:
            raise Exception("unknown protocol command '%c'" % (message[0]))
    except Exception as e:
        raise tornado.gen.Return(bytearray("E%s" % e, 'utf-8'))
    raise tornado.gen.Return(rt)
    def analyzeQuery(self, scheme, querystr, nofranks):
        terms = []
        relatedterms = []
        errors = []
        conn = None
        try:
            query = bytearray(b"Q")
            query.extend(b'X')
            query.extend(strusMessage.packString(querystr))
            query.extend(b'N')
            query.extend(struct.pack(">H", nofranks))

            ri = qryserver.rindex(':')
            host, port = qryserver[:ri], int(qryserver[ri + 1:])
            conn = yield msgclient.connect(host, port)
            reply = yield msgclient.issueRequest(conn, query)
            if reply[0] == ord('E'):
                raise Exception("failed to query analyze server: %s" %
                                reply[1:])
            elif reply[0] != ord('Y'):
                raise Exception("protocol error in query analyze server query")
            replyofs = 1
            replylen = len(reply)
            while replyofs < replylen:
                if reply[replyofs] == ord('T'):
                    replyofs += 1
                    type = None
                    value = None
                    length = 1
                    while replyofs < replylen:
                        if reply[replyofs] == ord('T'):
                            (type, replyofs) = strusMessage.unpackString(
                                reply, replyofs + 1)
                        elif reply[replyofs] == ord('V'):
                            (value, replyofs) = strusMessage.unpackString(
                                reply, replyofs + 1)
                        elif reply[replyofs] == ord('L'):
                            (length, ) = struct.unpack_from(
                                ">I", reply, replyofs + 1)
                            replyofs += struct.calcsize(">I") + 1
                        elif reply[replyofs] == ord('_'):
                            replyofs += 1
                            break
                    terms.append(QueryTerm(type, value, length, 1.0))
                elif reply[replyofs] == ord('R'):
                    replyofs += 1
                    value = None
                    index = -1
                    weight = 0.0
                    while replyofs < replylen:
                        if reply[replyofs] == ord('V'):
                            (value, replyofs) = strusMessage.unpackString(
                                reply, replyofs + 1)
                        elif reply[replyofs] == ord('I'):
                            (index, ) = struct.unpack_from(
                                ">I", reply, replyofs + 1)
                            replyofs += struct.calcsize(">I") + 1
                        elif reply[replyofs] == ord('W'):
                            (weight, ) = struct.unpack_from(
                                ">d", reply, replyofs + 1)
                            replyofs += struct.calcsize(">d") + 1
                        elif reply[replyofs] == ord('_'):
                            replyofs += 1
                            break
                    valuestr = value.replace('_', ' ')
                    if (valuestr.lower() != querystr.lower()):
                        encvalue = urllib.parse.quote(valuestr)
                        relatedterms.append(
                            RelatedTerm(valuestr, encvalue, index, weight))
                else:
                    break
            if replyofs != replylen:
                raise Exception("query analyze server result format error")
            conn.close()
        except Exception as e:
            errors.append("query analyze server request failed: %s" % e)
            if conn:
                conn.close()
            alt_terms = analyzer.analyzeTermExpression(["text", querystr])
            for term in alt_terms:
                terms.append(QueryTerm(term.type, term.value, term.length,
                                       1.0))
        raise tornado.gen.Return(QueryStruct(terms, [], relatedterms, errors))
def processCommand(message):
    rt = bytearray(b"Y")
    try:
        if debugtrace:
            strusctx.enableDebugTrace("analyzer")

        messagesize = len(message)
        if messagesize < 1:
            raise tornado.gen.Return(b"Eempty request string")
        messageofs = 1
        if message[0] == ord('Q'):
            # Build query to evaluate from the request:
            while (messageofs < messagesize):
                if (message[messageofs] == ord('N')):
                    (nofranks, ) = struct.unpack_from(">H", message,
                                                      messageofs + 1)
                    messageofs += struct.calcsize(">H") + 1
                elif (message[messageofs] == ord('X')):
                    (querystr, messageofs) = strusMessage.unpackString(
                        message, messageofs + 1)
                else:
                    raise tornado.gen.Return(b"Eunknown parameter")

            # Analyze query:
            relatedlist = []
            terms = analyzer.analyzeTermExpression([["text", querystr],
                                                    ["seltext", querystr]])

            # Extract vectors referenced:
            f_indices = []
            for term in terms:
                if term.value[0] == 'F':
                    f_indices.append(int(term.value[1:]))

            # Build real list of features for retrieval in the searchindex:
            pos2term = {}
            pos = 0
            for term in terms:
                if term.type != "selstem":
                    if term.length and term.length > 1:
                        pos2term[pos] = AnalyzerTerm(term.type, term.value,
                                                     term.length)
                        pos += term.length
                    elif term.type == "stem":
                        pos2term[pos] = AnalyzerTerm(term.type, term.value, 1)
                        pos += 1
            pos = 0
            for term in terms:
                if term.type == "selstem":
                    if not pos in pos2term:
                        pos2term[pos] = AnalyzerTerm("stem", term.value, 1)
                    pos += 1
            finalterms = []
            for pos, term in pos2term.items():
                finalterms.append(term)
            terms = finalterms

            # Calculate nearest neighbours of vectors exctracted:
            if f_indices:
                vec = vecstorage.featureVector(f_indices[0])
                if len(f_indices) > 1:
                    for nextidx in f_indices[1:]:
                        vec = [
                            v + i for v, i in zip(
                                vec, vecstorage.featureVector(nextidx))
                        ]
                    neighbour_ranklist = vecsearcher.findSimilar(vec, nofranks)
                else:
                    neighbour_list = []
                    neighbour_set = set()
                    for concept in vecstorage.featureConcepts(
                            "", f_indices[0]):
                        for neighbour in vecstorage.conceptFeatures(
                                "", concept):
                            neighbour_set.add(neighbour)
                    for neighbour in neighbour_set:
                        neighbour_list.append(neighbour)
                    neighbour_ranklist = vecsearcher.findSimilarFromSelection(
                        neighbour_list, vec, nofranks)

                for neighbour in neighbour_ranklist:
                    fname = vecstorage.featureName(neighbour.featidx)
                    relatedlist.append(
                        RelatedTerm(fname, neighbour.featidx,
                                    neighbour.weight))

            # Build the result and pack it into the reply message for the client:
            for termidx, term in enumerate(terms):
                rt.extend(b'T')
                rt.extend(b'T')
                rt.extend(strusMessage.packString(term.type))
                rt.extend(b'V')
                rt.extend(strusMessage.packString(term.value))
                if (term.length):
                    rt.extend(b'L')
                    rt.extend(struct.pack(">I", term.length))
                rt.extend(b'_')
            for related in relatedlist:
                rt.extend(b'R')
                rt.extend(b'V')
                rt.extend(strusMessage.packString(related.value))
                rt.extend(b'I')
                rt.extend(struct.pack(">I", related.index))
                rt.extend(b'W')
                rt.extend(struct.pack(">d", related.weight))
                rt.extend(b'_')
        else:
            if debugtrace:
                strusctx.disableDebugTrace("analyzer")
            raise Exception("unknown protocol command '%c'" % (message[0]))
    except Exception as e:
        if debugtrace:
            strusctx.disableDebugTrace("analyzer")
        raise tornado.gen.Return(bytearray("E%s" % e, 'utf-8'))
    if debugtrace:
        dumpDebugTrace(strusctx.fetchDebugTrace(), "", 5)
        strusctx.disableDebugTrace("analyzer")
    raise tornado.gen.Return(rt)