Пример #1
0
    def __send_search_request_to(self, query_id, TTL, ip, port, search_query, min_score, forwarding_node_count, result_count):
#               signal.alarm(10)
        try:
            connection = httplib.HTTPConnection(ip, port)
            connection.putrequest("POST", "/message")
            connection.endheaders()

            print "send message"

            protocol.sendHeader(connection, constants.SEARCH_REQUEST_COMMAND, query_id, TTL)
            protocol.sendSearchRequest(connection, min_score, forwarding_node_count, result_count, search_query)
            print "get response"
            http_response = connection.getresponse()

            (protocol_version, vendor, node_id, ip, port, bandwidth, counter, command_type, queryID, TTL) = response.readHeader(http_response)

            print "recv pong id = %s %s %s" % (node_id, ip, port)

            last_seen_time = int(time.time())
            nodeInfo = globalvars.maay_core.updateNodeInfo(node_id, ip, port, bandwidth, counter, last_seen_time)
            connection.close()
#               except TimeoutError:
#                       signal.alarm(0)
#                       return communication.Communication.TIMEOUT_ERROR
        except socket.error, (code, message):
            print "Connection problem on node [%s:%s]: %s" % (ip, port, message)
            return communication.Communication.CONNECTION_ERROR
Пример #2
0
    def __send_search_request_to(self, query_id, TTL, ip, port, search_query,
                                 min_score, forwarding_node_count,
                                 result_count):
        #               signal.alarm(10)
        try:
            connection = httplib.HTTPConnection(ip, port)
            connection.putrequest("POST", "/message")
            connection.endheaders()

            print "send message"

            protocol.sendHeader(connection, constants.SEARCH_REQUEST_COMMAND,
                                query_id, TTL)
            protocol.sendSearchRequest(connection, min_score,
                                       forwarding_node_count, result_count,
                                       search_query)
            print "get response"
            http_response = connection.getresponse()

            (protocol_version, vendor, node_id, ip, port, bandwidth, counter,
             command_type, queryID, TTL) = response.readHeader(http_response)

            print "recv pong id = %s %s %s" % (node_id, ip, port)

            last_seen_time = int(time.time())
            nodeInfo = globalvars.maay_core.updateNodeInfo(
                node_id, ip, port, bandwidth, counter, last_seen_time)
            connection.close()
#               except TimeoutError:
#                       signal.alarm(0)
#                       return communication.Communication.TIMEOUT_ERROR
        except socket.error, (code, message):
            print "Connection problem on node [%s:%s]: %s" % (ip, port,
                                                              message)
            return communication.Communication.CONNECTION_ERROR
Пример #3
0
    def recv_download_request(self, httpRequestHandler, query_id, TTL, sender_nodeID, sender_nodeIP, sender_nodePort, document_id, search_query):
        # check in the indexer if we have it
        documentInfo = globalvars.database.getDocumentInfo(document_id=document_id)
        if not documentInfo:
            print "never heard about document %s" % document_id
            # todo: forward request to a node which have a document which a close document id? 
            has_content = 0
            has_description = 0
        else:
            if documentInfo.state == maay.datastructure.documentinfo.KNOWN_STATE:
                print "I do not have the file on my disk, why do you ask me ?"
                # todo: but I can give you some other pointers
                has_content = 0
            else:
                has_content = 1
            has_description = 1

        flags = (has_content * constants.HAS_DOCUMENT_CONTENT_FLAG) | (has_description * constants.HAS_DOCUMENT_DESCRIPTION_FLAG)

        # update documentScore with the download request received and the
        # documentscore received

#               dp = documentProviders[0]
#               nodeInfo = globalvars.database.getNodeInfo(dp.node_id)

        httpRequestHandler.send_response(200)
        httpRequestHandler.end_headers()
        output = tools.file2stream(httpRequestHandler.wfile)
        protocol.sendHeader(output, constants.DOWNLOAD_RESPONSE_COMMAND, self.__generateQueryID(), constants.INIT_TTL)
        protocol.sendDownloadResponse(output, document_id, flags)
        if has_description:
            documentProviders = globalvars.database.getDocumentProviders(documentInfo.db_document_id)

            if has_content and not documentInfo.url:
                fileInfos = globalvars.database.getFileInfos(db_document_id = documentInfo.db_document_id, state = maay.datastructure.documentinfo.PUBLISHED_STATE)
                for fileInfo in fileInfos:
                    pos = fileInfo.file_name.find(globalvars.config.getValue("PublishedDocumentRoot"))
                    print "send url pos = %s" % pos
                    if pos != -1:
                        documentInfo.url = 'http://%s:%s/pub/%s' % (globalvars.ip, globalvars.port, fileInfo.file_name[pos + len(globalvars.config.getValue("PublishedDocumentRoot")) + 1:])

                        documentInfo.url = documentInfo.url.replace("\\", "/")
                        break

            protocol.sendDownloadResponseDocumentDescription(output, documentInfo.title, documentInfo.publication_time, documentInfo.mime_type, documentInfo.size, documentInfo.url or "", len(documentProviders))


            for dp in documentProviders:
                nodeInfo = globalvars.database.getNodeInfo(dp.node_id)
                protocol.sendDownloadResponseProvider(output, dp.node_id, nodeInfo.ip, nodeInfo.port, dp.last_providing_time, nodeInfo.last_seen_time, nodeInfo.bandwidth, nodeInfo.counter)

            if has_content:
                fileInfo = globalvars.database.getFileInfos(db_document_id = documentInfo.db_document_id)[0]
                protocol.sendDownloadResponseDocument(output, fileInfo.file_name, documentInfo.size)
                self.hasDownloaded(sender_nodeID, document_id, search_query, weight=DOWNLOAD_SCORE_WEIGHT)
Пример #4
0
def handleMessage(httpRequestHandler):
    # read message content
    (protocol_version, vendor, node_id, ip, port, bandwidth, counter,
     command_type, queryID,
     TTL) = response.readHeader(httpRequestHandler.rfile)

    last_seen_time = int(time.time())

    nodeInfo = globalvars.maay_core.updateNodeInfo(node_id, ip, port,
                                                   bandwidth, counter,
                                                   last_seen_time)

    # update information on the node
    # we have to forward them back also, and bufferize them before
    # forwarding them back
    # check error before

    if command_type == constants.SEARCH_REQUEST_COMMAND:
        (min_score, forwarding_node_count, result_count,
         search_query) = response.readSearchRequest(httpRequestHandler.rfile)
        globalvars.maay_core.recv_search_request(queryID, TTL, node_id, ip,
                                                 port, search_query, min_score,
                                                 forwarding_node_count,
                                                 result_count,
                                                 constants.MAAY_SEARCH_RANGE)

        globalvars.maay_core.manifest_interest(node_id, search_query)

    elif command_type == constants.SEARCH_RESPONSE_COMMAND:

        # if I receive answers of a unknown query, do nothing
        resultSpool = globalvars.maay_core.getResultSpoolManager(
        ).getResultSpool(queryID)
        if not resultSpool:
            return

        search_query = resultSpool.getQuery()

        globalvars.maay_core.manifest_interest(node_id, search_query)
        hit_count = response.readSearchResponseInfo(httpRequestHandler.rfile)
        for i in range(0, hit_count):
            (document_id, mime_type, url, publication_time, file_size, title,
             score_count, provider_count) = response.readSearchResponseHitInfo(
                 httpRequestHandler.rfile)

            # update information on the document
            #                       print (document_id, mime_type, url, publication_time, file_size, title, score_count, provider_count)
            documentInfo = globalvars.maay_core.updateDocumentInfo(
                document_id, mime_type, title, file_size, publication_time,
                url)

            # todo 0 should be the score/rank of the document
            #                       rank = 0.0

            for j in range(0, score_count):
                (word, relevance, popularity, excerpt, excerpt_position,
                 word_position) = response.readSearchResponseHitScore(
                     httpRequestHandler.rfile)
                #                               print (word, relevance, popularity, excerpt)
                # update information on the word in the document
                print "process document scores"
                ds = globalvars.maay_core.processDocumentScore(
                    documentInfo.db_document_id, word, relevance, popularity,
                    excerpt, excerpt_position, word_position, nodeInfo)

#                               if word in search_query:
#                                       rank += (float(ds.relevance) + 0.0001) * (float(ds.popularity) + 0.0001)

            ranking_score = globalvars.maay_core.compute_ranking_score(
                document_id, search_query)
            globalvars.maay_core.updateDocumentMatching(
                document_id=document_id)

            result = resultspool.MaayResult(document_id, ranking_score, 0, 0,
                                            int(publication_time),
                                            documentInfo.state)
            resultSpool.addResult(result)

            for j in xrange(provider_count):
                (node_id, ip, port, last_storing_time, last_seen_time,
                 bandwidth, counter) = response.readSearchResponseHitProvider(
                     httpRequestHandler.rfile)

                globalvars.maay_core.updateNodeInfo(node_id, ip, port,
                                                    bandwidth, counter,
                                                    last_seen_time)
                globalvars.maay_core.manifest_interest(node_id, search_query)

                # update information on the document provider (node)
                globalvars.maay_core.updateDocumentProvider(
                    documentInfo.db_document_id, node_id, last_storing_time)
    elif command_type == constants.DOWNLOAD_REQUEST_COMMAND:
        (document_id,
         search_query) = response.readDownloadRequest(httpRequestHandler.rfile)
        globalvars.maay_core.manifest_interest(node_id, search_query)
        globalvars.maay_core.recv_download_request(httpRequestHandler, queryID,
                                                   TTL, node_id, ip, port,
                                                   document_id, search_query)

    if command_type != constants.DOWNLOAD_REQUEST_COMMAND:
        httpRequestHandler.send_response(200)
        httpRequestHandler.end_headers()
        protocol.sendHeader(tools.file2stream(httpRequestHandler.wfile),
                            constants.PONG_COMMAND, queryID, 0)
Пример #5
0
class Download:

    FINISHED_STATE = 2

    def __init__(self, document_id, search_query = []):
        self.__active = 0
        self.__providers = []
        self.__providersHT = {}
        self.__document_id = document_id
        self.__search_query = search_query
        self.__transferred = 0
        self.__state = download.NOT_STARTED_STATE
        self.__last_search_time = 0
        # TODO: for the moment unisource is ok
        # divide the file into chunks of 256ko
        # several states : in download, downloaded, not downloaded

    def isActive(self):
        return self.__active

    def setActive(self, active):
        self.__active = active

    def fetch(self):
        if self.__state == download.FINISHED_STATE:
            return 1

        if self.__state == download.NOT_STARTED_STATE:
            self.__state = download.INIT_STATE
        while 1:
            documentInfo = globalvars.database.getDocumentInfo(self.__document_id)
            documentProviders = globalvars.database.getDocumentProviders(documentInfo.db_document_id)
            for dp in documentProviders:
                p = self.__providersHT.get(dp.node_id)
                if p:
                    n = globalvars.database.getNodeInfo(dp.node_id)
                    if p.ip != n.ip or p.port != n.port:
                        p.port = port
                        p.ip = ip
                        p.state = download.Provider.UNTRIED_STATE
                        p.last_try = 0
                else:
                    n = globalvars.database.getNodeInfo(dp.node_id)
                    p = download.Provider(n.node_id, n.ip, n.port, n.bandwidth, dp.last_providing_time)
                    self.__providers.append(p)
                    self.__providersHT[dp.node_id] = p

            provider = None
            for p in self.__providers:
                if p.state in (download.Provider.UNREACHABLE_STATE, download.Provider.NOT_PROVIDING_STATE):
                    continue
                if p.last_try + download.NEXT_RETRY_PERIOD > time.time():
                    continue
                provider = p
                break

            if not provider:
                print "no provider"
                if self.__last_search_time + download.NEXT_SEARCH_PERIOD > time.time() and self.__state == download.SEARCHING_SOURCES_STATE:
                    time.sleep(1)
                    return 0

                globalvars.maay_core.send_search_request(["#%s" % self.__document_id], constants.INIT_TTL, constants.MAAY_SEARCH_RANGE, constants.MIN_SCORE, constants.INIT_FNC, constants.INIT_EHC, query_id = self.__document_id)
                print "search for providers"
                self.__state = download.SEARCHING_SOURCES_STATE
                self.__last_search_time = time.time()
                time.sleep(1)
                return 0

            print "PROVIDER IP =%s [%s]" % (provider.ip, provider.state)

            self.__state = download.CONNECTING_STATE
            provider.state = download.Provider.CONNECTED_STATE
            connection = None
            try:
#                               signal.alarm(5)
                print "essaie sur %s %s" % (provider.ip, provider.port)
                connection = httplib.HTTPConnection(provider.ip, provider.port)
                connection.putrequest("POST", "/message")
                connection.endheaders()
#                               signal.alarm(0)
            except Exception, e:
#                               signal.alarm(0)
                print "Exception: %s" % e
                provider.state = download.Provider.UNREACHABLE_STATE
                continue
            except TimeoutError:
#                               signal.alarm(0)
                provider.state = download.Provider.BUSY_STATE
                provider.last_try = time.time()
                continue
            try:
                provider.state = download.Provider.DOWNLOADING_STATE
                protocol.sendHeader(connection, constants.DOWNLOAD_REQUEST_COMMAND, "12345678901234567890", constants.INIT_TTL)
                protocol.sendDownloadRequest(connection, self.__document_id, self.__search_query)
                print "requete envoye, attente reponse %s" % connection

                r = connection.getresponse()
                print "resp"

                (protocol_version, vendor, node_id, ip, port, bandwidth, counter, command_type, queryID, TTL) =  response.readHeader(r)
                print "resp 2"

                document_id, flags = response.readDownloadResponse(r)
                if not (flags & constants.HAS_DOCUMENT_DESCRIPTION_FLAG):
                    print "the provider do not have the file %s" % len(self.__providers)
                    provider.state = download.Provider.NOT_PROVIDING_STATE
                    globalvars.database.deleteDocumentProvider(db_document_id = documentInfo.db_document_id, node_id = provider.node_id)
                    continue

                (title, publication_time, mime_type, size, url, provider_count) = response.readDownloadResponseDocumentDescription(r)
                print "url received = %s" % url
                globalvars.maay_core.updateDocumentInfo(document_id, mime_type, title, size, publication_time, url)

                
                for j in xrange(provider_count):
                    (node_id, ip, port, last_storing_time, last_seen_time, bandwidth, counter) = response.readSearchResponseHitProvider(r)
                    print "provider in resp = %s" % str((node_id, ip, port, bandwidth, counter, last_seen_time))
                    globalvars.maay_core.updateNodeInfo(node_id, ip, port, bandwidth, counter, last_seen_time)
                    globalvars.maay_core.updateDocumentProvider(documentInfo.db_document_id, node_id, last_storing_time)
                    globalvars.maay_core.manifest_interest(node_id, self.__search_query)

                if not (flags & constants.HAS_DOCUMENT_CONTENT_FLAG):
                    print "the provider do not have the file %s" % len(self.__providers)
                    provider.state = download.Provider.NOT_PROVIDING_STATE
                    globalvars.database.deleteDocumentProvider(db_document_id = documentInfo.db_document_id, node_id = provider.node_id)
                    continue


                content_input = response.readDownloadResponseInput(r)

                self.__state = download.DOWNLOADING_STATE
                print "waiting document content"

                file_name = globalvars.config.getValue("TemporaryDocumentRoot") + os.path.sep + document_id + (mimetypes.guess_extension(mime_type) or ".txt")
                fd = file(file_name, "wb")

                idle = 0
                self.__transferred = 0

                while self.__transferred < size and idle < 20:
                    idle += 1
                    buf = content_input.read(1024)
                    if not buf:
                        continue
                    idle = 0
                    fd.write(buf)
#                                       print "write buf"
                    self.__transferred += len(buf)
                fd.close()
                if idle > 20:
                    raise "idle"

                print "document received completeley"

                if self.__transferred != size:
                    print "Error: file length not match %s %s" % (self.__transferred, size)
                    connection.close()
                    os.remove(file_name)
                    continue

                new_file_name = document_id + (mimetypes.guess_extension(mime_type) or ".txt")
                absolute_new_file_name = "%s%s%s" % (globalvars.config.getValue("CachedDocumentRoot"), os.path.sep, new_file_name)
                if os.path.exists(absolute_new_file_name):
                    os.remove(absolute_new_file_name)

                print "rename %s => %s" % (file_name, absolute_new_file_name)
                os.rename(file_name, absolute_new_file_name)
                print "done => %s" % absolute_new_file_name
#                               file_time = int(os.stat(absolute_new_file_name)[stat.ST_MTIME])
                file_time = 0
                fileInfo = maay.datastructure.fileinfo.FileInfo(absolute_new_file_name, file_time, documentInfo.db_document_id, maay.datastructure.documentinfo.CACHED_STATE, maay.datastructure.fileinfo.CREATED_FILE_STATE)
                print "1 documentInfo.db_document_id = %s" %   fileInfo.db_document_id
                db_fileInfos = globalvars.database.getFileInfos(file_name=absolute_new_file_name)
                if not db_fileInfos:
                    globalvars.database.insertFileInfo(fileInfo)
                else:
                    globalvars.database.updateFileInfo(fileInfo)

                globalvars.indexer.addNewDocumentToIndex(absolute_new_file_name)

                self.__state = download.FINISHED_STATE
                provider.state = download.Provider.FINISHED_STATE
                return 1

            except Exception, e:
                time.sleep(2)

#                       else:
#                       except TimeoutError, e:
                print "Error ex: %s" % e
                provider.state = download.Provider.BUSY_STATE
                provider.last_try = time.time()
Пример #6
0
    def flushResults(self):
        t = int(time.time())
        for rs in self.resultSpoolManager.getResultSpools():

            if rs.getNodeID() == self.__nodeID:
                continue

            if t - rs.getQueryTime() > constants.result_spool_lifetime:
                self.resultSpoolManager.removeResultSpool(rs)
                continue

            if rs.getSentResultCount() >= rs.getExpectedResultCount():
                continue

            # keep this resultspool

            documentIDs = rs.getBestUnsentResults()
            if len(documentIDs) == 0:
                continue

            nodeInfo = globalvars.database.getNodeInfo(rs.getNodeID())
            print "flush results to %s" % rs.getNodeID()
            # todo : if the connection is local, make a shortcut
            #                               c = protocol.Protocol(self, nodeInfo.ip, nodeInfo.port)
            connection = httplib.HTTPConnection(nodeInfo.ip, nodeInfo.port)
            connection.putrequest("POST", "/message")
            connection.endheaders()

            protocol.sendHeader(connection, constants.SEARCH_RESPONSE_COMMAND,
                                rs.getQueryID(), 0)
            protocol.sendSearchResponseInfo(connection, len(documentIDs))

            for document_id in documentIDs:
                documentInfo = globalvars.database.getDocumentInfos(
                    document_id=document_id, get_text=1)[0]
                if not documentInfo.url:
                    fileInfos = globalvars.database.getFileInfos(
                        db_document_id=documentInfo.db_document_id,
                        state=maay.datastructure.documentinfo.PUBLISHED_STATE)
                    for fileInfo in fileInfos:
                        pos = fileInfo.file_name.find(
                            globalvars.config.getValue(
                                "PublishedDocumentRoot"))
                        if pos != -1:
                            documentInfo.url = 'http://%s:%s/pub/%s' % (
                                globalvars.ip, globalvars.port,
                                fileInfo.file_name[pos + len(
                                    globalvars.config.
                                    getValue("PublishedDocumentRoot")) + 1:])

                            documentInfo.url = documentInfo.url.replace(
                                "\\", "/")
                            break

                queryDocumentScores = globalvars.database.getDocumentScores(
                    documentInfo.db_document_id, rs.getQuery())
                relevantDocumentScores = globalvars.database.getBestRelevantDocumentScores(
                    documentInfo.db_document_id,
                    constants.relevant_document_score_count +
                    len(rs.getQuery()))
                documentProviders = globalvars.database.getDocumentProviders(
                    documentInfo.db_document_id)

                documentScores = queryDocumentScores[:]

                for ds in relevantDocumentScores:
                    add = 1
                    for word in rs.getQuery():
                        if ds.word == word:
                            add = 0
                            break
                    if add:
                        documentScores.append(ds)

                url = documentInfo.url
                if not url:
                    url = ""
#                               if url:
#                                       if url.find('/') == 0:
#                                               url = "http://%s:%s%s/pub" % (globalvars.hostname, globalvars.port, url)
#                               else:
#                                       url = ""

                protocol.sendSearchResponseHitInfo(
                    connection, document_id, documentInfo.mime_type, url,
                    documentInfo.publication_time, documentInfo.size,
                    documentInfo.title, len(documentScores),
                    len(documentProviders))

                for ds in documentScores:
                    pos = ds.position
                    text = documentInfo.text
                    if pos >= constants.MAX_TEXT_CONTENT_STORED_SIZE:
                        pos = 0
                    start = max(0, pos - constants.EXCERPT_HALF_SIZE)
                    if start > 0:
                        while start < pos and text[start] != ' ':
                            start += 1
                        start += 1

                    end = min(len(text) - 1, start + constants.EXCERPT_SIZE)

                    if end < len(text) - 1:
                        while end > pos and text[end] != ' ':
                            end -= 1

                    excerpt = documentInfo.text[start:end]
                    print "excerpt = %s (%s,%s)" % (excerpt, start, end)

                    protocol.sendSearchResponseHitScore(
                        connection, ds.word, ds.relevance, ds.popularity,
                        excerpt, start, ds.position)

                for dp in documentProviders:
                    ni = globalvars.database.getNodeInfo(dp.node_id)
                    if ni.node_id == self.__nodeID:
                        ni.last_seen_time == int(time.time())
                    protocol.sendSearchResponseHitProvider(
                        connection, dp.node_id, ni.ip, ni.port,
                        dp.last_providing_time, ni.last_seen_time,
                        ni.bandwidth, ni.counter)

            connection.close()
Пример #7
0
    def recv_download_request(self, httpRequestHandler, query_id, TTL,
                              sender_nodeID, sender_nodeIP, sender_nodePort,
                              document_id, search_query):
        # check in the indexer if we have it
        documentInfo = globalvars.database.getDocumentInfo(
            document_id=document_id)
        if not documentInfo:
            print "never heard about document %s" % document_id
            # todo: forward request to a node which have a document which a close document id?
            has_content = 0
            has_description = 0
        else:
            if documentInfo.state == maay.datastructure.documentinfo.KNOWN_STATE:
                print "I do not have the file on my disk, why do you ask me ?"
                # todo: but I can give you some other pointers
                has_content = 0
            else:
                has_content = 1
            has_description = 1

        flags = (has_content * constants.HAS_DOCUMENT_CONTENT_FLAG) | (
            has_description * constants.HAS_DOCUMENT_DESCRIPTION_FLAG)

        # update documentScore with the download request received and the
        # documentscore received

        #               dp = documentProviders[0]
        #               nodeInfo = globalvars.database.getNodeInfo(dp.node_id)

        httpRequestHandler.send_response(200)
        httpRequestHandler.end_headers()
        output = tools.file2stream(httpRequestHandler.wfile)
        protocol.sendHeader(output, constants.DOWNLOAD_RESPONSE_COMMAND,
                            self.__generateQueryID(), constants.INIT_TTL)
        protocol.sendDownloadResponse(output, document_id, flags)
        if has_description:
            documentProviders = globalvars.database.getDocumentProviders(
                documentInfo.db_document_id)

            if has_content and not documentInfo.url:
                fileInfos = globalvars.database.getFileInfos(
                    db_document_id=documentInfo.db_document_id,
                    state=maay.datastructure.documentinfo.PUBLISHED_STATE)
                for fileInfo in fileInfos:
                    pos = fileInfo.file_name.find(
                        globalvars.config.getValue("PublishedDocumentRoot"))
                    print "send url pos = %s" % pos
                    if pos != -1:
                        documentInfo.url = 'http://%s:%s/pub/%s' % (
                            globalvars.ip, globalvars.port,
                            fileInfo.file_name[pos + len(
                                globalvars.config.
                                getValue("PublishedDocumentRoot")) + 1:])

                        documentInfo.url = documentInfo.url.replace("\\", "/")
                        break

            protocol.sendDownloadResponseDocumentDescription(
                output, documentInfo.title, documentInfo.publication_time,
                documentInfo.mime_type, documentInfo.size, documentInfo.url
                or "", len(documentProviders))

            for dp in documentProviders:
                nodeInfo = globalvars.database.getNodeInfo(dp.node_id)
                protocol.sendDownloadResponseProvider(
                    output, dp.node_id, nodeInfo.ip, nodeInfo.port,
                    dp.last_providing_time, nodeInfo.last_seen_time,
                    nodeInfo.bandwidth, nodeInfo.counter)

            if has_content:
                fileInfo = globalvars.database.getFileInfos(
                    db_document_id=documentInfo.db_document_id)[0]
                protocol.sendDownloadResponseDocument(output,
                                                      fileInfo.file_name,
                                                      documentInfo.size)
                self.hasDownloaded(sender_nodeID,
                                   document_id,
                                   search_query,
                                   weight=DOWNLOAD_SCORE_WEIGHT)
Пример #8
0
    def flushResults(self):
        t = int(time.time())
        for rs in self.resultSpoolManager.getResultSpools():

            if rs.getNodeID() == self.__nodeID:
                continue

            if t - rs.getQueryTime() > constants.result_spool_lifetime:
                self.resultSpoolManager.removeResultSpool(rs)
                continue

            if rs.getSentResultCount() >= rs.getExpectedResultCount():
                continue

            # keep this resultspool

            documentIDs = rs.getBestUnsentResults()
            if len(documentIDs) == 0:
                continue

            nodeInfo = globalvars.database.getNodeInfo(rs.getNodeID())
            print "flush results to %s" % rs.getNodeID()
                # todo : if the connection is local, make a shortcut
#                               c = protocol.Protocol(self, nodeInfo.ip, nodeInfo.port)
            connection = httplib.HTTPConnection(nodeInfo.ip, nodeInfo.port)
            connection.putrequest("POST", "/message")
            connection.endheaders()

            protocol.sendHeader(connection, constants.SEARCH_RESPONSE_COMMAND, rs.getQueryID(), 0)
            protocol.sendSearchResponseInfo(connection, len(documentIDs))

            for document_id in documentIDs:
                documentInfo = globalvars.database.getDocumentInfos(document_id = document_id, get_text = 1)[0]
                if not documentInfo.url:
                    fileInfos = globalvars.database.getFileInfos(db_document_id = documentInfo.db_document_id, state = maay.datastructure.documentinfo.PUBLISHED_STATE)
                    for fileInfo in fileInfos:
                        pos = fileInfo.file_name.find(globalvars.config.getValue("PublishedDocumentRoot"))
                        if pos != -1:
                            documentInfo.url = 'http://%s:%s/pub/%s' % (globalvars.ip, globalvars.port, fileInfo.file_name[pos + len(globalvars.config.getValue("PublishedDocumentRoot")) + 1:])

                            documentInfo.url = documentInfo.url.replace("\\", "/")
                            break

            
                queryDocumentScores = globalvars.database.getDocumentScores(documentInfo.db_document_id, rs.getQuery())
                relevantDocumentScores = globalvars.database.getBestRelevantDocumentScores(documentInfo.db_document_id, constants.relevant_document_score_count + len(rs.getQuery()))
                documentProviders = globalvars.database.getDocumentProviders(documentInfo.db_document_id)

                documentScores = queryDocumentScores[:]

                for ds in relevantDocumentScores:
                    add = 1
                    for word in rs.getQuery():
                        if ds.word == word:
                            add = 0
                            break
                    if add:
                        documentScores.append(ds)

                url = documentInfo.url
                if not url:
                    url = ""
#                               if url:
#                                       if url.find('/') == 0:
#                                               url = "http://%s:%s%s/pub" % (globalvars.hostname, globalvars.port, url)
#                               else:
#                                       url = ""

                protocol.sendSearchResponseHitInfo(connection, document_id, documentInfo.mime_type, url, documentInfo.publication_time, documentInfo.size, documentInfo.title, len(documentScores), len(documentProviders))

                for ds in documentScores:
                    pos = ds.position
                    text = documentInfo.text
                    if pos >= constants.MAX_TEXT_CONTENT_STORED_SIZE:
                        pos = 0
                    start = max(0, pos - constants.EXCERPT_HALF_SIZE)
                    if start > 0:
                        while start < pos and text[start] != ' ':
                            start += 1
                        start += 1

                    end = min(len(text) - 1, start + constants.EXCERPT_SIZE)

                    if end < len(text) - 1:
                        while end > pos and text[end] != ' ':
                            end -= 1

                    excerpt = documentInfo.text[start:end]
                    print "excerpt = %s (%s,%s)" % (excerpt, start, end)

                    protocol.sendSearchResponseHitScore(connection, ds.word, ds.relevance, ds.popularity, excerpt, start, ds.position)

                for dp in documentProviders:
                    ni = globalvars.database.getNodeInfo(dp.node_id)
                    if ni.node_id == self.__nodeID:
                        ni.last_seen_time == int(time.time())
                    protocol.sendSearchResponseHitProvider(connection, dp.node_id, ni.ip, ni.port, dp.last_providing_time, ni.last_seen_time, ni.bandwidth, ni.counter)

            connection.close()
Пример #9
0
def handleMessage(httpRequestHandler):
    # read message content
    (protocol_version, vendor, node_id, ip, port, bandwidth, counter, command_type, queryID, TTL) =  response.readHeader(httpRequestHandler.rfile)

    last_seen_time = int(time.time())

    nodeInfo = globalvars.maay_core.updateNodeInfo(node_id, ip, port, bandwidth, counter, last_seen_time)

    # update information on the node
    # we have to forward them back also, and bufferize them before
    # forwarding them back
    # check error before

    if command_type == constants.SEARCH_REQUEST_COMMAND:
        (min_score, forwarding_node_count, result_count, search_query) =  response.readSearchRequest(httpRequestHandler.rfile)
        globalvars.maay_core.recv_search_request(queryID, TTL, node_id, ip, port, search_query, min_score, forwarding_node_count, result_count, constants.MAAY_SEARCH_RANGE)

        globalvars.maay_core.manifest_interest(node_id, search_query)

    elif command_type == constants.SEARCH_RESPONSE_COMMAND:
            
        # if I receive answers of a unknown query, do nothing
        resultSpool = globalvars.maay_core.getResultSpoolManager().getResultSpool(queryID)
        if not resultSpool:
            return

        search_query = resultSpool.getQuery()

        globalvars.maay_core.manifest_interest(node_id, search_query)
        hit_count = response.readSearchResponseInfo(httpRequestHandler.rfile)
        for i in range(0, hit_count):
            (document_id, mime_type, url, publication_time, file_size, title, score_count, provider_count) = response.readSearchResponseHitInfo(httpRequestHandler.rfile)

            # update information on the document
#                       print (document_id, mime_type, url, publication_time, file_size, title, score_count, provider_count) 
            documentInfo = globalvars.maay_core.updateDocumentInfo(document_id, mime_type, title, file_size, publication_time, url)

            # todo 0 should be the score/rank of the document
#                       rank = 0.0
                        
            for j in range(0, score_count):
                (word, relevance, popularity, excerpt, excerpt_position, word_position) = response.readSearchResponseHitScore(httpRequestHandler.rfile)
#                               print (word, relevance, popularity, excerpt) 
                # update information on the word in the document
                print "process document scores"
                ds = globalvars.maay_core.processDocumentScore(documentInfo.db_document_id, word, relevance, popularity, excerpt, excerpt_position, word_position, nodeInfo)

#                               if word in search_query:
#                                       rank += (float(ds.relevance) + 0.0001) * (float(ds.popularity) + 0.0001)

            ranking_score = globalvars.maay_core.compute_ranking_score(document_id, search_query)
            globalvars.maay_core.updateDocumentMatching(document_id=document_id)

            result = resultspool.MaayResult(document_id, ranking_score, 0, 0, int(publication_time), documentInfo.state)
            resultSpool.addResult(result)

            for j in xrange(provider_count):
                (node_id, ip, port, last_storing_time, last_seen_time, bandwidth, counter) = response.readSearchResponseHitProvider(httpRequestHandler.rfile)

                globalvars.maay_core.updateNodeInfo(node_id, ip, port, bandwidth, counter, last_seen_time)
                globalvars.maay_core.manifest_interest(node_id, search_query)

                # update information on the document provider (node)
                globalvars.maay_core.updateDocumentProvider(documentInfo.db_document_id, node_id, last_storing_time)
    elif command_type == constants.DOWNLOAD_REQUEST_COMMAND:
        (document_id, search_query) = response.readDownloadRequest(httpRequestHandler.rfile)
        globalvars.maay_core.manifest_interest(node_id, search_query)
        globalvars.maay_core.recv_download_request(httpRequestHandler, queryID, TTL, node_id, ip, port, document_id, search_query)

    if command_type != constants.DOWNLOAD_REQUEST_COMMAND:
        httpRequestHandler.send_response(200)
        httpRequestHandler.end_headers()
        protocol.sendHeader(tools.file2stream(httpRequestHandler.wfile), constants.PONG_COMMAND, queryID, 0)