Пример #1
0
    def indexDocument(self, filename, title, text, fileSize, lastModifiedOn,
                      content_hash, mime_type, state, file_state):
        """Inserts or update information in table documents,
        file_info, document_score and word"""
        # XXX Decide if we can compute the content_hash and mime_type
        # ourselves or if the indexer should do it and pass the values as an argument
        cursor = self._cnx.cursor()
        # insert or update in table file_info
        fileinfo = FileInfo.selectWhere(cursor, file_name=filename)
        if fileinfo:
            fileinfo = fileinfo[0]
            fileinfo.file_time = lastModifiedOn
            fileinfo.state = state
            fileinfo.file_state = file_state
            doc = Document.selectWhere(cursor,
                                       db_document_id=fileinfo.db_document_id)
            if not doc or doc[0].document_id != content_hash:
                # no document was found or a document with another content
                # in both case, we create a new Document in database
                # (we don't want to modify the existing one, because it
                # can be shared by several files)
                doc = self._createDocument(cursor, content_hash, title, text,
                                           fileSize, lastModifiedOn, filename,
                                           state)
                fileinfo.db_document_id = doc.db_document_id
            else:
                # document has not changed
                doc = doc[0]

            fileinfo.commit(cursor, update=True)

        else:
            # file unknown
            # try to find a Document with same hash value
            doc = Document.selectWhere(cursor, document_id=content_hash)
            if doc:
                doc = doc[0]
                doc.state = state
                doc.publication_time = max(doc.publication_time,
                                           lastModifiedOn)
                doc.commit(cursor, update=True)
            else:
                doc = self._createDocument(cursor, content_hash, title, text,
                                           fileSize, lastModifiedOn, filename,
                                           state)
                doc = Document.selectWhere(cursor, document_id=content_hash)[0]

            fileinfo = FileInfo(db_document_id=doc.db_document_id,
                                file_name=filename,
                                file_time=lastModifiedOn,
                                state=state,
                                file_state=file_state)
            fileinfo.commit(cursor, update=False)

        self._updateScores(cursor, doc.db_document_id, text)
        cursor.close()
        self._cnx.commit()
Пример #2
0
 def getFileInformations(self, filename):
     cursor = self._cnx.cursor()
     results = FileInfo.selectWhere(cursor, file_name=filename)
     cursor.close()
     return list(results)
Пример #3
0
 def getIndexedFiles(self):
     cursor = self._cnx.cursor()
     results = FileInfo.selectWhere(cursor)
     cursor.close()
     return [f.file_name for f in results]
Пример #4
0
 def indexDocument(self, nodeId, futureDoc):
     """Inserts or update information in table documents,
     file_info, document_score and word
     :type nodeId: node_id or None if working locally
     """
     # XXX Decide if we can compute the content_hash and mime_type
     # ourselves or if the indexer should do it and
     # pass the values as an argument
     cursor = self._cnx.cursor()
     try:
         # insert or update in table file_info
         fileinfo = FileInfo.selectWhere(cursor, file_name=futureDoc.filename)
         # insert title into text to be able to find documents according
         # to their title (e.g: searching 'foo' should find 'foo.pdf')
         futureDoc.text = '%s %s' % (futureDoc.title, futureDoc.text)
         if fileinfo:
             fileinfo = fileinfo[0]
             fileinfo.file_time = futureDoc.lastModificationTime
             fileinfo.state = futureDoc.state
             fileinfo.file_state = futureDoc.file_state
             doc = Document.selectWhere(cursor,
                                        db_document_id=fileinfo.db_document_id)
             if not doc or doc[0].document_id!=futureDoc.content_hash :
                 # no document was found or a document with another content
                 # in both case, we create a new Document in database
                 # (we don't want to modify the existing one, because it
                 # can be shared by several files)
                 doc = self._createDocument(cursor, futureDoc)
                 fileinfo.db_document_id = doc.db_document_id
             else:
                 # document has not changed
                 doc = doc[0]
                 if doc.state != futureDoc.state:
                     doc.state = futureDoc.state
                     doc.commit(cursor, update=True)
             fileinfo.commit(cursor, update=True)
         else:
             # file unknown
             # try to find a Document with same hash value
             doc = Document.selectWhere(cursor,
                                        document_id=futureDoc.content_hash)
             if doc:
                 doc = doc[0]
                 doc.state = futureDoc.state
                 doc.publication_time = max(doc.publication_time,
                                            futureDoc.lastModificationTime)
                 doc.commit(cursor, update=True)
             else:
                 doc = self._createDocument(cursor, futureDoc)
                 doc = Document.selectWhere(cursor, document_id=futureDoc.content_hash)[0]
             fileinfo = FileInfo(db_document_id=doc.db_document_id,
                                 file_name=futureDoc.filename,
                                 file_time=futureDoc.lastModificationTime,
                                 state=futureDoc.state,
                                 file_state=futureDoc.file_state)
             fileinfo.commit(cursor, update=False)
         self._updateScores(cursor, doc.db_document_id, futureDoc.text)
         # update last seen time only if not working locally
         if nodeId is not None:
             provider = DocumentProvider.selectOrInsertWhere(cursor,
                                                             db_document_id=doc.db_document_id,
                                                             node_id=nodeId)[0]
             provider.last_providing_time = int(time.time())
             provider.commit(cursor, update=True)
             nodes = Node.selectWhere(cursor, node_id=nodeId)
             if not nodes:
                 self._cnx.rollback()
                 cursor.close()
                 raise ValueError('provider %s is not registered in our database !')
             node = nodes[0]
             node.last_seen_time = int(time.time())
             node.commit(cursor, update=True)
         cursor.close()
         self._cnx.commit()
     except:
         self._cnx.rollback()
         raise
Пример #5
0
 def getIndexedFiles(self):
     cursor = self._cnx.cursor()
     results = FileInfo.selectWhere(cursor)
     cursor.close()
     return  [f.file_name for f in results]    
Пример #6
0
 def getFileInformations(self, filename):
     cursor = self._cnx.cursor()
     results = FileInfo.selectWhere(cursor, file_name=filename)
     cursor.close()
     return list(results)
Пример #7
0
    def indexDocument(self, filename, title, text, fileSize, lastModifiedOn,
                      content_hash, mime_type, state, file_state):
        """Inserts or update information in table documents,
        file_info, document_score and word"""
        # XXX Decide if we can compute the content_hash and mime_type
        # ourselves or if the indexer should do it and pass the values as an argument
        cursor = self._cnx.cursor()
        # insert or update in table file_info
        fileinfo = FileInfo.selectWhere(cursor,
                                        file_name=filename)
        if fileinfo:
            fileinfo = fileinfo[0]
            fileinfo.file_time = lastModifiedOn
            fileinfo.state = state
            fileinfo.file_state = file_state
            doc = Document.selectWhere(cursor,
                                       db_document_id=fileinfo.db_document_id)
            if not doc or doc[0].document_id!=content_hash :
                # no document was found or a document with another content
                # in both case, we create a new Document in database
                # (we don't want to modify the existing one, because it
                # can be shared by several files)
                doc = self._createDocument(cursor,
                                           content_hash,
                                           title,
                                           text,
                                           fileSize,
                                           lastModifiedOn,
                                           filename,
                                           state)
                fileinfo.db_document_id = doc.db_document_id
            else:
                # document has not changed
                doc = doc[0]
                
            fileinfo.commit(cursor, update=True)
                
        else:
            # file unknown
            # try to find a Document with same hash value
            doc = Document.selectWhere(cursor, document_id=content_hash)
            if doc:
                doc = doc[0]
                doc.state = state
                doc.publication_time = max(doc.publication_time, lastModifiedOn)
                doc.commit(cursor, update=True)
            else:
                doc = self._createDocument(cursor,
                                           content_hash,
                                           title,
                                           text,
                                           fileSize,
                                           lastModifiedOn,
                                           filename,
                                           state)
                doc = Document.selectWhere(cursor, document_id=content_hash)[0]

            fileinfo = FileInfo(db_document_id=doc.db_document_id,
                                 file_name=filename,
                                 file_time=lastModifiedOn,
                                 state=state,
                                 file_state=file_state)
            fileinfo.commit(cursor, update=False)

        self._updateScores(cursor, doc.db_document_id, text)
        cursor.close()
        self._cnx.commit()        
Пример #8
0
    def indexDocument(self, nodeId, futureDoc):
        """Inserts or update information in table documents,
        file_info, document_score and word"""
        # XXX Decide if we can compute the content_hash and mime_type
        # ourselves or if the indexer should do it and
        # pass the values as an argument
        cursor = self._cnx.cursor()
        # insert or update in table file_info
        fileinfo = FileInfo.selectWhere(cursor, file_name=futureDoc.filename)
        # insert title into text to be able to find documents according
        # to their title (e.g: searching 'foo' should find 'foo.pdf')
        futureDoc.text = '%s %s' % (futureDoc.title, futureDoc.text)
        if fileinfo:
            fileinfo = fileinfo[0]
            fileinfo.file_time = futureDoc.lastModificationTime
            fileinfo.state = futureDoc.state
            fileinfo.file_state = futureDoc.file_state
            doc = Document.selectWhere(cursor,
                                       db_document_id=fileinfo.db_document_id)
            if not doc or doc[0].document_id!=futureDoc.content_hash :
                # no document was found or a document with another content
                # in both case, we create a new Document in database
                # (we don't want to modify the existing one, because it
                # can be shared by several files)
                doc = self._createDocument(cursor, futureDoc)
                fileinfo.db_document_id = doc.db_document_id
            else:
                # document has not changed
                doc = doc[0]
                if doc.state != futureDoc.state:
                    doc.state = futureDoc.state
                    doc.commit(cursor, update=True)
                
            fileinfo.commit(cursor, update=True)
                
        else:
            # file unknown
            # try to find a Document with same hash value
            doc = Document.selectWhere(cursor,
                                       document_id=futureDoc.content_hash)
            if doc:
                doc = doc[0]
                doc.state = futureDoc.state
                doc.publication_time = max(doc.publication_time,
                                           futureDoc.lastModificationTime)
                doc.commit(cursor, update=True)
            else:
                doc = self._createDocument(cursor, futureDoc)
                doc = Document.selectWhere(cursor, document_id=futureDoc.content_hash)[0]

            fileinfo = FileInfo(db_document_id=doc.db_document_id,
                                file_name=futureDoc.filename,
                                file_time=futureDoc.lastModificationTime,
                                state=futureDoc.state,
                                file_state=futureDoc.file_state)
            fileinfo.commit(cursor, update=False)

        self._updateScores(cursor, doc.db_document_id, futureDoc.text)
        provider = DocumentProvider.selectOrInsertWhere(cursor,
                                          db_document_id=doc.db_document_id,
                                          node_id=nodeId)[0]
        provider.last_providing_time = int(time.time())
        provider.commit(cursor, update=True)
        node = Node.selectWhere(cursor, node_id=nodeId)[0]
        node.last_seen_time = int(time.time())
        node.commit(cursor, update=True)
        cursor.close()
        self._cnx.commit()