def indexDocument(self, filename, title, text, fileSize, lastModifiedOn, content_hash, mime_type, state, file_state): """Inserts or update information in table documents, file_info, document_score and word""" # XXX Decide if we can compute the content_hash and mime_type # ourselves or if the indexer should do it and pass the values as an argument cursor = self._cnx.cursor() # insert or update in table file_info fileinfo = FileInfo.selectWhere(cursor, file_name=filename) if fileinfo: fileinfo = fileinfo[0] fileinfo.file_time = lastModifiedOn fileinfo.state = state fileinfo.file_state = file_state doc = Document.selectWhere(cursor, db_document_id=fileinfo.db_document_id) if not doc or doc[0].document_id != content_hash: # no document was found or a document with another content # in both case, we create a new Document in database # (we don't want to modify the existing one, because it # can be shared by several files) doc = self._createDocument(cursor, content_hash, title, text, fileSize, lastModifiedOn, filename, state) fileinfo.db_document_id = doc.db_document_id else: # document has not changed doc = doc[0] fileinfo.commit(cursor, update=True) else: # file unknown # try to find a Document with same hash value doc = Document.selectWhere(cursor, document_id=content_hash) if doc: doc = doc[0] doc.state = state doc.publication_time = max(doc.publication_time, lastModifiedOn) doc.commit(cursor, update=True) else: doc = self._createDocument(cursor, content_hash, title, text, fileSize, lastModifiedOn, filename, state) doc = Document.selectWhere(cursor, document_id=content_hash)[0] fileinfo = FileInfo(db_document_id=doc.db_document_id, file_name=filename, file_time=lastModifiedOn, state=state, file_state=file_state) fileinfo.commit(cursor, update=False) self._updateScores(cursor, doc.db_document_id, text) cursor.close() self._cnx.commit()
def getFileInformations(self, filename): cursor = self._cnx.cursor() results = FileInfo.selectWhere(cursor, file_name=filename) cursor.close() return list(results)
def getIndexedFiles(self): cursor = self._cnx.cursor() results = FileInfo.selectWhere(cursor) cursor.close() return [f.file_name for f in results]
def indexDocument(self, nodeId, futureDoc): """Inserts or update information in table documents, file_info, document_score and word :type nodeId: node_id or None if working locally """ # XXX Decide if we can compute the content_hash and mime_type # ourselves or if the indexer should do it and # pass the values as an argument cursor = self._cnx.cursor() try: # insert or update in table file_info fileinfo = FileInfo.selectWhere(cursor, file_name=futureDoc.filename) # insert title into text to be able to find documents according # to their title (e.g: searching 'foo' should find 'foo.pdf') futureDoc.text = '%s %s' % (futureDoc.title, futureDoc.text) if fileinfo: fileinfo = fileinfo[0] fileinfo.file_time = futureDoc.lastModificationTime fileinfo.state = futureDoc.state fileinfo.file_state = futureDoc.file_state doc = Document.selectWhere(cursor, db_document_id=fileinfo.db_document_id) if not doc or doc[0].document_id!=futureDoc.content_hash : # no document was found or a document with another content # in both case, we create a new Document in database # (we don't want to modify the existing one, because it # can be shared by several files) doc = self._createDocument(cursor, futureDoc) fileinfo.db_document_id = doc.db_document_id else: # document has not changed doc = doc[0] if doc.state != futureDoc.state: doc.state = futureDoc.state doc.commit(cursor, update=True) fileinfo.commit(cursor, update=True) else: # file unknown # try to find a Document with same hash value doc = Document.selectWhere(cursor, document_id=futureDoc.content_hash) if doc: doc = doc[0] doc.state = futureDoc.state doc.publication_time = max(doc.publication_time, futureDoc.lastModificationTime) doc.commit(cursor, update=True) else: doc = self._createDocument(cursor, futureDoc) doc = Document.selectWhere(cursor, document_id=futureDoc.content_hash)[0] fileinfo = FileInfo(db_document_id=doc.db_document_id, file_name=futureDoc.filename, file_time=futureDoc.lastModificationTime, state=futureDoc.state, file_state=futureDoc.file_state) fileinfo.commit(cursor, update=False) self._updateScores(cursor, doc.db_document_id, futureDoc.text) # update last seen time only if not working locally if nodeId is not None: provider = DocumentProvider.selectOrInsertWhere(cursor, db_document_id=doc.db_document_id, node_id=nodeId)[0] provider.last_providing_time = int(time.time()) provider.commit(cursor, update=True) nodes = Node.selectWhere(cursor, node_id=nodeId) if not nodes: self._cnx.rollback() cursor.close() raise ValueError('provider %s is not registered in our database !') node = nodes[0] node.last_seen_time = int(time.time()) node.commit(cursor, update=True) cursor.close() self._cnx.commit() except: self._cnx.rollback() raise
def indexDocument(self, filename, title, text, fileSize, lastModifiedOn, content_hash, mime_type, state, file_state): """Inserts or update information in table documents, file_info, document_score and word""" # XXX Decide if we can compute the content_hash and mime_type # ourselves or if the indexer should do it and pass the values as an argument cursor = self._cnx.cursor() # insert or update in table file_info fileinfo = FileInfo.selectWhere(cursor, file_name=filename) if fileinfo: fileinfo = fileinfo[0] fileinfo.file_time = lastModifiedOn fileinfo.state = state fileinfo.file_state = file_state doc = Document.selectWhere(cursor, db_document_id=fileinfo.db_document_id) if not doc or doc[0].document_id!=content_hash : # no document was found or a document with another content # in both case, we create a new Document in database # (we don't want to modify the existing one, because it # can be shared by several files) doc = self._createDocument(cursor, content_hash, title, text, fileSize, lastModifiedOn, filename, state) fileinfo.db_document_id = doc.db_document_id else: # document has not changed doc = doc[0] fileinfo.commit(cursor, update=True) else: # file unknown # try to find a Document with same hash value doc = Document.selectWhere(cursor, document_id=content_hash) if doc: doc = doc[0] doc.state = state doc.publication_time = max(doc.publication_time, lastModifiedOn) doc.commit(cursor, update=True) else: doc = self._createDocument(cursor, content_hash, title, text, fileSize, lastModifiedOn, filename, state) doc = Document.selectWhere(cursor, document_id=content_hash)[0] fileinfo = FileInfo(db_document_id=doc.db_document_id, file_name=filename, file_time=lastModifiedOn, state=state, file_state=file_state) fileinfo.commit(cursor, update=False) self._updateScores(cursor, doc.db_document_id, text) cursor.close() self._cnx.commit()
def indexDocument(self, nodeId, futureDoc): """Inserts or update information in table documents, file_info, document_score and word""" # XXX Decide if we can compute the content_hash and mime_type # ourselves or if the indexer should do it and # pass the values as an argument cursor = self._cnx.cursor() # insert or update in table file_info fileinfo = FileInfo.selectWhere(cursor, file_name=futureDoc.filename) # insert title into text to be able to find documents according # to their title (e.g: searching 'foo' should find 'foo.pdf') futureDoc.text = '%s %s' % (futureDoc.title, futureDoc.text) if fileinfo: fileinfo = fileinfo[0] fileinfo.file_time = futureDoc.lastModificationTime fileinfo.state = futureDoc.state fileinfo.file_state = futureDoc.file_state doc = Document.selectWhere(cursor, db_document_id=fileinfo.db_document_id) if not doc or doc[0].document_id!=futureDoc.content_hash : # no document was found or a document with another content # in both case, we create a new Document in database # (we don't want to modify the existing one, because it # can be shared by several files) doc = self._createDocument(cursor, futureDoc) fileinfo.db_document_id = doc.db_document_id else: # document has not changed doc = doc[0] if doc.state != futureDoc.state: doc.state = futureDoc.state doc.commit(cursor, update=True) fileinfo.commit(cursor, update=True) else: # file unknown # try to find a Document with same hash value doc = Document.selectWhere(cursor, document_id=futureDoc.content_hash) if doc: doc = doc[0] doc.state = futureDoc.state doc.publication_time = max(doc.publication_time, futureDoc.lastModificationTime) doc.commit(cursor, update=True) else: doc = self._createDocument(cursor, futureDoc) doc = Document.selectWhere(cursor, document_id=futureDoc.content_hash)[0] fileinfo = FileInfo(db_document_id=doc.db_document_id, file_name=futureDoc.filename, file_time=futureDoc.lastModificationTime, state=futureDoc.state, file_state=futureDoc.file_state) fileinfo.commit(cursor, update=False) self._updateScores(cursor, doc.db_document_id, futureDoc.text) provider = DocumentProvider.selectOrInsertWhere(cursor, db_document_id=doc.db_document_id, node_id=nodeId)[0] provider.last_providing_time = int(time.time()) provider.commit(cursor, update=True) node = Node.selectWhere(cursor, node_id=nodeId)[0] node.last_seen_time = int(time.time()) node.commit(cursor, update=True) cursor.close() self._cnx.commit()