示例#1
0
文件: index.py 项目: aeroevan/pyra
 def __init__(self, index_location, base_path, summary_size):
     self._fs = FrequencySummarizer()
     self._rake = RakeKeywordExtractor()
     self._index_location = index_location
     self._base_path = base_path
     self._summary_size = summary_size
示例#2
0
文件: index.py 项目: aeroevan/pyra
class TextIndexer(object):
    def __init__(self, index_location, base_path, summary_size):
        self._fs = FrequencySummarizer()
        self._rake = RakeKeywordExtractor()
        self._index_location = index_location
        self._base_path = base_path
        self._summary_size = summary_size

    def _my_docs(self):
        for root, dirnames, filenames in os.walk(self._base_path):
            for filename in filenames:
                fullpath = os.path.join(root, filename)
                if magic.from_file(fullpath, mime=True) == b'text/plain':
                    yield fullpath

    def _add_doc(self, writer, path):
        logger.debug("Indexing {0}".format(path))
        fileobj = open(path, "rt")
        content = fileobj.read()
        summary = "\n".join(self._fs.summarize(content, self._summary_size))
        keywords = ','.join(self._rake.extract(content))
        fileobj.close()
        modtime = os.path.getmtime(path)
        writer.add_document(path=path, time=modtime, keywords=keywords,
                            summary=summary, content=content)

    def _get_schema(self):
        return Schema(path=ID(unique=True, stored=True),
                      time=STORED,
                      keywords=KEYWORD(lowercase=True, commas=True,
                                       stored=True),
                      summary=TEXT(stored=True),
                      content=TEXT)

    def clean_index(self):
        # Always create the index from scratch
        ix = index.create_in(self._index_location,
                             schema=self._get_schema())
        writer = ix.writer()

        # Assume we have a function that gathers the filenames of the
        # documents to be indexed
        for path in self._my_docs():
            self._add_doc(writer, path)

        writer.commit()

    def index(self, clean=False):
        if clean:
            logger.info('Clearing existing index')
            self.clean_index()
        else:
            logger.info('Incremental index')
            self.incremental_index()

    def incremental_index(self):
        ix = index.open_dir(self._index_location)

        # The set of all paths in the index
        indexed_paths = set()
        # The set of all paths we need to re-index
        to_index = set()

        with ix.searcher() as searcher:
            writer = ix.writer()

            # Loop over the stored fields in the index
            for fields in searcher.all_stored_fields():
                indexed_path = fields['path']
                indexed_paths.add(indexed_path)

                if not os.path.exists(indexed_path):
                    # This file was deleted since it was indexed
                    writer.delete_by_term('path', indexed_path)

                else:
                    # Check if this file was changed since it
                    # was indexed
                    indexed_time = fields['time']
                    mtime = os.path.getmtime(indexed_path)
                    if mtime > indexed_time:
                        # The file has changed, delete it and add it to the list
                        # of files to reindex
                        logging.debug("{0} has changed".format(indexed_path))
                        writer.delete_by_term('path', indexed_path)
                        to_index.add(indexed_path)

            # Loop over the files in the filesystem
            # Assume we have a function that gathers the filenames of the
            # documents to be indexed
            for path in self._my_docs():
                if path in to_index or path not in indexed_paths:
                    # This is either a file that's changed, or a new file
                    # that wasn't indexed before. So index it!
                    self._add_doc(writer, path)

            writer.commit()