Exemplo n.º 1
0
    def get_schema_init_fields(self) -> Dict[str, "FieldType"]:
        """Returns the arguments to be passed to the whoosh schema
        object instantiation found in the method `get_schema`.
        """
        from whoosh.fields import TEXT, ID, KEYWORD, STORED  # noqa: F401
        # This part is non-negotiable
        fields = {Database.get_id_key(): ID(stored=True, unique=True)}

        # TODO: this is a security risk, find a way to fix it
        user_prototype = eval(
            papis.config.getstring('whoosh-schema-prototype'))  # KeysView[str]
        fields.update(user_prototype)

        fields_list = papis.config.getlist('whoosh-schema-fields')
        for field in fields_list:
            fields.update({field: TEXT(stored=True)})

        return fields
Exemplo n.º 2
0
def createSearchableData(root):
    schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True),
                    textdata=TEXT(stored=True))
    if not os.path.exists("indexdir"):
        os.mkdir("indexdir")
    ix = create_in("indexdir", schema)
    writer = ix.writer()
    filepath = [os.path.join(root, i) for i in os.listdir(root)]
    for path in filepath:
        print(path)
        fp = open(path, 'r', encoding="UTF-8")
        text = fp.read()
        writer.add_document(title=path.split("\\")[1], path=path, content=text, textdata=text)
        fp.close()
    print("Almost done")
    writer.commit()
    ix.close()
    print("Done")
Exemplo n.º 3
0
def create_update_whoosh_index(video_id):
    container_name = "corpus-container"
    video_id_no_txt_extension = os.path.splitext(video_id)[0]
    block_blob_service = BlockBlobService(storage_account, storage_key)
    video_content = block_blob_service.get_blob_to_text(
        container_name, video_id).content
    if not os.path.exists(corpus_index_dir):
        os.mkdir(corpus_index_dir)
        schema = Schema(title=ID(stored=True, unique=True),
                        content=TEXT(stored=True, analyzer=StemmingAnalyzer()))
        index = create_in(corpus_index_dir, schema)
    else:
        index = open_dir(corpus_index_dir)
    index_writer = index.writer()
    index_writer.add_document(title=video_id_no_txt_extension,
                              content=video_content)
    index_writer.commit()
    extract_and_update_video_keywords(video_id_no_txt_extension, video_content)
Exemplo n.º 4
0
def create_en_index(cursor):
    counter = 0
    tk = TinySegmenterTokenizer(tinysegmenter.TinySegmenter())
    schema = Schema(uuid=ID(stored=True),
                    content=TEXT(analyzer=tk, stored=True))
    if not os.path.exists("jp_index"):
        os.mkdir("jp_index")
    # Creating a index writer to add document as per schema
    ix = create_in("jp_index", schema)
    writer = ix.writer()
    cursor.execute("SELECT * FROM jp_wiki")
    result = cursor.fetchall()
    for row in result:
        counter += 1
        if counter % 100 == 0:
            print("Processing data in row %d" % counter)
        writer.add_document(uuid=row[0], content=row[1])
    writer.commit()
Exemplo n.º 5
0
 def __init__(self, toolbox, index_dir=None, index_help=True):
     self.schema = Schema(id=ID(stored=True, unique=True),
                          old_id=ID,
                          stub=KEYWORD,
                          name=TEXT(analyzer=analysis.SimpleAnalyzer()),
                          description=TEXT,
                          section=TEXT,
                          help=TEXT,
                          labels=KEYWORD)
     self.rex = analysis.RegexTokenizer()
     self.index_dir = index_dir
     self.toolbox = toolbox
     self.index = self._index_setup()
     # We keep track of how many times the tool index has been rebuilt.
     # We start at -1, so that after the first index the count is at 0,
     # which is the same as the toolbox reload count. This way we can skip
     # reindexing if the index count is equal to the toolbox reload count.
     self.index_count = -1
Exemplo n.º 6
0
 def __init__(self, config):
     from whoosh.fields import TEXT, ID, Schema
     from whoosh.index import create_in, open_dir
     self.editor = config.get('brain', 'editor')
     self.result_limit = int(config.get('brain', 'result-limit'))
     self.delimiter = config.get('brain', 'delimiter')
     self.storage_path = os.path.expanduser(config.get('brain', 'storage'))
     self.index_path = os.path.expanduser(config.get('brain', 'index'))
     if not os.path.exists(self.storage_path):
         os.makedirs(self.storage_path)
     if not os.path.exists(self.index_path):
         os.makedirs(self.index_path)
         schema = Schema(entry=ID(stored=True, unique=True),
                         content=TEXT(stored=True))
         self.index = create_in(self.index_path, schema)
         self.index_all()
     else:
         self.index = open_dir(self.index_path)
Exemplo n.º 7
0
    def index(self):
        schema = Schema(
            Name=TEXT(stored=True),
            Description=TEXT(stored=True),
            Yearofrelease=TEXT(stored=True),
            Rating=TEXT(stored=True),
            Genre=TEXT(stored=True),
            ImdbUrl=ID(stored=True, unique=True),
            Votes=TEXT(stored=True)
        )

        if not os.path.exists("indexdir"):
            os.mkdir("indexdir")

        indexer = create_in("indexdir", schema)
        writer = indexer.writer()

        csvfile = open('original.csv', 'rt', encoding='utf-8')
        rows = csv.reader(csvfile, delimiter=',')

        for row in rows:
            cell_count = 1
            for cell in row:
                if cell_count == 1:
                    Name = cell
                elif cell_count == 2:
                    Description = cell
                elif cell_count == 3:
                    Yearofrelease = cell
                elif cell_count == 4:
                    Rating = cell
                elif cell_count == 5:
                    Genre = cell
                elif cell_count == 6:
                    ImdbUrl = cell
                elif cell_count == 7:
                    Votes = cell
                elif cell_count == 8:
                    img = cell
                cell_count += 1
            writer.add_document(Name=Name, Description=Description, Yearofrelease=Yearofrelease,
                                Rating=Rating, Genre=Genre, ImdbUrl=ImdbUrl, Votes=Votes)
        writer.commit()
        self.indexer = indexer
def createIndex(files_to_index):
    '''
    Takes a directory of files and indexes them so that they may be searched through.


    Args:
        files_to_index (string) = name of directory to create index in

    '''

    #analyser object for pre-processing search terms
    stem_analyzer = StemmingAnalyzer()

    #create search schema
    schema = Schema(title=TEXT(analyzer=stem_analyzer, stored=True),
                    path=ID(stored=True),
                    content=TEXT(analyzer=stem_analyzer, stored=True),
                    textdata=TEXT(stored=True))

    #if path to index does not exist, create it
    if not os.path.exists("indexdir"):
        os.mkdir("indexdir")

    # Creating a index writer to add document as per schema
    ix = create_in("indexdir", schema)
    writer = ix.writer()

    #parses every file in FAQs folder and extracts question and answer text data
    filepaths = [
        os.path.join(files_to_index, i) for i in os.listdir(files_to_index)
    ]
    for path in filepaths:
        html = BeautifulSoup(open(path, 'r'), "lxml")
        #parse question text
        question = html.find("div", {"id": "question"}).getText()
        #parse answer text
        answer = html.find("div", {
            "id": "answer"
        }).getText().replace(u'Â\xa0', u' ')
        writer.add_document(title=question,
                            path=path,
                            content=answer,
                            textdata=answer)
    writer.commit()
Exemplo n.º 9
0
def instantiate(result_path,mapping_complet,corpus_type, reinit_db,name):
	print 'whoosh being instantiated'
	if reinit_db:
		if corpus_type=='isi':
			schema = MySchema_isi()
		else:
			print 'ici\n' + corpus_type
			fields={}
			print mapping_complet.keys()
			for i,key in enumerate(mapping_complet.keys()):
				if key=='ISIpubdate':
					fields[str(key)]=NUMERIC(stored=True)
				elif key=='accessionNo':
					fields[str(key)]=ID(stored=True,unique=True)
				else:
					fields[str(key)]=TEXT(stored=True)
				fields['CO']=TEXT()
			print 'whoosh fields',fields.keys()

			schema = Schema(**fields)
		
		import shutil
		if reinit_db==True:
			try:
				shutil.rmtree(os.path.join(result_path,'indexdir'+'_'+name))
				print 'rep indexdir deleted'
			except:
				pass
		try:
			os.mkdir(os.path.join(result_path,'indexdir'+'_'+name))
			print 'on a cree ',os.path.join(result_path,'indexdir'+'_'+name)

		except:
			pass
		ix = create_in(os.path.join(result_path,'indexdir'+'_'+name), schema)
	else:
		from whoosh.index import open_dir
		try:
			ix = open_dir(os.path.join(result_path,'indexdir'+'_'+name))
		except:
			reinit_db=True
			ix=instantiate(result_path,mapping_complet,corpus_type, reinit_db,name)
		
	return ix
    def createSearchableData(self):  
        
        schema = Schema(title=TEXT(stored=True),path=ID(stored=True), content=TEXT)
        if not os.path.exists("indexdir"):
            os.mkdir("indexdir")

        ix = index.create_in("indexdir", schema, indexname = self.index)
        writer = ix.writer()

        filepaths = [os.path.join(self.root,i) for i in os.listdir(self.root)]
        for path in filepaths:
            text = self.getPDFText(path)
            text2 = self.getPDFText2(path)

            for i in range(len(text)):
                writer.add_document(title=path.split("\\")[1] + '_Page_' + str(i), path=path,                    content=text[i])
                writer.add_document(title=path.split("\\")[1] + '_Page_' + str(i) + '_2', path=path,                    content=text2[i])
            print(path.split("\\")[1] + ' has been indexed')
        writer.commit()            
Exemplo n.º 11
0
    def index(self):

        schema = Schema(title=TEXT(stored=True), path=ID(stored=True), \
                        content=TEXT, textdata=TEXT(stored=True))
        if not os.path.exists(self.idx_path):
            os.mkdir(self.idx_path)

        # Creating a index writer to add document as per schema
        ix = create_in(self.idx_path, schema)
        writer = ix.writer()

        filepaths = [os.path.join(self.root, i) for i in os.listdir(self.root)]
        for path in filepaths:
            with codecs.open(path, "r", "utf-8") as f:
                content = f.read()
                writer.add_document(title=unicode(path.split("/")[1]), path=unicode(path.split("/")[0]), \
                                    content=content, textdata=content)
        writer.commit()
        return "true"
Exemplo n.º 12
0
def createSearchableData(root):
    schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT, URL=TEXT(stored=True) , textdata=TEXT(stored=True))
    if not os.path.exists(indexoppath):
        os.mkdir(indexoppath)
    ix = create_in(indexoppath, schema)
    writer = ix.writer()
    filepaths = [os.path.join(root, i) for i in os.listdir(root)]
    for path in filepaths:
        fp = open(path, 'r')
        text = fp.read()
        doc_title = path.split("/")[-1]
        if doc_title in URLMap.keys():
            url = str(URLMap[doc_title]['URL'])
        else:
            print(doc_title + ' not in url map')
            url = 'default'
        writer.add_document(title=doc_title, path=path,content=text, URL=url ,  textdata=text)
        fp.close()
    writer.commit()
def createSearchableData(data_file):
    '''
    Schema definition: video_id, video_title, description
    '''
    stem_analyzer = StemmingAnalyzer()
    schema = Schema(id=ID(stored=True),title=TEXT(stored=True),description=TEXT(analyzer=stem_analyzer(stored=True)))

    if not os.path.exists("indexdir"):
        os.mkdir("indexdir")

    ix = create_in("indexdir", schema)
    writer = ix.writer()

    with open('data_for_indexing.json') as f:
            youtube_array = json.load(f)
            for item in youtube_array:
                writer.add_document(id=item['id'], title=item['title'], description=item['description'])

    writer.commit()
Exemplo n.º 14
0
def get_whoosh_index():
    from django.conf import settings
    from whoosh.index import create_in, exists_in, open_dir
    from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED

    schema = Schema(title=TEXT(stored=True),
                    path=ID(unique=True, stored=True),
                    content=TEXT(stored=True),
                    tags=KEYWORD,
                    classname=KEYWORD)

    if not os.path.exists(settings.WHOOSH_ROOT):
        os.mkdir(settings.WHOOSH_ROOT)

    if not exists_in(settings.WHOOSH_ROOT):
        index = create_in(settings.WHOOSH_ROOT, schema)
    else:
        index = open_dir(settings.WHOOSH_ROOT)
    return index
Exemplo n.º 15
0
    def __init__(self, *args, **kwargs):
        """
        Instantiate the whoosh schema and writer and create/open the index.
        """
        self.schema = kwargs.get(
            'schema',
            Schema(
                content_id=ID(stored=True, unique=True),
                content=TEXT(),
            ))
        self.log = kwargs.get('log', Logging())

        # get the absolute path and create the dir if required
        self.index_path = kwargs.get('index_path', INDEX_PATH)
        if self.create(self.index_path):
            self.log.info("SearchIndex", "__init__", "New index created.")

        # create an index obj and buffered writer
        self.index_obj = open_dir(self.index_path)
Exemplo n.º 16
0
def createSearchableData(web_docs):
    schema = Schema(title=TEXT(stored=True),
                    path=ID(stored=True),
                    content=TEXT,
                    textdata=TEXT(stored=True))
    if not os.path.exists("indexdir"):
        os.mkdir("indexdir")

    ix = create_in("indexdir", schema)
    writer = ix.writer()

    for doc in web_docs:
        text = doc['text']
        title = doc['title']
        writer.add_document(title=title,
                            path=doc['url'],
                            content=text,
                            textdata=text)
    writer.commit()
Exemplo n.º 17
0
 def __init__(self, path, index_name):
     '''
     创建一个MyWhoosh索引类,如果指定路径下的索引存在则直接打开,若不存在就创建新的
     :param path: 索引类要存放的文件夹路径
     :param index_name: 索引名称(一个文件夹下可以存放多个索引)
     '''
     self.index_name = index_name
     analyzer = ChineseAnalyzer()  # 导入中文分词工具
     schema = fields.Schema(title=TEXT(stored=True),
                            path=ID(stored=True),
                            content=TEXT(stored=True, analyzer=analyzer))
     try:
         self.ix = index.open_dir(path, index_name)
         print("索引文件已经存在,use old index")
     except:
         self.ix = create_in(
             path, schema=schema,
             indexname=index_name)  # path 为索引创建的地址,indexname为索引名称
         print("没有检测到旧索引文件,creat net index")
Exemplo n.º 18
0
class HiveJobListing(SchemaClass):
  '''Class to store the details associated with each Hive job'''

  job_url = ID(stored=True)
  title = TEXT(stored=True,analyzer=QUERY_ANALYZER)
  owner = KEYWORD(stored=True)
  completion_time = DATETIME(stored=True)
  query = TEXT(stored=True,analyzer=QUERY_ANALYZER)

  def __init__(self):
    self.job_url = None
    self.title = None
    self.owner = None
    self.completion_time = None
    self.query = None

  def __str__(self):
    return 'Url: %s, Title: %s, Owner: %s, Time: %s, Query: %s...' % (
      self.job_url, self.title, self.owner, self.completion_time, self.query[0:10])
Exemplo n.º 19
0
 def build_index(self):
     jieba.setLogLevel(logging.INFO)
     analyzer = ChineseAnalyzer()
     # 创建索引模板
     schema = Schema(id=NUMERIC(stored=True),
                     title=TEXT(stored=True, analyzer=analyzer),
                     url=ID(stored=True),
                     content=TEXT(stored=True, analyzer=analyzer))
     # 创建索引文件,在[index/]下
     path = os.path.dirname(os.path.abspath(__file__)) + "/index"
     if not os.path.exists(path):
         os.mkdir(path)
         index = create_in(path, schema)
     else:
         index = open_dir(path)
     # 构建索引,增加需要索引的内容
     writer = index.writer()
     total_row = self.post.count_documents({})
     false_row = self.post.count_documents({'indexed': False})
     indexed_row = total_row - false_row
     while True:
         row = self.post.find_one({'indexed': False})
         if row is None:
             writer.commit()
             print('\n\tindexed successfully.')
             break
         else:
             writer.add_document(id=(total_row + 1) if
                                 (row['id'] == -1) else row['id'],
                                 title=row['title'],
                                 url=row['url'],
                                 content=row['content'])
             self.post.update_one({'title': row['title']},
                                  {'$set': {
                                      'indexed': True
                                  }})
             writer.commit()
             writer = index.writer()
             indexed_row += 1
             print(
                 f"\rIndexing: {'▉' * (int(indexed_row/total_row*100//2))} {indexed_row/total_row*100:.2f}%",
                 end='')
Exemplo n.º 20
0
def create_index(docs):
    # We create the index schema.
    schema = Schema(id=ID(stored=True), title=TEXT, content=TEXT)

    if not os.path.exists(index_dir):
        os.mkdir(index_dir)
    ix = index.create_in(index_dir, schema)

    # The writer() method of the Index object returns an IndexWriter object that lets us add documents to the index.
    writer = ix.writer()
    i = 1
    # Add documents to the index.
    for doc in docs:
         print("Indexing document: " + str(i))
         i += 1
        # Add document
        # Notes:
        # - Indexed text fields must be passed as unicode value. (use "str".decode())
        # - Fields can be left empty, i.e., we don't have to fill in a value for every field.
         try:
             docID = doc['id'].decode()
         except(UnicodeEncodeError):
             docID = "empty".decode()

         try:
             name = doc['title'].decode()
         except(UnicodeEncodeError):
             name = "".decode()

         try:
             text = doc['content'].decode()
         except(UnicodeEncodeError):
             text = "".decode()

         writer.add_document(id=docID, title=name, content=text)
        #
        #
        #writer.add_document(id=doc['id'], title=doc['title'], content=doc['content'])
        # Calling commit() on the IndexWriter saves the added documents to the index.
    writer.commit()

    ix.close()
Exemplo n.º 21
0
def create_indexer(doc_directory, index_directory):
    my_analyzer = RegexTokenizer() | LowercaseFilter()
    schema = Schema(id=ID(stored=True),
                    title=TEXT(stored=True, analyzer=my_analyzer),
                    summary=TEXT,
                    article=TEXT(analyzer=my_analyzer),
                    keywords=KEYWORD(stored=True, analyzer=my_analyzer),
                    date=DATETIME(stored=True),
                    path=TEXT(stored=True))

    if not os.path.exists(index_directory):
        os.mkdir(index_directory)
    ix = create_in(index_directory, schema)
    writer = ix.writer()

    nt = 0
    print("==============================")
    t1 = time.clock()
    for dirname, subdirs, files in os.walk(doc_directory):
        if (files != []):
            n = 0
            for filename in files:
                filename = os.path.join(dirname, filename)
                obj = load_json(filename)
                writer.add_document(id=obj['id'],
                                    title=obj['title'],
                                    summary=obj['summary'],
                                    article=obj['article'],
                                    keywords=obj['keywords'],
                                    date=obj['date'],
                                    path=filename)
                n += 1
            print("{}: {}".format(dirname, n))
            nt += n
    t2 = time.clock()
    print("==============================")
    print("Docs: {}, Time: {:.2f}s".format(nt, (t2 - t1)))
    print("Writing index...")
    writer.commit()
    t3 = time.clock()
    print("Total time: {:.2f}s".format(t3 - t1))
    print("==============================")
Exemplo n.º 22
0
def open_index(indexdir, incremental=False):
    """
    Opens the index with the given name. If the directory or the index
    do not yet exist, the are created.

    @type  indexdir: str
    @param indexdir: The name of the index directory.
    @type  incremental: bool
    @param incremental: Whether to preserve existing index content.
    @rtype:  whoosh.Index
    @return: An object representing the index.
    """
    if not os.path.exists(indexdir):
        os.makedirs(indexdir)
    if incremental and index.exists_in(indexdir):
        return index.open_dir(indexdir)
    schema = Schema(number=NUMERIC(stored=True),
                    filename=ID(stored=True),
                    line=TEXT(analyzer=SimpleAnalyzer(), stored=True))
    return index.create_in(indexdir, schema)
Exemplo n.º 23
0
    def __init__(self, destination=None):
        super(Search, self).__init__()

        self.destination = destination
        self.writer = None
        self.ix = None

        self.schema = Schema(title=TEXT(stored=True),
                             path=TEXT(stored=True),
                             page=TEXT(stored=True),
                             content=TEXT(stored=True),
                             unique=ID(stored=True))

        if not os.path.exists(self.destination):
            os.mkdir(self.destination)

        if not self.exists(self.destination):
            return self.create(self.destination)

        return self.previous(self.destination)
Exemplo n.º 24
0
    def __init__(self, questions_file):
        """
        """
        self.questions_file = questions_file

        if os.path.exists(self._CACHED_INDEX):
            self.indx = index.open_dir(self._CACHED_INDEX)
        else:
            os.makedirs(self._CACHED_INDEX)

            schema = Schema(key=ID(stored=True, unique=True), text=TEXT(stored=False))
            ix = index.create_in(self._CACHED_INDEX, schema)
            writer = ix.writer()
            with open(self.questions_file) as f:
                for i, row in tqdm(enumerate(f)):
                    writer.add_document(key=str(i), text=row.strip())
                writer.commit()
            self.indx = ix
        self.query_parser = QueryParser('text', self.indx.schema)
        self.searcher = self.indx.searcher()
Exemplo n.º 25
0
    def create(self, in_memory=False):
        analyzer = StandardAnalyzer(minsize=1, stoplist=None) | CleanUpFilter()
        schema = Schema(source=TEXT(stored=True, analyzer=analyzer),
                        target=TEXT(stored=True, analyzer=analyzer),
                        comment=STORED,
                        context=STORED,
                        softcatala=BOOLEAN,
                        project=ID(stored=True))

        if in_memory:
            st = RamStorage()
            ix = st.create_index(schema)
        else:
            if not os.path.exists(self.dir_name):
                os.mkdir(self.dir_name)

            ix = create_in(self.dir_name, schema)

        self.writer = ix.writer()
        return ix
Exemplo n.º 26
0
 def _create_index_dir(self):
     # Create indexes
     os.makedirs(self._path)
     # Create schema
     schema = Schema(id=ID(stored=True, unique=True),
                     tags=KEYWORD(stored=True),
                     named_tags=KEYWORD(stored=True))
     # Create index
     index = create_in(self._path, schema)
     index_writer = index.writer()
     index_writer.add_document(id=unicode('11111'),
                               tags=unicode('test1 test2'),
                               named_tags=unicode('test1 test2'))
     index_writer.add_document(id=unicode('22222'),
                               tags=unicode('test1 test3 test4'),
                               named_tags=unicode('test1 test3 test4'))
     index_writer.add_document(id=unicode('33333'),
                               tags=unicode('test2 test5 test6'),
                               named_tags=unicode('test2 test5 test6'))
     index_writer.commit()
Exemplo n.º 27
0
def createSearchableData(root):
    schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT, textdata=TEXT(stored=True))
    in_path="./indexdir" # Insert here where you want you save the Index
    if not os.path.exists(in_path): 
        os.mkdir(in_path)

    # Creating an index writer to add document as per schema
    ix = create_in(in_path, schema)
    writer = ix.writer()

    filepaths = [os.path.join(root, i) for i in os.listdir(root)]
    for path in filepaths:
        fp = open(path)
        print(path)
        text = fp.read()
        
        writer.add_document(title=path.split("/")[6], path=path,  content=text, textdata=text)
        
        fp.close()
    writer.commit()
Exemplo n.º 28
0
def build_whoosh_database():
    analyzer = ChineseAnalyzer()
    schema = Schema(title=TEXT(stored=True, analyzer=analyzer),
                    type=TEXT(stored=True),
                    link=ID(stored=True),
                    content=TEXT(stored=True, analyzer=analyzer))
    ix = create_in(whoosh_database, schema)

    writer = ix.writer()

    mpost = MPost()
    recs = mpost.query_all()
    for rec in recs:
        text2 = html2text.html2text(tornado.escape.xhtml_unescape(
            rec.cnt_html))
        print(text2)
        writer.add_document(title=rec.title,
                            type='<span style="color:blue;">[文档]</span>',
                            link='/post/{0}.html'.format(rec.uid),
                            content=text2)
    writer.commit()
Exemplo n.º 29
0
def index(path):
    """Index all of the artifacts in a specified directory.

    Parameters
    ----------
    path : str
        The path to the directory containing the artifacts to be indexed.
    """

    ind = os.path.abspath(os.path.join(path, os.pardir, "whoosh"))
    schema = create_whoosh_schema(SCHEMAS["artifact"]["schema"])
    schema.add("pkg", TEXT(stored=True))
    schema.add("channel", TEXT(stored=True))
    schema.add("arch", TEXT(stored=True))
    schema.add("filename", TEXT(stored=True))
    schema.add("path", ID(stored=True, unique=True))
    ix = get_index(ind, schema=schema)

    unindexed = _unindexed_artifacts(path, ix)
    print(f"TOTAL UNINDEXED ARTIFACTS: {len(unindexed)}")
    unindexed = list(unindexed)[:5000]
    progress = tqdm.tqdm(total=len(unindexed))

    writer = ix.writer()
    with ThreadPoolExecutor(max_workers=1) as pool:
        futures = [
            pool.submit(get_artifact,
                        path,
                        artifact,
                        progress_callback=progress.update)
            for artifact in unindexed
        ]
        for f in as_completed(futures):
            try:
                data = f.result()
            except Exception as e:
                print(e)
            else:
                writer.add_document(**data)
    writer.commit()
Exemplo n.º 30
0
    def index_graph_description(self, index_name='graphs'):
        from whoosh.fields import TEXT, ID, NGRAM, NUMERIC, KEYWORD
        from whoosh.analysis import StemmingAnalyzer, SimpleAnalyzer, IDAnalyzer
        from whoosh.analysis.filters import LowercaseFilter
        print 'Building %s index...' % index_name

        # build a single schema from the fields exposed by the different search
        # types
        print '\tSchema:'
        fields = {
            'gid': ID(stored=True),
            'description': KEYWORD(lowercase=True, scorable=True)
        }
        #fields = {'gid': ID(stored=True), 'description': TEXT(analyzer=SimpleAnalyzer(ur'[.\s]', True))}

        from whoosh.fields import Schema
        schema = Schema(**fields)

        # Create the index schema
        index = self.recreate_index(index_name, schema)

        # Add documents to the index
        print '\tWrite indexes:'
        writer = index.writer()
        c = 0
        from digipal.models import Graph
        for graph in Graph.objects.filter(
                graph_components__isnull=False).prefetch_related(
                    'graph_components', 'graph_components__component',
                    'graph_components__features').distinct():
            c += 1
            doc = {
                'gid': unicode(graph.id),
                'description': graph.get_serialised_description()
            }
            writer.add_document(**doc)

        print '\t\tIndex %d graphs' % c

        writer.commit()