-
Notifications
You must be signed in to change notification settings - Fork 0
/
Indexer.py
183 lines (145 loc) · 5.4 KB
/
Indexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# -*- coding: utf-8 -*-
import os
import re
import whoosh
import whoosh.index
import whoosh.fields
import whoosh.qparser
from Metadata import Metadata
class Indexer:
index = None
root = None
writer = None
imgre = None
scan = False
def __init__(self, root):
self.imgre = re.compile('\.jpe?g$',re.IGNORECASE)
self.root = os.path.abspath(root)
idxdir = os.path.join(self.root,".index")
if(whoosh.index.exists_in(idxdir)):
# index exists, load it
print "loading index in "+idxdir
self.index = whoosh.index.open_dir(idxdir)
else:
# new index, create it
print "creating index in "+idxdir
if(not os.path.exists(idxdir)):
os.mkdir(idxdir)
self.index = whoosh.index.create_in(idxdir, schema=self.get_schema())
def close(self):
""" closes the index and deletes the associated objects, this should
unlock the index for sure
"""
print "closing index"
del self.writer
self.index.close()
del self.index
def get_schema(self):
""" returns the schema definition of the index """
return whoosh.fields.Schema(
path = whoosh.fields.ID(unique=True, stored=True),
folder = whoosh.fields.TEXT,
time = whoosh.fields.STORED,
title = whoosh.fields.TEXT(stored=True),
content = whoosh.fields.TEXT,
tags = whoosh.fields.KEYWORD(stored=True, lowercase=True, commas=True, scorable=True),
)
def scan_start(self):
""" Prepare directory scanning. Run this before running the
scan iterator
"""
self.writer = self.index.writer()
self.scan = True
def scan_stop(self):
""" Finish (or interrupt) a directory scan """
if self.writer:
print "index committed"
self.writer.commit()
del self.writer
self.scan = False
def scan_iterator(self,base,onloop=None, onexit=None):
""" Iterate over all found images in the given base directory
and below and add them to the index
this is designed to be run from gobject.idle_add()
before starting this, you need to call scan_start()
scan_stop() is called automatically, but can also be used
to interupt the scan
base - the full path of the directory to scan
onloop - callback to run on each loop:
func(path, isimage)
onexit - callback when the run finishes or is aborted:
func(wasabort)
"""
for directory, subdirs, files in os.walk(base):
for fn in files:
if not self.scan:
if(callable(onexit)):
onexit(True)
self.scan_stop()
yield False
filepath = os.path.join(directory,fn)
isimage = self.imgre.search(fn);
if(callable(onloop)):
onloop(filepath, isimage)
if(isimage):
self.update_image(filepath)
yield True
# we're through - stop the run
if(callable(onexit)):
onexit(False)
self.scan_stop()
yield False
def update_image(self,filepath):
""" Adds or updates the given image in the index
Set commit to False and call index.commit() yourself
when doing batch operations
"""
relpath = os.path.relpath(filepath,self.root)
folder = os.path.dirname(relpath)
meta = Metadata(filepath)
if not self.writer:
self.writer = self.index.writer()
commit = True
else:
commit = False
# add to index
self.writer.update_document(
path = unicode(relpath),
folder = unicode(folder),
time = os.path.getmtime(filepath),
title = meta.get_title(),
content = meta.get_content(),
tags = meta.get_tags()
#FIXME add more EXIF data here
)
if(commit):
self.writer.commit()
del self.writer
def search(self,query):
""" parse the given query, execute a search and return the results """
searcher = self.index.searcher()
mparser = whoosh.qparser.MultifieldParser(["title", "content", "tags"], schema=self.get_schema())
results = searcher.search(mparser.parse(query));
return results
def tagcloud(self):
""" Return the top 100 keywords from the index with their weighted
importance (9 = most common, 0 = rarest) as dictionary
"""
reader = self.index.reader()
tags = reader.most_frequent_terms('tags',100);
maxi = tags[0][0];
mini = tags[-1][0];
# calculate tresholds
levels = 9
tresholds = []
for i in range(len(tags)):
tresholds.append(pow(maxi - mini + 1, i/levels) + mini - 1)
# assign weights
weighted = {}
for (cnt, tag) in tags:
for i in range(len(tresholds)):
if(cnt <= tresholds[i]):
weighted[tag] = i
break
weighted[tag] = levels
return weighted