-
Notifications
You must be signed in to change notification settings - Fork 0
/
index_search.py
60 lines (47 loc) · 2.51 KB
/
index_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from collections import defaultdict # https://docs.python.org/2/library/collections.html
from words import get_text, words
def create_index(files):
"""
Given a list of fully-qualified filenames, build an index from word
to set of document IDs. A document ID is just the index into the
files parameter (indexed from 0) to get the file name. Make sure that
you are mapping a word to a set of doc IDs, not a list.
For each word w in file i, add i to the set of document IDs containing w
Return a dict object mapping a word to a set of doc IDs.
"""
dct_index = defaultdict() # Create an empty dict
for idx,fname in enumerate(files): # Iterate through every given file names
s_content = get_text(fname) # Turn each file name into a string content
lst_word = words(s_content) # Turn the string content into a list of normalized words
for word in lst_word: # For each normalized words, update the dict by word-file as key-value pairs
if word not in dct_index:
dct_index[word] = {idx} # If the key doesn't exist, create one
else:
dct_index[word].add(idx) # If the key exist, add the file name into the set of the file names under that word
return dct_index
def index_search(files, index, terms):
"""
Given an index and a list of fully-qualified filenames, return a list of
filenames whose file contents has all words in terms parameter as normalized
by your words() function. Parameter terms is a list of strings.
You can only use the index to find matching files; you cannot open the files
and look inside.
"""
# index is a dictionary, containg words mapping to its file IDs
set_file = set(files) # files is a list of file names, not file IDs
for term in terms:
if term not in index: # Edge case
return []
# Turn the IDs into file names
set_IDs = index[term]
lst_fnames_of_the_term = []
for ID in set_IDs:
# Get back the file names by ID, the index of the files (they are ordered in the list)
lst_fnames_of_the_term.append(files[ID])
set_file = set_file.intersection(set(lst_fnames_of_the_term)) # The set will getting smaller under each loop
return list(set_file)
if __name__ == '__main__':
dct = create_index(['/Users/seanlin/data/slate/51/ArticleIP_38825.txt',
'/Users/seanlin/data/slate/50/ArticleIP_27730.txt',
'/Users/seanlin/data/slate/5/Article247_604.txt'])
print(len(dct))