-
Notifications
You must be signed in to change notification settings - Fork 0
/
notebook_recommender.py
162 lines (124 loc) · 4.51 KB
/
notebook_recommender.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import sys
import json
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import StemmingAnalyzer
from whoosh import index
from whoosh.qparser import QueryParser
from whoosh import scoring
import os, os.path
#####################################
#Create the schema
#####################################
schema = Schema(filename=ID(stored=True),
cell_no=TEXT(stored=True),
content=TEXT(analyzer=StemmingAnalyzer())
)
#####################################
# Create the index and initialize a `writer`
#####################################
# Note, this clears the existing index in the directory
ix = index.create_in("notebooks", schema)
# Get a writer form the created index in
writer = ix.writer()
def visibleTextFromNB(filename):
'''
This function pulls all the non-output visible cells from
a JupyterNotebook and concatenates it all into a block of
text.
Returns : a list of the cells
'''
#####################################
# Parse file, pull cells
#####################################
file_data = json.load(open(filename, encoding="utf-8"))
# File data is now a map, recall a JSON format is a combo of dictionaries and lists
cells = file_data.get('cells')
#####################################
# Append cells into a list of cells
#####################################
cell_list = []
if cells == None:
return cell_list
# for each cell in the notebook
for c in cells:
#extract and test the cell type
cell_type = c['cell_type']
if ('code'==cell_type or 'markdown'==cell_type or 'raw'==cell_type ):
cell_text = ""
# run the source into lines, it is actually a list of strings/lines
source = c['source']
for l in source:
cell_text += l
cell_list.append(cell_text)
#####################################
# Append cells into a list of cells
#####################################
# return the list
return cell_list
#End of function: visibleTextFromNB
def loadFile(writer, fname):
'''
Read file contents, load into database.
'''
#####################################
# Get cell text from function
#####################################
cells = visibleTextFromNB(fname)
#####################################
# Iterate through cells, index
#####################################
counter = 1;
for c in cells:
writer.add_document(filename=fname, cell_no=str(counter), content=c) ### ANSWERS: your code here
counter +=1
# print("Indexed: ", fname)
# END of function
def walkFolder(writer, folder):
'''
Process a folder for files and subfolders
Prints the files and folders that are processed.
'''
#print('Processing folder: ',folder)
#####################################
# TODO: walk through the filesystem starting at folder
# HINT: os.walk
#####################################
for root, dirs, files in os.walk(folder):
#print("root = ", root)
result = []
for d in dirs:
if (d.startswith(".") or d== 'share' or d=='jupyter' or d == 'runtime' or ("collection" in d) or ("grading" in d) or (d == "PSDS2120") or (d == "extracted")):
pass
else:
result.append(d)
dirs[:] = result
#####################################
# Process Files
#####################################
for file in files:
filename = os.path.join(root, file)
if file.endswith("-checkpoint.ipynb"):
pass
elif file.endswith(".ipynb"):
print('Found Notebook:',filename)
loadFile(writer, filename)
############# END for walkFolder
walkFolder(writer,"C:\DSA")
# Commit changes
writer.commit() # save changes
# Get input, conver to unicode
qstr = input("Input a qeury: ")
print("searching for ",qstr)
####################################
# Build query parser and parse query
####################################
qp = QueryParser("content", schema=ix.schema)
q = qp.parse(qstr)
print(q)
####################################
# Search the content field
####################################
with ix.searcher(weighting=scoring.TF_IDF()) as s:
results = s.search(q)
for hit in results:
print("Cell {} of Notebook '{}'".format(hit['cell_no'],hit['filename']))