def searchPage(): query = request.GET.get('query', '').strip() print query if query: db = db_lib.sql3() results = db.doSearch(query) else: results = None s = template('main', results=results) return s
def setUp(self): self.db = db_lib.sql3({'clean': True, 'location': 'test.db'})
def setUp(self): self.db = db_lib.sql3({"clean": True, "location": "test.db"})
def __init__(self, url_file): """Initialize the crawler with a connection to the database to populate and with the file containing the list of seed URLs to begin indexing.""" #Do direct db store instead of memory cache. #Reasons.. #Not guaranteed enough memory for very large indexes #Have to rebuild cache every time crawler is run.. Crawler should be updated, not rewritten #End result, will be slightly slower due to INSERT IF NOT EXISTS mysql commands. self._url_queue = [ ] # functions to call when entering and exiting specific tags self._enter = defaultdict(lambda *a, **ka: self._visit_ignore) self._exit = defaultdict(lambda *a, **ka: self._visit_ignore) # add a link to our graph, and indexing info to the related page self._enter['a'] = self._visit_a # record the currently indexed document's title an increase # the font size def visit_title(*args, **kargs): self._visit_title(*args, **kargs) self._increase_font_factor(7)(*args, **kargs) # increase the font size when we enter these tags self._enter['b'] = self._increase_font_factor(2) self._enter['strong'] = self._increase_font_factor(2) self._enter['i'] = self._increase_font_factor(1) self._enter['em'] = self._increase_font_factor(1) self._enter['h1'] = self._increase_font_factor(7) self._enter['h2'] = self._increase_font_factor(6) self._enter['h3'] = self._increase_font_factor(5) self._enter['h4'] = self._increase_font_factor(4) self._enter['h5'] = self._increase_font_factor(3) self._enter['title'] = visit_title # decrease the font size when we exit these tags self._exit['b'] = self._increase_font_factor(-2) self._exit['strong'] = self._increase_font_factor(-2) self._exit['i'] = self._increase_font_factor(-1) self._exit['em'] = self._increase_font_factor(-1) self._exit['h1'] = self._increase_font_factor(-7) self._exit['h2'] = self._increase_font_factor(-6) self._exit['h3'] = self._increase_font_factor(-5) self._exit['h4'] = self._increase_font_factor(-4) self._exit['h5'] = self._increase_font_factor(-3) self._exit['title'] = self._increase_font_factor(-7) # never go in and parse these tags self._ignored_tags = set([ 'meta', 'script', 'link', 'meta', 'embed', 'iframe', 'frame', 'noscript', 'object', 'svg', 'canvas', 'applet', 'frameset', 'textarea', 'style', 'area', 'map', 'base', 'basefont', 'param', ]) # set of words to ignore self._ignored_words = set([ '', 'the', 'of', 'at', 'on', 'in', 'is', 'it', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'and', 'or', ]) # keep track of some info about the page we are currently parsing self._curr_depth = 0 self._curr_url = "" self._font_size = 0 # get all urls into the queue try: with open(url_file, 'r') as f: for line in f: self._url_queue.append((self._fix_url(line.strip(), ""), 0)) except IOError: pass self.db = db_lib.sql3({'clean': True, 'location':'test.db'})
incoming_link_sets = defaultdict(set) incoming_links = defaultdict(lambda: np.array([])) damping_factor = 0.85 # collect the number of outbound links and the set of all incoming documents # for every document for (from_id,to_id) in links: num_outgoing_links[int(from_id)] += 1.0 incoming_link_sets[to_id].add(int(from_id)) # convert each set of incoming links into a numpy array for doc_id in incoming_link_sets: incoming_links[doc_id] = np.array([from_doc_id for from_doc_id in incoming_link_sets[doc_id]]) num_documents = float(len(num_outgoing_links)) lead = (1.0 - damping_factor) / num_documents partial_PR = np.vectorize(lambda doc_id: page_rank[doc_id] / num_outgoing_links[doc_id]) for _ in xrange(num_iterations): for doc_id in num_outgoing_links: tail = 0.0 if len(incoming_links[doc_id]): tail = damping_factor * partial_PR(incoming_links[doc_id]).sum() page_rank[doc_id] = lead + tail return page_rank if __name__ == "__main__": db = db_lib.sql3() ranks = page_rank(db.)