def __init__(self, entry_point='http://learnair.media.mit.edu:8000/', \ crawl_delay=1000, filter_keywords=['previous','next']): #entry_point = starting URL for crawl #search_depth = how many steps in path we save to retrace when at a dead end #found_set_persistence = how long, in min, to keep a resource URI in memory # before it is allowed to be returned as a new resource again. 720= 12 # hours before crawler 'forgets' it has seen something and resubmits it # in the queue to be processed #crawl_delay = how long, in ms, before accessing/crawling a new resource self.entry_point = entry_point #entry point URI #initialize crawl variables self.current_uri = entry_point #keep track of current location self.current_uri_type = 'entry_point' self.crawl_delay = crawl_delay #in milliseconds self.degrees = 0 self.return_if_found = False self.createform_type = None self.found_resources = TimeDecaySet(0) #initialize filter word list for crawling self.filter_keywords = ['edit','create','self','curies','websocket'] [self.filter_keywords.append(x) for x in filter_keywords] log.debug( "filter keywords %s", self.filter_keywords) log.info( "-----------------------------------------------" ) log.info( "Crawler Initialized." ) log.info( "Entry Point: %s", self.entry_point ) log.info( "-----------------------------------------------" )
def query_link_array(self, crawl_links): '''takes a crawl_link array (which has links and types of objects) and decides which of these links were quieried for. Return List of URIs that are matched resources not in the set already discovered''' if self.qry_resource_type is not None: log.info('SEARCH_LIST: looking for singular: %s', self.qry_resource_type) log.info('SEARCH_LIST: looking for plural as item_list: %s', self.qry_resource_plural) if self.qry_resource_title is not None: log.info('SEARCH_LIST: looking for title: %s', self.qry_resource_title) matching_uris = [] #(1) if resource name exists, filter items to get only items that #match the singular resource name, AND (things that match the plural #resource name && are from_item_list) #(2) if title exists, filter items remaining for those that match the title for link_item in crawl_links: log.debug('SEARCH_LIST: checking if %s matches query criteria', link_item['href']) this_link_item_matches = True #see if it matches resource_type, if queried for if self.qry_resource_type is not None: if ((any(link_item['type'].lower() in x for x in self.qry_resource_plural) and link_item['from_item_list']) \ or (link_item['type'].lower() == self.qry_resource_type)): #it does! #double check for createForms the parent is correct if ('createform' == link_item['type'].lower() and self.createform_type is not None): if (self.current_uri_type.lower() not in self.createform_type): this_link_item_matches = False else: log.info('SEARCH_LIST: matched search_type %s', link_item['type']) else: log.info('SEARCH_LIST: matched search_type %s', link_item['type']) else: #it doesn't, but we're searching on resource_type this_link_item_matches = False #see if it matches resource_title, if queried for if self.qry_resource_title is not None: if (link_item['title'].lower() == self.qry_resource_title): #it does! log.info('SEARCH_LIST: matched search_title %s', link_item['title']) else: #it doesn't, but we're searching on resource_title this_link_item_matches = False #if we made it to here and this_link_item_matches, it's a match! if this_link_item_matches: matching_uris.append(link_item['href']) #return list of matching uris return matching_uris
def query_link_array(self, crawl_links): '''takes a crawl_link array (which has links and types of objects) and decides which of these links were quieried for. Return List of URIs that are matched resources not in the set already discovered''' if self.qry_resource_type is not None: log.info('SEARCH_LIST: looking for singular: %s', self.qry_resource_type) log.info('SEARCH_LIST: looking for plural as item_list: %s', self.qry_resource_plural) if self.qry_resource_title is not None: log.info('SEARCH_LIST: looking for title: %s', self.qry_resource_title) matching_uris = [] #(1) if resource name exists, filter items to get only items that #match the singular resource name, AND (things that match the plural #resource name && are from_item_list) #(2) if title exists, filter items remaining for those that match the title for link_item in crawl_links: log.debug('SEARCH_LIST: checking if %s matches query criteria', link_item['href']) this_link_item_matches = True #see if it matches resource_type, if queried for if self.qry_resource_type is not None: if ((any(link_item['type'].lower() in x for x in self.qry_resource_plural) and link_item['from_item_list']) \ or (link_item['type'].lower() == self.qry_resource_type)): #it does! log.info('SEARCH_LIST: matched search_type %s', link_item['type']) else: #it doesn't, but we're searching on resource_type this_link_item_matches = False #see if it matches resource_title, if queried for if self.qry_resource_title is not None: if (link_item['title'].lower() == self.qry_resource_title): #it does! log.info('SEARCH_LIST: matched search_title %s', link_item['title']) else: #it doesn't, but we're searching on resource_title this_link_item_matches = False #if we made it to here and this_link_item_matches, it's a match! if this_link_item_matches: matching_uris.append(link_item['href']) #return list of matching uris return matching_uris
def __init__(self, entry_point='http://learnair.media.mit.edu:8000/', \ cache_table_mask_length=8, track_search_depth=5, \ found_set_persistence=720, crawl_delay=1000, filter_keywords=['previous','next']): #entry_point = starting URL for crawl #search_depth = how many steps in path we save to retrace when at a dead end #found_set_persistence = how long, in min, to keep a resource URI in memory # before it is allowed to be returned as a new resource again. 720= 12 # hours before crawler 'forgets' it has seen something and resubmits it # in the queue to be processed #crawl_delay = how long, in ms, before accessing/crawling a new resource self.entry_point = entry_point #entry point URI #initialize crawl variables self.current_uri = entry_point #keep track of current location self.current_uri_type = 'entry_point' self.current_uri_title = 'entry_point' self.crawl_history = LeakyLIFO(track_search_depth) #keep track of past self.crawl_delay = crawl_delay #in milliseconds self.found_resources = TimeDecaySet(found_set_persistence) #in seconds #initialize cache self.cache = CrawlerCacheWithCollisionHistory(cache_table_mask_length) #initialize queue/zmq variables self.q = None self.zmq = None self.find_called = False #initialize filter word list for crawling self.filter_keywords = ['edit','create','self','curies','websocket'] [self.filter_keywords.append(x) for x in filter_keywords] log.debug( "filter keywords %s", self.filter_keywords) log.info( "-----------------------------------------------" ) log.info( "Crawler Initialized." ) log.info( "Entry Point: %s", self.entry_point ) log.info( "-----------------------------------------------" )
def apply_hal_curies(json, del_curies=True): '''Find and apply CURIES relationship shorcuts (namespace/rel definitions) to other links in the json object. I.E., if we have a CURIES "http://learnair.media.mit.edu/rels/{rel}" with name "ch", and a link further called 'ch:sites', remove the CURIES part of the object and apply it so that 'ch:sites' is now "http://learnair.media .mit.edu/rels/sites". del_curies tells this function whether to remove the CURIES section of _links after applying it to the document (True), or whether to leave it in (False).''' try: curies = json['_links']['curies'] #find the curies. for curie in curies: #compare each curies name... for key in json['_links']: #...with each link relationship #if we find a link relation that uses the curies if (key.startswith(curie['name'] + ':')): #combine the curies & key to make the full resource link newIndex = curie['href'] replaceString = key.split(curie['name'] + ':',1)[1] newIndex = re.sub(r"\{.*\}", replaceString, newIndex) #move the resource to the full resource link json['_links'][newIndex] = json['_links'][key] del json['_links'][key] log.debug( 'CURIES: %s moved to %s', key, newIndex ) #delete curies section of json if desired if del_curies: del json['_links']['curies'] log.debug( 'CURIES: CURIES Resource applied fully & removed.' ) except: log.warn( "CURIES: No CURIES found" ) json['_links']={} return json
def apply_hal_curies(json, del_curies=True): '''Find and apply CURIES relationship shorcuts (namespace/rel definitions) to other links in the json object. I.E., if we have a CURIES "http://learnair.media.mit.edu/rels/{rel}" with name "ch", and a link further called 'ch:sites', remove the CURIES part of the object and apply it so that 'ch:sites' is now "http://learnair.media .mit.edu/rels/sites". del_curies tells this function whether to remove the CURIES section of _links after applying it to the document (True), or whether to leave it in (False).''' try: curies = json['_links']['curies'] #find the curies. for curie in curies: #compare each curies name... for key in json['_links']: #...with each link relationship #if we find a link relation that uses the curies if (key.startswith(curie['name'] + ':')): #combine the curies & key to make the full resource link newIndex = curie['href'] replaceString = key.split(curie['name'] + ':',1)[1] newIndex = re.sub(r"\{.*\}", replaceString, newIndex) #move the resource to the full resource link json['_links'][newIndex] = json['_links'][key] del json['_links'][key] log.debug( 'CURIES: %s moved to %s', key, newIndex ) #delete curies section of json if desired if del_curies: del json['_links']['curies'] log.debug( 'CURIES: CURIES Resource applied fully & removed.' ) except: log.warn( "CURIES: No CURIES found" ) return json
def crawl_node(self): #put uri in cache now that we're crawling it, make a note of collisions if self.cache.put_and_collision(self.current_uri): log.info( 'HASH COLLISION: value overwritten in hash table.' ) #debug: print state of cache after updating log.debug('CACHE STATE: %s', self.cache._cache) #download the current resource try: req = requests.get(self.current_uri) log.info( '%s downloaded.', self.current_uri ) #downloading the current resource failed except requests.exceptions.ConnectionError: log.warn( 'URI "%s" unresponsive, moving back to previous link...',\ self.current_uri ) #if we failed to download the entry point, give up if self.current_uri == self.entry_point: log.error( 'URI is entry point, no previous link. Try again when' \ + ' the entry point URI is available.' ) return False #if it wasn't the entry point, go back in our search history try: prev = self.crawl_history.pop() self.current_uri = prev['href'] self.current_uri_type = prev['type'] self.current_uri_title = prev['title'] return True #if we don't have any history left, go back to the entry point except: log.info( 'exhausted depth of search history, back to entry point' ) self.current_uri = self.entry_point self.current_uri_type = "entry_point" self.current_uri_title = "entry_point" return True #end downloading resource #put request in JSON form, apply CURIES, get links resource_json = req.json() log.debug('HAL/JSON RAW RESOURCE: %s', resource_json) req_links = self.apply_hal_curies(resource_json)['_links'] crawl_links = self.get_external_links(req_links) #crawl_links is a 'flat' list list[:][fields] #fields are href, type, title, in_cache, from_item_list log.debug('HAL/JSON LINKS CURIES APPLIED, FILTERED (for history,' + \ 'self, create/edit, ws, itemlist flattened): %s', crawl_links) #find the uris/resources that match search criteria! if self.qry_extra is None: #we don't need to actually download the link to see if it matches matching_uris = self.query_link_array(crawl_links) else: #we only have enough information to tell if the current node matches matching_uris = self.query_current_node(resource_json) #... and send them out!! if (self.push_uris_to_queue(matching_uris) and self.find_called): return False #end crawl if we found one and 'find' was called #select next link!!!! #get uncached links uncached_links = [x for x in crawl_links if not x['in_cache']] log.info('CRAWL: %s LINKS UNCACHED OF %s LINKS FOUND', \ len(uncached_links), len(crawl_links) ) if (len(uncached_links)>0): #we have uncached link(s) to follow! randomly pick one. random_index = random.randrange(0,len(uncached_links)) self.crawl_history.push({'href':self.current_uri, 'type':self.current_uri_type, 'title':self.current_uri_title}) self.current_uri = uncached_links[random_index]['href'] self.current_uri_type = uncached_links[random_index]['type'] self.current_uri_title = uncached_links[random_index]['title'] else: #we don't have any uncached options from this node. Damn. log.info('CRAWL: no new links available here, crawling back up history') #special case of being at the entry point if (self.current_uri_type == 'entry_point'): #double check we have something to crawl if (len(crawl_links) > 0): log.info('CRAWL: no uncached links from entrypoint, resetting cache') self.cache.clear() # clear cache #randomly select node from crawl_links random_index = random.randrange(0,len(crawl_links)) self.crawl_history.push({'href':self.current_uri, 'type':self.current_uri_type, 'title':self.current_uri_title}) self.current_uri = crawl_links[random_index]['href'] self.current_uri_type = crawl_links[random_index]['type'] self.current_uri_title = crawl_links[random_index]['title'] else: log.error('CRAWL: NO CRAWLABLE LINKS DETECTED AT ENTRY_POINT!!!!') return False #not at entry point, time to try and move back up in history try: prev = self.crawl_history.pop() self.current_uri = prev['href'] self.current_uri_type = prev['type'] self.current_uri_title = prev['title'] except: #no history left, not at entry point- jump to entry point log.info('CRAWL: crawling back up history, but exhausted history. Jump to entrypoint.') self.current_uri= self.entry_point self.current_uri_type = 'entry_point' self.current_uri_title = 'entry_point' log.debug('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') log.info('CRAWL: crawling to %s : %s', self.current_uri_title.upper(), self.current_uri) log.info('CRAWL: type: %s', self.current_uri_type) log.debug('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') #recurse return True
def bfs(self): current_depth = 0 visited = set() link_tree = [[] for k in range(self.degrees)] while True: time.sleep(self.crawl_delay/1000.0) #download the current resource try: req = requests.get(self.current_uri) log.info( '%s downloaded.', self.current_uri ) #put request in JSON form, apply CURIES, get links resource_json = req.json() log.debug('HAL/JSON RAW RESOURCE: %s', resource_json) #downloading the current resource failed except requests.exceptions.ConnectionError: log.warn( 'URI "%s" unresponsive, ignoring',\ self.current_uri ) resource_json = {'_links':[]} #if we failed to download the entry point, give up if self.current_uri == self.entry_point: log.error( 'URI is entry point, no previous link. Try again when' \ + ' the entry point URI is available.' ) return #end downloading resource #get links from this resource req_links = self.apply_hal_curies(resource_json)['_links'] crawl_links = self.flatten_filter_link_array(req_links) #crawl_links is a 'flat' list list[:][fields] #fields are href, type, title, in_cache, from_item_list log.debug('HAL/JSON LINKS CURIES APPLIED, FILTERED (for history,' + \ 'self, create/edit, ws, itemlist flattened): %s', crawl_links) #find the uris/resources that match search criteria! matching_uris = self.query_link_array(crawl_links) #... and send them out!! if (self.push_uris_to_queue(matching_uris) and self.return_if_found): return #return if we are using find_first and we found one #push all uris that don't match visited to proper depth list visited.add(self.current_uri) if current_depth < self.degrees: [link_tree[current_depth].append(x) for x in crawl_links \ if not x['href'] in visited] log.debug('BFS Array: %s', link_tree) log.debug('VISITED: %s', visited) #select next current_uri and current_uri_type by looking through #link_tree, if empty return finished = True for index in range(len(link_tree)): if len(link_tree[index]): self.current_uri = link_tree[index][0]['href'] self.current_uri_type = link_tree[index][0]['type'] del link_tree[index][0] current_depth = index + 1 finished = False break if finished: return log.debug('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') log.info('CRAWL: moving to %s', self.current_uri) log.info('CRAWL: type: %s', self.current_uri_type) log.info('CRAWL: depth: %s', current_depth) log.debug('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
def crawl_node(self): #put uri in cache now that we're crawling it, make a note of collisions if self.cache.put_and_collision(self.current_uri): log.info('HASH COLLISION: value overwritten in hash table.') #debug: print state of cache after updating log.debug('CACHE STATE: %s', self.cache._cache) #download the current resource try: req = requests.get(self.current_uri) log.info('%s downloaded.', self.current_uri) #downloading the current resource failed except requests.exceptions.ConnectionError: log.warn( 'URI "%s" unresponsive, moving back to previous link...',\ self.current_uri ) #if we failed to download the entry point, give up if self.current_uri == self.entry_point: log.error( 'URI is entry point, no previous link. Try again when' \ + ' the entry point URI is available.' ) return False #if it wasn't the entry point, go back in our search history try: prev = self.crawl_history.pop() self.current_uri = prev['href'] self.current_uri_type = prev['type'] self.current_uri_title = prev['title'] return True #if we don't have any history left, go back to the entry point except: log.info( 'exhausted depth of search history, back to entry point') self.current_uri = self.entry_point self.current_uri_type = "entry_point" self.current_uri_title = "entry_point" return True #end downloading resource #put request in JSON form, apply CURIES, get links resource_json = req.json() log.debug('HAL/JSON RAW RESOURCE: %s', resource_json) req_links = self.apply_hal_curies(resource_json)['_links'] crawl_links = self.get_external_links(req_links) #crawl_links is a 'flat' list list[:][fields] #fields are href, type, title, in_cache, from_item_list log.debug('HAL/JSON LINKS CURIES APPLIED, FILTERED (for history,' + \ 'self, create/edit, ws, itemlist flattened): %s', crawl_links) #find the uris/resources that match search criteria! if self.qry_extra is None: #we don't need to actually download the link to see if it matches matching_uris = self.query_link_array(crawl_links) else: #we only have enough information to tell if the current node matches matching_uris = self.query_current_node(resource_json) #... and send them out!! if (self.push_uris_to_queue(matching_uris) and self.find_called): return False #end crawl if we found one and 'find' was called #select next link!!!! #get uncached links uncached_links = [x for x in crawl_links if not x['in_cache']] log.info('CRAWL: %s LINKS UNCACHED OF %s LINKS FOUND', \ len(uncached_links), len(crawl_links) ) if (len(uncached_links) > 0): #we have uncached link(s) to follow! randomly pick one. random_index = random.randrange(0, len(uncached_links)) self.crawl_history.push({ 'href': self.current_uri, 'type': self.current_uri_type, 'title': self.current_uri_title }) self.current_uri = uncached_links[random_index]['href'] self.current_uri_type = uncached_links[random_index]['type'] self.current_uri_title = uncached_links[random_index]['title'] else: #we don't have any uncached options from this node. Damn. log.info( 'CRAWL: no new links available here, crawling back up history') #special case of being at the entry point if (self.current_uri_type == 'entry_point'): #double check we have something to crawl if (len(crawl_links) > 0): log.info( 'CRAWL: no uncached links from entrypoint, resetting cache' ) self.cache.clear() # clear cache #randomly select node from crawl_links random_index = random.randrange(0, len(crawl_links)) self.crawl_history.push({ 'href': self.current_uri, 'type': self.current_uri_type, 'title': self.current_uri_title }) self.current_uri = crawl_links[random_index]['href'] self.current_uri_type = crawl_links[random_index]['type'] self.current_uri_title = crawl_links[random_index]['title'] else: log.error( 'CRAWL: NO CRAWLABLE LINKS DETECTED AT ENTRY_POINT!!!!' ) return False #not at entry point, time to try and move back up in history try: prev = self.crawl_history.pop() self.current_uri = prev['href'] self.current_uri_type = prev['type'] self.current_uri_title = prev['title'] except: #no history left, not at entry point- jump to entry point log.info( 'CRAWL: crawling back up history, but exhausted history. Jump to entrypoint.' ) self.current_uri = self.entry_point self.current_uri_type = 'entry_point' self.current_uri_title = 'entry_point' log.debug( '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') log.info('CRAWL: crawling to %s : %s', self.current_uri_title.upper(), self.current_uri) log.info('CRAWL: type: %s', self.current_uri_type) log.debug( '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') #recurse return True