Exemplo n.º 1
0
    def query_link_array(self, crawl_links):
        '''takes a crawl_link array (which has links and types of objects)
        and decides which of these links were quieried for. Return List of
        URIs that are matched resources not in the set already discovered'''

        if self.qry_resource_type is not None:
            log.info('SEARCH_LIST: looking for singular: %s',
                     self.qry_resource_type)
            log.info('SEARCH_LIST: looking for plural as item_list: %s',
                     self.qry_resource_plural)
        if self.qry_resource_title is not None:
            log.info('SEARCH_LIST: looking for title: %s',
                     self.qry_resource_title)

        matching_uris = []

        #(1) if resource name exists, filter items to get only items that
        #match the singular resource name, AND (things that match the plural
        #resource name && are from_item_list)
        #(2) if title exists, filter items remaining for those that match the title

        for link_item in crawl_links:

            log.debug('SEARCH_LIST: checking if %s matches query criteria',
                      link_item['href'])
            this_link_item_matches = True

            #see if it matches resource_type, if queried for
            if self.qry_resource_type is not None:
                if ((any(link_item['type'].lower() in x for x in self.qry_resource_plural) and link_item['from_item_list']) \
                        or (link_item['type'].lower() == self.qry_resource_type)):
                    #it does!
                    log.info('SEARCH_LIST: matched search_type %s',
                             link_item['type'])
                else:
                    #it doesn't, but we're searching on resource_type
                    this_link_item_matches = False

            #see if it matches resource_title, if queried for
            if self.qry_resource_title is not None:
                if (link_item['title'].lower() == self.qry_resource_title):
                    #it does!
                    log.info('SEARCH_LIST: matched search_title %s',
                             link_item['title'])
                else:
                    #it doesn't, but we're searching on resource_title
                    this_link_item_matches = False

            #if we made it to here and this_link_item_matches, it's a match!
            if this_link_item_matches:
                matching_uris.append(link_item['href'])

        #return list of matching uris
        return matching_uris
Exemplo n.º 2
0
    def __init__(self, entry_point='http://learnair.media.mit.edu:8000/', \
            crawl_delay=1000, filter_keywords=['previous','next']):
        #entry_point = starting URL for crawl
        #search_depth = how many steps in path we save to retrace when at a dead end
        #found_set_persistence = how long, in min,  to keep a resource URI in memory
        #       before it is allowed to be returned as a new resource again.  720= 12
        #       hours before crawler 'forgets' it has seen something and resubmits it
        #       in the queue to be processed
        #crawl_delay = how long, in ms, before accessing/crawling a new resource

        self.entry_point = entry_point #entry point URI

        #initialize crawl variables
        self.current_uri = entry_point #keep track of current location
        self.current_uri_type = 'entry_point'
        self.crawl_delay = crawl_delay #in milliseconds
        self.degrees = 0
        self.return_if_found = False
        self.createform_type = None

        self.found_resources = TimeDecaySet(0)

        #initialize filter word list for crawling
        self.filter_keywords = ['edit','create','self','curies','websocket']
        [self.filter_keywords.append(x) for x in filter_keywords]
        log.debug( "filter keywords %s", self.filter_keywords)

        log.info( "-----------------------------------------------" )
        log.info( "Crawler Initialized." )
        log.info( "Entry Point: %s", self.entry_point )
        log.info( "-----------------------------------------------" )
Exemplo n.º 3
0
    def __init__(self, mask_length=8):
        '''initializes fixed size hash table (2^mask_length entries), preallocates
        using C for speed and size.  Each stored value in the table is a cityHash64
        value (64 bits), so the hash table can support (theoretically) up to 2^64
        entries.  Defaults to 2^8 entries (256 entries).  The assumption is that
        with a uniform probability distribution, hash collisions while crawling a
        local area of the internet are unlikely.  Instead of storing a linked list
        at each hash table index, we will only store the most recent hash value.
        This may cause us to re-crawl websites, but again it should be fine for
        keeping the crawler from local loops or hyper-local crawl behavior.

        Values are stored based on a bitmask over the 64 bit hash.

        ex:  'http://test.com' hashes to '0x1234567887654321', and the cache table
        size is 2^8, or 256, so we apply an 8 bit mask of 0xff (& 255) to the hash.
        This gives us hashtable[0x21] = 0x1234567887654321.'''

        log.info( "-----------------------------------------------" )
        log.info( "---- Setting up cache ----" )

        self._cache_table_mask_length = mask_length
        self._cache_mask = (2**self._cache_table_mask_length) - 1
        self._cache = array.array('L',(0 for i in range (self._cache_mask+1)))

        if (self._cache.itemsize < 8):
            log.error("Cache Item Size is too small to represent 64 bit CityHash Value")
            raise TypeError("Cache Item Size is too small to represent 64 bit CityHash Value")

        log.info( 'cache length = %s, size = %s kB, mask = b{0:b}'.format(self._cache_mask), \
                len(self._cache), (sys.getsizeof(self._cache)/1000.0) )

        log.info( "-----------------------------------------------" )
Exemplo n.º 4
0
    def query_link_array(self, crawl_links):
        '''takes a crawl_link array (which has links and types of objects)
        and decides which of these links were quieried for. Return List of
        URIs that are matched resources not in the set already discovered'''

        if self.qry_resource_type is not None:
            log.info('SEARCH_LIST: looking for singular: %s', self.qry_resource_type)
            log.info('SEARCH_LIST: looking for plural as item_list: %s', self.qry_resource_plural)
        if self.qry_resource_title is not None:
            log.info('SEARCH_LIST: looking for title: %s', self.qry_resource_title)

        matching_uris = []

        #(1) if resource name exists, filter items to get only items that
        #match the singular resource name, AND (things that match the plural
        #resource name && are from_item_list)
        #(2) if title exists, filter items remaining for those that match the title

        for link_item in crawl_links:

            log.debug('SEARCH_LIST: checking if %s matches query criteria', link_item['href'])
            this_link_item_matches = True

            #see if it matches resource_type, if queried for
            if self.qry_resource_type is not None:
                if ((any(link_item['type'].lower() in x for x in self.qry_resource_plural) and link_item['from_item_list']) \
                        or (link_item['type'].lower() == self.qry_resource_type)):
                    #it does!
                    log.info('SEARCH_LIST: matched search_type %s', link_item['type'])
                else:
                    #it doesn't, but we're searching on resource_type
                    this_link_item_matches = False

            #see if it matches resource_title, if queried for
            if self.qry_resource_title is not None:
                if (link_item['title'].lower() == self.qry_resource_title):
                    #it does!
                    log.info('SEARCH_LIST: matched search_title %s', link_item['title'])
                else:
                    #it doesn't, but we're searching on resource_title
                    this_link_item_matches = False

            #if we made it to here and this_link_item_matches, it's a match!
            if this_link_item_matches:
                matching_uris.append(link_item['href'])

        #return list of matching uris
        return matching_uris
Exemplo n.º 5
0
    def push_uris_to_queue(self, uris):
        '''check uris against found_resources set, and if they're not there,
        get resource and push URI and resource out to queue'''

        found_one = False
        #self.found_resources
        for uri in uris:
            #if 'add' returns true, it's not in our set yet
            if self.found_resources.add(uri):

                log.info('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
                log.info('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
                log.info('New Resource Found!  %s', uri)
                log.info('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
                log.info('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')

                found_one = True

        return found_one
Exemplo n.º 6
0
    def __init__(self, entry_point='http://learnair.media.mit.edu:8000/', \
            cache_table_mask_length=8, track_search_depth=5, \
            found_set_persistence=720, crawl_delay=1000, filter_keywords=['previous','next']):
        #entry_point = starting URL for crawl
        #search_depth = how many steps in path we save to retrace when at a dead end
        #found_set_persistence = how long, in min,  to keep a resource URI in memory
        #       before it is allowed to be returned as a new resource again.  720= 12
        #       hours before crawler 'forgets' it has seen something and resubmits it
        #       in the queue to be processed
        #crawl_delay = how long, in ms, before accessing/crawling a new resource

        self.entry_point = entry_point #entry point URI

        #initialize crawl variables
        self.current_uri = entry_point #keep track of current location
        self.current_uri_type = 'entry_point'
        self.current_uri_title = 'entry_point'
        self.crawl_history = LeakyLIFO(track_search_depth) #keep track of past
        self.crawl_delay = crawl_delay #in milliseconds
        self.found_resources = TimeDecaySet(found_set_persistence) #in seconds

        #initialize cache
        self.cache = CrawlerCacheWithCollisionHistory(cache_table_mask_length)

        #initialize queue/zmq variables
        self.q = None
        self.zmq = None

        self.find_called = False

        #initialize filter word list for crawling
        self.filter_keywords = ['edit','create','self','curies','websocket']
        [self.filter_keywords.append(x) for x in filter_keywords]
        log.debug( "filter keywords %s", self.filter_keywords)

        log.info( "-----------------------------------------------" )
        log.info( "Crawler Initialized." )
        log.info( "Entry Point: %s", self.entry_point )
        log.info( "-----------------------------------------------" )
Exemplo n.º 7
0
    def __init__(self, mask_length=8):
        '''initializes fixed size hash table (2^mask_length entries), preallocates
        using C for speed and size.  Each stored value in the table is a cityHash64
        value (64 bits), so the hash table can support (theoretically) up to 2^64
        entries.  Defaults to 2^8 entries (256 entries).  The assumption is that
        with a uniform probability distribution, hash collisions while crawling a
        local area of the internet are unlikely.  Instead of storing a linked list
        at each hash table index, we will only store the most recent hash value.
        This may cause us to re-crawl websites, but again it should be fine for
        keeping the crawler from local loops or hyper-local crawl behavior.

        Values are stored based on a bitmask over the 64 bit hash.

        ex:  'http://test.com' hashes to '0x1234567887654321', and the cache table
        size is 2^8, or 256, so we apply an 8 bit mask of 0xff (& 255) to the hash.
        This gives us hashtable[0x21] = 0x1234567887654321.'''

        log.info("-----------------------------------------------")
        log.info("---- Setting up cache ----")

        self._cache_table_mask_length = mask_length
        self._cache_mask = (2**self._cache_table_mask_length) - 1
        self._cache = array.array('L',
                                  (0 for i in range(self._cache_mask + 1)))

        if (self._cache.itemsize < 8):
            log.error(
                "Cache Item Size is too small to represent 64 bit CityHash Value"
            )
            raise TypeError(
                "Cache Item Size is too small to represent 64 bit CityHash Value"
            )

        log.info( 'cache length = %s, size = %s kB, mask = b{0:b}'.format(self._cache_mask), \
                len(self._cache), (sys.getsizeof(self._cache)/1000.0) )

        log.info("-----------------------------------------------")
Exemplo n.º 8
0
    def crawl_node(self):

        #put uri in cache now that we're crawling it, make a note of collisions
        if self.cache.put_and_collision(self.current_uri):
            log.info( 'HASH COLLISION: value overwritten in hash table.' )

        #debug: print state of cache after updating
        log.debug('CACHE STATE: %s', self.cache._cache)

        #download the current resource
        try:
            req = requests.get(self.current_uri)
            log.info( '%s downloaded.', self.current_uri )

        #downloading the current resource failed
        except requests.exceptions.ConnectionError:

            log.warn( 'URI "%s" unresponsive, moving back to previous link...',\
                    self.current_uri )

            #if we failed to download the entry point, give up
            if self.current_uri == self.entry_point:
                log.error( 'URI is entry point, no previous link.  Try again when' \
                        + ' the entry point URI is available.' )
                return False

            #if it wasn't the entry point, go back in our search history
            try:
                prev = self.crawl_history.pop()
                self.current_uri = prev['href']
                self.current_uri_type = prev['type']
                self.current_uri_title = prev['title']
                return True

            #if we don't have any history left, go back to the entry point
            except:
                log.info( 'exhausted depth of search history, back to entry point' )
                self.current_uri = self.entry_point
                self.current_uri_type = "entry_point"
                self.current_uri_title = "entry_point"
                return True

        #end downloading resource

        #put request in JSON form, apply CURIES, get links
        resource_json = req.json()
        log.debug('HAL/JSON RAW RESOURCE: %s', resource_json)

        req_links = self.apply_hal_curies(resource_json)['_links']
        crawl_links = self.get_external_links(req_links)

        #crawl_links is a 'flat' list list[:][fields]
        #fields are href, type, title, in_cache, from_item_list

        log.debug('HAL/JSON LINKS CURIES APPLIED, FILTERED (for history,' + \
                'self, create/edit, ws, itemlist flattened): %s', crawl_links)

        #find the uris/resources that match search criteria!
        if self.qry_extra is None:
            #we don't need to actually download the link to see if it matches
            matching_uris = self.query_link_array(crawl_links)
        else:
            #we only have enough information to tell if the current node matches
            matching_uris = self.query_current_node(resource_json)

        #... and send them out!!
        if (self.push_uris_to_queue(matching_uris) and self.find_called):
            return False #end crawl if we found one and 'find' was called

        #select next link!!!!

        #get uncached links
        uncached_links = [x for x in crawl_links if not x['in_cache']]
        log.info('CRAWL: %s LINKS UNCACHED OF %s LINKS FOUND', \
                len(uncached_links), len(crawl_links) )

        if (len(uncached_links)>0):
            #we have uncached link(s) to follow! randomly pick one.
            random_index = random.randrange(0,len(uncached_links))

            self.crawl_history.push({'href':self.current_uri, 'type':self.current_uri_type, 'title':self.current_uri_title})
            self.current_uri = uncached_links[random_index]['href']
            self.current_uri_type = uncached_links[random_index]['type']
            self.current_uri_title = uncached_links[random_index]['title']

        else:
            #we don't have any uncached options from this node. Damn.
            log.info('CRAWL: no new links available here, crawling back up history')

            #special case of being at the entry point
            if (self.current_uri_type == 'entry_point'):
                #double check we have something to crawl
                if (len(crawl_links) > 0):

                    log.info('CRAWL: no uncached links from entrypoint, resetting cache')
                    self.cache.clear() # clear cache

                    #randomly select node from crawl_links
                    random_index = random.randrange(0,len(crawl_links))

                    self.crawl_history.push({'href':self.current_uri, 'type':self.current_uri_type, 'title':self.current_uri_title})
                    self.current_uri = crawl_links[random_index]['href']
                    self.current_uri_type = crawl_links[random_index]['type']
                    self.current_uri_title = crawl_links[random_index]['title']

                else:
                    log.error('CRAWL: NO CRAWLABLE LINKS DETECTED AT ENTRY_POINT!!!!')
                    return False

            #not at entry point, time to try and move back up in history
            try:
                prev = self.crawl_history.pop()
                self.current_uri = prev['href']
                self.current_uri_type = prev['type']
                self.current_uri_title = prev['title']

            except: #no history left, not at entry point- jump to entry point
                log.info('CRAWL: crawling back up history, but exhausted history.  Jump to entrypoint.')
                self.current_uri= self.entry_point
                self.current_uri_type = 'entry_point'
                self.current_uri_title = 'entry_point'

        log.debug('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
        log.info('CRAWL: crawling to %s : %s', self.current_uri_title.upper(), self.current_uri)
        log.info('CRAWL: type: %s', self.current_uri_type)
        log.debug('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')

        #recurse
        return True
Exemplo n.º 9
0
    def crawl(self, namespace="", resource_type=None, \
            plural_resource_type=None, resource_title=None, resource_extra=None):
        '''
        crawl through chain, pushing uri/resource that match the passed criteria
        onto the queue.  If nothing is passed, push all resources.

        Can match the resource_type.  If you want a resource list (plural, i.e.
        lists of organizations resources NOT organization resources), you can
        specify that as the resource_type even though it is the plural.

        The code assumes the word can be pluralized by adding an 's' or 'es' to
        the end.  If this is not true (i.e. Person -> People) please give the
        plural so the code can recognize when it has found a list of the
        singular resource of interest.

        if looking for a specific resource, this will cross check against the
        title of the resource.  Selection will be ANDED with other query
        criteria.
        '''

        #store search criteria in lowercase form, with namespace appended
        #add plural forms +'s', +'es' to list of plural cases to look for

        if resource_type is not None:
            #append namespace
            self.qry_resource_type = namespace + resource_type
            #make all lowercase
            self.qry_resource_type = self.qry_resource_type.lower()
            #'pluralize' resource after adding namespace
            self.qry_resource_plural = self.pluralize_resource_name(self.qry_resource_type)
            #add special pluralization if given by user
            if plural_resource_type is not None:
                self.qry_resource_plural.append(namespace + plural_resource_type)
            #make all plural list items lowercase
            self.qry_resource_plural = [x.lower() for x in self.qry_resource_plural]
        else:
            #not searching on resource_type, just define qry_resource_type as None
            self.qry_resource_type = None

        if resource_title is not None:
            #make all lowercase
            self.qry_resource_title = resource_title.lower()
        else:
            #not searching on title, just define qry_resource_title as None
            self.qry_resource_title = None

        if resource_extra is not None:
            self.qry_extra = resource_extra
        else:
            self.qry_extra = None

        #end initializing query variables

        loop_count=0

        #keep calling crawl_node, unless it returns false, with a pause between
        while(self.crawl_node()):

            #delay for crawl_delay ms between calls
            time.sleep(self.crawl_delay/1000.0)

            #count loop iterations
            loop_count = loop_count + 1
            log.info( "MAIN CRAWL LOOP ITERATION %s -----------------", loop_count )

        log.info( "--- crawling ended, %s pages crawled ---", loop_count )

        return self.found_resources
Exemplo n.º 10
0
    def push_uris_to_queue(self, uris):
        '''check uris against found_resources set, and if they're not there,
        get resource and push URI and resource out to queue'''
        #self.found_resources

        found_one = False

        for uri in uris:
            #if 'add' returns true, it's not in our set yet
            if self.found_resources.add(uri):

                log.info('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
                log.info('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
                log.info('New Resource Found!  %s', uri)
                log.info('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
                log.info('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')

                found_one = True

                #push uri and resource to queue!
                if isinstance(self.q, Queue.Queue):
                    log.info('QUEUE: Pushing to queue')
                    self.q.put(uri)
                elif self.zmq is not None:
                    log.info('QUEUE: Pusing to ZMQ socket')
                    self.zmq.send_string(uri)
                else:
                    log.warn('QUEUE: Queue and ZMQ Socket undefined')

        return found_one
Exemplo n.º 11
0
    def query_current_node(self, json):

        matching_uris = []

        if self.qry_resource_type is not None:
            log.info('SEARCH_LIST: looking for singular: %s', self.qry_resource_type)
        if self.qry_resource_title is not None:
            log.info('SEARCH_LIST: looking for title: %s', self.qry_resource_title)
        if self.qry_extra is not None:
            log.info('SEARCH_LIST: looking for %s', self.qry_extra)

        this_link_item_matches = True

        if self.qry_resource_type is not None:
            if (any(self.current_uri_type.lower() in x for x in self.qry_resource_plural) \
                    or self.current_uri_type.lower() == self.qry_resource_type):
                #it does!
                log.info('SEARCH_LIST: matched search_type %s', self.current_uri_type)
            else:
                #it doesn't, but we're searching on resource_type
                this_link_item_matches = False

        #see if it matches resource_title, if queried for
        if self.qry_resource_title is not None:
            if (self.current_uri_title.lower() == self.qry_resource_title):
                #it does!
                log.info('SEARCH_LIST: matched search_title %s', self.current_uri_title)
            else:
                #it doesn't, but we're searching on resource_title
                this_link_item_matches = False

        if self.qry_extra is not None:
            for key, val in self.qry_extra.iteritems():
                try:
                    actual_val = json[key]
                    if actual_val == val:
                        log.info('SEARCH_LIST: matched search_extra %s: %s', key, val)
                    else:
                        this_link_item_matches = False
                except:
                    this_link_item_matches = False

        #if we made it to here and this_link_item_matches, it's a match!
        if this_link_item_matches:
            matching_uris.append(self.current_uri)

        #return list of matching uris
        return matching_uris
Exemplo n.º 12
0
    def bfs(self):

        current_depth = 0
        visited = set()
        link_tree = [[] for k in range(self.degrees)]

        while True:

            time.sleep(self.crawl_delay/1000.0)

            #download the current resource
            try:
                req = requests.get(self.current_uri)
                log.info( '%s downloaded.', self.current_uri )

                #put request in JSON form, apply CURIES, get links
                resource_json = req.json()
                log.debug('HAL/JSON RAW RESOURCE: %s', resource_json)

            #downloading the current resource failed
            except requests.exceptions.ConnectionError:

                log.warn( 'URI "%s" unresponsive, ignoring',\
                        self.current_uri )

                resource_json = {'_links':[]}

                #if we failed to download the entry point, give up
                if self.current_uri == self.entry_point:
                    log.error( 'URI is entry point, no previous link.  Try again when' \
                            + ' the entry point URI is available.' )
                    return

            #end downloading resource

            #get links from this resource
            req_links = self.apply_hal_curies(resource_json)['_links']
            crawl_links = self.flatten_filter_link_array(req_links)

            #crawl_links is a 'flat' list list[:][fields]
            #fields are href, type, title, in_cache, from_item_list

            log.debug('HAL/JSON LINKS CURIES APPLIED, FILTERED (for history,' + \
                    'self, create/edit, ws, itemlist flattened): %s', crawl_links)

            #find the uris/resources that match search criteria!
            matching_uris = self.query_link_array(crawl_links)
            #... and send them out!!
            if (self.push_uris_to_queue(matching_uris) and self.return_if_found):
                return #return if we are using find_first and we found one

            #push all uris that don't match visited to proper depth list
            visited.add(self.current_uri)

            if current_depth < self.degrees:
                [link_tree[current_depth].append(x) for x in crawl_links \
                        if not x['href'] in visited]

            log.debug('BFS Array: %s', link_tree)
            log.debug('VISITED: %s', visited)

            #select next current_uri and current_uri_type by looking through
            #link_tree, if empty return

            finished = True

            for index in range(len(link_tree)):
                if len(link_tree[index]):

                    self.current_uri = link_tree[index][0]['href']
                    self.current_uri_type = link_tree[index][0]['type']
                    del link_tree[index][0]

                    current_depth = index + 1
                    finished = False
                    break

            if finished:
                return

            log.debug('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
            log.info('CRAWL: moving to %s', self.current_uri)
            log.info('CRAWL: type: %s', self.current_uri_type)
            log.info('CRAWL: depth: %s', current_depth)
            log.debug('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
Exemplo n.º 13
0
    def crawl_node(self):

        #put uri in cache now that we're crawling it, make a note of collisions
        if self.cache.put_and_collision(self.current_uri):
            log.info('HASH COLLISION: value overwritten in hash table.')

        #debug: print state of cache after updating
        log.debug('CACHE STATE: %s', self.cache._cache)

        #download the current resource
        try:
            req = requests.get(self.current_uri)
            log.info('%s downloaded.', self.current_uri)

        #downloading the current resource failed
        except requests.exceptions.ConnectionError:

            log.warn( 'URI "%s" unresponsive, moving back to previous link...',\
                    self.current_uri )

            #if we failed to download the entry point, give up
            if self.current_uri == self.entry_point:
                log.error( 'URI is entry point, no previous link.  Try again when' \
                        + ' the entry point URI is available.' )
                return False

            #if it wasn't the entry point, go back in our search history
            try:
                prev = self.crawl_history.pop()
                self.current_uri = prev['href']
                self.current_uri_type = prev['type']
                self.current_uri_title = prev['title']
                return True

            #if we don't have any history left, go back to the entry point
            except:
                log.info(
                    'exhausted depth of search history, back to entry point')
                self.current_uri = self.entry_point
                self.current_uri_type = "entry_point"
                self.current_uri_title = "entry_point"
                return True

        #end downloading resource

        #put request in JSON form, apply CURIES, get links
        resource_json = req.json()
        log.debug('HAL/JSON RAW RESOURCE: %s', resource_json)

        req_links = self.apply_hal_curies(resource_json)['_links']
        crawl_links = self.get_external_links(req_links)

        #crawl_links is a 'flat' list list[:][fields]
        #fields are href, type, title, in_cache, from_item_list

        log.debug('HAL/JSON LINKS CURIES APPLIED, FILTERED (for history,' + \
                'self, create/edit, ws, itemlist flattened): %s', crawl_links)

        #find the uris/resources that match search criteria!
        if self.qry_extra is None:
            #we don't need to actually download the link to see if it matches
            matching_uris = self.query_link_array(crawl_links)
        else:
            #we only have enough information to tell if the current node matches
            matching_uris = self.query_current_node(resource_json)

        #... and send them out!!
        if (self.push_uris_to_queue(matching_uris) and self.find_called):
            return False  #end crawl if we found one and 'find' was called

        #select next link!!!!

        #get uncached links
        uncached_links = [x for x in crawl_links if not x['in_cache']]
        log.info('CRAWL: %s LINKS UNCACHED OF %s LINKS FOUND', \
                len(uncached_links), len(crawl_links) )

        if (len(uncached_links) > 0):
            #we have uncached link(s) to follow! randomly pick one.
            random_index = random.randrange(0, len(uncached_links))

            self.crawl_history.push({
                'href': self.current_uri,
                'type': self.current_uri_type,
                'title': self.current_uri_title
            })
            self.current_uri = uncached_links[random_index]['href']
            self.current_uri_type = uncached_links[random_index]['type']
            self.current_uri_title = uncached_links[random_index]['title']

        else:
            #we don't have any uncached options from this node. Damn.
            log.info(
                'CRAWL: no new links available here, crawling back up history')

            #special case of being at the entry point
            if (self.current_uri_type == 'entry_point'):
                #double check we have something to crawl
                if (len(crawl_links) > 0):

                    log.info(
                        'CRAWL: no uncached links from entrypoint, resetting cache'
                    )
                    self.cache.clear()  # clear cache

                    #randomly select node from crawl_links
                    random_index = random.randrange(0, len(crawl_links))

                    self.crawl_history.push({
                        'href': self.current_uri,
                        'type': self.current_uri_type,
                        'title': self.current_uri_title
                    })
                    self.current_uri = crawl_links[random_index]['href']
                    self.current_uri_type = crawl_links[random_index]['type']
                    self.current_uri_title = crawl_links[random_index]['title']

                else:
                    log.error(
                        'CRAWL: NO CRAWLABLE LINKS DETECTED AT ENTRY_POINT!!!!'
                    )
                    return False

            #not at entry point, time to try and move back up in history
            try:
                prev = self.crawl_history.pop()
                self.current_uri = prev['href']
                self.current_uri_type = prev['type']
                self.current_uri_title = prev['title']

            except:  #no history left, not at entry point- jump to entry point
                log.info(
                    'CRAWL: crawling back up history, but exhausted history.  Jump to entrypoint.'
                )
                self.current_uri = self.entry_point
                self.current_uri_type = 'entry_point'
                self.current_uri_title = 'entry_point'

        log.debug(
            '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
        log.info('CRAWL: crawling to %s : %s', self.current_uri_title.upper(),
                 self.current_uri)
        log.info('CRAWL: type: %s', self.current_uri_type)
        log.debug(
            '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')

        #recurse
        return True
Exemplo n.º 14
0
    def crawl(self, namespace="", resource_type=None, \
            plural_resource_type=None, resource_title=None, resource_extra=None):
        '''
        crawl through chain, pushing uri/resource that match the passed criteria
        onto the queue.  If nothing is passed, push all resources.

        Can match the resource_type.  If you want a resource list (plural, i.e.
        lists of organizations resources NOT organization resources), you can
        specify that as the resource_type even though it is the plural.

        The code assumes the word can be pluralized by adding an 's' or 'es' to
        the end.  If this is not true (i.e. Person -> People) please give the
        plural so the code can recognize when it has found a list of the
        singular resource of interest.

        if looking for a specific resource, this will cross check against the
        title of the resource.  Selection will be ANDED with other query
        criteria.
        '''

        #store search criteria in lowercase form, with namespace appended
        #add plural forms +'s', +'es' to list of plural cases to look for

        if resource_type is not None:
            #append namespace
            self.qry_resource_type = namespace + resource_type
            #make all lowercase
            self.qry_resource_type = self.qry_resource_type.lower()
            #'pluralize' resource after adding namespace
            self.qry_resource_plural = self.pluralize_resource_name(
                self.qry_resource_type)
            #add special pluralization if given by user
            if plural_resource_type is not None:
                self.qry_resource_plural.append(namespace +
                                                plural_resource_type)
            #make all plural list items lowercase
            self.qry_resource_plural = [
                x.lower() for x in self.qry_resource_plural
            ]
        else:
            #not searching on resource_type, just define qry_resource_type as None
            self.qry_resource_type = None

        if resource_title is not None:
            #make all lowercase
            self.qry_resource_title = resource_title.lower()
        else:
            #not searching on title, just define qry_resource_title as None
            self.qry_resource_title = None

        if resource_extra is not None:
            self.qry_extra = resource_extra
        else:
            self.qry_extra = None

        #end initializing query variables

        loop_count = 0

        #keep calling crawl_node, unless it returns false, with a pause between
        while (self.crawl_node()):

            #delay for crawl_delay ms between calls
            time.sleep(self.crawl_delay / 1000.0)

            #count loop iterations
            loop_count = loop_count + 1
            log.info("MAIN CRAWL LOOP ITERATION %s -----------------",
                     loop_count)

        log.info("--- crawling ended, %s pages crawled ---", loop_count)

        return self.found_resources
Exemplo n.º 15
0
    def push_uris_to_queue(self, uris):
        '''check uris against found_resources set, and if they're not there,
        get resource and push URI and resource out to queue'''
        #self.found_resources

        found_one = False

        for uri in uris:
            #if 'add' returns true, it's not in our set yet
            if self.found_resources.add(uri):

                log.info(
                    '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'
                )
                log.info(
                    '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'
                )
                log.info('New Resource Found!  %s', uri)
                log.info(
                    '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'
                )
                log.info(
                    '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'
                )

                found_one = True

                #push uri and resource to queue!
                if isinstance(self.q, Queue.Queue):
                    log.info('QUEUE: Pushing to queue')
                    self.q.put(uri)
                elif self.zmq is not None:
                    log.info('QUEUE: Pusing to ZMQ socket')
                    self.zmq.send_string(uri)
                else:
                    log.warn('QUEUE: Queue and ZMQ Socket undefined')

        return found_one
Exemplo n.º 16
0
    def query_current_node(self, json):

        matching_uris = []

        if self.qry_resource_type is not None:
            log.info('SEARCH_LIST: looking for singular: %s',
                     self.qry_resource_type)
        if self.qry_resource_title is not None:
            log.info('SEARCH_LIST: looking for title: %s',
                     self.qry_resource_title)
        if self.qry_extra is not None:
            log.info('SEARCH_LIST: looking for %s', self.qry_extra)

        this_link_item_matches = True

        if self.qry_resource_type is not None:
            if (any(self.current_uri_type.lower() in x for x in self.qry_resource_plural) \
                    or self.current_uri_type.lower() == self.qry_resource_type):
                #it does!
                log.info('SEARCH_LIST: matched search_type %s',
                         self.current_uri_type)
            else:
                #it doesn't, but we're searching on resource_type
                this_link_item_matches = False

        #see if it matches resource_title, if queried for
        if self.qry_resource_title is not None:
            if (self.current_uri_title.lower() == self.qry_resource_title):
                #it does!
                log.info('SEARCH_LIST: matched search_title %s',
                         self.current_uri_title)
            else:
                #it doesn't, but we're searching on resource_title
                this_link_item_matches = False

        if self.qry_extra is not None:
            for key, val in self.qry_extra.iteritems():
                try:
                    actual_val = json[key]
                    if actual_val == val:
                        log.info('SEARCH_LIST: matched search_extra %s: %s',
                                 key, val)
                    else:
                        this_link_item_matches = False
                except:
                    this_link_item_matches = False

        #if we made it to here and this_link_item_matches, it's a match!
        if this_link_item_matches:
            matching_uris.append(self.current_uri)

        #return list of matching uris
        return matching_uris