示例#1
0
    def parse_api(self, response):
        self.logger.info(f'Scraping {response.url}')

        # Crawls the selector to create a list of each function doc.
        fdef = response.css('dl.function > dt')

        # Caches the processed call format of all functions
        defs = []

        if len(fdef) == 0:
            return

        # Each item in the list fdef contains the raw function call
        # format of a function in the package currently being crawled.
        # The loop goes through each function call to extract, preprocess
        # and cache in defs.
        for selector in fdef:
            # Preprocesses the current function call(selector)
            # and stores the processed representation.
            # For example, in the format - foo(arg1, arg2=bar)¶
            text = (remove_tags(selector.get()).replace('\n', '').replace(
                ' ', '').replace('[source]', ''))

            # Caches the processed function in the global functions cache.
            defs.append(text)

        # The loop goes through each simplified function call
        # to extract important segments and then yield through
        # a Scrapy Item object.
        for text in defs:
            # Uses the Regex rules to compile the function call
            split = self.split_def.match(text)

            if split is None:
                continue

            # Extracts only the function name from the Regex encoded text
            function_name = split.groups()[0].split('.')[-1]

            # Extracts all the function input arguments and stores in a list
            params = split.groups()[1].split(',')

            # Caches non-keyword arguments
            args = [p for p in params if '=' not in p]

            # Caches keyword arguments

            kwargs = [p.split('=') for p in params if '=' in p]

            # Initializes a Scrapy Item object
            # Check docs at https://docs.scrapy.org/en/latest/topics/items.html
            item = ApiItem()

            item['code'] = text  # Caches the function call.
            item['function_name'] = function_name  # Caches the function name.
            item['args'] = args  # Caches the regular arguments.
            item['kwargs'] = kwargs  # Caches the keyword arguments.

            # Yields a structured representation of the function call format.
            yield item
示例#2
0
    def parse_api(self, response):
        self.logger.info(f'Scraping {response.url}')
        fdef = response.css('h3 + .codehilite')
        defs = []

        if len(fdef) == 0:
            return

        for selector in fdef:
            text = (remove_tags(selector.get()).replace('\n', '').replace(
                ' ', '').replace('[source]', ''))
            if '(' not in text and ')' not in text:
                continue

            defs.append(text)
        for text in defs:
            split = self.split_def.match(text)

            if split is None:
                continue

            function_name = split.groups()[0].split('.')[-1]
            params = split.groups()[1].split(',')
            args = [p for p in params if '=' not in p]
            kwargs = [p.split('=') for p in params if '=' in p]
            item = ApiItem()

            item['code'] = text
            item['function_name'] = function_name
            item['args'] = args
            item['kwargs'] = kwargs
            yield item
示例#3
0
    def parse_api(self, response):
        self.logger.info(f'Scraping {response.url}')
        fdef = response.css('dl.function')
        defs = {}
        for selector in fdef:
            cmd_info = {}
            func_header = selector.css('dt')
            text = (remove_tags(func_header.get()).replace('\n', '').replace(
                '\\', '').replace('&gt', '').replace('&lt', '').replace(
                    ' ', '').replace('[source]', ''))
            if 'torchvision' in text:
                continue

            split_cmd = self.split_def.match(text)
            if split_cmd is None:
                continue

            function_name = split_cmd.groups()[0].split('.')[-1]

            cmd_info['code'] = text
            params = split_cmd.groups()[1].split(',')

            cmd_info['args'] = [p for p in params if '=' not in p]
            cmd_info['kwargs'] = [p.split('=') for p in params if '=' in p]

            defs[function_name] = cmd_info

        for function_name, cmd_info in defs.items():
            item = ApiItem()
            item['code'] = cmd_info['code']
            item['function_name'] = function_name
            item['args'] = cmd_info['args']
            item['kwargs'] = cmd_info['kwargs']
            yield item
示例#4
0
    def parse(self, response):
        fdef = response.css('div.function > div.symbol-header')
        defs = []
        for selector in fdef:
            text = remove_tags(selector.get())\
                .replace('\n', '')\
                .replace(' ', '')\
                .replace('Source', '')\
                .replace('function', '')\
                .replace('method', '')
            defs.append(text)

        for text in defs:
            split = self.split_def.match(text)
            if split is None:
                return

            function_name = split.groups()[0].split('.')[-1]
            params = split.groups()[1].split(',')
            args = [p for p in params if '=' not in p]
            kwargs = [p.split('=') for p in params if '=' in p]

            item = ApiItem()
            item['code'] = text
            item['function_name'] = function_name
            item['args'] = args
            item['kwargs'] = kwargs
            yield item
示例#5
0
    def parse(self, response):

        # Crawls the selector to create a list.
        fdef = response.css('div.function > div.symbol-header')

        defs = []  # Caches the processed call format of all functions.

        # Each item in the list fdef contains documentation of a function
        # in the module currently being crawled.
        # The loop goes through each function documentation to extract
        # Information about the function call and cache.
        for selector in fdef:
            # Preprocesses func_header and stores the processed representation.
            text = remove_tags(selector.get())\
                .replace('\n', '')\
                .replace(' ', '')\
                .replace('Source', '')\
                .replace('function', '')\
                .replace('method', '')
            defs.append(text)

        for text in defs:
            # Uses the Regex rules to compile the function call
            split = self.split_def.match(text)
            if split is None:
                return
            # Extracts only the function name from the Regex encoded text
            function_name = split.groups()[0].split('.')[-1]

            # Extracts every function input parameter and stores in a lis
            params = split.groups()[1].split(',')

            # Caches only Default parameters
            args = [p for p in params if '=' not in p]

            # Caches other parameters
            kwargs = [p.split('=') for p in params if '=' in p]

            # Initializes a Scrapy Item object
            # Check docs at https://docs.scrapy.org/en/latest/topics/items.html
            item = ApiItem()

            item['code'] = text  # Caches the function call
            item['function_name'] = function_name  # Caches the function name
            item['args'] = args  # Caches the default paramaters
            item['kwargs'] = kwargs  # Caches other parameters

            # Yields a structured representation of the function call format.
            yield item
示例#6
0
    def parse_api(self, response):
        self.logger.info(f'Scraping {response.url}')
        item = ApiItem()

        # Crawls the selector to create a list of each python doc.
        function_header = response.css('.lang-python')
        if len(function_header) == 0:
            return

        text = remove_tags(function_header.get())\
            .replace('\n', '')\
            .replace(' ', '')
        # Uses the Regex rules to compile the function call
        split = self.split_def.match(text)
        if split is None:
            return

        # Extracts only the function name from the Regex encoded text
        function_name = split.groups()[0].split('.')[-1]

        # Extracts every function input parameter and stores in a list
        params = split.groups()[1].split(',')

        # Caches only Default parameters
        args = [p for p in params if '=' not in p]

        # Caches other parameters
        kwargs = [p.split('=') for p in params if '=' in p]

        if '__' in text or 'compat' in text:
            return

        item['code'] = text  # Caches the function call
        item['function_name'] = function_name  # Caches the function name
        item['args'] = args  # Caches the default paramaters
        item['kwargs'] = kwargs  # Caches other parameters
        # Yields a structured representation of the function call format.
        yield item
示例#7
0
    def parse_api(self, response):
        self.logger.info(f'Scraping {response.url}')

        # Crawls the selector to create a list of each function doc.
        fdef = response.css('dl.function')

        defs = {}  # Caches the processed call format of all functions

        # Each item in the list fdef contains documentation of a function
        # in the module currently being crawled.
        # The loop goes through each function documentation to extract
        # Information about the function call and cache.
        for selector in fdef:

            cmd_info = {}  # Caches processed format of the current function

            # Stores the function call format, The dt tag contains the
            # Function call format for the currently crawled function.
            func_header = selector.css('dt')

            # Preprocesses func_header and stores the processed representation.
            # For example, in the format - torch.this_is_a_function(obj)¶
            text = (remove_tags(func_header.get()).replace('\n', '').replace(
                '\\', '').replace('&gt', '').replace('&lt', '').replace(
                    ' ', '').replace('[source]', ''))
            if 'torchvision' in text:
                continue

            # Uses the Regex rules to compile the function call
            split_cmd = self.split_def.match(text)

            if split_cmd is None:
                continue

            # Extracts only the function name from the Regex encoded text
            function_name = split_cmd.groups()[0].split('.')[-1]

            cmd_info['code'] = text  # Caches the formatted function call

            # Extracts every function input parameter and stores in a list
            params = split_cmd.groups()[1].split(',')

            # Caches only Default parameters
            cmd_info['args'] = [p for p in params if '=' not in p]

            # Caches other parameters
            cmd_info['kwargs'] = [p.split('=') for p in params if '=' in p]

            # Stores the function cache in the global function cache
            defs[function_name] = cmd_info

        # Loops through the global function cache to yield each function.
        for function_name, cmd_info in defs.items():

            print(function_name)
            # Initializes a Scrapy Item object
            # Check docs at https://docs.scrapy.org/en/latest/topics/items.html
            item = ApiItem()

            item['code'] = cmd_info['code']  # Caches the function call
            item['function_name'] = function_name  # Caches the function name
            item['args'] = cmd_info['args']  # Caches the default paramaters
            item['kwargs'] = cmd_info['kwargs']  # Caches other parameters

            # Yields a structured representation of the function call format.
            yield item