def parse_api(self, response): self.logger.info(f'Scraping {response.url}') # Crawls the selector to create a list of each function doc. fdef = response.css('dl.function > dt') # Caches the processed call format of all functions defs = [] if len(fdef) == 0: return # Each item in the list fdef contains the raw function call # format of a function in the package currently being crawled. # The loop goes through each function call to extract, preprocess # and cache in defs. for selector in fdef: # Preprocesses the current function call(selector) # and stores the processed representation. # For example, in the format - foo(arg1, arg2=bar)¶ text = (remove_tags(selector.get()).replace('\n', '').replace( ' ', '').replace('[source]', '')) # Caches the processed function in the global functions cache. defs.append(text) # The loop goes through each simplified function call # to extract important segments and then yield through # a Scrapy Item object. for text in defs: # Uses the Regex rules to compile the function call split = self.split_def.match(text) if split is None: continue # Extracts only the function name from the Regex encoded text function_name = split.groups()[0].split('.')[-1] # Extracts all the function input arguments and stores in a list params = split.groups()[1].split(',') # Caches non-keyword arguments args = [p for p in params if '=' not in p] # Caches keyword arguments kwargs = [p.split('=') for p in params if '=' in p] # Initializes a Scrapy Item object # Check docs at https://docs.scrapy.org/en/latest/topics/items.html item = ApiItem() item['code'] = text # Caches the function call. item['function_name'] = function_name # Caches the function name. item['args'] = args # Caches the regular arguments. item['kwargs'] = kwargs # Caches the keyword arguments. # Yields a structured representation of the function call format. yield item
def parse_api(self, response): self.logger.info(f'Scraping {response.url}') fdef = response.css('h3 + .codehilite') defs = [] if len(fdef) == 0: return for selector in fdef: text = (remove_tags(selector.get()).replace('\n', '').replace( ' ', '').replace('[source]', '')) if '(' not in text and ')' not in text: continue defs.append(text) for text in defs: split = self.split_def.match(text) if split is None: continue function_name = split.groups()[0].split('.')[-1] params = split.groups()[1].split(',') args = [p for p in params if '=' not in p] kwargs = [p.split('=') for p in params if '=' in p] item = ApiItem() item['code'] = text item['function_name'] = function_name item['args'] = args item['kwargs'] = kwargs yield item
def parse_api(self, response): self.logger.info(f'Scraping {response.url}') fdef = response.css('dl.function') defs = {} for selector in fdef: cmd_info = {} func_header = selector.css('dt') text = (remove_tags(func_header.get()).replace('\n', '').replace( '\\', '').replace('>', '').replace('<', '').replace( ' ', '').replace('[source]', '')) if 'torchvision' in text: continue split_cmd = self.split_def.match(text) if split_cmd is None: continue function_name = split_cmd.groups()[0].split('.')[-1] cmd_info['code'] = text params = split_cmd.groups()[1].split(',') cmd_info['args'] = [p for p in params if '=' not in p] cmd_info['kwargs'] = [p.split('=') for p in params if '=' in p] defs[function_name] = cmd_info for function_name, cmd_info in defs.items(): item = ApiItem() item['code'] = cmd_info['code'] item['function_name'] = function_name item['args'] = cmd_info['args'] item['kwargs'] = cmd_info['kwargs'] yield item
def parse(self, response): fdef = response.css('div.function > div.symbol-header') defs = [] for selector in fdef: text = remove_tags(selector.get())\ .replace('\n', '')\ .replace(' ', '')\ .replace('Source', '')\ .replace('function', '')\ .replace('method', '') defs.append(text) for text in defs: split = self.split_def.match(text) if split is None: return function_name = split.groups()[0].split('.')[-1] params = split.groups()[1].split(',') args = [p for p in params if '=' not in p] kwargs = [p.split('=') for p in params if '=' in p] item = ApiItem() item['code'] = text item['function_name'] = function_name item['args'] = args item['kwargs'] = kwargs yield item
def parse(self, response): # Crawls the selector to create a list. fdef = response.css('div.function > div.symbol-header') defs = [] # Caches the processed call format of all functions. # Each item in the list fdef contains documentation of a function # in the module currently being crawled. # The loop goes through each function documentation to extract # Information about the function call and cache. for selector in fdef: # Preprocesses func_header and stores the processed representation. text = remove_tags(selector.get())\ .replace('\n', '')\ .replace(' ', '')\ .replace('Source', '')\ .replace('function', '')\ .replace('method', '') defs.append(text) for text in defs: # Uses the Regex rules to compile the function call split = self.split_def.match(text) if split is None: return # Extracts only the function name from the Regex encoded text function_name = split.groups()[0].split('.')[-1] # Extracts every function input parameter and stores in a lis params = split.groups()[1].split(',') # Caches only Default parameters args = [p for p in params if '=' not in p] # Caches other parameters kwargs = [p.split('=') for p in params if '=' in p] # Initializes a Scrapy Item object # Check docs at https://docs.scrapy.org/en/latest/topics/items.html item = ApiItem() item['code'] = text # Caches the function call item['function_name'] = function_name # Caches the function name item['args'] = args # Caches the default paramaters item['kwargs'] = kwargs # Caches other parameters # Yields a structured representation of the function call format. yield item
def parse_api(self, response): self.logger.info(f'Scraping {response.url}') item = ApiItem() # Crawls the selector to create a list of each python doc. function_header = response.css('.lang-python') if len(function_header) == 0: return text = remove_tags(function_header.get())\ .replace('\n', '')\ .replace(' ', '') # Uses the Regex rules to compile the function call split = self.split_def.match(text) if split is None: return # Extracts only the function name from the Regex encoded text function_name = split.groups()[0].split('.')[-1] # Extracts every function input parameter and stores in a list params = split.groups()[1].split(',') # Caches only Default parameters args = [p for p in params if '=' not in p] # Caches other parameters kwargs = [p.split('=') for p in params if '=' in p] if '__' in text or 'compat' in text: return item['code'] = text # Caches the function call item['function_name'] = function_name # Caches the function name item['args'] = args # Caches the default paramaters item['kwargs'] = kwargs # Caches other parameters # Yields a structured representation of the function call format. yield item
def parse_api(self, response): self.logger.info(f'Scraping {response.url}') # Crawls the selector to create a list of each function doc. fdef = response.css('dl.function') defs = {} # Caches the processed call format of all functions # Each item in the list fdef contains documentation of a function # in the module currently being crawled. # The loop goes through each function documentation to extract # Information about the function call and cache. for selector in fdef: cmd_info = {} # Caches processed format of the current function # Stores the function call format, The dt tag contains the # Function call format for the currently crawled function. func_header = selector.css('dt') # Preprocesses func_header and stores the processed representation. # For example, in the format - torch.this_is_a_function(obj)¶ text = (remove_tags(func_header.get()).replace('\n', '').replace( '\\', '').replace('>', '').replace('<', '').replace( ' ', '').replace('[source]', '')) if 'torchvision' in text: continue # Uses the Regex rules to compile the function call split_cmd = self.split_def.match(text) if split_cmd is None: continue # Extracts only the function name from the Regex encoded text function_name = split_cmd.groups()[0].split('.')[-1] cmd_info['code'] = text # Caches the formatted function call # Extracts every function input parameter and stores in a list params = split_cmd.groups()[1].split(',') # Caches only Default parameters cmd_info['args'] = [p for p in params if '=' not in p] # Caches other parameters cmd_info['kwargs'] = [p.split('=') for p in params if '=' in p] # Stores the function cache in the global function cache defs[function_name] = cmd_info # Loops through the global function cache to yield each function. for function_name, cmd_info in defs.items(): print(function_name) # Initializes a Scrapy Item object # Check docs at https://docs.scrapy.org/en/latest/topics/items.html item = ApiItem() item['code'] = cmd_info['code'] # Caches the function call item['function_name'] = function_name # Caches the function name item['args'] = cmd_info['args'] # Caches the default paramaters item['kwargs'] = cmd_info['kwargs'] # Caches other parameters # Yields a structured representation of the function call format. yield item