def actions(action_sets,reload='No'): from selenium import webdriver from bs4 import BeautifulSoup import time from common.commons import myreader, mywriter pathhead = 'crawlers/actions_by_product/' if reload == 'No': driver = webdriver.Safari() actions = [] for row in action_sets: driver.get(row[3]) time.sleep(8) actions_soup = BeautifulSoup(driver.page_source,"lxml") bowl = actions_soup.findAll('table',attrs='xisCas-actionstoc') for spoon in bowl: sip = spoon.findAll('tr') for swallow in sip: if swallow.find('a'): temp = swallow.find('td').find_next_sibling('td').text.strip() actions.append([row[0],row[1],swallow.find('td').text.strip(),' '.join(temp.split()),swallow.find('a').get('href').strip()]) driver.close() #keep the list of links for actions in actions.csv header=["Product","ActionSet","Action","Action_Describe","Action_Link"] mywriter(pathhead,header,actions,'actions') return actions else: actions = myreader(pathhead,'actions',header='drop') return actions
def action_sets(products,reload='No'): from selenium import webdriver from bs4 import BeautifulSoup import time from common.commons import myreader, mywriter pathhead = 'crawlers/actions_by_product/' if reload == 'No': driver = webdriver.Safari() action_sets = [] for row in products: driver.get(row[1]) time.sleep(10) action_soup = BeautifulSoup(driver.page_source,"lxml") bowl = action_soup.findAll('tr') for spoon in bowl: sip = spoon.findAll('td') if len(sip) == 3: action_sets.append([row[0],sip[1].text.strip(),' '.join(sip[2].text.split()),sip[0].find('a').get('href').strip(),' '.join(sip[0].text.split())]) driver.close() #keep the list of links for actions in action_sets.csv header=["Product","ActionSet","ActionSet_Describe","ActionSet_Link","ActionSet_LinkText"] mywriter(pathhead,header,action_sets,'action_sets') return action_sets else: action_sets = myreader(pathhead,'action_sets',header='drop') return action_sets
def products(url,reload='No'): from selenium import webdriver from bs4 import BeautifulSoup import time from common.commons import myreader, mywriter pathhead = 'crawlers/actions_by_product/' if reload == 'No': driver = webdriver.Safari() driver.get(url) time.sleep(10) soup = BeautifulSoup(driver.page_source,"lxml") driver.close() #print(soup) # Build the product list bowl = soup.findAll('div',attrs='xisDoc-toc_1 ng-scope') #printlist(bowl) products = [] for spoon in bowl: products.append([spoon.text,spoon.find('a').get('href')]) #printlist(products) #keep the list of links for actions in products.csv header=["Product","Product_Link"] mywriter(pathhead,header,products,'products') return products else: products = myreader(pathhead,'products',header='drop') return products
def procs_plus(procs,reload='No'): from selenium import webdriver from bs4 import BeautifulSoup import time from common.commons import myreader, mywriter pathhead = 'crawlers/procs_by_product/' if reload == 'No': # start with procs and add columns: procs_plus = procs #function to see check if link is for desired purpose and if it needs stump url def check_addstump(link,stump): link=link.strip() if link.startswith('http'): return link else: return stump + link # cycle through procedure links, check for overview and contrasted links: Collect = Product | Procedure | Procedure_Short | Procedure_Link | Overview_Link | Compared_Link comp_stump='https://documentation.sas.com' #procs_plus = procs_plus[393:397] #subset for testing #procs_plus = procs_plus[290:296] #subset for testing driver = webdriver.Safari() for row in procs_plus: driver.get(row[3]) time.sleep(10) proc_soup = BeautifulSoup(driver.page_source,"lxml") for proc_link in proc_soup.find_all('a'): if ("Overview" in proc_link.text) and proc_link.get('href'): if "overview" in proc_link.get('href'): row.append(check_addstump(proc_link.get('href'),comp_stump)) if len(row) != 5: row.append('') for proc_link in proc_soup.find_all('a'): comps=["Contrasted","Compared"] if any(comp in proc_link.text for comp in comps) and proc_link.get('href'): row.append(check_addstump(proc_link.get('href'),comp_stump)) if len(row) !=6: row.append('') driver.quit() #keep the list of links for products and procedures in procs_plus.csv header=["Product","Procedure","Procedure_Short","Procedure_Link","Overview_Link","Compared_Link"] mywriter(pathhead,header,procs_plus,'procs_plus') return procs_plus else: procs_plus = myreader(pathhead,'procs_plus',header='drop') return procs_plus
def viya_procs(url, reload='No'): from selenium import webdriver from bs4 import BeautifulSoup import time from common.commons import myreader, mywriter pathhead = 'crawlers/viya_procs/' if reload == 'No': driver = webdriver.Safari() driver.get(url) time.sleep(10) soup = BeautifulSoup(driver.page_source, "lxml") driver.close() #print(soup) # Build the collect list: Product | Procedure | Procedure_Short | Procedure_Link bowl = soup.findAll( ['h2', 'p'], attrs={'class': ['xisDoc-title', 'xisDoc-paragraph']}) products = [] viya_procs = [] for spoon in bowl: if spoon.name == 'h2' and "SAS Products" not in spoon.text: products.append(spoon.text.strip()) if spoon.name == 'p' and products: block = spoon.find('a') if block: link = block.get('href') proc = ' '.join(block.text.split()) proc_short = proc.replace( ': ', ' ' ) # template shows up as template: because it has multiple links proc_short = proc_short.split(' ', 1)[0] viya_procs.append( [products[-1], proc, proc_short, link.strip()]) #keep the list of links for products and procedures in procs.csv header = ["Product", "Procedure", "Procedure_Short", "Procedure_Link"] mywriter(pathhead, header, viya_procs, 'viya_procs') return viya_procs else: viya_procs = myreader(pathhead, 'viya_procs', header='drop') return viya_procs
def procs(url,reload='No'): from selenium import webdriver from bs4 import BeautifulSoup import time from common.commons import myreader, mywriter pathhead = 'crawlers/procs_by_product/' if reload == 'No': driver = webdriver.Safari() driver.get(url) time.sleep(10) soup = BeautifulSoup(driver.page_source,"lxml") driver.close() #print(soup) # Build the collect list: Product | Procedure | Procedure_Short | Procedure_Link bowl = soup.findAll(['h2','p'],attrs={'class':['xisDoc-title','xisDoc-paragraph']}) procs = [] product = [] for spoon in bowl: #print('line - ', spoon) if spoon.name=='h2' and "SAS Products" not in spoon.text: product.append(spoon.text.strip()) if spoon.name=='p' and product: block = spoon.find('a') if block: link = block.get('href') proc = ' '.join(block.text.split()) proc_short = proc.replace(': ',' ') # template shows up as template: because it has multiple links proc_short = proc_short.split(' ',1)[0] procs.append([product[-1], proc, proc_short, link.strip()]) #remove the few cases where a product starts by listing another product (not a proc): as in "includes contents of product..." for idx, item in enumerate(procs): if item[1] in product: del procs[idx] #keep the list of links for products and procedures in procs.csv header=["Product","Procedure","Procedure_Short","Procedure_Link"] mywriter(pathhead,header,procs,'procs') return procs else: procs = myreader(pathhead,'procs',header='drop') return procs
return inputlist # process procs_linked - data clean, create node_name for unique occurences of PROC procs_linked = myreader('crawlers/procs_by_product/', 'procs_linked') header = procs_linked.pop(0) # The rows for TEMPLATE: type of template are causing the create of the dot.svg to break as : is interpreted for port for row in procs_linked: if 'TEMPLATE:' in row[1]: row[1] = row[1].replace(':', ' -') # add node_name column for the short proc name in [2], feed the first thre columns [:3] Product, Proc, Proc_Short procs_linked = node_name(procs_linked, header, 'PROC_', 3, 2) print(header) mywriter('process/processed_data/', header, procs_linked, 'procs_linked') # process viya actionsets - remove actionsets with no actions, add node_name to actions and action_sets actions = myreader('crawlers/actions_by_product/', 'actions') action_sets = myreader('crawlers/actions_by_product/', 'action_sets') a_header = actions.pop(0) as_header = action_sets.pop(0) used_actionsets = [] for actionset in actions: if actionset[1] not in used_actionsets: used_actionsets.append(actionset[1]) for i, action_set in enumerate(action_sets): if action_set[1] not in used_actionsets: del action_sets[i]