def getprice(product): site = "mysmartprice" start = "'href': '/url?q=" end = "&sa" links = linkGrabber.Links( 'https://www.google.co.in/search?newwindow=1&biw=1366&bih=659&q=' + site + '+' + product) gb = links.find(limit=30) print(gb[25]) gb1 = str(gb) frame = pd.DataFrame() gb2 = re.search("http://www.mysmartprice.com/(.+?)(%|&)", gb1) #print(gb2.group(1)) if gb2: found2 = gb2.group(1) print(found2) frame = findprice(found2) else: print('Not found') return frame #getprice()
def get_links(url): try: links = linkGrabber.Links(url) l = links.find() except Exception: l = '' return l
def findgitlink(url, identifier): links = linkGrabber.Links(url) print('url: ' + url) gb = links.find() for item in gb: print(item['href']) if item['href'].startswith(identifier): return item['href']
def getlink(product): #pd.set_option('display.width',1000) pd.options.display.max_colwidth = 200 site = "amazon.in" #product ="NEXUS 5" start = "'href': '/url?q=" end = "&sa" frame = pd.DataFrame() links = linkGrabber.Links( 'https://www.google.com/search?newwindow=1&biw=1366&bih=659&q=' + site + '+' + product + '+product+reviews') gb = links.find(limit=30) print("~~~~~~~~~~~~print g2[25]~~~~~~~~~~~~~~~ ") print(gb[25]) gb1 = str(gb) print("~~~~~~~~~~~print g2 group(1)~~~~~~~~~~~~~") print(str(gb1)) gb2 = re.search("https://www.amazon.com/(.+?)(%|&)", gb1) print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") print("LINK GRABBED IS: ") print(gb2) print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") print("print g2 group(1)") print(gb2.group(1)) if gb2: '''found = gb2.group(1) print(found[-10:]) found1=found+'xyz' #time.sleep(4) gb3 = re.search('/(.+?)xyz',found1) found2=gb3.group(1) print(found2) time.sleep(500) frame =scrap(found2)''' found = gb2.group(1) print(found[-10:]) frame = scrap(found[-10:]) else: print('Not found') return frame
def getlink(product): site = "snapdeal" start = "'href': '/url?q=" end = "&sa" links = linkGrabber.Links( 'https://www.google.co.in/search?newwindow=1&biw=1366&bih=659&q=' + site + '+' + product + '+reviews') gb = links.find(limit=30) gb1 = str(gb) print(gb1) frame = pd.DataFrame() f = open("link2.txt", "w+") print(f, gb1) gb2 = re.search("www." + site + ".com(.+?)&", gb1) if gb2: found = gb2.group(1) link = "https://www." + site + ".com" + found print(link) frame = scrap(link) else: print('not found') return frame
def getlink(txt): site = "flipkat" product = txt start = "'href': '/url?q=" end = "&sa" links = linkGrabber.Links( 'https://www.google.co.in/search?newwindow=1&biw=1366&bih=659&q=' + site + '+' + product + '+product+details') gb = links.find(limit=100) print(gb[25]) gb1 = str(gb) frame = pd.DataFrame() gb2 = re.search("http://www.flipkart.com(.+?)(%|&)", gb1) if gb2: found2 = gb2.group(1) print(found2) #time.sleep(500) frame = scrap(found2) else: print('Not found') return frame
import re import linkGrabber links = linkGrabber.Links( 'https://www.google.com/search?client=firefox-b-d&q=king') gb = links.find(limit=4, duplicates=False, pretty=True) print(gb)
import linkGrabber links = linkGrabber.Links(r"http://google.com") gb = links.find(limit=4, duplicates=False, pretty=True) print(gb)
import re import linkGrabber links = linkGrabber.Links("https://devx.work") #gb = links.find(limit=10, duplicates= False, pretty=True) gb = links.find(pretty=True) print(gb)
import re import linkGrabber links = linkGrabber.Links("https://www.google.com/") gb = links.find(limit=5, duplicates=False, pretty=True) print(gb)
import re import linkGrabber import pprint links = linkGrabber.Links('http://www.xkcd.com/') gb = links.find(limit=4, duplicates=False, pretty=True) # I think i need some regular expressions under here but I'm still trying to # learn them... But this works to get the URLs
"www.google.com", "www.airbnb.com", # "www.hotels.com", # "www.amazon.com", "www.cars.com", "www.twitch.tv", "store.steampowered.com", "www.reuters.com", # "imgur.com", # "www.lowes.com", "www.cbssports.com", "www.nfl.com", # "www.expedia.com", "www.walmart.com", "www.wayfair.com", "bing.com", # "reddit.com", ] for u in initial_domains: print(u) links = linkGrabber.Links(f"https://{u}/") l = links.find(limit=100, href=re.compile("//" + u), duplicates=False) _c = lambda x: x if x.startswith("http") else "https:" + x paths.extend([_c(e["href"]) for e in l]) d = DomDataset(DATA_DIR + "/dom-dataset", paths) if download: d.download() d.process()
import re import linkGrabber import ssl gcontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1) #url = "http://mhrd.gov.in" url = "http://upfireservice.gov.in/" links = linkGrabber.Links(url) gb = links.find(duplicate=False) count = 0 for rec in gb: url_text = rec['text'] url_href = rec['href'] if '.pdf' in url_href: count += 1 if not url_href.startswith('http'): url_href = url + url_href #print url_text print url_href print count
import linkGrabber links = linkGrabber.Links( 'http://allrecipes.com/recipes/276/desserts/cakes/?page=3#2') gb = links.find(limit=10, duplicates=False, pretty=True, {'class': 'pinterest'}) print(gb)
g = Github() repo = g.get_repo("mdadams/jasper") commit = repo.get_commit(sha=git_link.rsplit('/', 1)[-1]) parent_sha = commit.commit.parents[0].sha print("sha : " + parent_sha) directory = "C:\LabWork\\" + parent_sha git_issue_download_link = '' if git_issue_link is None or git_issue_link == '': print('The git issue link is not found, cant reproduce the exploit') else: links = linkGrabber.Links(git_issue_link) gb = links.find() for item in gb: if item['href'].endswith('.zip'): git_issue_download_link = item['href'] if not os.path.exists(directory): os.makedirs(directory) if len(os.listdir(directory)) == 0: repo = Repo.clone_from(url="https://github.com/mdadams/jasper.git", to_path="C:\LabWork" + "\\" + parent_sha) print("Repository cloned") print("Checking out to the wanted commit") repo.git.checkout(parent_sha)
import re import linkGrabber # Explaination # in this example we are going to grab all urls and links inside the page . this is used in web surfing and crawling scenarios here we are not using selenium #here we use linkGrabber library which is not in python packages by default so we install it using pip open your cmd and type the below command #---------------------------------------------------------------------------------------------------------------------------------------------- # # pip install linkGrabber # #---------------------------------------------------------------------------------------------------------------------------------------------- #Parameters: #* filters (dict): Beautiful Soup's filters as a dictionary #* limit (int): Limit the number of links in sequential order #* reverse (bool): Reverses how the list of <a> tags are sorted #* sort (function): Accepts a function that accepts which key to sort upon #within the List class links = linkGrabber.Links('https://google.com') gb = links.find(limit=8, duplicates=False, pretty=True) print(gb)
import re import linkGrabber links = linkGrabber.Links("https://www.pitchvision.com/#/") gb = links.find(pretty=True) print(gb)
# a python script to read all the links from a webpage and print them import urllib2 import re import linkGrabber links = linkGrabber.Links( "https://www.indeed.com/jobs?q=devops+engineer&sort=date") gb = links.find(Limit=20, pretty=True) print(gb)