def checkSnortRule(cve_id: str, good_snort_link: list, pause_sec_list) -> bool: ''' Search google by keyword "cve_id snort rule", and store it in good_snort_link when the domain name match 'https://www.snort.org' or 'https://blog.snort.org'. ''' # [*] A little weird here is that the searching result here is different from googling by myself. It may match here while it should not in googling by myself, # but when it match here, it often match in googling by myself, too. So I think the result here can be take into consideration. query = cve_id + ' snort rule' agent = googlesearch.get_random_user_agent() para = { 'oq': cve_id + '+', 'aqs': 'chrome.1.69i59l2.1406j0j1', 'sourceid': 'chrome', 'ie': 'UTF-8' } # I don't know whether this is useful pause_sec = random.choice(pause_sec_list) # To avoid blocked by google for entry in googlesearch.search( query, stop=5, user_agent=agent, extra_params=para, pause=pause_sec): # Mostly appears at top five result if entry[: 21] == 'https://www.snort.org' or entry[: 22] == 'https://blog.snort.org': # 31 is the length of 'https://www.snort.org' good_snort_link.append(entry) return True return False
def searchInGoogle(self, message: dict, maxCountOfLinks: int = 5) -> []: """ Выполняет поиск в Google по запросу с параметрами, которые пришли от клиента (от веб-сайта MAS). Параметры: -------- message: dict словарь с критериями поиска вакансий от клиента maxCountOfLinks : int, optional максимальное количество страниц для поиска. Чем больше это число, тем дольше будет обрабатываться запрос. Для тестовых целей есть смысл сделать его маленьким (в пределах 10) Возвращаемое значение: -------- array список ссылок, которые удалось достать из Google ------- """ user_agent = googlesearch.get_random_user_agent() response = googlesearch.search('Вакансии %s Гродно' % (message['position']), country=message['country'], stop=maxCountOfLinks, user_agent=user_agent) urls = [x for x in response] self.log_info("> Parsed %s items. Sending to viewer..." % len(urls)) return urls
def search_google(search, args): ''' the function where googlesearch from mario vilas is called ''' s = search.split(',') search_stop = args.search_stop query = 'filetype:pdf' #query = 'site:%s filetype:pdf' % search # print(query) urls = [] try: for url in gs.search(query, num=20, domains=s, stop=search_stop, user_agent=gs.get_random_user_agent()): #print(url) urls.append(url) except urllib.error.HTTPError as e: #print('Error: %s' % e) return False, e except urllib.error.URLError as e: return False, e return True, urls
def __init__(self, seq: str = None): self.__isvalid = True self.__number = 339960 if seq != None: seq = str(int(seq)) if not 0 < int(seq) < self.__number: self.__isvalid = False self.__seq = 'A' + '0' * (6 - len(seq)) + seq self.__isrand = False else: self.__seq = str(choice(range(1, self.__number))) self.__seq = 'A' + '0' * (6 - len(self.__seq)) + self.__seq self.__isrand = True self.__url = f'https://oeis.org/search?q={self.__seq}' self.__html = bs( get(self.__url, headers={ 'User-Agent': str(get_random_user_agent()) }).text, 'lxml') self.__elem = self.__html.find( 'p', { 'style': 'text-indent: -1em; margin-left: 1em; margin-top: 0; margin-bottom: 0;' }) self.__table = self.__html.findAll('table')[9]
def _search(): time_since_last_use = 0 prush("Selecting an engine...") engine_name = "" while True: engine = random.choice(search_engines)() engine_name = engine.__class__.__name__ if not engine_name in engine_times: break time_since_last_use = (datetime.now() - engine_times[engine_name]).total_seconds() if time_since_last_use < ENGINE_COOLDOWN_TIME: prush( "Engine '{}' used too recently. Trying another...".format( engine_name)) else: break engine.set_headers({'User-Agent': get_random_user_agent()}) # internally intepreted as sleep(random_uniform(*self._delay)) # This value set low (or zero) since we pause between use of each # engine (above). engine._delay = (0, 0) subject = random.choice(subjects) + " news" prush("Searching for subject '{}'...".format(subject)) search_results = engine.search(subject, pages=SEARCH_PAGES).links() engine_times[engine_name] = datetime.now() prush("Found {} results for subject '{}'.".format( len(search_results), subject)) return search_results
def __scrape(self): if self.__html == '': self.__html = bs( get(self.__url, headers={ 'User-Agent': str(get_random_user_agent()) }).text, 'lxml')
def search_pdf(search, args): ''' the function where googlesearch from mario vilas is called ''' search_stop = args.search_stop query = '%s filetype:pdf' % search #print(query) urls = [] try: for url in gs.search(query, num=20, stop=search_stop, user_agent=gs.get_random_user_agent()): #print(url) # parse out the name of the file in the url filename = find_name(url) # add the file to queue process_queue_data(filename, url, 'url') urls.append(url) except urllib.error.HTTPError as e: print('Error: %s' % e) return -1
def google_search_query(self, query, lang="id", maxSearch=10, **kwargs): dt = { "google_search_query": { "urls": [], "title": [] }, } for i in search(query, lang=lang, start=0, stop=maxSearch, tld="com", safe="off", tbs="0", num=10, country=None, pause=1.5, domains=None, tpe="", user_agent=get_random_user_agent()): dt["google_search_query"]["urls"].append(i) tl = BeautifulSoup(requests.get(i).content, "html.parser") for title in tl.findAll("title"): dq = title.text.strip().replace("\n", "") dt["google_search_query"]["title"].append(dq) ggle = json.dumps(dt, indent=2, sort_keys=True) return json.loads(ggle)
def google_search(query, resultsToRetrieve, userAgent, sleepDelay=1): """ perform google search and handle exceptions """ try: search_results = search(query, stop=resultsToRetrieve, user_agent=get_random_user_agent()) #search_results = ["test result 1","test result 2"] if search_results != None: #i dont know why i need this for exception handling to work!! for item in search_results: pass # #print(str(item)) except urllib.error.HTTPError as e: if e.code == 429 or str(e) == "HTTP Error 429: Too Many Requests": print( colored( '\tRejected, sleeping for ' + str(sleepDelay) + ' min..', 'red')) sleep((sleepDelay * 60)) print('\tRetrying..') google_search(query, resultsToRetrieve, userAgent, (sleepDelay * 2)) else: return search_results
def findSites(query, qnt=None, site=''): #site = site if site != '' else 'gov.br' site = [site if site else 'gov.br'] for result in search( query, lang='pt', num=20, start=0, stop= int(qnt) if qnt else None, pause=1, domains=site, user_agent=get_random_user_agent()): yield result
def title_to_paper_link(title): try: paper_gen = googlesearch.search(title, num=10, stop=1, domains=PAPER_DOMAINS, user_agent=googlesearch.get_random_user_agent()) paper_link = next(itertools.islice(paper_gen, 0, None)) except: paper_gen = googlesearch.search(title, num=10, stop=1, user_agent=googlesearch.get_random_user_agent()) paper_link = next(itertools.islice(paper_gen, 0, None)) return paper_link
def title_to_code_link(title): try: code_gen = googlesearch.search(title + 'github', num=10, stop=1, domains=CODE_DOMAINS, user_agent=googlesearch.get_random_user_agent()) code_link = next(itertools.islice(code_gen, 0, None)) except: code_gen = googlesearch.search(title + 'github', num=10, stop=1, user_agent=googlesearch.get_random_user_agent()) code_link = next(itertools.islice(code_gen, 0, None)) return code_link
def go(self): i = 1 for dork in self.google_dorks: try: dork = dork.strip() self.links = [ ] # Stores URLs with files, clear out for each dork. # Search for the links to collect. if self.domain: query = dork + " site:" + self.domain else: query = dork pause_time = self.delay + random.choice(self.jitter) print("[*] Search ( " + str(i) + " / " + str(len(self.google_dorks)) + " ) for Google dork [ " + query + " ] and waiting " + str(pause_time) + " seconds between searches") for url in googlesearch.search( query, start=0, stop=self.search_max, num=100, pause=pause_time, extra_params={'filter': '0'}, user_agent=googlesearch.get_random_user_agent()): self.links.append(url) # Since googlesearch.search method retreives URLs in batches of 100, ensure the file list only contains the requested amount. if len(self.links) > self.search_max: self.links = self.links[:-(len(self.links) - self.search_max)] print("[*] Results: " + str(len(self.links)) + " sites found for Google dork: " + dork) for foundDork in self.links: print(foundDork) self.total_dorks += len(self.links) # Only save links with valid results to an output file. if self.save_links and (self.links): f = open(self.log_file, 'a') f.write('#: ' + dork + "\n") for link in self.links: f.write(link + "\n") f.write("=" * 50 + "\n") f.close except: print("[-] ERROR with dork: " + dork) i += 1 self.fp.close print("[*] Total dorks found: " + str(self.total_dorks))
def get_category_for_archive_element(product_name): user_agent = get_random_user_agent() response = search("xkom " + product_name, start=0, stop=5, num=5, user_agent=user_agent) try: result = '' while 'x-kom' not in result or '.html' not in result: result = next(response) print(result) category = get_category(result) except StopIteration: category = "Brak" return category
def go_gle(query): my_results_list = [] for i in search( query, # Expression tld='com', # TL domain lang='en', # Set lang en num=10, # Number of results / page start=0, # First result to retrieve stop=None, # Last result to retrieve pause=0, # Lapse between HTTP requests user_agent=get_random_user_agent(), ): my_results_list.append(i) return '\n'.join(my_results_list)
def get_value(companyname): accurateset = OrderedSet() multipleset = OrderedSet() try: user_agent_str = googlesearch.get_random_user_agent() logger.info('google search for %s', companyname) for j in googlesearch.search(companyname, tld="co.in", num=3, stop=3, pause=1, user_agent=user_agent_str): #logger.info('google search result %s', companyname) #print(j) correct = True for item in bannedlist: if j.__contains__(item): correct = False break if correct: try: newstr = re.split("www.", j)[1] except IndexError as e: newstr = re.split("//", j)[1] finamdomain = re.split("/", newstr)[0] if "." == finamdomain[0]: finamdomain.replace(".", "", 1) multipleset.add(finamdomain) pattren_str = re.compile(r'([a-z]+)') firstname = re.search(pattren_str, companyname.lower()).group(0) if firstname in finamdomain: accurateset.add(finamdomain) except urllib.error.HTTPError as httperr: #print(httperr.headers,httperr.read()) # Dump the headers to see if there's more information return {'error': 'captcha'} data = { 'companyname': companyname, 'accurate': list(accurateset), 'multiple': list(multipleset) } return data
def google_search(domain): target_url = '' kw = '日常生活用具' query = "{} {}".format(domain, kw) print('domain: ' + domain) try: for url in search(query, lang='ja', stop=1, pause=3.0, user_agent=get_random_user_agent()): print(url) target_url = url except Exception as e: print(e) target_url = None return target_url
def __init__(self, query): self.__query = query self.__url = f'https://scholar.google.it/scholar?hl=en&as_sdt=0%2C5&q={parse.quote_plus(query)}&btnG=&oq=we' self.__html = bs( get(self.__url, headers={ 'User-Agent': str(get_random_user_agent()) }).text, 'lxml') ids = (i['data-cid'] for i in self.__html.findAll( 'div', {'class': 'gs_r gs_or gs_scl'})) gs = self.__html.findAll('div', {'class': 'gs_fl'}) a = (i.findAll('a') for i in gs) self.__cit = (i[2].text.split()[-1] for i in a if len(i) > 2) self.__rel = ('https://scholar.google.it' + i[3]['href'] for i in a if len(i) > 3) self.__results = [ self.__res(i['href'], i.text, next(self.__rel)) for i in (self.__html.find('a', {'id': i}) for i in ids) ]
def search_google(word, stp=5): # Search query query = str(word) query_result = search(query=query, tld='com', lang='en', num=5, start=0, stop=stp) results = [] for res in query_result: res = filter_result(res) html = get_page(res, get_random_user_agent()) results.append({'link': res, 'page': html}) return results
def hits_google(search, args): ''' the function where googlesearch from mario vilas is called ''' s = search.split(',') query = 'filetype:pdf' try: hits = gs.hits(query, domains=s, user_agent=gs.get_random_user_agent()) except urllib.error.HTTPError as e: return False, e except urllib.error.URLError as e: return False, e except IndexError as e: return False, e return True, hits
def gdork(domain): ''' Run google dorks against domain ''' try: dorks = { 'Login portal': [ f'site:{domain} intext:login', ], 'Files': [f'site:{domain} filetype:pdf'], } result = '' for dork_type in dorks.keys(): user_agent = get_random_user_agent() result += f'=================\n{dork_type.upper()}\n=================\n' for dork in dorks[dork_type]: data = search(dork, user_agent=user_agent) for link in data: print(link) result += (link + "\n") result += '\n' return result except (gaierror, URLError): return 0
def get_random_agent(): return (gs.get_random_user_agent())
def get_user_agent(): return googlesearch.get_random_user_agent()[source]
highSeverity = ['.conf', '.cnf', '.cfg', '.env', '.sql', '.dbf', '.mdb', '.log', '.bak', '.htaccess'] mediumSeverity = ['.txt', '.csv', 'admin', 'git', 'svn', 'ini'] testingUrl = str(results.url) resultsnumber = int(results.resultcount) verbose = results.verbose waitTime = results.wait print("Searching....\n") try: for dork in dorkDict: if verbose == True: print("[Testing " + dork + "]") results = [] finishedDork = "site:" + testingUrl + " " + dorkDict[dork] for x in googlesearch.search(finishedDork, lang='en', num=resultsnumber, start=0, stop=resultsnumber, pause=waitTime, user_agent=googlesearch.get_random_user_agent()): results.append(x) if len(results) > 0: print(colours.OKGREEN + "[" + dork + "]" + colours.ENDC) for x in results: if bool([ele for ele in highSeverity if(ele in x)]): print(colours.FAIL + x + colours.ENDC) elif bool([ele for ele in mediumSeverity if(ele in x)]): print(colours.WARNING + x + colours.ENDC) else: print(x) except Exception as e: print(e)
myLowPause = 5 myHighPause = 15 myDate = date.today() nbTrials = 0 myTLD = "com" #Google tld -> we search in google.com myHl = "en" #in english #this may be long while myStart < myMaxStart: print("PASSAGE NUMBER :" + str(myStart)) print("Query:" + myKeyword) #change user-agent and pause to avoid blocking by Google myPause = random.randint(myLowPause, myHighPause) #long pause print("Pause:" + str(myPause)) #change user_agent and provide local language in the User Agent #myUserAgent = getRandomUserAgent(myconfig.userAgentsList, myUserAgentLanguage) myUserAgent = googlesearch.get_random_user_agent() print("UserAgent:" + str(myUserAgent)) #myPause=myPause*(nbTrials+1) #up the pause if trial get nothing #print("Pause:"+str(myPause)) try: urls = googlesearch.search(query=myKeyword, tld=myTLD, lang=myHl, safe='off', num=myNum, start=myStart, stop=myStop, domains=None, pause=myPause, tpe='', user_agent=myUserAgent)
from googlesearch import search, get_random_user_agent import argparse parser = argparse.ArgumentParser() parser.add_argument('-v', dest='vuln', action='store', help='The vulnerability that you want to gather h1 reports', required=True) parser.add_argument('-n', dest='numberOfReports', action='store', type=int, default=100, help='The number of how many reports you want to gather. DEFAULT: 100') parser.add_argument('-p', dest='seconds', action='store', type=float, default=2.0, help='How much second/seconds do you want it to wait between HTTP Requests, I highly recommend to set this up so that we can avoid getting banned by google. DEFAULT: 2') args = parser.parse_args() numberOfReports = args.numberOfReports vuln = args.vuln seconds = args.seconds user_agent = get_random_user_agent() query = f"site:hackerone.com inurl:/reports/ intext:{vuln}" def banner(): print(""" ____ ____ ___ _____ _ _ / ___| _ \|_ _|_ _| | | | | | _| |_) || | | | | |_| | | |_| | _ < | | | | | _ | \____|_| \_\___| |_| |_| |_| a tool created by 0xShin """) def search_links(query): reports = [] links = search(query, stop=numberOfReports, pause=seconds, user_agent=user_agent) print('[+] Currently gathering links [+]') for link in links: reports.append(link) return reports def outputToFile(reports):
def user_agent(): # Information about user agent print(">> User agant :") print(googlesearch.get_random_user_agent())
def main(): with open('county_list.csv', newline='') as f: engine_times = dict() r = csv.reader(f, delimiter=',') for row in r: county, state = row[0], row[1] prush("{}, {}...".format(county, state)) time_since_last_use = 0 engine_name = "" while True: # This does basically constitute a busy loop if all engines are # in a cooldown period, but since this is single threaded, I'm # not too concerned. engine = random.choice(search_engines)() engine_name = engine.__class__.__name__ if not engine_name in engine_times: break time_since_last_use = ( datetime.now() - engine_times[engine_name]).total_seconds() if time_since_last_use >= ENGINE_MIN_COOLDOWN_SECS: break engine.set_headers({'User-Agent': get_random_user_agent()}) subject = PREFERRED_SEARCH_TEMPLATE.format(county, state) search_results = engine.search(subject, pages=SEARCH_PAGES).links() engine_times[engine_name] = datetime.now() if len(search_results) == 0: subject = ALTERNATE_SEARCH_TEMPLATE.format(county, state) # Random-uniform wait period between successive calls to same # engine adds some delay and jitter to the calls, making it # just slightly harder to get rate-limited. time.sleep( random.uniform(ALTERNATE_SEARCH_MIN_WAIT_SECS, ALTERNATE_SEARCH_MAX_WAIT_SECS)) search_results = engine.search(subject, pages=SEARCH_PAGES).links() title = fmt_title(engine_name, subject) access_time = fmt_access_time() markdown = "" with open(state + "/" + county + ".md", "r") as county_file: markdown = county_file.read() if len(search_results) == 0 or search_results[0] in markdown: continue uri = select_best_search_result(search_results) if len(markdown.strip()) == 0 or NO_TIPS_PLACEHOLDER.lower( ) in markdown.lower(): markdown = fmt_page_heading(county, state) markdown = markdown + fmt_entry(title, uri, access_time) with open(state + "/" + county + ".md", "w") as county_file: county_file.write(markdown)
#myTbs= "qdr:m" #recherche sur le dernier mois. pas utilisé. #tbs=myTbs, #pause assez importante pour ne pas bloquer affiner les valeurs si besoin myLowPause = 15 myHighPause = 45 #on boucle (peut durer plusieurs heures - faites cela pendant la nuit :-) !!!) while i < len(myQueries): myQuery = myQueries[i] print("PASSAGE NUMERO :" + str(i)) print("Query:" + myQuery) #on fait varier le user_agent et la pause pour ne pas se faire bloquer myPause = random.randint( myLowPause, myHighPause) #pause assez importante pour ne pas bloquer. print("Pause:" + str(myPause)) myUserAgent = googlesearch.get_random_user_agent( ) #modification du user_agent pour ne pas bloquer print("UserAgent:" + str(myUserAgent)) df = pd.DataFrame(columns=['query', 'page', 'position', 'source']) #dataframe de travail try: urls = googlesearch.search(query=myQuery, tld='fr', lang='fr', safe='off', num=myNum, start=myStart, stop=myStop, domains=None, pause=myPause, only_standard=False, extra_params={},
# -*- coding: UTF-8 -*- __title__ = 'potato' __version__ = '1.0.1' __author__ = '@gyscordia' __license__ = 'MIT' __copyright__ = 'Copyright 2020 by Me' try: from googlesearch import search except ImportError: print("dammmm..") import googlesearch import os import sys from datetime import datetime agente = googlesearch.get_random_user_agent() data = str(datetime.today()) horario = data[0:19] print("getting urls... {}".format(horario)) try: for j in search(query=sys.argv[1], tld=sys.argv[2], lang=sys.argv[3], num=int(sys.argv[4]), stop=int(sys.argv[4]), pause=5, user_agent=agente): print(j) except IndexError: print('try: '+ __file__ + ' search (query you want to use) com(top level domain) language(pt-br, en) num(number of urls, eg. 10)') print('example: '+ __file__ + ' noticias com pt-br 50')