def bingSearch(link, limit=4): #Sanitize input try: linkfile = link.replace("^", "|") bing=PyBingSearch('MsYC/eW39AiaY9EYFIC8mlX8C7HPRRooagMKRwVZx7Q') try: result_list, next_uri = bing.search(linkfile, limit, format='json') except: result_list, next_uri = bing.search(linkfile.replace(" news", ""), limit, format='json') returning=[] for i in xrange(limit): try: returning.append(result_list[i].url.encode('utf8')) except: break return returning except: return [link.replace(" news", "")]
def run(self, keywords=[]): if not keywords: # Check if file exists if not os.path.isfile(self.default_keyword_file): return False else: keywords = [] fp = open(self.default_keyword_file, "r") for line in fp.readlines(): keywords.append(line.strip()) fp.close() self.keywords = keywords print "Using Keywords:{0}".format(self.keywords) try: # Get the hits for the given keywords bing = PyBingSearch(BING_API_KEY) for keyword in self.keywords: print "KEYWORD:{0}".format(keyword) result_list, next_uri = bing.search(keyword, limit=self.maxResuts, format='json') for result in result_list: url = result.url print "Found URL:{0}".format(url) self.urls.append(url) except: print "Something went wrong querying Bing." pass return True
def run(self, keywords=[]): if not keywords: # Check if file exists if not os.path.isfile(self.default_keyword_file): return False else: keywords = [] fp = open(self.default_keyword_file,"r") for line in fp.readlines(): keywords.append(line.strip()) fp.close() self.keywords = keywords print "Using Keywords:{0}".format(self.keywords) try: # Get the hits for the given keywords bing = PyBingSearch(BING_API_KEY) for keyword in self.keywords: print "KEYWORD:{0}".format(keyword) result_list, next_uri = bing.search(keyword, limit=self.maxResuts, format='json') for result in result_list: url = result.url print "Found URL:{0}".format(url) self.urls.append(url) except: print "Something went wrong querying Bing." pass return True
def bingSearch(self, numresult=10): bing = PyBingSearch(self.bing_api_key) results, next_uri = bing.search(self.query, limit=numresult, format='json') res = [] for i in range(numresult): res += [results[i].url] return res
def get_improved_term(query): bing = PyBingSearch('') # Add your bing-api key here result_list, next_url = bing.search("%s wikipedia" % query, limit=3, format='json') for result in result_list: wiki_url = result.url wiki_desc = result.description if "en.wikipedia" in wiki_url: if ("may refer to" not in wiki_desc) or ("may also refer to" not in wiki_desc): wiki_corr_term = (wiki_url).split("/")[-1] try: wiki_corr_term_dec = str(urllib.unquote(wiki_corr_term).decode('utf-8')) return wiki_corr_term_dec except: pass return query
def getTopTen(): global query global pagesToBeCrawled global fb bing = PyBingSearch('mMlCxUd5qmU5uDJ1w1VLbDkobVK905A9cZZhYkfqGHg=') query = raw_input("Enter a search query ") pagesToBeCrawled = input("Enter the number of pages you would like to be crawled? ") fp.write('****************************The query searched for is:' + query + ", pages to be crawled: " + str(pagesToBeCrawled) + '\n') urlList, next_uri = bing.search(query, limit=10, format='json') # get the results for result in urlList: #initialUrls.append(result); # Add the initial lists to the list if (pages > pagesToBeCrawled): print 'We have successfully crawled',pagesToBeCrawled,'pages' break checkUrl(result.url)
def bingSearch(linkfile): print "\nCalling bingSearch with arguments linkfile: {}:".format(str(linkfile)) #Sanitize input linkfile = linkfile.replace("^", "|") bing=PyBingSearch('XXXXX') #Get from bing: result_list, next_uri = bing.search(linkfile, limit=5, format='json') #result_list, next_uri = bing.search("Python Software Foundation", limit=50, format='json') result_list[0].description #first bing result file = open( 'bingResults.txt', 'w') for res in result_list: file.write('"' + res.url + '" ') break print "\nbingSearch complete" return str(result_list[0].url)
def get_improved_term(query): bing = PyBingSearch('') # Add your bing-api key here result_list, next_url = bing.search("%s wikipedia" % query, limit=3, format='json') for result in result_list: wiki_url = result.url wiki_desc = result.description if "en.wikipedia" in wiki_url: if ("may refer to" not in wiki_desc) or ("may also refer to" not in wiki_desc): wiki_corr_term = (wiki_url).split("/")[-1] try: wiki_corr_term_dec = str( urllib.unquote(wiki_corr_term).decode('utf-8')) return wiki_corr_term_dec except: pass return query
def _hits(self,my_query): if self.search_engine == "google": query = urllib.urlencode({'q' : my_query}) time.sleep(randint(0,4)) r = requests.get('https://www.google.com/search?' + query) searchres_param = "id=\"resultStats\">((About |)[0-9,]+) result(|s)</div>" print my_query try: count = re.search(searchres_param,r.text).group(1) if "About " in count: count = count.strip("About ") print "Result found" return (int(str(re.sub(',','',count))) + 0.01) except: print "No results" return 0.01 elif self.search_engine == "bing": bing = PyBingSearch('xAFcyVsidXgkpQxwHYkPcPPPRGpjU2qlNtjBD6ZqGNU') result_list,next_url = bing.search(my_query) if len(result_list) > 0: return len(result_list) + 0.01 else: return 0.01
#pip install py-bing-search #Blog Yazisi : http://bit.ly/1iEZHZt from py_bing_search import PyBingSearch file = open("siteurl.txt", "wb") bing = PyBingSearch('API-KEY') result_list, next_uri = bing.search("Sorgu Cümleciği", limit=50, format='json') for result in result_list: file.write(result.url+"\n"); file.close()
# -*- coding: utf-8 -*- __author__ = 'lufo' from py_bing_search import PyBingSearch bing = PyBingSearch('QkcWAM6VJ/S0LJI9wvVGN4UNQUwikMb4zY/kUVe/hAw') result_list, next_uri = bing.search("Python Software Foundation", limit=50, format='json') for result in result_list: print result.url
for query_full in input_text: index = index + 1 passphrase = re.sub(r"\'", '',query_full) query = query_full.split() for i in range(0, len(query)): query[i] = query[i].lower() print "qeury[" + str(i) + "] = " + query[i] new_list = [] print passphrase #Handling exception in case of bad query try: result_list = bing.search(passphrase, limit=10, format='json') except: #Gathering bad query badwords_output.write(query_full + '\n') # Initializing dictionary #matched_list = {"passphrase": passphrase, "cloestMatching": '', "maxMatchNumber" : 0, "Percentage":0} matched_list = {} matched_list["passphrase"] = [] matched_list["title"] = [] matched_list["matchedWords"] = [] matched_list["maxMatchNumber"] = [] matched_list["percentage"] = [] matched_list["uniqueMatchedWords"] = []
class WebMd: def __init__(self): self.APIKEY = open("apikey.txt").read() self.bing = PyBingSearch(self.APIKEY) self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8', 'Connection': 'keep-alive'} self.summarizer = Summarizer() def extractUrlStructuredText(self, url): """Extracts data from webmd url and provides a list of objects containing the heading and body """ html = self.getUrl(url) Soup = BeautifulSoup(html) soup = Soup.find('div', {'class':'hwDefinition_fmt'}) # better condition but doesn't always exist if soup == None: soup = Soup.find('div', {'id':'textArea'}) # generally always exists body = "" blocks = [] # list of objects containing heading and body heading = "" body = "" startNew = False skip = False for child in soup.recursiveChildGenerator(): name = getattr(child, "name", None) if skip: skip = False continue if startNew: heading = child body = "" startNew = False continue if name in ['script', 'style']: skip = True continue if name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'b']: blocks.append({'heading':heading, 'body':body}) startNew = True if name is not None: pass elif not child.isspace(): # leaf node, don't print spaces body = body + " " + child if len(blocks)>1: return blocks[1::] return [] def extractUrlText(self, url): """Extracts content text from webmd url """ html = self.getUrl(url) Soup = BeautifulSoup(html) soup = Soup.find('div', {'class':'hwDefinition_fmt'}) # better condition but doesn't always exist if soup == None: soup = Soup.find('div', {'id':'textArea'}) # generally always exists skipNext = False body = "" for child in soup.recursiveChildGenerator(): if skipNext: skipNext = False continue name = getattr(child, "name", None) if name in ["script", "style"]: skipNext = True if name is not None: pass elif not child.isspace(): # leaf node, don't print spaces body = body + child return body def getUrl(self, url): """Attempts to summarize webpage contents (assuming webmd url) """ hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8', 'Connection': 'keep-alive'} req = urllib2.Request(url, headers=hdr) response = urllib2.urlopen(req).read() #response = requests.get(test_url) #response = urllib2.urlopen(test_url).read() return response def isFirstAidPage(self, url): if url.find('/first-aid/') == -1: return False else: return True def search(self, s, limit=3): """Searches top limit number of bing searches. Returns the summarized/unsummarized data and the format code (0=no format, 1=formatted) """ result_list, next_uri = self.bing.search(s + " treatment webmd", limit=limit, format='json') ########### Xiuyan's processing. First Aid type instruction format ########## for result in result_list: print(result.url) if self.isFirstAidPage(result.url): try: page = requests.get(result.url) print("piece of shit") return (extract_instructions(page), 1) except: print("entered Xiuyan's except") ########## Rahman's processing. Returns structured data representing all of first link ############# try: blocks = self.extractUrlStructuredText(result_list[0].url) return (blocks, 1) except: print("Able to structure into headers and body") ########### Rahman's processing for 'other' pages. Attempts to summarize all first three links ########### content = "" for result in result_list: try: content = content + self.extractUrlText(result.url) except Exception, e: print(e) pass if content == "": print("Other WebMd Page") return (self.summarizer.summarizeText(content), 0) ########### Worst case: summarize first url ################ print("Summarizing first") return (self.summarizer.summarizeUrl(result_list[0].url), 0)
def get_results(search): '-> _List_ of dictionaries of results' bing = PyBingSearch(BING_SEARCH_KEY) results, next_uri = bing.search(search, limit =NUM_SEARCH_RESULTS, format ='json') return results
from py_bing_search import PyBingSearch import sys import os linkfile = sys.argv[-1] linkfile = linkfile.replace("^", "|"); bing=PyBingSearch('MsYC/eW39AiaY9EYFIC8mlX8C7HPRRooagMKRwVZx7Q') result_list, next_uri = bing.search(linkfile, limit=5, format='json') #result_list, next_uri = bing.search("Python Software Foundation", limit=50, format='json') result_list[0].description file = open( 'bingResults.txt', 'w') for res in result_list: file.write('"' + res.url + '" ') break
def bingWikiSearch(self): query = self.query.split(" ")[0] + " :wiki" bing = PyBingSearch(self.bing_api_key) results, next_uri = bing.search(query, limit=1, format='json') return results[0].url