Exemplo n.º 1
0
    def run(self, keywords=[]):

        if not keywords:
            # Check if file exists
            if not os.path.isfile(self.default_keyword_file):
                return False
            else:
                keywords = []
                fp = open(self.default_keyword_file,"r")
                for line in fp.readlines():
                    keywords.append(line.strip())
                fp.close()

        self.keywords = keywords
        print "Using Keywords:{0}".format(self.keywords)

        try:
            # Get the hits for the given keywords
            bing = PyBingSearch(BING_API_KEY)
            for keyword in self.keywords:
                print "KEYWORD:{0}".format(keyword)
                result_list, next_uri = bing.search(keyword, limit=self.maxResuts, format='json')
                for result in result_list:
                    url = result.url
                    print "Found URL:{0}".format(url)
                    self.urls.append(url)
        except:
                print "Something went wrong querying Bing."
                pass

        return True
Exemplo n.º 2
0
    def run(self, keywords=[]):

        if not keywords:
            # Check if file exists
            if not os.path.isfile(self.default_keyword_file):
                return False
            else:
                keywords = []
                fp = open(self.default_keyword_file, "r")
                for line in fp.readlines():
                    keywords.append(line.strip())
                fp.close()

        self.keywords = keywords
        print "Using Keywords:{0}".format(self.keywords)

        try:
            # Get the hits for the given keywords
            bing = PyBingSearch(BING_API_KEY)
            for keyword in self.keywords:
                print "KEYWORD:{0}".format(keyword)
                result_list, next_uri = bing.search(keyword,
                                                    limit=self.maxResuts,
                                                    format='json')
                for result in result_list:
                    url = result.url
                    print "Found URL:{0}".format(url)
                    self.urls.append(url)
        except:
            print "Something went wrong querying Bing."
            pass

        return True
Exemplo n.º 3
0
def bing_search(query):
    bing = PyBingSearch('rLSasvRW9cvlU5fG9hoSGjJG2M1eIjR+Ld27nFC9Pj8=')
    buildquery = query.replace(',', ' ')
    result_list = bing.search_all(query, limit=10, format='json')
    bingurls = []
    for result in result_list:
        bingurls.append(result.url)
    return bingurls
Exemplo n.º 4
0
def bing_search(query):
	bing = PyBingSearch('rLSasvRW9cvlU5fG9hoSGjJG2M1eIjR+Ld27nFC9Pj8=')
	buildquery=query.replace(',',' ')
	result_list = bing.search_all(query, limit=10, format='json')
	bingurls=[]
	for result in result_list:
		 bingurls.append(result.url)
	return bingurls
Exemplo n.º 5
0
def GetLinksForQueryBing(query):
    bing = PyBingSearch('3Bybyj2qcK/w5FXbBqBUjI9MajN51efC2uYldmzvvnY')
    try:
        result_list = bing.search_all(query, limit=20, format='json')
    except PyBingException:
        return []
    results = [result.url for result in result_list]
    results = results[:min(20, len(results))]
    return [r for r in results if r.find("youtube") == -1]
Exemplo n.º 6
0
def GetLinksForQueryBing(query):
    bing = PyBingSearch('3Bybyj2qcK/w5FXbBqBUjI9MajN51efC2uYldmzvvnY')
    try:
        result_list = bing.search_all(query, limit=20, format='json')
    except PyBingException:
        return []
    results = [result.url for result in result_list]
    results = results[:min(20, len(results))]
    return [r for r in results if r.find("youtube") == -1]
 def bingSearch(self, numresult=10):
     bing = PyBingSearch(self.bing_api_key)
     results, next_uri = bing.search(self.query,
                                     limit=numresult,
                                     format='json')
     res = []
     for i in range(numresult):
         res += [results[i].url]
     return res
def bingSearch(link, limit=4):
    #Sanitize input
    try:
	    linkfile = link.replace("^", "|")
	    bing=PyBingSearch('MsYC/eW39AiaY9EYFIC8mlX8C7HPRRooagMKRwVZx7Q')
	    try: result_list, next_uri = bing.search(linkfile, limit, format='json')
	    except: result_list, next_uri = bing.search(linkfile.replace(" news", ""), limit, format='json')
	    returning=[]
	    for i in xrange(limit):
		 try: returning.append(result_list[i].url.encode('utf8'))
		 except: break
	    return returning
    except: return [link.replace(" news", "")]
Exemplo n.º 9
0
def get_improved_term(query):
    bing = PyBingSearch('') # Add your bing-api key here
    result_list, next_url = bing.search("%s wikipedia" % query, limit=3, format='json')
    for result in result_list:
        wiki_url = result.url
        wiki_desc = result.description
        if "en.wikipedia" in wiki_url:
            if ("may refer to" not in wiki_desc) or ("may also refer to" not in wiki_desc):
                wiki_corr_term = (wiki_url).split("/")[-1]
                try:
                    wiki_corr_term_dec = str(urllib.unquote(wiki_corr_term).decode('utf-8'))
                    return wiki_corr_term_dec
                except:
                    pass
    return query
Exemplo n.º 10
0
def getTopTen():
	global query
	global pagesToBeCrawled
	global fb
	bing = PyBingSearch('mMlCxUd5qmU5uDJ1w1VLbDkobVK905A9cZZhYkfqGHg=')
	query = raw_input("Enter a search query ")
	pagesToBeCrawled = input("Enter the number of pages you would like to be crawled? ")
	fp.write('****************************The query searched for is:' + query + ", pages to be crawled: " + str(pagesToBeCrawled) + '\n')
	urlList, next_uri = bing.search(query, limit=10, format='json') # get the results
	for result in urlList:
		#initialUrls.append(result); # Add the initial lists to the list
		if (pages > pagesToBeCrawled):
				print 'We have successfully crawled',pagesToBeCrawled,'pages'
				break
		checkUrl(result.url)
Exemplo n.º 11
0
def bingSearch(linkfile):
    print "\nCalling bingSearch with arguments linkfile: {}:".format(str(linkfile))
    #Sanitize input
    linkfile = linkfile.replace("^", "|")

    bing=PyBingSearch('XXXXX')
    #Get from bing:
    result_list, next_uri = bing.search(linkfile, limit=5, format='json')
    #result_list, next_uri = bing.search("Python Software Foundation", limit=50, format='json')
    result_list[0].description #first bing result
    file = open( 'bingResults.txt', 'w')
    for res in result_list:
        file.write('"' + res.url + '" ')
        break
    print "\nbingSearch complete"
    return str(result_list[0].url)
Exemplo n.º 12
0
def GetLinksForQueryBing(query):
    #service = build("customsearch", "v1",
    #          developerKey="AIzaSyDBh9qkTpuXSWbsjCfnCTQJFuFGKOYCElM")

    #res = service.cse().list(
    #    q=query,
    #    cx='000504779742960611072:dpmv5fihhu8',
    #  ).execute()

    #return [item['link'] for item in res['items']][:20]

    try:
        bing = PyBingSearch('3Bybyj2qcK/w5FXbBqBUjI9MajN51efC2uYldmzvvnY')
        result_list = bing.search_all(query, limit=20, format='json')

        results = [result.url for result in result_list]
    except:
        return None
    return results[:min(20, len(results))]
Exemplo n.º 13
0
def GetLinksForQueryBing(query):
    #service = build("customsearch", "v1",
    #          developerKey="AIzaSyDBh9qkTpuXSWbsjCfnCTQJFuFGKOYCElM")

    #res = service.cse().list(
    #    q=query,
    #    cx='000504779742960611072:dpmv5fihhu8',
    #  ).execute()

    #return [item['link'] for item in res['items']][:20]

    try:
        bing = PyBingSearch('3Bybyj2qcK/w5FXbBqBUjI9MajN51efC2uYldmzvvnY')
        result_list = bing.search_all(query, limit=20, format='json')

        results = [result.url for result in result_list]
    except:
        return None
    return results[:min(20, len(results))]
Exemplo n.º 14
0
def get_improved_term(query):
    bing = PyBingSearch('')  # Add your bing-api key here
    result_list, next_url = bing.search("%s wikipedia" % query,
                                        limit=3,
                                        format='json')
    for result in result_list:
        wiki_url = result.url
        wiki_desc = result.description
        if "en.wikipedia" in wiki_url:
            if ("may refer to" not in wiki_desc) or ("may also refer to"
                                                     not in wiki_desc):
                wiki_corr_term = (wiki_url).split("/")[-1]
                try:
                    wiki_corr_term_dec = str(
                        urllib.unquote(wiki_corr_term).decode('utf-8'))
                    return wiki_corr_term_dec
                except:
                    pass
    return query
Exemplo n.º 15
0
 def __init__(self):
     self.APIKEY = open("apikey.txt").read()
     self.bing = PyBingSearch(self.APIKEY)
     self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    'Accept-Encoding': 'none',
    'Accept-Language': 'en-US,en;q=0.8',
    'Connection': 'keep-alive'}
     self.summarizer = Summarizer()
Exemplo n.º 16
0
def main(argv):
	query = argv[1]

	if(EXACT):
		query = '"' + query + '"'
	print("Query:", query)
	
	bing = PyBingSearch()
	
	bing.search_all(query, 1000)
	
	print("-----------------------")
	print("hits:", len(bing.list))
	print("writing results to:", output_file)
	output = open(output_file, 'w')
	
	for url in bing.list:
		output.write(url + "\n")
		
	print("writing finished")

	sys.exit()
Exemplo n.º 17
0
 def _hits(self,my_query):
     if self.search_engine == "google":
         query = urllib.urlencode({'q' : my_query})
         time.sleep(randint(0,4))
         r = requests.get('https://www.google.com/search?' + query)
         searchres_param = "id=\"resultStats\">((About |)[0-9,]+) result(|s)</div>"
         print my_query
         try:
             count = re.search(searchres_param,r.text).group(1)
             if "About " in count:
                 count = count.strip("About ")
             print "Result found"
             return (int(str(re.sub(',','',count))) + 0.01)
         except:
             print "No results"
             return 0.01
     elif self.search_engine == "bing":
         bing = PyBingSearch('xAFcyVsidXgkpQxwHYkPcPPPRGpjU2qlNtjBD6ZqGNU')
         result_list,next_url = bing.search(my_query)
         if len(result_list) > 0:
             return len(result_list) + 0.01
         else:
             return 0.01
Exemplo n.º 18
0
#pip install py-bing-search
#Blog Yazisi : http://bit.ly/1iEZHZt

from py_bing_search import PyBingSearch

file = open("siteurl.txt", "wb")

bing = PyBingSearch('API-KEY')
result_list, next_uri = bing.search("Sorgu Cümleciği", limit=50, format='json')

for result in result_list:
    file.write(result.url+"\n");

file.close()
Exemplo n.º 19
0
# -*- coding: utf-8 -*-
__author__ = 'lufo'

from py_bing_search import PyBingSearch

bing = PyBingSearch('QkcWAM6VJ/S0LJI9wvVGN4UNQUwikMb4zY/kUVe/hAw')
result_list, next_uri = bing.search("Python Software Foundation", limit=50, format='json')

for result in result_list:
    print result.url
from py_bing_search import PyBingSearch
import os
import fileinput
import sys
import time
import re

reload(sys)
sys.setdefaultencoding('utf8')
filename = sys.argv[1]
bing = PyBingSearch('UvG/iELD97We0KffqjrVFHwUrEHbe0ZCbeVfraImZRg')

outputfile = filename.replace('.txt', "")
input_text = []
output_text = []

text = open(filename, 'r')
for line in text.readlines():
	input_text.append(line.rstrip());

all_text_length = len(input_text)

badwords = outputfile + "-badwords.txt"
total = outputfile + "-result.txt"

badwords_output = open(badwords, 'w')
total_output = open(total, 'w')

output = open(outputfile + ".json", 'w')

output.write('{"result":[')
Exemplo n.º 21
0
class WebMd:
    
    def __init__(self):
        self.APIKEY = open("apikey.txt").read()
        self.bing = PyBingSearch(self.APIKEY)
        self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}
        self.summarizer = Summarizer()

    def extractUrlStructuredText(self, url):
        """Extracts data from webmd url and provides a list of objects containing the heading and body
        """
        html = self.getUrl(url)    
        Soup = BeautifulSoup(html)
        soup = Soup.find('div', {'class':'hwDefinition_fmt'}) # better condition but doesn't always exist
        if soup == None:
            soup = Soup.find('div', {'id':'textArea'}) # generally always exists
        body = ""
        blocks = [] # list of objects containing heading and body
        heading = ""
        body = ""
        startNew = False
        skip = False
        for child in soup.recursiveChildGenerator():
            name = getattr(child, "name", None)
            if skip:
                skip = False
                continue
            if startNew:
                heading = child
                body = ""
                startNew = False
                continue
            if name in ['script', 'style']:
                skip = True
                continue
            if name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'b']:
                blocks.append({'heading':heading, 'body':body})
                startNew = True
            if name is not None:
                pass
            elif not child.isspace(): # leaf node, don't print spaces
                body = body + " " + child
        if len(blocks)>1:
            return blocks[1::]
        return []
        
    def extractUrlText(self, url):
        """Extracts content text from webmd url
        """
        html = self.getUrl(url)    
        Soup = BeautifulSoup(html)
        soup = Soup.find('div', {'class':'hwDefinition_fmt'}) # better condition but doesn't always exist
        if soup == None:
            soup = Soup.find('div', {'id':'textArea'}) # generally always exists
        skipNext = False
        body = ""
        for child in soup.recursiveChildGenerator():
            if skipNext:
                skipNext = False
                continue
            name = getattr(child, "name", None)
            if name in ["script", "style"]:
                skipNext = True
            if name is not None:
                pass
            elif not child.isspace(): # leaf node, don't print spaces
                body = body + child
        return body
                   
    def getUrl(self, url):
        """Attempts to summarize webpage contents (assuming webmd url) 
        """
        hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
               'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
               'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
               'Accept-Encoding': 'none',
               'Accept-Language': 'en-US,en;q=0.8',
               'Connection': 'keep-alive'}
        req = urllib2.Request(url, headers=hdr)
        response = urllib2.urlopen(req).read()
        #response = requests.get(test_url)
        #response = urllib2.urlopen(test_url).read()        
        return response

    def isFirstAidPage(self, url):
        if url.find('/first-aid/') == -1:
            return False
        else:
            return True
        
    def search(self, s, limit=3):
        """Searches top limit number of bing searches.
           Returns the summarized/unsummarized data and the format code (0=no format, 1=formatted)
        """
        result_list, next_uri = self.bing.search(s + " treatment webmd", limit=limit, format='json')
        
        ########### Xiuyan's processing. First Aid type instruction format ##########
        for result in result_list:
            print(result.url)
            if self.isFirstAidPage(result.url):
                
                try:
                    page = requests.get(result.url)
                    print("piece of shit")
                    return (extract_instructions(page), 1)
                except:
                    print("entered Xiuyan's except")
                
        ########## Rahman's processing. Returns structured data representing all of first link #############
        try:
            blocks = self.extractUrlStructuredText(result_list[0].url)
            return (blocks, 1)
        except:
            print("Able to structure into headers and body")

        ########### Rahman's processing for 'other' pages. Attempts to summarize all first three links ###########  
        content = ""      
        for result in result_list:
            try:
                content = content + self.extractUrlText(result.url)
            except Exception, e:
                print(e)
                pass
        if content == "":
            print("Other WebMd Page")
            return (self.summarizer.summarizeText(content), 0)
        
        ########### Worst case: summarize first url ################
        print("Summarizing first")
        return (self.summarizer.summarizeUrl(result_list[0].url), 0)
def get_results(search):
    '-> _List_ of dictionaries of results'
    bing = PyBingSearch(BING_SEARCH_KEY)
    results, next_uri = bing.search(search, limit =NUM_SEARCH_RESULTS, format ='json')
    return results
Exemplo n.º 23
0
from py_bing_search import PyBingSearch

bing = PyBingSearch('3Bybyj2qcK/w5FXbBqBUjI9MajN51efC2uYldmzvvnY')
result_list = bing.search_all(
    "(yawn) AND (other OR early) AND (people) AND (contagious OR catching) AND (room)",
    limit=50,
    format='json')

print[result.url for result in result_list][:10]
Exemplo n.º 24
0
from py_bing_search import PyBingSearch
import sys
import os

linkfile = sys.argv[-1]
linkfile = linkfile.replace("^", "|");
bing=PyBingSearch('MsYC/eW39AiaY9EYFIC8mlX8C7HPRRooagMKRwVZx7Q')
result_list, next_uri = bing.search(linkfile, limit=5, format='json')
#result_list, next_uri = bing.search("Python Software Foundation", limit=50, format='json')
result_list[0].description

file = open( 'bingResults.txt', 'w')
for res in result_list:
	file.write('"' + res.url + '" ')
	break
 def bingWikiSearch(self):
     query = self.query.split(" ")[0] + " :wiki"
     bing = PyBingSearch(self.bing_api_key)
     results, next_uri = bing.search(query, limit=1, format='json')
     return results[0].url
Exemplo n.º 26
0
def GetLinksForQuery(query):
    bing = PyBingSearch('3Bybyj2qcK/w5FXbBqBUjI9MajN51efC2uYldmzvvnY')
    result_list = bing.search_all(query, limit=20, format='json')

    return [result.url for result in result_list][:20]
Exemplo n.º 27
0
from py_bing_search import PyBingSearch
import urllib
import urllib2
import json
import os
import socket

socket.setdefaulttimeout(5)

key = '4axpjG94pE8x9yUZqveY2LObcgNLVfX5oTW6+s5JbR0'
bing = PyBingSearch('4axpjG94pE8x9yUZqveY2LObcgNLVfX5oTW6+s5JbR0')

credentialBing = 'Basic ' + (':%s' % key).encode(
    'base64')[:-1]  # the "-1" is to remove the trailing "\n" which encode adds

photo_directory = 'bingBad'
if not os.path.exists(photo_directory):
    os.makedirs(photo_directory)

for offset in range(0, 50000, 50):
    bing_search_url = "https://api.datamarket.azure.com/Bing/Search/v1/Image?Query=%27bad%20photography%27&$format=json&$top=200&$skip=" + str(
        offset)

    request = urllib2.Request(bing_search_url)
    request.add_header('Authorization', credentialBing)
    requestOpener = urllib2.build_opener()
    response = requestOpener.open(request)

    results = json.load(response)

    for i, image in enumerate(results['d']['results']):
Exemplo n.º 28
0
# -*- coding: utf-8 -*-
__author__ = 'lufo'

from py_bing_search import PyBingSearch

bing = PyBingSearch('QkcWAM6VJ/S0LJI9wvVGN4UNQUwikMb4zY/kUVe/hAw')
result_list, next_uri = bing.search("Python Software Foundation",
                                    limit=50,
                                    format='json')

for result in result_list:
    print result.url
Exemplo n.º 29
0
from py_bing_search import PyBingSearch

bing = PyBingSearch('3Bybyj2qcK/w5FXbBqBUjI9MajN51efC2uYldmzvvnY')
result_list = bing.search_all("(yawn) AND (other OR early) AND (people) AND (contagious OR catching) AND (room)", limit=50, format='json')

print [result.url for result in result_list][:10]
Exemplo n.º 30
0
import json
import os
import requests
import time
from collections import defaultdict
from functools import reduce
from py_bing_search import PyBingSearch
import sys

apikey = 'wtprucmwrgk6bd92rq7tun97'
edmund_url = 'http://api.edmunds.com/api/vehicle/v2/'
end_url = '?fmt=json&view=full&api_key=' + apikey
bing = PyBingSearch('Np5rmrL6fIPP3jpDqVi+Li/rJ1Joih4Q6wP69HrjQro=')

model_id = 1
make_id = 1
engine_id = 1

models_list = []
makes_list = []
engines_list = []
make_ids = {}

makes_models_dict = defaultdict(list)
makes_json = requests.get(edmund_url + 'makes' + end_url).json()


def add_engines(engine, model_id):
    global engine_id
    global engines_list
    global models_list
Exemplo n.º 31
0
    soup = BeautifulSoup(r)
    body = soup.find('body').text
    body = unicodedata.normalize('NFKD', body).encode('ascii', 'ignore')
    body = body.splitlines()
    body = [i for i in body if i != '']
    body = [x for x in body if len(x) > 70]
    body = map(cut, body)
    if len(body) < 5:
        indexes = range(0, len(body))
    else:
        indexes = [randint(0, len(body) - 1) for i in range(0, 5)]
    return ['"' + body[i] + '"' for i in indexes]


#Now the request
bing = PyBingSearch('1lQ7z/Ye5Qo/vuWoEuznwGUDQX841pfEkLC77SBTNCs')


#Function
def request_urls(url):
    statements = rand_statements(url)
    list_duplicates = []
    for statement in statements:
        result_list, next_uri = bing.search(statement, limit=50, format='json')
        results = [
            unicodedata.normalize('NFKD', result_list[i].url).encode(
                'ascii', 'ignore') for i in range(0, len(result_list))
        ]
        list_duplicates = list_duplicates + results
    #Get the frequencies of each url we get
    return Counter(list_duplicates).most_common()
Exemplo n.º 32
0
def GetLinksForQueryBing(query):
    bing = PyBingSearch('3Bybyj2qcK/w5FXbBqBUjI9MajN51efC2uYldmzvvnY')
    result_list = bing.search_all(query, limit=20, format='json')

    return [result.url for result in result_list][:20]