示例#1
0
def printResults(newRankList, uniqueList, googleResults, bingResults,
                 googleURLs, bingURLs, ul, filename):
    for i in range(0, len(uniqueList)):
        result = struc()
        result.url = uniqueList[i]
        result.rank = ul[i]
        if uniqueList[i] in googleURLs:
            ind = googleURLs.index(uniqueList[i])
            result.title = googleResults[ind].title
            result.description = googleResults[ind].description
        else:
            ind = bingURLs.index(uniqueList[i])
            result.title = bingResults[ind].title
            result.description = bingResults[ind].description
        newRankList.append(result)

    newRankList.sort(key=operator.attrgetter('rank'))

    file = open(filename, "w")
    count = 0
    for result in newRankList:
        count = count + 1
        file.write(
            str(count) + '    ' + str(result.title) + '    ' +
            str(result.description) + '\n')
    return
示例#2
0
def bing_scrape(query):
    bingResults = []
    address = "https://www.bing.com/search?q=" + query + "&count=" + str(
        numresults)
    request = Request(
        address, None, {
            'User-Agent':
            'Mosilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11'
        })
    urlfile = urlopen(request)
    page = urlfile.read()
    soup = BeautifulSoup(page, 'html5lib')

    titles = []
    descriptions = []
    urls = []

    headers = soup.findAll('li', attrs={'class': 'b_algo'})
    for header in headers:
        if header is not None:
            anchor = header.find('a')
            desc = header.find('p')
            if anchor is not None and desc is not None:
                t = anchor.contents[0].encode("utf-8")
                u = anchor['href']
                urls.append(u)
                titles.append(t)
                d = desc.text.encode("utf-8")
                descriptions.append(d)

    size = len(titles)

    filename = query + '_Bing.txt'
    file = open(filename, "w")
    for i in range(0, size):
        result = struc()
        result.rank = i + 1
        result.title = titles[i]
        result.description = descriptions[i]
        result.url = urls[i]
        file.write(
            str(i + 1) + '    ' + str(titles[i]) + ' ' + str(descriptions[i]) +
            '\n')
        bingResults.append(result)

    return bingResults
def yahoo_scrape(query):
    yahooResults = []
    address = "https://search.yahoo.com/search?p=" + query + "&n=" + str(
        numresults)
    request = urllib2.Request(
        address, None, {
            'User-Agent':
            'Mosilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11'
        })
    urlfile = urllib2.urlopen(request)
    page = urlfile.read()
    soup = BeautifulSoup(page)

    titles = []
    descriptions = []
    urls = []

    headers = soup.findAll('div', 'compTitle')
    for header in headers:
        if header is not None:
            t = header.a.text.encode('utf-8')
            u = header.a.get('href')
            urls.append(u)
            titles.append(t)

    desclist = soup.findAll('div', 'compText aAbs')
    for desc in desclist:
        if desc is not None:
            d = desc.text.encode('utf-8')
            descriptions.append(d)

    size = len(titles)

    filename = query + '_Yahoo.txt'
    file = open(filename, "w")
    for i in range(0, size):
        result = struc()
        result.rank = i + 1
        result.title = titles[i]
        result.description = descriptions[i]
        result.url = urls[i]
        file.write(
            str(i + 1) + '    ' + titles[i] + ' ' + descriptions[i] + '\n')
        yahooResults.append(result)

    return yahooResults
示例#4
0
def aggregate(googleResults, yahooResults):
    l1 = []
    l2 = []
    uniques = struc()
    for each in googleResults:
        l1.append(each.url)

    for each in yahooResults:
        l2.append(each.url)

    uniqueList = union(l1, l2)

    # Common Results
    #print(intersect(l1,l2))
    print str(len(l1)) + ' Google Results, ' + str(
        len(l2)) + ' Yahoo results, ' + str(
            len(uniqueList)) + ' Unique Results'

    googleURLs = []
    yahooURLs = []

    for result in googleResults:
        googleURLs.append(result.url)

    for result in yahooResults:
        yahooURLs.append(result.url)

    # Writing Aggregated Documents
    filename = 'UniqueDocuments.txt'
    file = open(filename, "w")
    count = 0
    for link in uniqueList:
        if link in googleURLs:
            count = count + 1
            i = googleURLs.index(link)
            file.write(
                str(count) + '    ' + googleResults[i].title + ' ' +
                googleResults[i].description + '\n')
        elif link in yahooURLs:
            count = count + 1
            i = yahooURLs.index(link)
            file.write(
                str(count) + '    ' + yahooResults[i].title + ' ' +
                yahooResults[i].description + '\n')
    return uniqueList
示例#5
0
def google_scrape(query):
    googleResults = []
    address = "http://www.google.com/search?q=" + urllib.quote_plus(
        query) + "&num=" + str(numresults + 2) + "&hl=en&start=0"
    request = urllib2.Request(
        address, None, {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11'
        })
    urlfile = urllib2.urlopen(request)
    page = urlfile.read()
    soup = BeautifulSoup(page)

    titles = []
    descriptions = []
    urls = []

    headers = soup.findAll('div', 'rc')
    for header in headers:
        t = header.a.string.encode('utf-8')
        u = header.a.get('href')
        urls.append(u)
        titles.append(t)

    desclist = soup.findAll('span', 'st')
    for desc in desclist:
        d = desc.text.encode('utf-8')
        descriptions.append(d)

    size = len(titles)
    filename = query + '_Google.txt'
    file = open(filename, "w")
    for i in range(0, size):
        result = struc()
        result.rank = i + 1
        result.title = titles[i]
        result.description = descriptions[i]
        result.url = urls[i]
        file.write(
            str(i + 1) + '    ' + titles[i] + ' ' + descriptions[i] + '\n')
        googleResults.append(result)
    return googleResults
示例#6
0
def yahoo_scrape(query):
    yahooResults = []
    address = "https://search.yahoo.com/search?p=" + query + "&n=" + str(numresults)
    request = urllib2.Request(address, None, {'User-Agent':'Mosilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11'})
    urlfile = urllib2.urlopen(request)
    page = urlfile.read()
    soup = BeautifulSoup(page)

    titles = []
    descriptions = []
    urls = []

    headers = soup.findAll('div','compTitle')
    for header in headers:
        if header is not None:
            t = header.a.text.encode('utf-8')
            u = header.a.get('href')
            urls.append(u)
            titles.append(t)
    
    desclist = soup.findAll('div','compText aAbs')
    for desc in desclist:
        if desc is not None:
            d = desc.text.encode('utf-8')
            descriptions.append(d)
            
    size = len(titles)
    
    filename = query + '_Yahoo.txt'
    file = open(filename,"w")
    for i in range(0,size):
        result = struc()
        result.rank = i+1
        result.title = titles[i]
        result.description = descriptions[i]
        result.url = urls[i]
        file.write(str(i+1) + '    ' + titles[i] + ' ' + descriptions[i] + '\n')
        yahooResults.append(result)

    return yahooResults
def aggregate(googleResults,yahooResults):
    l1 = []
    l2 = []
    uniques = struc()
    for each in googleResults:
        l1.append(each.url)
    
    for each in yahooResults:
        l2.append(each.url)

    uniqueList = union(l1,l2)

    # Common Results
    #print(intersect(l1,l2))
    print str(len(l1)) + ' Google Results, ' + str(len(l2)) + ' Yahoo results, ' + str(len(uniqueList)) + ' Unique Results'

    googleURLs = []
    yahooURLs = []

    for result in googleResults:
        googleURLs.append(result.url)

    for result in yahooResults:
        yahooURLs.append(result.url)

    # Writing Aggregated Documents
    filename = 'UniqueDocuments.txt'
    file = open(filename,"w")
    count = 0
    for link in uniqueList:
        if link in googleURLs:
            count = count + 1
            i = googleURLs.index(link)
            file.write(str(count) + '    ' + googleResults[i].title + ' ' + googleResults[i].description + '\n')
        elif link in yahooURLs:
            count = count + 1
            i = yahooURLs.index(link)
            file.write(str(count) + '    ' + yahooResults[i].title + ' ' + yahooResults[i].description + '\n')
    return uniqueList
示例#8
0
def printResults(newRankList,uniqueList,googleResults,yahooResults,googleURLs,yahooURLs,ul,filename):
    for i in range(0,len(uniqueList)):
        result = struc()
        result.url = uniqueList[i]
        result.rank = ul[i]
        if uniqueList[i] in googleURLs:
            ind = googleURLs.index(uniqueList[i])
            result.title = googleResults[ind].title
            result.description = googleResults[ind].description           
        else:
            ind = yahooURLs.index(uniqueList[i])
            result.title = yahooResults[ind].title
            result.description = yahooResults[ind].description
        newRankList.append(result)
        
    newRankList.sort(key=operator.attrgetter('rank'))
    
    file = open(filename,"w")
    count = 0
    for result in newRankList:
        count = count + 1
        file.write(str(count) + '    ' + result.title + '    ' + result.description + '\n')
    return
def google_scrape(query):
    googleResults = []
    address = "http://www.google.com/search?q=" + urllib.quote_plus(query) + "&num=" + str(numresults+2) +"&hl=en&start=0"
    request = urllib2.Request(address, None, {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11'})
    urlfile = urllib2.urlopen(request)
    page = urlfile.read()
    soup = BeautifulSoup(page)

    titles = []
    descriptions = []
    urls = []

    headers = soup.findAll('div','rc')
    for header in headers:
        t = header.a.string.encode('utf-8')
        u = header.a.get('href')
        urls.append(u)
        titles.append(t)
        
    desclist = soup.findAll('span','st')
    for desc in desclist:
        d = desc.text.encode('utf-8')
        descriptions.append(d)

    size = len(titles)
    filename = query + '_Google.txt'
    file = open(filename,"w")
    for i in range(0,size):
        result = struc()    
        result.rank = i+1
        result.title = titles[i]
        result.description = descriptions[i]
        result.url = urls[i]
        file.write(str(i+1) + '    ' + titles[i] + ' ' + descriptions[i] + '\n')
        googleResults.append(result)
    return googleResults