Python getPage示例，utils.getPage Python示例

示例#1

0

显示文件

文件： msdn_blog_crawler.py 项目： fanwangg/python-crawler

def main():
    url = START_PAGE
    while url is not None:
        soup = utils.getPage(url)
        for title in soup.findAll('h2', {"class": "entry-title"}):
            print title.text
            #getArticle(title['href'])

        print getNextPageLink(soup)
        url = getNextPageLink(soup)

    print "Crawling finished!"

示例#2

0

显示文件

文件： networkFoodCrawler.py 项目： katz12/justinthyme

def netCrawler(recipe_id, n):
#recipe_id = 0
#for n in range (0, 100):
    a= 12*n
    urln = 'http://www.foodnetwork.com/search/delegate.do?Ntk=site_search&Nr=Record%20Type:Result&N=501&No=' + str(a)
    time.sleep(1)#--#
    S = utils.getPage(urln)
    templst = []
    for u in S('.result-item.recipe'):
        templst.append(S(u))
    
    urlist = []
    for u in templst:
        urlist.append('http://www.foodnetwork.com' + u.find('a').attr('href'))
    
    print 'net list made ' + str(n)
    
    for nextUrl in urlist:
        time.sleep(1)#--#
        p = utils.getPage(nextUrl)
        
        RecpName = []
        for this in p('.fn_name'):
                RecpName.append(p(this).text())
        print RecpName

        if utils.isDuplicate(RecpName[0]):
            recipe_id += 1
            print 'duplicate'
            continue

        ImgURL = ''
        b = p('img#recipe-player-th')
        for this in b:
            ImgURL = p(this).attr('src') #if ImgURL is '' then there is no image given. Happens often.
        
        unparsedCtime = ''
        unparsedPtime = ''
        c = p('dd.clrfix')
        for this in c('meta'):
            if p(this).attr('itemprop') == 'cookTime':
                unparsedCtime = p(this).attr('content')
            elif p(this).attr('itemprop') == 'prepTime':
                unparsedPtime = p(this).attr('content')
        
        difficulty = ''
        difficulty = p(c[-1]).text()
    
        Ctime = utils.FN_minutesParser(unparsedCtime) + 60*(utils.FN_hourParser(unparsedCtime))
        Ptime = utils.FN_minutesParser(unparsedPtime) + 60*(utils.FN_hourParser(unparsedPtime))
        servings = []
        for this in c('span'):
            servings.append(p(this).text())
    
        IngrName = []
        d = p('.kv-ingred-list1')
        #d is now a list of all the ingred lists
        for this in d('li'):
            IngrName.append(p(this).text())
    
        #the ingredients here are smashed together with their ammounts, which I can't parse out.
        # if we can parse things like "1 large tomato, diced in 1/2-inch pieces" into ammounts and
        # ingredient lists, then someone should teach me.
    
        directions = []
        e = p('.fn_instructions')
        for this in e('p'):
            elf = p(this).text()
            if elf != '\n':
                if elf != '':
                    directions.append(elf)

        if not servings:
            servings = ['']
        
        decript = utils.descriptionize(directions)
        k = 0
        RecipeDict = {}
        RecipeDict = { 'id': recipe_id, 
            'name':RecpName[0], 
            'description' : decript,
            'servings' : servings[0], 
            'url' : nextUrl,
            'img_url' : ImgURL,
            'difficulty' : difficulty,
            'cook_time' : Ctime, 
            'prep_time' : Ptime}
        k = utils.insert('recipe', **RecipeDict)
        #print RecpName
        #for this in decript:
        #    print this
        #print servings[0]
        #print nextUrl
        #print ImgURL
        #print Ctime
        #print Ptime
   
        IngrDict = {}
        IngrAmnt = []
        if k != -1:
            for a in range(len(IngrName)):
                if len(IngrAmnt) < len(IngrName):
        	        IngrAmnt.append('') #in the case where "season to preference" is the 'ingredient'
        	        #print len(IngrAmnt)
                    #print IngrName[a]
                    #print IngrAmnt[a]
                IngrDict = { 'recipe_id':recipe_id, 
                    'ingredient_name':IngrName[a], 
                    'quantity':IngrAmnt[a]}
                l = utils.insert('recipe_ingredient', **IngrDict)
        recipe_id += 1
        
        
    return recipe_id

示例#3

0

显示文件

文件： article.py 项目： fanwangg/python-crawler

def getArticle(url):
    soup = utils.getPage(url)
    getTags(soup)

    return

示例#4

0

显示文件

文件： allrecipiesCrawler.py 项目： katz12/justinthyme

def allCrawler(recipe_id, n):
#recipe_id = 10
#for n in range(1, 220):
    a = n * 17
    #grab every 17th page, to provide a wide spread of results
    urln = 'http://allrecipes.com/recipes/ViewAll.aspx?Page=' + str(a)
    time.sleep(1)   #Sleeping is done according to the Web Crawling Standards S = pq(url=urln)
    S = utils.getPage(urln)
    templst = []
    for u in S('.rectitlediv'): 
        templst.append(S(u))
    urlist = []
    for u in templst: 
        urlist.append(u.find('a').attr('href'))
    print 'all list made ' + str(n)

    for nextUrl in urlist:
        time.sleep(1)		#Sleeping is done according to the Web Crawling Standards
        p = utils.getPage(nextUrl)
        #Creation of temp variables
        RecpName = []
        IngrName = []
        IngrAmnt = []
        Description = []
        ImageURL = []
        Cookpmin = []
        Cookphour = []
        Cookcmin = []
        Cookchour = []
        Cooktmin = []
        Cookthour = []
        Servings = []

        #temp variables filled
        for this in p('#itemTitle'): 
            RecpName.append(p(this).text())
        print RecpName

        if utils.isDuplicate(RecpName[0]):
            print 'duplicate'
            recipe_id += 1
            continue

        for this in p('#lblIngName'): 
	        if p(this).text() != '':
		        if p(this).attr('class') == 'ingredient-name':
			        IngrName.append(p(this).text())

        for this in p('#lblIngAmount'): 
            IngrAmnt.append(p(this).text())
        for this in p('span.plaincharacterwrap.break'): 
            Description.append(p(this).text())
        for this in p('#imgPhoto'): 
            ImageURL.append(p(this).attr('src'))

        for this in p('#prepMinsSpan'): 
            Cookpmin.append(p(this).text())
        for this in p('#cookMinsSpan'): 
            Cookcmin.append(p(this).text())
        for this in p('#totalMinsSpan'): 
            Cooktmin.append(p(this).text())

        for this in p('#prepHoursSpan'): 
            Cookphour.append(p(this).text())
        for this in p('#cookHoursSpan'): 
            Cookchour.append(p(this).text())
        for this in p('#totalHoursSpan'): 
            Cookthour.append(p(this).text())

        for this in p('#lblYield'): 
            Servings.append(p(this).text())


        #parse out the decimal times for cooking
        HpTime = utils.makeDecimalTime(Cookphour)
        MpTime = utils.makeDecimalTime(Cookpmin)

        #parse out the decimal times for preping
        HTime = utils.makeDecimalTime(Cookthour)
        MTime = utils.makeDecimalTime(Cooktmin)

        #convert all time into minutes
        Ctime = HTime*60 + MTime
        Ptime = HpTime*60 + MpTime

        #error checking
        if RecpName:
	        Rname = RecpName[0]
        else:
	        Rname = 'null'
        if ImageURL:
	        ImgURL = ImageURL[0]
        else:
	        ImgURL = 'null'
        if not Servings:
            Servings = ['null']

       
        decript = utils.descriptionize(Description)

        k = 0
        RecipeDict = { 'id': recipe_id, 
            'name':Rname, 
            'description' : decript,
            'servings' : Servings[0], 
            'url' : nextUrl,
            'img_url' : ImgURL, 
            'cook_time' : Ctime, 
            'prep_time' : Ptime}
        k = utils.insert('recipe', **RecipeDict)


        IngrDict = {}
        #if len(IngrName) > len(IngrAmnt):
	        #print IngrName
	        #print IngrAmnt
        if k != -1:
            for a in range(len(IngrName)):
                if len(IngrAmnt) < len(IngrName):
                    IngrAmnt.append('') #in the case where "season to preference" is the 'ingredient'
	                #print len(IngrAmnt)
                IngrDict = { 'recipe_id':recipe_id, 
                    'ingredient_name':IngrName[a], 
                    'quantity':IngrAmnt[a]}
    	        l = utils.insert('recipe_ingredient', **IngrDict)

        recipe_id += 1

    return recipe_id

示例#5

0

显示文件

文件： fooddotcomCrawler.py 项目： katz12/justinthyme

import time
import utils

#recipe_id = utils.latest_recipe_id
abc = ['123','A','B','C','D','E','F','G','H','I','J','K','L',
'M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
a = ['A']
for n in a:
    
    urln = 'http://www.food.com/browse/allrecipes/index.zsp?letter=' + str(n)
    time.sleep(1)#--#
    S = utils.getPage(urln)
    templst = []
    for u in S('.list'):
        templst.append(S(u))
    urlist = []
    #only templst 0 and 1 have the results in themu
    for u in templst:
        urlist.append(u.find('a').attr('href'))
    print 'list made ' + str(n)

    for nextUrl in urlist:
        time.sleep(1)#--#
        p = utils.getPage(nextUrl)
        
        RecpName = []
        for this in p('#rz-lead'):
            if p(this).attr('class') == 'fn':
                RecpName.append(p(this).text())
        print RecpName