Пример #1
0
def getmodel():
    
    
    counter = 0
    
    for car in db.temp.find():
        
        url = car['url']
        brand = car['brand']
        sub_brand = car['sub_brand']
        name = car['name']
        
        counter += 1

        model_page = crawler.readlink(baselink + url)

        for each_model in model_page.findAll('td',{'class':'list_version'}):
            model = each_model.a.string
            model_url = each_model.a['href']

            car_model = {
                'brand' : brand,
                'sub_brand' : sub_brand,
                'name' : name,
                'url' : url,
                'model' : model,
                'model_url' : model_url
            }

            # print car_model

            mongo.insert_model(car_model)
Пример #2
0
def getmodel():

    counter = 0

    for car in db.temp.find():

        url = car['url']
        brand = car['brand']
        sub_brand = car['sub_brand']
        name = car['name']

        counter += 1

        model_page = crawler.readlink(baselink + url)

        for each_model in model_page.findAll('td', {'class': 'list_version'}):
            model = each_model.a.string
            model_url = each_model.a['href']

            car_model = {
                'brand': brand,
                'sub_brand': sub_brand,
                'name': name,
                'url': url,
                'model': model,
                'model_url': model_url
            }

            # print car_model

            mongo.insert_model(car_model)
Пример #3
0
def getAllPages():

    for category in categories:
        starting_page = music_server + user_name + '/' + category
        content = crawler.readlink(starting_page)

        # print starting_page
        getNextPage(starting_page)
Пример #4
0
def getNextPage(link):
    content = crawler.readlink(link)
    print link
    try:
        next_page = content.find('span', attrs={
            'class': 'next'
        }).find('a')['href']
        page_cache.append(next_page)
        getNextPage(next_page)
    except TypeError:
        print('Reach to the last page')
Пример #5
0
def getMusic():

    for pages_link in page_cache:

        content = crawler.readlink(pages_link)
        items = content.find('div', attrs={
            'class': 'grid-view'
        }).findAll('div', attrs={'class': 'item'})
        for item in items:
            # item in pic
            cd_link = item.find('div', attrs={
                'class': 'pic'
            }).find('a')['href']
            cd_title = item.find('div', attrs={
                'class': 'pic'
            }).find('a')['title']
            cd_cover = item.find('div', attrs={
                'class': 'pic'
            }).find('img')['src']

            # item in info
            cd_intro = item.find('div', attrs={
                'class': 'info'
            }).find('li', attrs={
                'class': 'intro'
            }).text
            cd_addtime = item.find('div', attrs={
                'class': 'info'
            }).find('span', attrs={
                'class': 'date'
            }).text
            try:
                cd_mannual_tags = item.find('div', attrs={
                    'class': 'info'
                }).find('span', attrs={
                    'class': 'tags'
                }).text[3:]
            except AttributeError:
                cd_mannual_tags = 'No Comment'

            cd_detail = {
                'title': cd_title,
                'link': cd_link,
                'cover': cd_cover,
                'intro': cd_intro,
                'add_date': cd_addtime,
                'tag_by_user': cd_mannual_tags
            }

            mongo.insert_cd(cd_detail)
Пример #6
0
def main():
    mainpage_content = crawler.readlink(mainpage)

    for brand in mainpage_content.findAll('div',
                                          attrs={'class': 'column_content'}):
        brandName = brand.find('p').text
        for car in brand.findAll('a', title=True):
            name = car.text
            url = car['href']
            print brandName + '|' + name + '|' + url

            car_url = {'Brand': brandName, 'Name': name, 'url': url}

            mongo.insert_url(car_url)
Пример #7
0
def main():
    mainpage_content = crawler.readlink(mainpage)
    
    for brand in mainpage_content.findAll('div', attrs = {'class':'column_content'}):
        brandName = brand.find('p').text
        for car in brand.findAll('a', title = True):
            name = car.text
            url = car['href']
            print brandName +'|'+ name + '|' + url

            car_url = {
                'Brand': brandName,
                'Name': name,
                'url': url
            }
        
            mongo.insert_url(car_url)
Пример #8
0
def getmodel():
    # compare databases
    cars = db.temp
    check = db.model

    counter = 0

    templist = []

    for car in cars.find():
        templist.append(car['url'])
    print('%d, %s') % (len(templist), 'in the brand list')

    for car in check.find():
        templist.remove(car['url'])
    print('%d, %s') % (len(templist), 'left to fetch')

    for url in templist:
        car = cars.find_one({'url': url})
        model_page = crawler.readlink(baselink + url)

        brand = car['brand']
        sub_brand = car['sub_brand']
        name = car['name']

        for each_model in model_page.findAll('td', {'class': 'list_version'}):
            model = each_model.a.string
            model_url = each_model.a['href']

            car_model = {
                'brand': brand,
                'sub_brand': sub_brand,
                'name': name,
                'url': url,
                'model': model,
                'model_url': model_url
            }

            # print car_model

            mongo.insert_model(car_model)
Пример #9
0
def getmodel():
    # compare databases
    cars = db.temp
    check = db.model
    
    counter = 0
    
    templist = []
    
    for car in cars.find():
        templist.append(car['url'])
    print('%d, %s') % (len(templist), 'in the brand list')
    
    for car in check.find():
        templist.remove(car['url'])
    print('%d, %s') % (len(templist), 'left to fetch')
    
    for url in templist:
        car = cars.find_one({'url':url})
        model_page = crawler.readlink(baselink + url)
        
        brand = car['brand']
        sub_brand = car['sub_brand']
        name = car['name']

        for each_model in model_page.findAll('td',{'class':'list_version'}):
            model = each_model.a.string
            model_url = each_model.a['href']

            car_model = {
                'brand' : brand,
                'sub_brand' : sub_brand,
                'name' : name,
                'url' : url,
                'model' : model,
                'model_url' : model_url
            }

            # print car_model

            mongo.insert_model(car_model)
Пример #10
0
def getbrand():
    mainpage_content = crawler.readlink(mainpage)

    # get each car model with their corresponding brand name
    # extract the brand name

    # this is car containter accroding to their initials, there are total 22 initals in the main page
    car_container = mainpage_content.findAll('div',
                                             attrs={'class': 'container'})

    # in each container there are N elements of tr, which represents a specfic car brand
    for init in car_container:
        # the following extract each brand into a block
        for each_tr in init.findAll('tr'):
            brand_name = each_tr.findAll('td')[0].text
            #print brand_name

            # each div is made of two parts, p for sub model name and li for each model under such sub model
            for each_column_content in each_tr.findAll('td')[1].findAll(
                    'div', attrs={'class': 'column_content'}):
                sub_brand_name = each_column_content.p.text
                #print sub_brand_name

                for brand in each_column_content.findAll('li'):
                    name = brand.text
                    url = brand.a['href']
                    # print name

                    # print brand_name, sub_brand_name, name, url

                    car = {
                        'brand': brand_name,
                        'sub_brand': sub_brand_name,
                        'name': name,
                        'url': url
                    }

                    mongo.insert_brand(car)
Пример #11
0
def getOfficialDetail():
    cd_urls = mongo.get_cd_urls()

    for cd in cd_urls:
        cd_id = cd['_id']
        link = cd['link']
        print link
        content = crawler.readlink(link)

        try:
            cd['douban_score']
            print('already in the database')

        except KeyError:

            try:
                douban_score = content.find('strong',
                                            attrs={
                                                'class': 'll rating_num'
                                            }).text
            except AttributeError:
                douban_score = 'unkonwn'

            try:
                people_voted = content.find('span',
                                            attrs={
                                                'property': 'v:votes'
                                            }).text
            except AttributeError:
                people_voted = 'unknown'

            new_entities = {
                'douban_score': douban_score,
                'people_voted': people_voted
            }

            mongo.update(new_entities, cd_id)
Пример #12
0
def getbrand():
    mainpage_content = crawler.readlink(mainpage)
    
    # get each car model with their corresponding brand name
    # extract the brand name
    
    # this is car containter accroding to their initials, there are total 22 initals in the main page
    car_container = mainpage_content.findAll('div',attrs={'class':'container'})
    
    # in each container there are N elements of tr, which represents a specfic car brand
    for init in car_container:
        # the following extract each brand into a block
        for each_tr in init.findAll('tr'):
            brand_name = each_tr.findAll('td')[0].text
            #print brand_name
            
            # each div is made of two parts, p for sub model name and li for each model under such sub model
            for each_column_content in each_tr.findAll('td')[1].findAll('div', attrs = {'class': 'column_content'}):
                sub_brand_name = each_column_content.p.text
                #print sub_brand_name
                
                for brand in each_column_content.findAll('li'):
                    name = brand.text
                    url = brand.a['href']
                    # print name
                    
                    # print brand_name, sub_brand_name, name, url
                    
                    car = {
                        'brand' : brand_name,
                        'sub_brand' : sub_brand_name,
                        'name' : name,
                        'url' : url
                    }
                    
                    mongo.insert_brand(car)
Пример #13
0
# define base link
baselink = "http://newcar.xcar.com.cn"

# select DB collection
cars = db.cars_model.find()

# loop in collection and get spec

for car in cars:
    brand = car['Brand']
    name = car['Name']
    model = car['Model']
    url = car['Model_url']
    car_id = car['_id']

    specpage = crawler.readlink(baselink + url)

    if len(specpage.findAll('em', text=u'排量(L):')) != 0:
        price = float(specpage.b.text) * 10
        horsepower = specpage.find('em',
                                   text=u'最大功率(kW/rpm):').findNext('td').text
        liter = specpage.find('em', text=u'排量(L):').findNext('td').text
        engine_type = specpage.find('em', text=u'进气形式:').findNext('td').text
        tourque = specpage.find('em',
                                text=u'最大扭矩(Nm/rpm):').findNext('td').text
        drive = specpage.find('em', text=u'驱动方式:').findNext('td').text
    else:
        price = float(specpage.find('div', attrs={
            'class': 'price'
        }).b.text) * 10
        horsepower = 0
Пример #14
0
def getspec():
    check = db.model

    for car in check.find():
        brand = car['brand']
        sub_brand = car['sub_brand']
        name = car['name']
        url = car['url']
        model = car['model']
        model_url = car['model_url']
        _id = car['_id']

        check_length = car.get('price')
        if type(check_length) != dict:
            specpage = crawler.readlink(baselink + model_url)

            if len(specpage.findAll('em', text=u'排量(L):')) != 0:
                price = float(specpage.b.text) * 10
                horsepower = specpage.find(
                    'em', text=u'最大功率(kW/rpm):').findNext('td').text
                liter = specpage.find('em', text=u'排量(L):').findNext('td').text
                engine_type = specpage.find('em',
                                            text=u'进气形式:').findNext('td').text
                tourque = specpage.find(
                    'em', text=u'最大扭矩(Nm/rpm):').findNext('td').text
                drive = specpage.find('em', text=u'驱动方式:').findNext('td').text
            else:
                price = float(
                    specpage.find('div', attrs={
                        'class': 'price'
                    }).b.text) * 10
                horsepower = 0
                liter = 0
                engine_type = 0
                tourque = 0
                drive = 0

            car = {
                'url': url,
                'brand': brand,
                'sub_brand': sub_brand,
                'name': name,
                'model': model,
                'model_url': model_url,
                'price': price,
                'engine_size': liter,
                'engine_type': engine_type,
                'horsepower': horsepower,
                'tourque': tourque,
                'drive': drive
            }

            s = check.update({'_id': _id}, {
                '$set': {
                    'url': url,
                    'brand': brand,
                    'sub_brand': sub_brand,
                    'name': name,
                    'model': model,
                    'model_url': model_url,
                    'price': price,
                    'engine_size': liter,
                    'engine_type': engine_type,
                    'horsepower': horsepower,
                    'tourque': tourque,
                    'drive': drive
                }
            })

            print s, car['brand'], car['sub_brand'], car['name'], car[
                'model'], car['price']

        else:
            print('Spec already there')
Пример #15
0
def getspec():
    check = db.model
    
    for car in check.find():
        brand = car['brand']
        sub_brand = car['sub_brand']
        name = car['name']
        url = car['url']
        model = car['model']
        model_url = car['model_url']
        _id = car['_id']
        
        check_length = car.get('price')
        if type(check_length) != dict:
            specpage = crawler.readlink(baselink + model_url)
            
            if len(specpage.findAll('em', text = u'排量(L):')) != 0:
                price = float(specpage.b.text) * 10
                horsepower = specpage.find('em', text = u'最大功率(kW/rpm):').findNext('td').text
                liter = specpage.find('em', text = u'排量(L):').findNext('td').text
                engine_type = specpage.find('em', text = u'进气形式:').findNext('td').text
                tourque = specpage.find('em', text = u'最大扭矩(Nm/rpm):').findNext('td').text
                drive = specpage.find('em', text = u'驱动方式:').findNext('td').text
            else:
                price = float(specpage.find('div', attrs = {'class':'price'}).b.text) * 10
                horsepower = 0
                liter = 0
                engine_type = 0
                tourque = 0
                drive = 0
                
            car = {
                '_id': _id,
                'url': url,
                'brand': brand,
                'sub_brand': sub_brand,
                'name': name,
                'model': model,
                'model_url': model_url,
                'price': price,
                'engine_size': liter,
                'engine_type': engine_type,
                'horsepower': horsepower,
                'tourque': tourque,
                'drive': drive
            }
            
            s = check.update(
                {'_id':_id},
                {'$set': {
                        'price': price,
                        'engine_size': liter,
                        'engine_type': engine_type,
                        'horsepower': horsepower,
                        'tourque': tourque,
                        'drive': drive
                    }
                }
            )
            
            print car['_id'], car['brand'], car['sub_brand'], car['name'], car['model'], car['price'], s
            
        else:
            print('Spec already there')
Пример #16
0
# select DB collection
cars = db.cars_url.find()

check = db.cars_model

# loop in collection
for car in cars:
    brand = car['Brand']
    name = car['Name']
    url = car['url']
    car_id = car['_id']
    
    check_length = check.find_one({'url':url})
    if type(check_length) != dict:
        allspecpage = crawler.readlink(baselink + url)
    
        for each in allspecpage.findAll('td',{'class':'list_version'}):
            model = each.a.string
            model_url = each.a['href']

            car_model = {
                'Brand': brand,
                'Name': name,
                'url': url,
                'Model': model,
                'Model_url': model_url
            }
            print brand + '|' + name + '|' + model
            mongo.insert_model(car_model)
    else: