def getmodel(): counter = 0 for car in db.temp.find(): url = car['url'] brand = car['brand'] sub_brand = car['sub_brand'] name = car['name'] counter += 1 model_page = crawler.readlink(baselink + url) for each_model in model_page.findAll('td',{'class':'list_version'}): model = each_model.a.string model_url = each_model.a['href'] car_model = { 'brand' : brand, 'sub_brand' : sub_brand, 'name' : name, 'url' : url, 'model' : model, 'model_url' : model_url } # print car_model mongo.insert_model(car_model)
def getmodel(): counter = 0 for car in db.temp.find(): url = car['url'] brand = car['brand'] sub_brand = car['sub_brand'] name = car['name'] counter += 1 model_page = crawler.readlink(baselink + url) for each_model in model_page.findAll('td', {'class': 'list_version'}): model = each_model.a.string model_url = each_model.a['href'] car_model = { 'brand': brand, 'sub_brand': sub_brand, 'name': name, 'url': url, 'model': model, 'model_url': model_url } # print car_model mongo.insert_model(car_model)
def getAllPages(): for category in categories: starting_page = music_server + user_name + '/' + category content = crawler.readlink(starting_page) # print starting_page getNextPage(starting_page)
def getNextPage(link): content = crawler.readlink(link) print link try: next_page = content.find('span', attrs={ 'class': 'next' }).find('a')['href'] page_cache.append(next_page) getNextPage(next_page) except TypeError: print('Reach to the last page')
def getMusic(): for pages_link in page_cache: content = crawler.readlink(pages_link) items = content.find('div', attrs={ 'class': 'grid-view' }).findAll('div', attrs={'class': 'item'}) for item in items: # item in pic cd_link = item.find('div', attrs={ 'class': 'pic' }).find('a')['href'] cd_title = item.find('div', attrs={ 'class': 'pic' }).find('a')['title'] cd_cover = item.find('div', attrs={ 'class': 'pic' }).find('img')['src'] # item in info cd_intro = item.find('div', attrs={ 'class': 'info' }).find('li', attrs={ 'class': 'intro' }).text cd_addtime = item.find('div', attrs={ 'class': 'info' }).find('span', attrs={ 'class': 'date' }).text try: cd_mannual_tags = item.find('div', attrs={ 'class': 'info' }).find('span', attrs={ 'class': 'tags' }).text[3:] except AttributeError: cd_mannual_tags = 'No Comment' cd_detail = { 'title': cd_title, 'link': cd_link, 'cover': cd_cover, 'intro': cd_intro, 'add_date': cd_addtime, 'tag_by_user': cd_mannual_tags } mongo.insert_cd(cd_detail)
def main(): mainpage_content = crawler.readlink(mainpage) for brand in mainpage_content.findAll('div', attrs={'class': 'column_content'}): brandName = brand.find('p').text for car in brand.findAll('a', title=True): name = car.text url = car['href'] print brandName + '|' + name + '|' + url car_url = {'Brand': brandName, 'Name': name, 'url': url} mongo.insert_url(car_url)
def main(): mainpage_content = crawler.readlink(mainpage) for brand in mainpage_content.findAll('div', attrs = {'class':'column_content'}): brandName = brand.find('p').text for car in brand.findAll('a', title = True): name = car.text url = car['href'] print brandName +'|'+ name + '|' + url car_url = { 'Brand': brandName, 'Name': name, 'url': url } mongo.insert_url(car_url)
def getmodel(): # compare databases cars = db.temp check = db.model counter = 0 templist = [] for car in cars.find(): templist.append(car['url']) print('%d, %s') % (len(templist), 'in the brand list') for car in check.find(): templist.remove(car['url']) print('%d, %s') % (len(templist), 'left to fetch') for url in templist: car = cars.find_one({'url': url}) model_page = crawler.readlink(baselink + url) brand = car['brand'] sub_brand = car['sub_brand'] name = car['name'] for each_model in model_page.findAll('td', {'class': 'list_version'}): model = each_model.a.string model_url = each_model.a['href'] car_model = { 'brand': brand, 'sub_brand': sub_brand, 'name': name, 'url': url, 'model': model, 'model_url': model_url } # print car_model mongo.insert_model(car_model)
def getmodel(): # compare databases cars = db.temp check = db.model counter = 0 templist = [] for car in cars.find(): templist.append(car['url']) print('%d, %s') % (len(templist), 'in the brand list') for car in check.find(): templist.remove(car['url']) print('%d, %s') % (len(templist), 'left to fetch') for url in templist: car = cars.find_one({'url':url}) model_page = crawler.readlink(baselink + url) brand = car['brand'] sub_brand = car['sub_brand'] name = car['name'] for each_model in model_page.findAll('td',{'class':'list_version'}): model = each_model.a.string model_url = each_model.a['href'] car_model = { 'brand' : brand, 'sub_brand' : sub_brand, 'name' : name, 'url' : url, 'model' : model, 'model_url' : model_url } # print car_model mongo.insert_model(car_model)
def getbrand(): mainpage_content = crawler.readlink(mainpage) # get each car model with their corresponding brand name # extract the brand name # this is car containter accroding to their initials, there are total 22 initals in the main page car_container = mainpage_content.findAll('div', attrs={'class': 'container'}) # in each container there are N elements of tr, which represents a specfic car brand for init in car_container: # the following extract each brand into a block for each_tr in init.findAll('tr'): brand_name = each_tr.findAll('td')[0].text #print brand_name # each div is made of two parts, p for sub model name and li for each model under such sub model for each_column_content in each_tr.findAll('td')[1].findAll( 'div', attrs={'class': 'column_content'}): sub_brand_name = each_column_content.p.text #print sub_brand_name for brand in each_column_content.findAll('li'): name = brand.text url = brand.a['href'] # print name # print brand_name, sub_brand_name, name, url car = { 'brand': brand_name, 'sub_brand': sub_brand_name, 'name': name, 'url': url } mongo.insert_brand(car)
def getOfficialDetail(): cd_urls = mongo.get_cd_urls() for cd in cd_urls: cd_id = cd['_id'] link = cd['link'] print link content = crawler.readlink(link) try: cd['douban_score'] print('already in the database') except KeyError: try: douban_score = content.find('strong', attrs={ 'class': 'll rating_num' }).text except AttributeError: douban_score = 'unkonwn' try: people_voted = content.find('span', attrs={ 'property': 'v:votes' }).text except AttributeError: people_voted = 'unknown' new_entities = { 'douban_score': douban_score, 'people_voted': people_voted } mongo.update(new_entities, cd_id)
def getbrand(): mainpage_content = crawler.readlink(mainpage) # get each car model with their corresponding brand name # extract the brand name # this is car containter accroding to their initials, there are total 22 initals in the main page car_container = mainpage_content.findAll('div',attrs={'class':'container'}) # in each container there are N elements of tr, which represents a specfic car brand for init in car_container: # the following extract each brand into a block for each_tr in init.findAll('tr'): brand_name = each_tr.findAll('td')[0].text #print brand_name # each div is made of two parts, p for sub model name and li for each model under such sub model for each_column_content in each_tr.findAll('td')[1].findAll('div', attrs = {'class': 'column_content'}): sub_brand_name = each_column_content.p.text #print sub_brand_name for brand in each_column_content.findAll('li'): name = brand.text url = brand.a['href'] # print name # print brand_name, sub_brand_name, name, url car = { 'brand' : brand_name, 'sub_brand' : sub_brand_name, 'name' : name, 'url' : url } mongo.insert_brand(car)
# define base link baselink = "http://newcar.xcar.com.cn" # select DB collection cars = db.cars_model.find() # loop in collection and get spec for car in cars: brand = car['Brand'] name = car['Name'] model = car['Model'] url = car['Model_url'] car_id = car['_id'] specpage = crawler.readlink(baselink + url) if len(specpage.findAll('em', text=u'排量(L):')) != 0: price = float(specpage.b.text) * 10 horsepower = specpage.find('em', text=u'最大功率(kW/rpm):').findNext('td').text liter = specpage.find('em', text=u'排量(L):').findNext('td').text engine_type = specpage.find('em', text=u'进气形式:').findNext('td').text tourque = specpage.find('em', text=u'最大扭矩(Nm/rpm):').findNext('td').text drive = specpage.find('em', text=u'驱动方式:').findNext('td').text else: price = float(specpage.find('div', attrs={ 'class': 'price' }).b.text) * 10 horsepower = 0
def getspec(): check = db.model for car in check.find(): brand = car['brand'] sub_brand = car['sub_brand'] name = car['name'] url = car['url'] model = car['model'] model_url = car['model_url'] _id = car['_id'] check_length = car.get('price') if type(check_length) != dict: specpage = crawler.readlink(baselink + model_url) if len(specpage.findAll('em', text=u'排量(L):')) != 0: price = float(specpage.b.text) * 10 horsepower = specpage.find( 'em', text=u'最大功率(kW/rpm):').findNext('td').text liter = specpage.find('em', text=u'排量(L):').findNext('td').text engine_type = specpage.find('em', text=u'进气形式:').findNext('td').text tourque = specpage.find( 'em', text=u'最大扭矩(Nm/rpm):').findNext('td').text drive = specpage.find('em', text=u'驱动方式:').findNext('td').text else: price = float( specpage.find('div', attrs={ 'class': 'price' }).b.text) * 10 horsepower = 0 liter = 0 engine_type = 0 tourque = 0 drive = 0 car = { 'url': url, 'brand': brand, 'sub_brand': sub_brand, 'name': name, 'model': model, 'model_url': model_url, 'price': price, 'engine_size': liter, 'engine_type': engine_type, 'horsepower': horsepower, 'tourque': tourque, 'drive': drive } s = check.update({'_id': _id}, { '$set': { 'url': url, 'brand': brand, 'sub_brand': sub_brand, 'name': name, 'model': model, 'model_url': model_url, 'price': price, 'engine_size': liter, 'engine_type': engine_type, 'horsepower': horsepower, 'tourque': tourque, 'drive': drive } }) print s, car['brand'], car['sub_brand'], car['name'], car[ 'model'], car['price'] else: print('Spec already there')
def getspec(): check = db.model for car in check.find(): brand = car['brand'] sub_brand = car['sub_brand'] name = car['name'] url = car['url'] model = car['model'] model_url = car['model_url'] _id = car['_id'] check_length = car.get('price') if type(check_length) != dict: specpage = crawler.readlink(baselink + model_url) if len(specpage.findAll('em', text = u'排量(L):')) != 0: price = float(specpage.b.text) * 10 horsepower = specpage.find('em', text = u'最大功率(kW/rpm):').findNext('td').text liter = specpage.find('em', text = u'排量(L):').findNext('td').text engine_type = specpage.find('em', text = u'进气形式:').findNext('td').text tourque = specpage.find('em', text = u'最大扭矩(Nm/rpm):').findNext('td').text drive = specpage.find('em', text = u'驱动方式:').findNext('td').text else: price = float(specpage.find('div', attrs = {'class':'price'}).b.text) * 10 horsepower = 0 liter = 0 engine_type = 0 tourque = 0 drive = 0 car = { '_id': _id, 'url': url, 'brand': brand, 'sub_brand': sub_brand, 'name': name, 'model': model, 'model_url': model_url, 'price': price, 'engine_size': liter, 'engine_type': engine_type, 'horsepower': horsepower, 'tourque': tourque, 'drive': drive } s = check.update( {'_id':_id}, {'$set': { 'price': price, 'engine_size': liter, 'engine_type': engine_type, 'horsepower': horsepower, 'tourque': tourque, 'drive': drive } } ) print car['_id'], car['brand'], car['sub_brand'], car['name'], car['model'], car['price'], s else: print('Spec already there')
# select DB collection cars = db.cars_url.find() check = db.cars_model # loop in collection for car in cars: brand = car['Brand'] name = car['Name'] url = car['url'] car_id = car['_id'] check_length = check.find_one({'url':url}) if type(check_length) != dict: allspecpage = crawler.readlink(baselink + url) for each in allspecpage.findAll('td',{'class':'list_version'}): model = each.a.string model_url = each.a['href'] car_model = { 'Brand': brand, 'Name': name, 'url': url, 'Model': model, 'Model_url': model_url } print brand + '|' + name + '|' + model mongo.insert_model(car_model) else: