def get_save_selling_num(url): printset = set(string.printable) all_rows = [] ret_dict = {} prius_num = 0 prius_ytd = 0 tmp_months_to_show = [] month_list_entity = ds.get_saved_months('month_list') if month_list_entity != None: tmp_months_to_show = month_list_entity.month_list soup = BeautifulSoup(urllib2.urlopen(url).read(), "html.parser") table = soup.find('table', attrs={'border':'1'}) if table == None: table = soup.find('table', attrs={'border':'3'}) table_body = table.find('tbody') rows = table_body.find_all('tr') for row in rows: one_row = [] cols = row.find_all('td') for entry in cols: one_row.append(entry.text.strip().lower().replace('\n',' ')) # print one_row all_rows.append(one_row) month_str = all_rows[0][2].split() month = month_str[1] + month_dict[month_str[0].replace('.','')] if len(tmp_months_to_show) == 0: tmp_months_to_show.append(month) ds.save_updated_months('month_list', tmp_months_to_show) elif month not in tmp_months_to_show: #tmp_months_to_show is sorted list of months for i,m in enumerate(tmp_months_to_show): if int(month) > int(m): tmp_months_to_show.insert(i, month) break else: continue if i == len(tmp_months_to_show)-1: tmp_months_to_show.append(month) ds.save_updated_months('month_list', tmp_months_to_show) for v in all_rows[1:]: name_tmp = v[1].replace('*','').strip().split() brand = name_tmp[0].strip() if 'prius' in name_tmp: brand = 'toyota' if name_tmp[0] == 'prius': model = 'prius x' else: model = ' '.join(name_tmp[1:]) elif 'ram' in name_tmp: brand = 'ram' model = 'p/u' else: if not set(name_tmp[-1]).issubset(printset): del name_tmp[-1] model = ' '.join(name_tmp[1:]) if brand == 'bmw': model = "3&4 series" # print brand,model ret_dict['brand'] = brand ret_dict['model'] = model ret_dict['month'] = month selling_no = int(v[2].replace(',','')) if len(v) < 5: if month_dict[month_str[0]] == '01': selling_no_ytd = selling_no else: selling_no_ytd = 0 else: if int(month) < 201310: if v[4] == 'n/a': selling_no_ytd = 0 else: selling_no_ytd = int(v[4].replace(',','')) else: selling_no_ytd = int(v[5].replace(',','')) if model == 'prius': prius_num = selling_no prius_ytd = selling_no_ytd selling_no = prius_num selling_no_ytd = prius_ytd elif 'prius' in model: prius_num += selling_no prius_ytd += selling_no_ytd selling_no = prius_num selling_no_ytd = prius_ytd model = 'prius' ret_dict['model'] = 'prius' elif model == 'corolla/matrix': model = 'corolla' ret_dict['model'] = 'corolla' elif model in ['ram p/u','ram','p/u']: model = 'p/u' ret_dict['model'] = 'p/u' if model in TOP_30_TRUCKS: ret_dict['car_type'] = 'truck' elif model in TOP_30_CARS: ret_dict['car_type'] = 'car' elif model in TOP_30_SUVS: ret_dict['car_type'] = 'suv' elif model in TOP_VANS: ret_dict['car_type'] = 'van' else: ret_dict['car_type'] = 'car' print "##########model type not found for %s"%model ret_dict['selling_no'] = selling_no ret_dict['selling_no_ytd'] = selling_no_ytd entry_key = str(brand)+str(model)+str(month) ds.save_entity(entry_key, ret_dict) monthly_total_perbrand = {} monthly_total_perbrand['month'] = month data_entities = ds.get_all_entities_bymonth(month) for b in TREND_TOP_V_BRAND: monthly_total_perbrand['brand'] = b monthly_total_perbrand['monthly_total_no'] = 0 for i in data_entities: if b == i.brand: monthly_total_perbrand['monthly_total_no'] += i.selling_no # print monthly_total_perbrand ds.save_monthly_total_perbrand(b+month, monthly_total_perbrand) monthly_total_pertype = {} monthly_total_pertype['month'] = month for t in TREND_TOP_V_TYPE: monthly_total_pertype['car_type'] = t monthly_total_pertype['monthly_total_no'] = 0 for i in data_entities: if t == "all": monthly_total_pertype['monthly_total_no'] += i.selling_no elif t == i.car_type: monthly_total_pertype['monthly_total_no'] += i.selling_no # print monthly_total_pertype ds.save_monthly_total_pertype(t+month, monthly_total_pertype)
def get_save_tweet(debug=True): search_tweet_id = 0 save_tweet_id = 0 tweet = None target_count = 50 target_vehicles = ALL_TOP_V_SEARCH tweet_id_entity = ds.get_tweet_id('tweet_id') if tweet_id_entity != None: search_tweet_id = tweet_id_entity.saved_id print "####Got saved tweet id %d"%search_tweet_id if debug == True: target_count = 50 target_vehicles = TESTING_V for query_str in target_vehicles: # print "Searching for %s, since tweet id=%d"%(query_str, search_tweet_id) tweets_list = api.GetSearch(term=query_str, lang="en", \ count=target_count,result_type='recent', since_id=search_tweet_id) for tweet in tweets_list: sum_lat = sum_lng = 0 ret_dict = {} if tweet.truncated == True: print "####got truncated tweet" continue if tweet.place != None and tweet.place['country_code'] == "US": coordinate_list = tweet.place['bounding_box']['coordinates'][0] num = len(coordinate_list) for co in coordinate_list: sum_lng += co[0] sum_lat += co[1] centerlng = float(sum_lng)/num; centerlat = float(sum_lat)/num; #has both coordinate and place ret_dict['coordinate'] = [centerlng, centerlat] ret_dict['place'] = tweet.place['full_name'] elif tweet.coordinates != None: #has only coordinate ret_dict['coordinate'] = tweet.coordinates['coordinates'] if is_coordinate_in_us(ret_dict['coordinate']): ret_dict['place'] = get_place_reverse_geocode(ret_dict['coordinate']) if ret_dict['place'] == None: continue # print "####only coordinate, place=%s"%ret_dict['place'] else: continue elif tweet.user != None: #has only place, note place may be not in US location = tweet.user.location.strip().encode('utf-8','ignore') if location != "": ret_dict['place'] = location ret_dict['coordinate'] = get_coordinate_geocode(location) if ret_dict['coordinate']==None or \ is_coordinate_in_us(ret_dict['coordinate'])==False: continue # print "####only place, coordinate=%s"%ret_dict['coordinate'] else: continue if ret_dict != {}: if ret_dict['coordinate'] == []: continue ret_dict['model'] = query_str tweet_date = datetime.strptime(tweet.created_at,'%a %b %d %H:%M:%S +0000 %Y')\ .replace(tzinfo=pytz.UTC) ret_dict['month'] = tweet_date.strftime("%Y%m") ret_dict['text'] = tweet.text.strip().encode("utf-8",'ignore') ret_dict['tag'] = 'UNKNOWN' additional_str = "".join(re.findall("[a-zA-Z]+", ret_dict['text'])) entry_key = ret_dict['model'].split()[1] + additional_str[:5] \ + str(ret_dict['coordinate'][0]) + str(ret_dict['coordinate'][1]) print ret_dict ds.save_entity(entry_key, ret_dict) #save the last tweet id of one query str aftet for loop if tweet != None: save_tweet_id = tweet.id print "####save the last tweet id to %d"%save_tweet_id ds.save_tweet_id('tweet_id', save_tweet_id)