def search(self): maker = self.maker_box.get() model = self.model_box.get() zipcode = int(self.zip_entry.get()) radius = int(self.radius_var.get()) condition = self.condition_var.get() car_json_file = "cars_com_make_model.json" directory = "../data/" page_num = 1 num_per_page = 100 start_url = generate_url(maker, model, zipcode, radius, car_json_file, condition, page_num, num_per_page) csv_name = "{}-{}-{:d}-{:d}-{:s}.csv".format(maker, model, zipcode, radius, condition) csv_name = os.path.join(directory, csv_name) print("crawling {} {} {}...".format(condition, maker, model)) craw_from_url(start_url, csv_name) print("finish crawling...") df = load_csvfile(csv_name) car_info = extract_info_from_csvfilename(csv_name) price_info = analyze_price(df) # print_price_info(price_info, car_info) self.mean = int(price_info['mean']) self.min = int(price_info['min']) self.max = int(price_info['max']) self.min_label_text.set(self.min) self.mean_label_text.set(self.mean) self.max_label_text.set(self.max)
def read_and_crawl(): """ crawl multiple models, crawl and compare """ if len(sys.argv) != 7: print( "Usage: >> python {} <maker_model_file> <zip> <radius> <used or new> <json or keyfile> <output_dir>" .format(sys.argv[0])) print( "e.g. python {} <maker_model_file> 53715 25 used <json or keyfile> ./data/" .format(sys.argv[0])) sys.exit(1) with open(sys.argv[1], 'r') as mmfile: maker_models = (tuple(line.split(":")) for line in mmfile.readlines()) # print(maker_models) zipcode = int(sys.argv[2]) radius = int(sys.argv[3]) condition = sys.argv[4] car_json_file = sys.argv[5] output_dir = sys.argv[6] # if the output_dir does not exist, create it os.makedirs(output_dir, exist_ok=True) car_infos = [] price_infos = [] for maker, model in maker_models: maker = maker.strip() model = model.strip() page_num = 1 num_per_page = 100 start_url = generate_url(maker, model, zipcode, radius, car_json_file, condition, page_num, num_per_page) csv_name = "{}-{}-{:d}-{:d}-{:s}.csv".format(maker, model, zipcode, radius, condition) csv_name = os.path.join(output_dir, csv_name) print("crawling {} {} {}...".format(condition, maker, model)) craw_from_url(start_url, csv_name) print("finish crawling...") df = load_csvfile(csv_name) car_info = extract_info_from_csvfilename(csv_name) price_info = analyze_price(df) car_infos.append(car_info) price_infos.append(price_info) plot_price_info(car_infos, price_infos)
def pipeline_carscom(): """ crawling pipeline for cars.com """ maker, model, zipcode, radius, condition, car_json_file, directory = user_input( ) page_num = 1 num_per_page = 100 start_url = generate_url(maker, model, zipcode, radius, car_json_file, condition, page_num, num_per_page) csv_name = "{}-{}-{:d}-{:d}-{:s}.csv".format(maker, model, zipcode, radius, condition) directory = os.path.dirname(os.path.realpath(__file__)) csv_name = os.path.join(directory, csv_name) print("crawling {} {} {}...".format(condition, maker, model)) craw_from_url(start_url, csv_name) print("finish crawling...") df = load_csvfile(csv_name) car_info = extract_info_from_csvfilename(csv_name) price_info = analyze_price(df) print_price_info(price_info, car_info)
def pipeline_carscom(directory='./'): """ crawling pipeline for cars.com """ maker, model, zipcode, radius, condition, car_json_file, directory = user_input( ) page_num = 1 num_per_page = 100 start_url = generate_url(maker, model, zipcode, radius, car_json_file, condition, page_num, num_per_page) csv_name = "{}-{}-{:d}-{:d}-{:s}.csv".format(maker, model, zipcode, radius, condition) csv_name = os.path.join(directory, csv_name) print("crawling...") craw_from_url(start_url, csv_name) print("finish crawling...") df = load_csvfile(csv_name) if '/' in csv_name: csv_name = csv_name[csv_name.rfind('/') + 1:] maker, model = csv_name.split('-')[:2] maker = maker.upper() model = model.upper() analyze_price(df, maker, model)
def pipeline_market_check(directory='./'): """ the whole crawling pipeline for market check Args: directory: output directory """ maker, model, zipcode, radius, condition, market_key_file, directory = user_input( ) # convert zipcode to latitude and longitude zipSearch = ZipcodeSearchEngine() zipinfo = zipSearch.by_zipcode(str(zipcode)) latitude, longtitude = str(zipinfo["Latitude"]), str(zipinfo["Longitude"]) # read assess key for api with open(market_key_file, "r") as key: api_key = key.read() car_market_url = "http://api.marketcheck.com/v1/search" # start with index = 0, rows = 50 (max cars per request) max_rows_per_request = 50 querystring = { "api_key": api_key, "make": maker, "latitude": latitude, "longitude": longtitude, "radius": str(radius), "car_type": condition, "seller_type": "dealer", "start": "0", "rows": str(max_rows_per_request), } if model != "all": querystring["model"] = model headers = {'Host': 'marketcheck-prod.apigee.net'} response = requests.request("GET", car_market_url, headers=headers, params=querystring) cars_json = json.loads(response.text) count = (cars_json["num_found"]) num_of_requests = (count + max_rows_per_request - 1) // max_rows_per_request print("Total number of request is {:d}".format(num_of_requests)) short_csv_header = [ "name", "VIN", "price", "miles", "Exterior Color", "Interior Color" ] dict_header = [ "heading", "vin", "price", "miles", "exterior_color", "interior_color" ] # names from API response long_csv_header = [ "name", "VIN", "make", "model", "year", "price", "miles", "Exterior Color", "Interior Color", "Seller Name", "Seller Phone", "Transmission", "Drivetrain" ] # car attributes stored in csv table csv_rows = [] # stores each crawled car as an dictionary for ite in range(num_of_requests): print("Sending the {:d}th request".format(ite)) # query the API with new offset json if ite != 0: querystring["start"] = str(ite * max_rows_per_request) response = requests.request("GET", car_market_url, headers=headers, params=querystring) cars_json = json.loads(response.text) for car in cars_json["listings"]: # car is a dictionary car_dict = { ch: car.get(dh, None) for ch, dh in zip(short_csv_header, dict_header) } if "dealer" in car: dealer = car["dealer"] car_dict["Seller Phone"] = dealer.get("phone", None) car_dict["Seller Name"] = dealer.get("name", None) if "build" in car: build = car["build"] car_dict["Transmission"] = build.get("transmission", None) car_dict["Drivetrain"] = build.get("drivetrain", None) car_dict["year"] = build.get("year", None) car_dict["make"] = build.get("make", None) car_dict["model"] = build.get("model", None) csv_rows.append(dict(car_dict)) # write data to csv file csv_name = "{}-{}-{:d}-{:d}-{:s}.csv".format(maker, model, zipcode, radius, condition) csv_name = os.path.join(directory, csv_name) write_cars_to_csv(csv_name, long_csv_header, csv_rows) # do some price analysis there df = load_csvfile(csv_name) maker, model = maker.upper(), model.upper() analyze_price(df, maker, model)