def scrape_DIYothers(key): url = URLS.get(key) response = (requests.get(url, timeout=5)) soup = BeautifulSoup(response.content, "html.parser") table = soup.find_all("table", {"class": "roundy"}) items = {} for tr in table[2].find_all("tr")[1:]: name = tr.find_all("td")[0].a.text item = { "name": name, "imageLink": tr.find_all("a")[1]['href'], "materials": separate_by_br( tr.find_all("td")[2]).lstrip().strip("\n").split(","), "materialsImageLink": get_image_links(tr.find_all("td")[2].find_all("img")), "sizeImageLink": tr.find_all("td")[3].img.get("data-src"), "obtainedFrom": tr.find_all("td")[4].text.strip().strip("\n").splitlines(), "price": parse_price(tr.find_all("td")[5].text) } if (item["obtainedFrom"] == ["Nook Stop (1,000 )" ]): # TODO: rewrite this lazy code item["obtainedFrom"] = ["Nook Stop (1,000 Nook Miles)"] items[name] = item dump_data(items, "crafting/" + key) return items
def scrape_fossils(key): url = URLS.get(key) response = (requests.get(url, timeout=5)) soup = BeautifulSoup(response.content, "html.parser") table = soup("table", {"class": "sortable"}) items = {} # Stand-alone fossils items["stand_alone"] = {} for tr in table[0]("tr")[1:]: name = tr("td")[0].a.text item = { "image_url": tr("a")[1]['href'], "price": parse_price(tr("td")[2].text), } items["stand_alone"][name] = item # Multi-part fossils items["multi_part"] = {} for tr in table[1]("tr")[1:]: tds = tr("td") if not tds: currentCategory = tr("a")[0].text continue name = tr("td")[0].a.text item = { "image_url": tr("a")[1]['href'], "price": parse_price(tr("td")[2].text), "category": currentCategory } items["multi_part"][name] = item dump_data(items, "museum/" + key) return items
def scrape_equipments(key): url = URLS.get(key) response = (requests.get(url, timeout=5)) soup = BeautifulSoup(response.content, "html.parser") table = soup.find_all("table", {"class": "sortable"}) items = {} for tr in table[0].find_all("tr")[1:]: name = tr.find_all("td")[0].a.text item = { "name": name, "imageLink": tr.find_all("a")[1]['href'], "materials": separate_by_br( tr.find_all("td")[2]).lstrip().strip("\n").split(","), "materialsImageLink": get_image_links(tr.find_all("td")[2].find_all("img")), "sizeImageLink": tr.find_all("td")[3].img.get("data-src"), "obtainedFrom": tr.find_all("td")[4].text.strip().strip("\n").splitlines(), "price": parse_price(tr.find_all("td")[5].text) } items[name] = item dump_data(items, "crafting/" + key) return items
def scrape_furniture_housewares(key): url = URLS["furniture"][key] response = requests.get(url, timeout=5) soup = BeautifulSoup( response.content, "html5lib") # html.parser does not scrape all html contents tables = soup("table", {"class": "roundy"}) items = {} for table_number in range(3, 29): # a - z if len(tables[table_number]("tr")) > 3: # some tables are empty for tr in tables[table_number]("tr")[2:]: name = tr("td")[1].text.strip() item = { "image_url": parse_image_url(tr("td")[0]), "price": { "buy": parse_price(tr("td")[2].text), "sell": parse_price(tr("td")[3].text) }, "source": parse_source(tr("td")[4]), "variations": parse_furniture_variations(tr("td")[5]), "customization": parse_customization(tr("td")[6]), "size_image_url": parse_image_img_url(tr("td")[7]), } items[name] = item dump_data(items, "furniture/" + key) return items
def scrape_shoes(key): url = URLS.get(key) response = (requests.get(url, timeout=5)) soup = BeautifulSoup(response.content, "html.parser") table = soup.find_all("table", {"class": "roundy"}) items = {} for tableNumber in range(2, 8): for tr in table[tableNumber].find_all("tr")[2:]: name = tr.find_all("td")[0].text.strip() item = { "name": name, # "imageLink": tr.find_all("td")[1].find_all("a")[0]["href"], "priceBuy": parse_price(tr.find_all("td")[2].text), "priceSell": parse_price(tr.find_all("td")[3].text), "source": parse_source(tr.find_all("td")[4]), "variations": parse_variations(tr.find_all("td")[5]), "variationImageLinks": get_image_links(tr.find_all("td")[5].find_all("img")) } if tr.find_all("td")[1].find_all("a"): item["imageLink"] = tr.find_all("td")[1].find_all( "a")[0]["href"] items[name] = item dump_data(items, "clothing/" + key) return items
def scrape_wallpapers(key): url = URLS.get(key) response = (requests.get(url, timeout=5)) soup = BeautifulSoup(response.content, "html.parser") table = soup.find_all("table", {"class": "sortable"}) items = {} for tr in table[0].find_all("tr")[1:]: name = tr.find_all("td")[0].a.text item = { "name": name, } if tr.find_all("a")[1]['href']: item["imageLink"] = tr.find_all("a")[1]['href'] if tr.find_all("td")[2]: item["materials"] = separate_by_br( tr.find_all("td")[2]).strip("\n").split(",") item["materialsImageLink"] = get_image_links( tr.find_all("td")[2].find_all("img")) if tr.find_all("td")[3].find_all("a"): item["sizeLink"] = tr.find_all("td")[3].find_all("a")[0]['href'] if tr.find_all("td")[4].text: if (tr.find_all("td")[4].text.strip('\n').splitlines() == []): pass else: item["obtainedFrom"] = tr.find_all("td")[4].text.strip( '\n').splitlines() if tr.find_all("td")[5].text.strip().replace(",", ""): item["price"] = int( tr.find_all("td")[5].text.strip().replace(",", "")) items[name] = item dump_data(items, "crafting/" + key) return items
def scrape_fish(url_key): items = {} url = URLS["museum"][url_key] response = (requests.get(url, timeout=5)) soup = BeautifulSoup(response.content, "html.parser") table = soup("table", {"class": "sortable"})[0] item_id = 1 for tr in table("tr")[1:]: name = tr("td")[0].text.strip() item_key = name.replace(" ", "_").replace("-", "_") acnhapi_url = URLS["api"] + "/fish/" + str(item_id) acnhapi_response = urllib.request.urlopen(acnhapi_url) acnhapi_data = json.loads(acnhapi_response.read()) item = { "name": name, "id": item_id, "wiki_url": "https://animalcrossing.fandom.com" + tr("td")[0].find("a")["href"], "icon_url": tr("a")[1]['href'], "image_url": URLS["api"] + "/images/fish/" + str(item_id), "price": parse_price(tr("td")[2].text), "location": tr("td")[3].text.strip(), "shadow_size": tr("td")[4].text.strip(), "time": tr("small")[0].text.split(" & "), "months": { "northern": parse_months([tr("td")[6], tr("td")[7], tr("td")[8], tr("td")[9], tr("td")[10], tr("td")[11], tr("td")[12], tr("td")[13], tr("td")[14], tr("td")[15], tr("td")[16], tr("td")[17]]), "southern": parse_months([tr("td")[12], tr("td")[13], tr("td")[14], tr("td")[15], tr("td")[16], tr("td")[17], tr("td")[6], tr("td")[7], tr("td")[8], tr("td")[9], tr("td")[10], tr("td")[11]]), }, "catch_phrase": acnhapi_data["catch-phrase"], "museum_phrase": acnhapi_data["museum-phrase"], } item_id += 1 items[item_key] = item dump_data(items, "museum/fish")
def scrape_bugs(url_key): # contains all bugs items = {} # get response from url and create soup url = URLS["museum"][url_key] response = requests.get(url, timeout=5) soup = BeautifulSoup(response.content, "html.parser") # find the table to scrape from table = soup("table", {"class": "sortable"})[0] item_id = 1 for tr in table("tr")[1:]: name = tr("td")[0].text.strip() item_key = name.replace(" ", "_").replace("-", "_") acnhapi_url = URLS["api"] + "/bugs/" + str(item_id) acnhapi_response = urllib.request.urlopen(acnhapi_url) acnhapi_data = json.loads(acnhapi_response.read()) item = { "name": name, "id": item_id, "wiki_url": URLS["wiki"] + tr("td")[0].find("a")["href"], "icon_url": tr("a")[1]['href'], "image_url": URLS["api"] + "/images/bugs/" + str(item_id), "price": parse_price(tr("td")[2].text), "location": tr("td")[3].text.strip(), "time": tr("small")[0].text.split(" & "), "months": { "northern": parse_months([tr("td")[5], tr("td")[6], tr("td")[7], tr("td")[8], tr("td")[9], tr("td")[10], tr("td")[11], tr("td")[12], tr("td")[13], tr("td")[14], tr("td")[15], tr("td")[16]]), "southern": parse_months([tr("td")[11], tr("td")[12], tr("td")[13], tr("td")[14], tr("td")[15], tr("td")[16], tr("td")[5], tr("td")[6], tr("td")[7], tr("td")[8], tr("td")[9], tr("td")[10]]), }, "catch_phrase": acnhapi_data["catch-phrase"], "museum_phrase": acnhapi_data["museum-phrase"], } item_id += 1 items[item_key] = item dump_data(items, "museum/bugs")
def scrape_villagers(key): # get list of villager urls url = URLS.get(key) response = (requests.get(url, timeout=5)) soup = BeautifulSoup(response.content, "html.parser") table = soup("table", {"class": "sortable"}) villagers_urls = [] for tr in table[0]("tr")[1:]: villagers_urls.append("https://animalcrossing.fandom.com" + tr("td")[0].a.get("href")) # scrape each villager page villagers_info = {} for vu in villagers_urls: response = requests.get(vu, timeout=5) soup = BeautifulSoup(response.content, "html.parser") asides = soup("aside") name = asides[0]("h2")[0].text item = {} item["image_url"] = asides[0]("img")[0].get("src").replace( "/scale-to-width-down/350", "") if len(asides[0]("figcaption")) > 0: item["caption"] = asides[0]("figcaption")[0].text else: item["caption"] = None for div in asides[0]("div", {"class": "pi-item"}): if div.find("div").text == "Unknown": item[div("h3")[0].text.lower().replace(" ", "_")] = None else: item[div("h3")[0].text.lower().replace( " ", "_")] = div.find("div").text villagers_info[name] = item dump_data(villagers_info, "characters/villagers")
def scrape_crafting_others(key): url = URLS.get(key) response = (requests.get(url, timeout=5)) soup = BeautifulSoup(response.content, "html.parser") tables = soup("table", {"class": "roundy"}) items = {} for tr in tables[2]("tr")[1:]: name = tr("td")[0].a.text item = { "image_url": tr("a")[1]['href'], "materials": parse_materials(tr("td")[2]), "obtained_from": parse_obtained_from(tr( "td")[4]), # TODO add nook miles .replace(")", "Nook Miles)") "price": parse_price(tr("td")[5].text) } if tr("td")[3].img.get("data-src"): item["size_image_url"] = tr("td")[3].img.get("data-src") elif tr("td")[3].img: item["size_image_url"] = tr("td")[3].img.get("src") # ???? else: item["size_image_url"] = None items[name] = item dump_data(items, "crafting/" + key) return items
def scrape_bugs(key): # take url and return object containing bugs data url = URLS.get(key) # create soup object response = (requests.get(url, timeout=5)) soup = BeautifulSoup(response.content, "html.parser") # find the target table table = soup("table", {"class": "sortable"}) items = {} # go through each tr in the table, ignoring the table header for tr in table[0]("tr")[1:]: # scrape each item name = tr("td")[0].a.text item = { "image_url": tr("a")[1]['href'], "price": parse_price(tr("td")[2].text), "location": tr("td")[3].text.strip(), "time": tr("small")[0].text, "months": { "northern": parse_months([ tr("td")[5], tr("td")[6], tr("td")[7], tr("td")[8], tr("td")[9], tr("td")[10], tr("td")[11], tr("td")[12], tr("td")[13], tr("td")[14], tr("td")[15], tr("td")[16] ]), "southern": parse_months([ tr("td")[11], tr("td")[12], tr("td")[13], tr("td")[14], tr("td")[15], tr("td")[16], tr("td")[5], tr("td")[6], tr("td")[7], tr("td")[8], tr("td")[9], tr("td")[10] ]), } } items[name] = item # dump data in a json dump_data(items, "museum/" + key) # return for debugging return items
def parse_g1mg_subchunk_0x10001(schunk_data): dump_chunk = True get = get_getter(schunk_data, "<") schunk_type, schunk_size = get(0x0, "2I") entry_count = get(0x8, "I") reserved = get(0xc, "I") assert reserved == 0x100 assert len(schunk_data) == 0x10 + entry_count * 0x40 log("entry_count=%d" % entry_count, lv=0) # entry size == 0x40, but not matrix if dump_chunk: dump_data("g1mg_0x10001.bin", schunk_data)
def scrape_fish(key): # same logic as scrapeBugs url = URLS.get(key) response = (requests.get(url, timeout=5)) soup = BeautifulSoup(response.content, "html.parser") table = soup("table", {"class": "sortable"}) items = {} for tr in table[0]("tr")[1:]: name = tr("td")[0].a.text item = { "image_url": tr("a")[1]['href'], "price": parse_price(tr("td")[2].text), "location": tr("td")[3].text.strip(), "shadow_size": tr("td")[4].text.strip(), "time": tr("small")[0].text, "months": { "northern": parse_months([ tr("td")[6], tr("td")[7], tr("td")[8], tr("td")[9], tr("td")[10], tr("td")[11], tr("td")[12], tr("td")[13], tr("td")[14], tr("td")[15], tr("td")[16], tr("td")[17] ]), "southern": parse_months([ tr("td")[12], tr("td")[13], tr("td")[14], tr("td")[15], tr("td")[16], tr("td")[17], tr("td")[6], tr("td")[7], tr("td")[8], tr("td")[9], tr("td")[10], tr("td")[11] ]), } } items[name] = item dump_data(items, "museum/" + key) return items
def parse_g1mg_subchunk_0x10006(schunk_data): dump_chunk = False get = get_getter(schunk_data, "<") schunk_type, schunk_size = get(0x0, "2I") entry_count = get(0x8, "I") off = 0xc log("entry_count=%d" % entry_count, lv=0) for entry_idx in xrange(entry_count): item_count = get(off, "I") mat_ref_idx, unk0, unk1, unk2, joint_map_idx = get(off + 0x4, "IHHHH") assert unk0 == 0x8000 and unk2 == 0x8000 #count(locals(), "unk1") off += 0x4 + item_count * 0xc assert off == len(schunk_data) if dump_chunk: dump_data("g1mg_0x10006.bin", schunk_data)
def scrape_villagers(url_key): url = URLS["character"][url_key] response = requests.get(url, timeout=5) soup = BeautifulSoup(response.content, "html.parser") tables = soup("table", {"class": "sortable"}) items = {} # these headers must be scraped from their individual wiki page headers = ["initial_clothes", "caption", "home_request", "skill", "goal", "coffee", "style", "favorite_song", "appearances"] for tr in tables[0]("tr")[1:]: name = tr("td")[0].text.strip() item = { "wiki_url": "https://animalcrossing.fandom.com" + tr("td")[0].a.get("href"), "image_url": tr("td")[1]("a")[0]("img")[-1]["src"].replace("scale-to-width-down/100", ""), # fix data:images "gender": parse_gender(tr("td")[2]), "personality": parse_personality(tr("td")[2]), "species": tr("td")[3].text.strip(), "birthday": tr("td")[4].text.strip(), "initial_phrase": tr("td")[5].text.strip().replace("\"", ""), "hobbies": tr("td")[6].text.strip(), } # scrape additional information from the character's page for header in headers: item[header] = None villager_response = requests.get(item["wiki_url"], timeout=5) villager_soup = BeautifulSoup(villager_response.content, "html.parser") aside = villager_soup("aside")[0] if len(aside("figcaption")) > 0: item["caption"] = aside("figcaption")[0].text.replace("“", "").replace("”", "") for div in aside("div", {"class": "pi-item"}): if not div.find("div").text == "Unknown": if div("h3")[0].text.lower().replace(" ", "_") in headers: item[div("h3")[0].text.lower().replace(" ", "_")] = div.find("div").text # format unformatted text if not item["coffee"] is None: coffee = item["coffee"].split(",") item["coffee"] = { "type": coffee[0], "milk": coffee[1], "sugar": coffee[2] } if not item["appearances"] is None: item["appearances"] = item["appearances"].split(", ") if not item["favorite_song"] is None: item["favorite_song"] = item["favorite_song"].replace("[[", "").replace("]]", "") items[name] = item dump_data(items, "character/villagers")
def parse_g1mg_subchunk_0x10007(schunk_data): print "index buffer block" get = get_getter(schunk_data, "<") schunk_type, schunk_size = get(0x0, "2I") ib_count = get(0x8, "I") off = 0xc index_buffer_list = [] for j in xrange(ib_count): index_count, b, c = get(off, "3I") try: index_buffer_list.append(get(off + 0xc, "%dH" % index_count)) except struct.error, e: dump_data("g1mg_0x10007.bin", schunk_data) raise e off += 0xc + index_count * 2 print "%d => index_count: %d, 0x%x" % (j, index_count, b)
def scrape_tools(key): url = URLS.get(key) response = (requests.get(url, timeout=5)) soup = BeautifulSoup(response.content, "html.parser") tables = soup("table", {"class": "sortable"}) items = {} for tr in tables[0]("tr")[1:]: name = tr("td")[0].a.text item = { "image_url": tr("a")[1]['href'], "materials": parse_materials(tr("td")[2]), "size_image_url": tr("td")[3].img.get("data-src"), "obtained_from": parse_obtained_from(tr("td")[4]), "price": parse_price(tr("td")[5].text) } items[name] = item dump_data(items, "crafting/" + key) return items
def scrape_umbrellas(key): url = URLS.get(key) response = (requests.get(url, timeout=5)) soup = BeautifulSoup(response.content, "html.parser") table = soup.find_all("table", {"class": "roundy"}) items = {} for tr in table[2].find_all("tr")[2:]: name = tr.find_all("td")[0].text.strip() item = { "name": name, "imageLink": tr.find_all("td")[1].find_all("a")[0]["href"], "source": parse_source(tr.find_all("td")[2]), "priceBuy": parse_price(tr.find_all("td")[3].text), "priceSell": parse_price(tr.find_all("td")[4].text), } items[name] = item dump_data(items, "clothing/" + key) return items
def scrape_furniture_wallpapers(key): url = URLS["furniture"][key] response = requests.get(url, timeout=5) soup = BeautifulSoup(response.content, "html5lib") # html.parser does not scrape all html contents tables = soup("table", {"class": "roundy"}) items = {} for tr in tables[3]("tr")[2:]: name = tr("td")[1].text.strip() item = { "image_url": parse_image_url(tr("td")[0]), "price": { "buy": parse_price(tr("td")[2].text), "sell": parse_price(tr("td")[3].text) }, "source": parse_source(tr("td")[4]), } items[name] = item dump_data(items, "furniture/" + key) return items
def scrape_music(key): url = URLS.get(key) response = requests.get(url, timeout=5) soup = BeautifulSoup(response.content, "html.parser") tables = soup("table", {"class": "article-table"}) items = {} for tr in tables[0]("tr")[1:]: name = tr("td")[0].text.strip() item_key = name.replace(" ", "_").replace("-", "_") item = { "name": name, "image_url": parse_image_url(tr.find_all("td")[1]), "priceBuy": parse_price(tr.find_all("td")[2].text), "priceSell": parse_price(tr.find_all("td")[3].text), "source": parse_source(tr.find_all("td")[4]) } items[item_key] = item dump_data(items, "music/" + key) return items
def extract(data, path): g1m_data = [] get = get_getter(data, ">") fourcc = get(0x0, "8s") assert fourcc == "G1H_0020", "invalid fourcc" file_size = get(0x8, "I") assert len(data) == file_size, "file size not match!" unk0, g1hp_chunk_count = get(0xc, "2H") assert unk0 == 0x10, "hey, guess wrong, this value has special meaning" g1hp_chunk_offset_list = get(0x10, "%dI" % g1hp_chunk_count) for i in xrange(g1hp_chunk_count): off = g1hp_chunk_offset_list[i] chunk_size = get(off + 0x8, "I") g1m_data = extract_g1hp(data[off: off + chunk_size]) for j in xrange(len(g1m_data)): dump_data(path.replace(".g1h", "_%d_%d.g1m" % (i, j)), g1m_data[j])
def parse_g1mg_subchunk_0x10003(schunk_data): log("================", lv=0) log("uniforms", lv=0) log("================", lv=0) dump_chunk = False get = get_getter(schunk_data, "<") schunk_type, schunk_size = get(0x0, "2I") uniform_blk_cnt = get(0x8, "I") offset = 0xc for uniform_blk_idx in xrange(uniform_blk_cnt): uniform_cnt = get(offset + 0x0, "I") log("\nuniform block %d: uniform_num=%d" % (uniform_blk_idx, uniform_cnt), lv=0) offset += 0x4 for uniform_idx in xrange(uniform_cnt): tot_len, name_len = get(offset, "2I") reserved0, datatype, reserved1 = get(offset + 0x8, "I2H") assert reserved0 == 0 and reserved1 == 1 name = get(offset + 0x10, "%ds" % name_len).rstrip("\x00") rem_size = tot_len - 0x10 - name_len if 1 <= datatype <= 4: vec_size = datatype assert rem_size == vec_size * 0x4 values = get(offset + 0x10 + name_len, "%df" % vec_size, force_tuple=True) values_string = ",".join(["%.4f" % v for v in values]) elif datatype == 5: assert rem_size == 4 values = get(offset + 0x10 + name_len, "4B", force_tuple=True) values_string = ",".join(["%d" % v for v in values]) else: assert False log("\tuniform: %s, values=%s, datatype=%d" % (name, values_string, datatype), lv=0) offset += tot_len if dump_chunk: dump_data("g1mg_0x10003.bin", schunk_data)
def main(cat, cat_name): data = cat['dat'] r, rbins = c.make_r_scale(.1, 20, 25) pair_proxies = ['c%.2f' % _ for _ in r] names = ['rhillmass', 'dm5e12', 's5', 'd1', 'Pall'] proxy_list = [['rhillmass'], ['d5e12', 'm5e12'], ['s5'], ['d1'], pair_proxies] predicted_ssfrs = [] for proxies, name in zip(proxy_list, names): data = util.load_proxies(data, 'data/' + cat_name + '/', proxies, proxies) features = proxies + ['mstar'] dtrain, dtest, regressor = model.trainRegressor(data, features) predicted_ssfrs.append(dtest['pred']) log_dir = util.get_logging_dir(cat_name) for proxies, pred, name in zip(proxy_list, predicted_ssfrs, names): dtest['pred'] = pred util.train_and_dump_rwp(data, proxies + ['mstar'], name + '.dat', '', cat['box_size'], cat['red_cut'], logging=False) util.train_and_dump_rwp_bins(data, proxies + ['mstar'], name + '.dat', '', cat['box_size'], num_splits=1, red_cut=cat['red_cut'], logging=False) xcf = c.cross_correlation_function(dtest, cat['red_cut'], box_size=cat['box_size']) util.dump_data(xcf, name + '_xcf.dat', log_dir) mcf = c.jackknife_mcf(dtest, box_size=cat['box_size']) util.dump_data(mcf, name + '_mcf.dat', log_dir) mlims = [(9.9, 10.1), (10.1, 10.3), (10.5, 10.7)] # change for illustris fnames = [''.join([name, '_conformity_', str(num), '.dat']) for num in [10.0, 10.2, 10.6]] for mlim, fname in zip(mlims, fnames): res = c.radial_conformity_wrapper(dtest, cat['box_size'], mlim[0], mlim[1]) util.dump_data(res, fname, log_dir)
def scrape_artworks(key): url = URLS.get(key) response = requests.get(url, timeout=5) soup = BeautifulSoup(response.content, "html.parser") table = soup("table", {"class": "wikitable"}) items = {} # paintings items["paintings"] = {} for tr in table[0]("tr")[1:]: name = tr("td")[0].a.text item = { "description": tr("td")[3].text.strip(), } if tr("td")[1].a: item["fake_image_url"] = tr("td")[1].a['href'] else: item["fake_image_url"] = None if tr("td")[2].a: item["real_image_url"] = tr("td")[2].a['href'] else: item["real_image_url"] = None items["paintings"][name] = item # sculptures items["sculptures"] = {} for tr in table[1]("tr")[1:]: name = tr("td")[0].a.text item = { "description": tr("td")[3].text.strip(), } if tr("td")[1].a: item["fake_image_url"] = tr("td")[1].a['href'] else: item["fake_image_url"] = None if tr("td")[2].a: item["real_image_url"] = tr("td")[2].a['href'] else: item["real_image_url"] = None items["sculptures"][name] = item dump_data(items, "museum/" + key) return items
def parse_g1mg_subchunk_0x10009(schunk_data): get = get_getter(schunk_data, "<") schunk_type, schunk_size = get(0x0, "2I") unk0 = get(0x8, "I") assert unk0 == 1 or unk0 == 2 reserved = get(0xc, "3I") assert not any(reserved) shader_count1, shader_count2 = get(0x18, "II") dump_data("g1mg_0x10009.bin", schunk_data) off = 0x28 print "1st block shader:" for shader_idx in xrange(shader_count1): unk1 = get(off, "I") # assert unk1 == 0xFFFFFFFF unk_cnt = get(off + 0x4, "I") off += 0x8 + unk_cnt * 0x4 shader_name = get(off, "16s").rstrip("\x00") off += 0x10 unk_2, unk3 = get(off, "2H") off += 0x4 + unk_2 * 0x2 print "\tshader:", shader_name
def main(cat_name, proxy): cat = util.get_catalog(cat_name) df = cat['dat'] print "num galaxies: ", len(df) data_dir = cat['dir'] fname = data_dir + proxy + '.csv' print 'reading from: ', fname density = pd.read_csv(fname, header=None) print "num entries: ", len(density) df[proxy] = density.values df_train, df_test, m = model.trainRegressor(df, cat['box_size'], ['mstar', proxy]) r_value = pearsonr(df_test['pred'], df_test['ssfr'])[0] print "Correlation Coefficient is: ", r_value #TODO: Add a lock here try: stat_dict = util.load_data('statistics.pckl', data_dir) except: stat_dict = defaultdict(dict) if not stat_dict['pearsonr'].has_key(proxy): stat_dict['pearsonr'][proxy] = [] stat_dict['pearsonr'][proxy].append(r_value) util.dump_data(stat_dict, 'statistics.pckl', data_dir)
def scrape_furniture_housewares(key): url = URLS.get(key) response = (requests.get(url, timeout=5)) soup = BeautifulSoup(response.content, "html.parser") table = soup.find_all("table", {"class": "roundy"}) items = {} print(table[3]("tr")) for tr in table[3]("tr")[2:]: name = tr.find_all("td")[1].text.strip() item = { "name": name, # "imageLink": tr.find_all("td")[1].find_all("a")[0]["href"], "priceBuy": parse_price(tr.find_all("td")[2].text), "priceSell": parse_price(tr.find_all("td")[3].text), "source": parse_source(tr.find_all("td")[4]), "variations": parse_variations(tr.find_all("td")[5]), "customization": False, "sizeLink": tr.find_all("td")[6].img.get("data-src") } if tr.find_all("td")[1].find_all("a"): item["imageLink"] = tr.find_all("td")[0].find_all("a")[0]["href"] items[name] = item dump_data(items, "furniture/" + key) return items
def train(): t1 = time.time() tf.reset_default_graph() with tf.variable_scope(name_or_scope='train', reuse=tf.AUTO_REUSE): cls, (x, y), (w0, w1, w2, w3, w4) = gm.result() loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=cls, name='loss') loss_mean = tf.reduce_mean(loss, name='loss_mean') global_step = tf.Variable(0, name='global_step') learning_rate = tf.train.exponential_decay(constant.LEARNING_RATE, global_step, 1000, 0.96, staircase=True, name='learning_rate') optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, name='optimizer') train_op = optimizer.minimize(loss_mean, global_step=global_step, name='train_op') data_train = util.load_data(constant.DATA_TRAIN) data_test = util.load_data(constant.DATA_TEST) graph = tf.get_default_graph() # var_list = [i for i in tf.global_variables() if i.name.split('/')[1] == 'result'] # saver = tf.train.Saver(var_list=var_list, max_to_keep=5) # [print(i) for i in tf.global_variables()] # [print(i.name) for i in graph.get_operations()] saver = tf.train.Saver(max_to_keep=5) with tf.Session(graph=graph) as sess: sess.run(tf.global_variables_initializer()) idx_train = np.linspace(0, constant.TRAIN_TOTAL_SIZE - 1, constant.TRAIN_TOTAL_SIZE, dtype=np.int32) step = 0 accuracies_train = [] accuracies_test = [] losses = [] ws = (w0, w1, w2, w3, w4) wa = WeightAdjust() wa.init(len(ws)) for i in range(constant.EPOCH): np.random.shuffle(idx_train) for j in range(constant.TRAIN_TIMES_FOR_EPOCH): idx_j = np.linspace(j * constant.BATCH_SIZE, (j + 1) * constant.BATCH_SIZE - 1, constant.BATCH_SIZE, dtype=np.int32) idx_train_batch = idx_train[idx_j] _, labels_train, _, images_train = util.get_batch( data_train, idx_train_batch) feed_dict_train = {x: images_train, y: labels_train} cls_train, _loss, _ = sess.run([cls, loss_mean, train_op], feed_dict=feed_dict_train) arg_idx_train = np.argmax(cls_train, axis=1) accuracy_train = sum( labels_train == arg_idx_train) / constant.BATCH_SIZE # test idx_test_batch = np.random.randint(0, constant.TEST_TOTAL_SIZE, [constant.BATCH_SIZE]) _, labels_test, _, images_test = util.get_batch( data_test, idx_test_batch) feed_dict_test = {x: images_test, y: labels_test} cls_test = sess.run(cls, feed_dict=feed_dict_test) arg_idx_test = np.argmax(cls_test, axis=1) accuracy_test = sum( labels_test == arg_idx_test) / constant.BATCH_SIZE step += 1 if step % constant.PRINT_EVERY_TIMES == 0: print( 'time:{},epoch:{},loss:{},accuracy_train:{:.2%},accuracy_test:{:.2%}' .format(util.cur_time(), step, _loss, accuracy_train, accuracy_test)) accuracies_train.append(accuracy_train) accuracies_test.append(accuracy_test) losses.append(_loss) times = int(constant.TRAIN_TIMES_FOR_EPOCH / constant.PRINT_EVERY_TIMES) train_mean = util.mean(accuracies_train[-times:]) test_mean = util.mean(accuracies_test[-times:]) print('save model,step: {},train_mean:{},test_mean:{}'.format( step, train_mean, test_mean)) saver.save(sess, save_path='./model/resnet/cifar-resnet.ckpt', global_step=step) wa.adjust(train_mean, test_mean, step) print(wa.action) if wa.action == 'adjust': print('本次迭代权重经过调整:{}'.format(wa.weights)) assigns = gm.assign_weight(wa, ws) sess.run(assigns) elif wa.action == 'stop': break else: pass accuracy_map = { 'accuracies_train': accuracies_train, 'accuracies_test': accuracies_test, 'losses': losses, 'weights': wa } util.dump_data(accuracy_map, './accuracy_map.pkl') t2 = time.time() print('耗时:{}'.format(util.str_time(t2 - t1)))
import util stats_to_delete = ['chi2_blue', 'chi2_red', 'chi2_red_00_0', 'chi2_red_00_1', 'chi2_red_00_2', 'chi2_red_25_0', 'chi2_red_25_1', 'chi2_red_25_2', 'chi2_red_50_0', 'chi2_red_50_1', 'chi2_red_50_2', 'chi2_red_75_0', 'chi2_red_75_1', 'chi2_red_75_2' ] fname = 'statistics.pckl' for name in [ 'Becker', 'EAGLE', 'Henriques', 'HW', 'Illustris', 'Lu', 'MB-II']: dir = 'data/' + name + '/' s = util.load_data(fname, dir) for stat in stats_to_delete: del s[stat] util.dump_data(s, fname, dir)
# TODO: make a log flag so that m5e12 gets log scaled as well dat = util.load_proxies(dat, 'data/HW/', proxies, proxies) features = proxies + ['mstar'] d_train, d_test = util.split_test_train(dat) Xtr, ytr, xtrsc, ytrsc = util.select_features(features, d_train, scaled=True) Xts, yts, xtssc, ytssc = util.select_features(features, d_test, scaled=True) #poly = preprocessing.PolynomialFeatures(degree=2) #Xtr_new = poly.fit_transform(Xtr) #Xts_new = poly.fit_transform(Xts) gp = GaussianProcess() gp.fit(Xtr, ytr) y_hat = gp.predict(Xts) #clf = Lasso(alpha=0.2) #clf.fit(Xtr_new, ytr) #y_hat = clf.predict(Xts_new) y_pred = ytssc.inverse_transform(y_hat) y_test = ytssc.inverse_transform(yts) p.sns.kdeplot(y_pred) p.sns.kdeplot(y_test) d_test = util.add_column(d_test, 'pred', y_pred) results = c.wprp_split(d_test, red_split=-11, box_size=cat['box_size']) util.dump_data(results, 'gp.dat', util.get_logging_dir('HW'))
def scrape_flowers(key): url = URLS.get(key) response = (requests.get(url, timeout=5)) soup = BeautifulSoup(response.content, "html.parser") tables = soup("table") # availability items = {} for tr in tables[0]("tr")[2:10]: name = tr("td")[0].text.strip() t = tables[0]("tr")[2]("td") item = { "color_image_urls": parse_image_URLs(tr("td")[1]), "months": { "northern": parse_months([ tr("td")[2], tr("td")[3], tr("td")[4], tr("td")[5], tr("td")[6], tr("td")[7], tr("td")[8], tr("td")[9], tr("td")[10], tr("td")[11], tr("td")[12], tr("td")[13] ]), "southern": parse_months([ tr("td")[8], tr("td")[9], tr("td")[10], tr("td")[11], tr("td")[12], tr("td")[13], tr("td")[2], tr("td")[3], tr("td")[4], tr("td")[5], tr("td")[6], tr("td")[7] ]) } } items[name] = item dump_data(items, "flower/availability") # genetics_rose items = [] for tr in tables[5]("tr")[2:29]: item = { "genotype": { "red": int(tr("td")[0].text.strip()), "yellow": int(tr("td")[1].text.strip()), "white": int(tr("td")[2].text.strip()) }, "phenotypes_image_url": parse_rose_image_URLs( [tr("td")[3].img, tr("td")[4].img, tr("td")[5].img]) } items.append(item) dump_data(items, "flower/genetics_rose") # genetics_others items = [] for tr in tables[6]("tr")[3:30]: item = { "genotype": { "red": int(tr("td")[0].text.strip()), "yellow": int(tr("td")[1].text.strip()), "white": int(tr("td")[2].text.strip()) }, "phenotypes_image_url": { "tulips": tr("td")[3].img.get("data-src").replace( "/scale-to-width-down/50", ""), "pansies": tr("td")[4].img.get("data-src").replace( "/scale-to-width-down/50", ""), "cosmos": tr("td")[5].img.get("data-src").replace( "/scale-to-width-down/50", ""), "lilies": tr("td")[6].img.get("data-src").replace( "/scale-to-width-down/50", ""), "hyacinths": tr("td")[7].img.get("data-src").replace( "/scale-to-width-down/50", ""), "windflowers": tr("td")[8].img.get("data-src").replace( "/scale-to-width-down/50", ""), "mums": tr("td")[9].img.get("data-src").replace( "/scale-to-width-down/50", "") } } items.append(item) dump_data(items, "flower/genetics_others") # hybridization_simple items = {} for table_number in range(7, 15): temp = [] species = tables[table_number]("tr")[0].text.strip() for tr in tables[table_number]("tr")[2:8]: if len(tr( "abbr")) > 0: # some trs do not contain hybridization data item = { "parent_a": { "gene": tr("abbr")[0].get("title"), "image_url": tr("abbr")[0]("img")[0].get("data-src").replace( "/scale-to-width-down/50", "") }, "parent_b": { "gene": tr("abbr")[1].get("title"), "image_url": tr("abbr")[1]("img")[0].get("data-src").replace( "/scale-to-width-down/50", "") }, "children": parse_hybridization_children(tr("td")[2]) } temp.append(item) items[species] = temp dump_data(items, "flower/hybridization_simple") # hybridization_advanced items = {} for table_number in range(15, 17): temp = [] species = tables[table_number]("tr")[0].text.strip() for tr in tables[table_number]("tr")[2:6]: if len(tr("abbr")) > 0: item = { "parent_a": { "gene": tr("abbr")[0].get("title"), "image_url": tr("abbr")[0]("img")[0].get("data-src").replace( "/scale-to-width-down/50", "") }, "parent_b": { "gene": tr("abbr")[1].get("title"), "image_url": tr("abbr")[1]("img")[0].get("data-src").replace( "/scale-to-width-down/50", "") }, "children": parse_hybridization_children(tr("td")[2]) } temp.append(item) items[species] = temp dump_data(items, "flower/hybridization_advanced")