def bayes_select_feature(model_name, word_matrix, labels_list, file_name): if model_name == "Gaussian": model = naive_bayes_Gaussian elif model_name == "Multinomial": model = naive_bayes_Multinomial score_dict = {} for num in trange(300, 900): #使用卡方分布从词频特征抽取特征 new_feature = tools.select_feature(word_matrix.toarray(), labels_list, num) #贝叶斯分类 score = model(new_feature, labels_list) #记录卡方分布抽取数对应的分类精度 score_dict[str(num) + " features"] = score if score > 0.9: break max_item = max(score_dict.items(), key=lambda x: x[1]) print("The best is ", max_item) score_dict["Best"] = [max_item[0], max_item[1]] tools.write_json( "E:/Program Files/workspace/report_sheng/" + file_name + ".json", score_dict)
def token_dict2json(path, token_dict_path, label2i_path): """用来生成中文词对应索引字典,和label的对应索引字典""" train_data = pd.read_csv(path) labels = train_data['y'] label2i = {str(i): index for index, i in enumerate(set(labels))} i2label = {str(value): key for key, value in label2i.items()} print(i2label) write_json(label2i_path, label2i) train_data_x = [jieba.lcut(i) for i in train_data['x']] char_set = set(word for sen in train_data_x for word in sen) char_dic = {str(j): i + 1 for i, j in enumerate(char_set)} char_dic["unk"] = 0 print(len(char_dic)) max_features = len(char_dic) new_sequential = [[char_dic.get(word) for word in sen] for sen in train_data_x] ngram_range = 2 print('Adding {}-gram features'.format(ngram_range)) ngram_set = set() for input_list in new_sequential: set_of_ngram = create_ngram_set(input_list, ngram_value=2) ngram_set.update(set_of_ngram) start_index = max_features token_indice = {str(v): k + start_index for k, v in enumerate(ngram_set)} token_dict = {**token_indice, **char_dic} write_json(token_dict_path, token_dict)
def get_skeleton(kps, json_file_name, aux_i, aux_j): ''' Get the closest point to aux_i and aux_j if they are not -1, if they are -1 get the most confident values ''' kps = np.array(kps) possibles_i = [] possibles_j = [] lista = list(range(len(kps))) for i in range(len(kps)): n = np.count_nonzero(kps[i][:, 0][kps[i][:, 0] > 0.1]) possibles_i.append((np.sum(kps[i][:, 0][kps[i][:, 0] > 0.1])) / n) n = np.count_nonzero(kps[i][:, 0][kps[i][:, 1] > 0.1]) possibles_j.append((np.sum(kps[i][:, 1][kps[i][:, 1] > 0.1])) / n) possibles_i = np.array(possibles_i) possibles_j = np.array(possibles_j) idx, distance = find_nearest(possibles_i, possibles_j, aux_i, aux_j) n = np.count_nonzero(kps[idx][:, 0][kps[idx][:, 0] > 0.1]) aux_i = (np.sum(kps[idx][:, 0][kps[idx][:, 0] > 0.1])) / n aux_j = (np.sum(kps[idx][:, 1][kps[idx][:, 1] > 0.1])) / n aux = lista[0] lista[0] = idx lista[idx] = aux write_json(kps, json_file_name, lista) return aux_i, aux_j, idx
def Multinomial_select_alpha(word_matrix, labels_list, feature_num): new_feature = tools.select_feature(word_matrix.toarray(), labels_list, feature_num) #随机将样本分为训练集、测试集 train_data, train_label, test_data, test_label = tools.part_features( new_feature, labels_list) score_dict = {} for al in np.arange(0.0, 0.12, 0.0001): mnb = MultinomialNB(alpha=al, fit_prior=True) mnb.fit(train_data, train_label) print('Traing score : %.2f' % mnb.score(train_data, train_label)) print('Testing score : %.2f' % mnb.score(test_data, test_label)) score_dict["alpha = " + str(al)] = mnb.score(test_data, test_label) max_item = max(score_dict.items(), key=lambda x: x[1]) print("The best is ", max_item) score_dict["Best"] = [max_item[0], max_item[1]] tools.write_json( "E:/Program Files/workspace/report_sheng/Multinomial_alpha_score_dict.json", score_dict)
def writing_data_and_price_json_csv(): cars_data = [] cars_price = [] # count = 0 # count1 = 0 for i in range(len(page_saving.brands)): for j in range(20): content = tools.file_content( f'captured_data/{page_saving.brands[i]}/{page_saving.brands[i]}_page_{j+1}.html' ) for match_data in re.finditer( pattern_data, content): # for loop for appending car data # count += 1 # print(match_data.groupdict()) cars_data.append(match_data.groupdict()) for match_price in re.finditer( pattern_price, content): # for loop for appending car prices # count1 += 1 cars_price.append(match_price.groupdict()) # print(count, count1) tools.write_csv(cars_price, ['price'], 'cars_price.csv') tools.write_json(cars_price, 'cars_price.json') tools.write_json(cars_data, 'cars_data.json') tools.write_csv(cars_data, [ 'brand', 'model', 'first_registration', 'kilometers', 'engine', 'transmission' ], 'cars_data.csv')
def get_dic(path, t2i_path, l2i_path): sentences = read_jsonline(path) words = set(word for i in sentences for word in i['word']) token_dic = {str(v): i + 1 for i, v in enumerate(words)} token_dic["unk"] = 0 labels = set(word for i in sentences for word in i['tag']) label2id = {str(v): i for i, v in enumerate(labels)} write_json(t2i_path, token_dic) write_json(l2i_path, label2id)
def extrator(site, id): mileage = "" price = "" exterior_color = "" transmission = "" title = "" fuel = "" page = requests.get(site) print(page.status_code) if (page.status_code == 200): try: soup = BeautifulSoup(page.content, 'html.parser') try: title = soup.title.get_text().split('|')[0] except: print("Title not extracted") try: price = soup.find(class_="finance-info-price").get_text() except: print("Price not extracted") try: tabela = soup.find(class_="about-vehicle-wrapper") for item in tabela.descendants: try: if (item.get_text().split(":")[0] == "Gearbox"): transmission = item.get_text().split( ":")[-1].strip() if (item.get_text().split(":")[0] == "Colour"): exterior_color = item.get_text().split( ":")[-1].strip() if (item.get_text().split(":")[0] == "Fuel"): fuel = item.get_text().split(":")[-1].strip() if (item.get_text().split(":")[0] == "Mileage"): mileage = item.get_text().split(":")[-1].strip() except: pass except: print( "Table with transmission, exterior color, fuel and mileage not founded" ) data = { 'Title': title, 'Price': price, 'Exterior Color': exterior_color, 'Mileage': mileage, 'Transmission': transmission } tools.write_json("extract", id, data) except: print("Page not downloaded") else: print("Page request error")
def test_update_json1(self): data1 = {'a': 1, 'b': 2} data2 = {'c': 3, 'd': 4} data3 = {'a': 1, 'c': 3, 'b': 2, 'd': 4} output_file = os.path.join(fixture_dir, 'foo_{0}.json'.format(t.timestamp())) t.write_json(object=data1, output_file=output_file) t.update_json(data=data2, input_file=output_file) data4 = t.load_json(input_file=output_file) self.assertTrue( data3 == data4, 'Data read from JSON file does not match expected output')
def save_latest(self, directory, model_and_loss, stats_dict, store_as_best=False): # ----------------------------------------------------------------------------------------- # Make sure directory exists # ----------------------------------------------------------------------------------------- tools.ensure_dir(directory) # check previous latest file version latest_statistics_filename = os.path.join( directory, self._prefix + self._latest_postfix + ".json") if os.path.isfile(latest_statistics_filename): statistics = tools.read_json(latest_statistics_filename) shadow_is_latest = statistics['shadow'] else: shadow_is_latest = True stats_dict['shadow'] = not shadow_is_latest # ----------------------------------------------------------------------------------------- # Save # ----------------------------------------------------------------------------------------- save_dict = dict(stats_dict) save_dict[self._model_key] = model_and_loss.state_dict() if shadow_is_latest: latest_checkpoint_filename = os.path.join( directory, self._prefix + self._latest_postfix + self._extension) else: latest_checkpoint_filename = os.path.join( directory, self._prefix + self._latest_postfix + '_shadow' + self._extension) torch.save(save_dict, latest_checkpoint_filename) tools.write_json(data_dict=stats_dict, filename=latest_statistics_filename) # ----------------------------------------------------------------------------------------- # Possibly store as best # ----------------------------------------------------------------------------------------- if store_as_best: best_checkpoint_filename = os.path.join( directory, self._prefix + self._best_postfix + self._extension) best_statistics_filename = os.path.join( directory, self._prefix + self._best_postfix + ".json") logging.info("Saved checkpoint as best model..") shutil.copyfile(latest_checkpoint_filename, best_checkpoint_filename) shutil.copyfile(latest_statistics_filename, best_statistics_filename)
def save_latest(self, directory, model_and_loss, stats_dict, store_as_best=False): # ----------------------------------------------------------------------------------------- # Make sure directory exists # ----------------------------------------------------------------------------------------- tools.ensure_dir(directory) # ----------------------------------------------------------------------------------------- # Save # ----------------------------------------------------------------------------------------- save_dict = dict(stats_dict) save_dict[self._model_key] = model_and_loss.state_dict() latest_checkpoint_filename = os.path.join( directory, self._prefix + self._latest_postfix + self._extension) latest_statistics_filename = os.path.join( directory, self._prefix + self._latest_postfix + ".json") torch.save(save_dict, latest_checkpoint_filename) tools.write_json(data_dict=stats_dict, filename=latest_statistics_filename) # ----------------------------------------------------------------------------------------- # Possibly store as best # ----------------------------------------------------------------------------------------- if store_as_best: best_checkpoint_filename = os.path.join( directory, self._prefix + self._best_postfix + self._extension) best_statistics_filename = os.path.join( directory, self._prefix + self._best_postfix + ".json") logging.info("Saved checkpoint as best model..") shutil.copyfile(latest_checkpoint_filename, best_checkpoint_filename) shutil.copyfile(latest_statistics_filename, best_statistics_filename)
def extrator(url, id): page = requests.get(url) print(page.status_code) if (page.status_code == 200): mileage = "" price = "" exterior_color = "" interior_color = "" transmission = "" engine = "" title = "" try: soup = BeautifulSoup(page.content, 'html.parser') try: title = soup.title.get_text() except: print("Title not extracted") try: tabela = soup.find(class_="ucc-table ucc-table--summary") for child in tabela: try: if (child.get_text().split("$")[0].strip().replace( '\n', '') == "Price"): price = child.get_text().split( "Price")[-1].strip().replace('\n', '') if (child.get_text().split(" Color") [0].strip().replace('\n', '') == "Exterior"): exterior_color = child.get_text().split( "Color")[-1].strip().replace('\n', '') if (child.get_text().split(" Color") [0].strip().replace('\n', '') == "Interior"): interior_color = child.get_text().split( "Color")[-1].strip().replace('\n', '') if (child.get_text().split("gine")[0].strip().replace( '\n', '') == "En"): engine = child.get_text().split( "Engine")[-1].strip().replace('\n', '') if (child.get_text().split("mission") [0].strip().replace('\n', '') == "Trans"): transmission = child.get_text().split( "Transmission")[-1].strip().replace('\n', '') if (child.get_text().split("age")[0].strip().replace( '\n', '') == "Mile"): mileage = child.get_text().split( "Mileage")[-1].strip().replace('\n', '') except: pass except: print("Table not extracted") data = { 'Title': title, 'Price': price, 'Exterior Color': exterior_color, 'Mileage': mileage, 'Transmission': transmission } tools.write_json("extract", id, data) #print(title) #print(price) #print(exterior_color) #print(interior_color) #print(engine) #print(mileage) #print(transmission) except: print("Page not downloaded") else: print("Page request error")
# This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/> """ Generates the playlist of the soundtrack (file names and titles displayed in the game). Phonon cannot read metadata under Windows sometimes, see: http://stackoverflow.com/questions/23288557/phonon-cant-get-meta-data-of-audio-files-in-python """ import os os.chdir('..') # create the playlist, a list of (filename, title) playlist = [('01 Imperialism Theme.ogg', 'Imperialism Theme'), ('02 Silent Ashes.ogg', 'Silent Ashes')] # write import constants as c, tools as t print('write to {}'.format(c.Soundtrack_Playlist)) t.write_json(c.Soundtrack_Playlist, playlist)
simi_path = "E:/Program Files/workspace/report_sheng/report_similarity/" features, labels, words, feature_names = tools.get_feature(simi_path) labels_list = [] for i in range(len(labels)): labels_list.append(lable_dict[labels[i]]) word_matrix, word_names = tools.get_count_vect(words) score_dict = {} for num in trange(10, 1137): #使用卡方分布从词频特征抽取特征 new_feature = tools.select_feature(word_matrix.toarray(), labels_list, num) for neibhor in range(3, 32): #多项式贝叶斯分类 score = KNeighbor(new_feature, labels_list, neibhor) #记录卡方分布抽取数对应的分类精度 score_dict[str(num) + " features and " + str(neibhor) + " neighbors"] = score max_item = max(score_dict.items(), key=lambda x: x[1]) print("The best is ", max_item) score_dict["Best"] = [max_item[0], max_item[1]] tools.write_json( "E:/Program Files/workspace/report_sheng/KNeighbor_score_dict.json", score_dict)
def extrator(url, id): page = requests.get(url) print(page.status_code) if (page.status_code == 200): fuel = "" mileage = "" price = "" exterior_color = "" transmission = "" title = "" try: soup = BeautifulSoup(page.content, 'html.parser') try: price = soup.find(class_="currentPrice-441857624").get_text() except: print("Price not extracted") try: title = soup.title.get_text().split("|")[0] except: print("Title not extracted") try: tabela = soup.find_all(class_="itemAttribute-983037059") for item in tabela: if (item.get_text().count("Fuel") > 0): fuel = item.get_text().split( "Type")[-1].strip().replace('\n', '') elif (item.get_text().count("Transmission") > 0): transmission = item.get_text().split( "Transmission")[-1].strip().replace('\n', '') elif (item.get_text().count("Colour") > 0): exterior_color = item.get_text().split( "Colour")[-1].strip().replace('\n', '') elif (item.get_text().count("Kilometers") > 0): mileage = item.get_text().split( "Kilometers")[-1].strip().replace('\n', '') except: print("Tabel not extracted") data = { 'Title': title, 'Price': price, 'Exterior Color': exterior_color, 'Mileage': mileage, 'Transmission': transmission } tools.write_json("extract", id, data) #print(title) #print(price) #print(exterior_color) #print(fuel) #print(mileage) #print(transmission) except: print("Page not downloaded") else: print("Page request error")
def extrator(url,id): page = requests.get(url) print(page.status_code) if(page.status_code == 200): fuel="" mileage = "" price="" exterior_color="" interior_color="" transmission="" engine="" title="" try: soup = BeautifulSoup(page.content, 'html.parser') try: title = soup.title.get_text().split('|')[0] except: print("Title not extracted") try: price = soup.find(class_="price").get_text().strip().replace('\n','') except: print("Price not extracted") try: tabela_class = soup.find_all(class_="details-list")[0].get_text() tabela = tabela_class.split("\n") tabela = filter(None, tabela) for item in tabela: if(item.split()[0] == "Fuel"): fuel = item.split(":")[-1].strip().replace('\n','') elif(item.split()[0] == "Exterior"): exterior_color = item.split(":")[-1].strip().replace('\n','') elif(item.split()[0] == "Interior"): interior_color = item.split(":")[-1].strip().replace('\n','') elif(item.split()[0] == "Engine:"): engine = item.split(":")[-1].strip().replace('\n','') elif(item.split()[0] == "Transmission:"): transmission = item.split(":")[-1].strip().replace('\n','') elif(item.split()[0] == "Mileage:"): mileage = item.split(":")[-1].strip().replace('\n','') except: print("Table not extracted") data = { 'Title': title, 'Price': price, 'Exterior Color' : exterior_color, 'Mileage' : mileage, 'Transmission': transmission } tools.write_json("extract", id, data) except: print("Page not Downloaded") #print(title) #print(price) #print(exterior_color) #print(interior_color) #print(engine) #print(mileage) #print(fuel) #print(transmission) else: print("Page request error")
def extrator(url, id): page = requests.get(url) print(page.status_code) if (page.status_code == 200): price = "" exterior_color = "" interior_color = "" transmission = "" engine = "" title = "" try: soup = BeautifulSoup(page.content, 'html.parser') try: title = soup.find_all(class_="inventory-main-line" )[0].get_text().strip().replace( '\n', '') except: print("Title not extracted") try: tabela = soup.find(id="vehicle-info-wrapper") for child in tabela.descendants: try: if (child.get_text().split()[0] == "Exterior:" and child.get_text().split(":")[-1] != " "): exterior_color = child.get_text().split( ":")[-1].strip() if (child.get_text().split()[0] == "Interior:" and child.get_text().split(":")[-1] != " "): interior_color = child.get_text().split( ":")[-1].strip() if (child.get_text().split()[0] == "Engine:" and child.get_text().split(":")[-1] != " "): engine = child.get_text().split(":")[-1].strip() if (child.get_text().split()[0] == "Transmission:" and child.get_text().split(":")[-1] != " "): transmission = child.get_text().split( ":")[-1].strip() if (child.get_text().split()[0] == "Odometer:" and child.get_text().split(":")[-1] != " "): mileage = child.get_text().split(":")[-1].strip() except AttributeError: pass except: print("Table not extracted") try: price_tree = soup.find(class_="inventory-price") for child in price_tree.descendants: try: if (child.get_text().split()[0] == "Price:" and child.get_text().split(":")[-1] != " "): price = child.get_text().split(":")[-1].strip() except: pass except: print("Price not extracted") data = { 'Title': title, 'Price': price, 'Exterior Color': exterior_color, 'Mileage': mileage, 'Transmission': transmission } tools.write_json("extract", id, data) #print(title) #print(price) #print(exterior_color) #print(interior_color) #print(engine) #print(mileage) #print(transmission) except: print("Page not downloaded") else: print("Page request error")
def extrator(url, id): page = requests.get(url) print(page.status_code) if (page.status_code == 200): fuel = "" mileage = "" price = "" exterior_color = "" interior_color = "" transmission = "" engine = "" title = "" try: soup = BeautifulSoup(page.content, 'html.parser') try: title = soup.title.get_text().split("|")[0] except: print("Title not extracted") try: price_class = soup.find_all( class_= "vehicle-info__price-display vehicle-info__price-display--dealer cui-heading-2" ) price = price_class[0].get_text() except: print("Price not extracted") try: tabela = soup.find_all(class_="vdp-details-basics__item") for item in tabela: if (item.get_text().split()[0] == "Fuel"): fuel = item.get_text().split(":")[-1] fuel = fuel.strip().replace('\n', '') elif (item.get_text().split()[0] == "Exterior"): exterior_color = item.get_text().split(":")[-1] exterior_color = exterior_color.strip().replace( '\n', '') elif (item.get_text().split()[0] == "Interior"): interior_color = item.get_text().split(":")[-1] interior_color = interior_color.strip().replace( '\n', '') elif (item.get_text().split()[0] == "Engine:"): engine = item.get_text().split(":")[-1] engine = engine.strip().replace('\n', '') elif (item.get_text().split()[0] == "Transmission:"): transmission = item.get_text().split(":")[-1] transmission = transmission.strip().replace('\n', '') elif (item.get_text().split()[0] == "Mileage:"): mileage = item.get_text().split(":")[-1] mileage = mileage.strip().replace('\n', '') except: print("Table not extracted") except: print("Page not downloaded") data = { 'Title': title, 'Price': price, 'Exterior Color': exterior_color, 'Mileage': mileage, 'Transmission': transmission } tools.write_json("extract", id, data) else: print("Page request error")
def extrator(url, id): page = requests.get(url) print(page.status_code) if (page.status_code == 200): mileage = "" price = "" exterior_color = "" transmission = "" engine = "" title = "" try: soup = BeautifulSoup(page.content, 'html.parser') try: title = soup.title.get_text() except: print("Title not extracted") try: tabela = soup.find_all(class_="ar_vehspec") for item in tabela: if (item.get_text().split(" :")[0] == "Exterior"): exterior_color = item.get_text().split( " :")[-1].strip().replace('\n', '') elif (item.get_text().split(" :")[0].strip() == "Sale Price"): price = item.get_text().split( " :")[-1].strip().replace('\n', '') elif (item.get_text().split(" :")[0].strip() == "Transmission"): transmission = item.get_text().split( " :")[-1].strip().replace('\n', '') elif (item.get_text().split(" :")[0].strip() == "Engine"): engine = item.get_text().split( " :")[-1].strip().replace('\n', '') elif (item.get_text().split(" :")[0].strip() == "Mileage"): mileage = item.get_text().split( " :")[-1].strip().replace('\n', '') except: print("Table not extracted") data = { 'Title': title, 'Price': price, 'Exterior Color': exterior_color, 'Mileage': mileage, 'Transmission': transmission } tools.write_json("extract", id, data) #print(title) #print(price) #print(exterior_color) #print(engine) #print(mileage) #print(transmission) except: print("Page not downloaded") else: print("Page request error")
# GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/> """ Generates the default options. """ import os os.chdir('..') import constants as c # options are stored as a dictionary options = { c.O_VERSION: 'v0.2.0 (2014-xx-xx)', # to be displayed on the start screen c.O_OPTIONS_VERSION: 1, # version of options c.OG_MW_FULLSCREEN: True, # we start full screen (can be unset by the program for some linux desktop environments c.OG_FULLSCREEN_SUPPORTED: True, # is full screen supported c.OM_PHONON_SUPPORTED: True, c.OM_BG_MUTE: False } # save import tools as t print('write to {}'.format(c.Options_Default_File)) t.write_json(c.Options_Default_File, options)