def main(): raw_list = csv_to_list(csv_file)[:100] total_len = len(raw_list) counter = 0 result_dict = dict() print "Commencing Web Scraping..." start_time = time.time() for raw_link in raw_list: try: raw_link = raw_link[0] whois_link = "http://www.whois.com/whois/" + raw_link ipaddress_link = "http://" + raw_link + ".ipaddress.com/" whois_soup = link_to_lxmlsoup(whois_link) ipaddress_soup = link_to_lxmlsoup(ipaddress_link) result_dict.setdefault('Raw Link', []).append(str(raw_link)) result_dict = whois_parser(whois_soup, result_dict) result_dict = ipaddress_parser(ipaddress_soup, result_dict) counter, total_len = print_counter(counter, total_len) if counter % 400 == 0: print "Commencing 30 Second Sleep after 400 iterations" time.sleep(30) time_elapsed = time.time() - start_time print_progress(time_elapsed, counter, total_len) except: dict_to_json(result_dict, 'output.json') dict_to_csv(result_dict, 'output.csv') print "Unexpected Error", sys.exc_info()[0] raise dict_to_json(result_dict, 'output.json') dict_to_csv(result_dict, 'output.csv')
def search_all(): """ Get all books in all categories of the book.toscrape site """ list_category = get_categories() for category in list_category: result = search_products_by_category(category["url"], category["label"]) dict_to_csv(result, category["label"])
def write_to_csv_file_for_DataTransfer(inst, dics): """ Writes/Overwrites CSV files with data supplied in dictionaries Note: Dictionary keys will be changed to work with DataTransfer extension Args: inst: Instance of the class (Legislator, Committee..) to indicate which file to write to and what's the template name dics: Dictionaries which will be written into a CSV file """ modified_dics = [ modify_dict_for_DataTransfer(dic, inst.template_name) for dic in dics ] utils.dict_to_csv(modified_dics, inst.file_path)
def record(self, fold): # save plots save_plot(self.val_record, 'loss', self.args.n_eval, 'tmp/val_loss.png') save_plot(self.val_record, 'f1', self.args.n_eval, 'tmp/val_f1.png') save_plot(self.norm_record, 'grad_norm', self.args.n_eval, 'tmp/grad_norm.png') if self.args.test: save_plots([self.val_record, self.test_record], ['loss', 'f1'], ['val', 'test'], self.args.n_eval) # create subdir for this experiment os.makedirs(self.record_dir, exist_ok=True) subdir = os.path.join(self.models_dir, str_date_time()) if self.args.mode == 'test': subdir += '_test' os.mkdir(subdir) # write model params and results to csv csvlog = os.path.join(subdir, 'info.csv') param_dict = {} for arg in vars(self.args): param_dict[arg] = str(getattr(self.args, arg)) info = torch.load(self.best_info_path) hash = get_hash() if self.args.machine == 'dt' else 'no_hash' passed_args = ' '.join(sys.argv[1:]) param_dict = { 'hash': hash, 'subdir': subdir, **param_dict, **info, 'args': passed_args } dict_to_csv(param_dict, csvlog, 'w', 'index', reverse=False) header = True if fold == 0 else False dict_to_csv(param_dict, self.record_path, 'a', 'columns', reverse=True, header=header) # copy all records to subdir png_files = ['val_loss.png', 'val_f1.png' ] if not self.args.test else ['loss.png', 'f1.png'] csv_files = [ 'val_probs*.csv', 'train_steps.csv', 'submission.csv', 'test_probs.csv' ] copy_files([*png_files, 'models/*.info', *csv_files], 'tmp', subdir) return subdir
def save_step(self, step_info, message=False): if self.new: header = True mode = 'w' self.new = False else: header = False mode = 'a' dict_to_csv(step_info, 'tmp/train_steps.csv', mode, orient='columns', header=header) if message: print( 'epoch {:02} - step {:06} - train_loss {:.4f} - val_loss {:.4f} - f1 {:.4f}' .format(*list(step_info.values())))
def get_player_data(): player_filename = 'game_summary_by_player.csv' data_dir = os.getcwd() + '/data/external' player_path = os.path.join(data_dir, player_filename) output_path = os.path.join(data_dir, 'player_metadata.csv') player_game_df = pd.read_csv(player_path) players_df = player_game_df[["player_link", "player_name"]].drop_duplicates() if os.path.isfile(player_path): addtl_rows_flg = True player_metadata_df = pd.read_csv(output_path) player_metadata_df = player_metadata_df.drop_duplicates() else: addtl_rows_flg = False player_metadata_df = pd.DataFrame() if addtl_rows_flg == True: existing_players = list(player_metadata_df["player_link"].unique()) players_df = players_df[~players_df["player_link"]. isin(existing_players)] print( "number of players already scraped: {} / length of remaining df: {}" .format(len(existing_players), len(players_df))) output_rows = [] for idx, row in players_df.iterrows(): name = row['player_name'] link = row['player_link'] print("{}: {} - {}".format(idx, name, link)) player = Player(name, link) #print("{}: {} - {}".format(idx, player.name, player.full_player_url)) player_dict = player.get_player_data() dict_to_csv(player_dict, data_dir, 'player_metadata') output_rows.append(player_dict) #output_df = pd.DataFrame(output_rows) #output_path = os.path.join(data_dir, 'player_metadata.csv' ) #output_df = output_df[["player_link", "player_name", "position", "draft_pick" ,"height", "weight", "height", "birthdate", "forty_yd", "combine_bench", "combine_broad_jump", "combine_cone", "combine_shuttle", "combine_vert", "combine_year"]] #output_df.to_csv(output_path, index = False) return
def record(self, max_f1, tresh, method): ens_info = format_info({'max_f1': max_f1, 'tresh': tresh}) ens_info = {'method': method, **ens_info} model_infos = [] # partial model descriptions # copy partial models descriptions info_paths = [os.path.join(pp, 'info.csv') for pp in self.pred_dirs] for ip in info_paths: info = self.read_model_info(ip) model_infos.append(info) model_infos = [o for l in model_infos for o in l] with open(self.ens_record_path, 'a') as f: writer = csv.writer(f) writer.writerows(model_infos) dict_to_csv(ens_info, self.ens_record_path, 'a', 'columns', reverse=False, header=True)
# copy jpgs from jpg folder to xmls_classify folder ############################################################### print(colorama.Fore.GREEN + "[INFO] copy jpgs from jpg folder to xmls_classify folder" + colorama.Fore.WHITE) xmls = scan_files(classify_xml_path, postfix=".xml") for xml in xmls: jpg = os.path.join(image_path, os.path.basename(os.path.splitext(xml)[0]) + ".jpg") shutil.copy(jpg, classify_xml_path) # write auto_labeling info into csv ############################################################################### print(colorama.Fore.GREEN + "[INFO] write auto_labeling info into csv" + colorama.Fore.WHITE) csv_file_s = os.path.join(output_tif_608s, tif_name + "_s.csv") dict_to_csv(dict_pic_info, csv_file_s) csv_file_c = os.path.join(output_tif_608s, tif_name + "_c.csv") predictions_to_csv(dict_pic_info_all, classes_list, classes_all, csv_file_c) # generate confusion matrix ####################################################################################### print(colorama.Fore.GREEN + "[INFO] generate confusion matrix" + colorama.Fore.WHITE) matrix = confusion_matrix(classes_all, cell_numpy_index, predictions) xlsx = os.path.join(output_tif_608s, tif_name + ".xlsx") generate_xlsx(classes_all, matrix, xlsx) # generate asap_xml from labelimg_xmls print(colorama.Fore.GREEN + "[INFO] generate asap xml from labelimg xmls" + colorama.Fore.WHITE) xml_asap_segment = os.path.join(output_tif_608s, tif_name + "_segment.xml") gen_asap_xml(xml_asap_segment, segment_xml_path)
# if u"*" in text_lines or u"\u2217" in text_lines: asterisk_idx = [True if l[0] in asterisks else False for l in first_lines] if any(asterisk_idx): info_location = treat_asteriks(first_lines, asterisk_idx) # print info_location # print # print text_lines # print "***"*80 # with_stars+=1 else: # print "---"*80 info_location = treat_no_asterisks(first_lines, authors) locations_dict[page_idx].extend(info_location) dict_to_csv("../../input/location_1pageRB.csv", locations_dict, columns=["id", "location"]) # print "From {0} 1page pdf, {1} have stars in first lines".format(len(one_pages), with_stars) return locations_dict def clean_list(noisy_list): noise = [u"@"] noisy_list = [n.strip().lstrip().lower() for n in noisy_list if n] noisy_elements = np.array([False if l in n else True for l in noise for n in noisy_list], dtype=np.bool) noisy_list = np.array(noisy_list) return noisy_list[noisy_elements] def similarity_locations(locations_dict):
list_categories = get_categories() for index, category in enumerate(list_categories): print(f"[{index}] {category['label']}") choice = input( "Choississez le numero de la categorie que vous voulez chercher : " ) try: choice = int(choice) categorie_choice = list_categories[choice] with Loader( desc= f"Web Scrapping de la categorie {categorie_choice['label']} en cours ... " ): result = search_products_by_category(categorie_choice['url'], categorie_choice['label']) dict_to_csv(result, f"{categorie_choice['label']}_books") if settings.zip_option: zip_files(f"{categorie_choice['label']}_results") except IndexError: print("Le chiffre entrée n'existe pas") except ValueError: print("Vous n'avez pas entré un chiffre") elif args.product: if args.product != "": if re.search( r"^(https:[/]{2}books.toscrape.com[/]catalogue[/])[\w\W]*(index.html)$", args.product, ): with Loader("Web Scraping de votre produit en cours ... "): product = [search_product(args.product)] product_title = product[0]["title"][:10]
] if any(asterisk_idx): info_location = treat_asteriks(first_lines, asterisk_idx) # print info_location # print # print text_lines # print "***"*80 # with_stars+=1 else: # print "---"*80 info_location = treat_no_asterisks(first_lines, authors) locations_dict[page_idx].extend(info_location) dict_to_csv("../../input/location_1pageRB.csv", locations_dict, columns=["id", "location"]) # print "From {0} 1page pdf, {1} have stars in first lines".format(len(one_pages), with_stars) return locations_dict def clean_list(noisy_list): noise = [u"@"] noisy_list = [n.strip().lstrip().lower() for n in noisy_list if n] noisy_elements = np.array( [False if l in n else True for l in noise for n in noisy_list], dtype=np.bool) noisy_list = np.array(noisy_list) return noisy_list[noisy_elements]