示例#1
0
def main():
    raw_list = csv_to_list(csv_file)[:100]
    total_len = len(raw_list)
    counter = 0
    result_dict = dict()
    print "Commencing Web Scraping..."
    start_time = time.time()
    for raw_link in raw_list:
        try:
            raw_link = raw_link[0]
            whois_link = "http://www.whois.com/whois/" + raw_link
            ipaddress_link = "http://" + raw_link + ".ipaddress.com/"
            whois_soup = link_to_lxmlsoup(whois_link)
            ipaddress_soup = link_to_lxmlsoup(ipaddress_link)
            result_dict.setdefault('Raw Link', []).append(str(raw_link))
            result_dict = whois_parser(whois_soup, result_dict)
            result_dict = ipaddress_parser(ipaddress_soup, result_dict)
            counter, total_len = print_counter(counter, total_len)
            if counter % 400 == 0:
                print "Commencing 30 Second Sleep after 400 iterations"
                time.sleep(30)
            time_elapsed = time.time() - start_time
            print_progress(time_elapsed, counter, total_len)
        except:
            dict_to_json(result_dict, 'output.json')
            dict_to_csv(result_dict, 'output.csv')
            print "Unexpected Error", sys.exc_info()[0]
            raise
    dict_to_json(result_dict, 'output.json')
    dict_to_csv(result_dict, 'output.csv')
def search_all():
    """
    Get all books in all categories of the book.toscrape site
    """
    list_category = get_categories()
    for category in list_category:
        result = search_products_by_category(category["url"], category["label"])
        dict_to_csv(result, category["label"])
示例#3
0
def write_to_csv_file_for_DataTransfer(inst, dics):
    """
    Writes/Overwrites CSV files with data supplied in dictionaries
    Note: Dictionary keys will be changed to work with DataTransfer extension
    Args:
        inst: Instance of the class (Legislator, Committee..) to indicate which file to write to and what's the template name
        dics: Dictionaries which will be written into a CSV file
    """
    modified_dics = [
        modify_dict_for_DataTransfer(dic, inst.template_name) for dic in dics
    ]
    utils.dict_to_csv(modified_dics, inst.file_path)
示例#4
0
文件: learner.py 项目: nascarr/quora
    def record(self, fold):
        # save plots
        save_plot(self.val_record, 'loss', self.args.n_eval,
                  'tmp/val_loss.png')
        save_plot(self.val_record, 'f1', self.args.n_eval, 'tmp/val_f1.png')
        save_plot(self.norm_record, 'grad_norm', self.args.n_eval,
                  'tmp/grad_norm.png')
        if self.args.test:
            save_plots([self.val_record, self.test_record], ['loss', 'f1'],
                       ['val', 'test'], self.args.n_eval)

        # create subdir for this experiment
        os.makedirs(self.record_dir, exist_ok=True)
        subdir = os.path.join(self.models_dir, str_date_time())
        if self.args.mode == 'test':
            subdir += '_test'
        os.mkdir(subdir)

        # write model params and results to csv
        csvlog = os.path.join(subdir, 'info.csv')
        param_dict = {}
        for arg in vars(self.args):
            param_dict[arg] = str(getattr(self.args, arg))
        info = torch.load(self.best_info_path)
        hash = get_hash() if self.args.machine == 'dt' else 'no_hash'
        passed_args = ' '.join(sys.argv[1:])
        param_dict = {
            'hash': hash,
            'subdir': subdir,
            **param_dict,
            **info, 'args': passed_args
        }
        dict_to_csv(param_dict, csvlog, 'w', 'index', reverse=False)
        header = True if fold == 0 else False
        dict_to_csv(param_dict,
                    self.record_path,
                    'a',
                    'columns',
                    reverse=True,
                    header=header)

        # copy all records to subdir
        png_files = ['val_loss.png', 'val_f1.png'
                     ] if not self.args.test else ['loss.png', 'f1.png']
        csv_files = [
            'val_probs*.csv', 'train_steps.csv', 'submission.csv',
            'test_probs.csv'
        ]
        copy_files([*png_files, 'models/*.info', *csv_files], 'tmp', subdir)
        return subdir
示例#5
0
文件: learner.py 项目: nascarr/quora
 def save_step(self, step_info, message=False):
     if self.new:
         header = True
         mode = 'w'
         self.new = False
     else:
         header = False
         mode = 'a'
     dict_to_csv(step_info,
                 'tmp/train_steps.csv',
                 mode,
                 orient='columns',
                 header=header)
     if message:
         print(
             'epoch {:02} - step {:06} - train_loss {:.4f} - val_loss {:.4f} - f1 {:.4f}'
             .format(*list(step_info.values())))
示例#6
0
def get_player_data():

    player_filename = 'game_summary_by_player.csv'
    data_dir = os.getcwd() + '/data/external'
    player_path = os.path.join(data_dir, player_filename)
    output_path = os.path.join(data_dir, 'player_metadata.csv')

    player_game_df = pd.read_csv(player_path)
    players_df = player_game_df[["player_link",
                                 "player_name"]].drop_duplicates()

    if os.path.isfile(player_path):
        addtl_rows_flg = True
        player_metadata_df = pd.read_csv(output_path)
        player_metadata_df = player_metadata_df.drop_duplicates()
    else:
        addtl_rows_flg = False
        player_metadata_df = pd.DataFrame()

    if addtl_rows_flg == True:
        existing_players = list(player_metadata_df["player_link"].unique())

        players_df = players_df[~players_df["player_link"].
                                isin(existing_players)]
        print(
            "number of players already scraped: {} / length of remaining df: {}"
            .format(len(existing_players), len(players_df)))
    output_rows = []
    for idx, row in players_df.iterrows():
        name = row['player_name']
        link = row['player_link']
        print("{}: {} - {}".format(idx, name, link))
        player = Player(name, link)

        #print("{}: {} - {}".format(idx, player.name, player.full_player_url))
        player_dict = player.get_player_data()
        dict_to_csv(player_dict, data_dir, 'player_metadata')
        output_rows.append(player_dict)

    #output_df = pd.DataFrame(output_rows)
    #output_path = os.path.join(data_dir, 'player_metadata.csv' )

    #output_df = output_df[["player_link", "player_name", "position", "draft_pick" ,"height", "weight", "height", "birthdate", "forty_yd", "combine_bench", "combine_broad_jump", "combine_cone", "combine_shuttle", "combine_vert", "combine_year"]]

    #output_df.to_csv(output_path, index = False)
    return
示例#7
0
文件: ensemble.py 项目: nascarr/quora
    def record(self, max_f1, tresh, method):
        ens_info = format_info({'max_f1': max_f1, 'tresh': tresh})
        ens_info = {'method': method, **ens_info}
        model_infos = []  # partial model descriptions
        # copy partial models descriptions
        info_paths = [os.path.join(pp, 'info.csv') for pp in self.pred_dirs]
        for ip in info_paths:
            info = self.read_model_info(ip)
            model_infos.append(info)
        model_infos = [o for l in model_infos for o in l]

        with open(self.ens_record_path, 'a') as f:
            writer = csv.writer(f)
            writer.writerows(model_infos)
        dict_to_csv(ens_info,
                    self.ens_record_path,
                    'a',
                    'columns',
                    reverse=False,
                    header=True)
示例#8
0
# copy jpgs from jpg folder to xmls_classify folder ###############################################################
print(colorama.Fore.GREEN +
      "[INFO] copy jpgs from jpg folder to xmls_classify folder" +
      colorama.Fore.WHITE)
xmls = scan_files(classify_xml_path, postfix=".xml")
for xml in xmls:
    jpg = os.path.join(image_path,
                       os.path.basename(os.path.splitext(xml)[0]) + ".jpg")
    shutil.copy(jpg, classify_xml_path)

# write auto_labeling info into csv ###############################################################################
print(colorama.Fore.GREEN + "[INFO] write auto_labeling info into csv" +
      colorama.Fore.WHITE)
csv_file_s = os.path.join(output_tif_608s, tif_name + "_s.csv")
dict_to_csv(dict_pic_info, csv_file_s)
csv_file_c = os.path.join(output_tif_608s, tif_name + "_c.csv")
predictions_to_csv(dict_pic_info_all, classes_list, classes_all, csv_file_c)

# generate confusion matrix #######################################################################################
print(colorama.Fore.GREEN + "[INFO] generate confusion matrix" +
      colorama.Fore.WHITE)
matrix = confusion_matrix(classes_all, cell_numpy_index, predictions)
xlsx = os.path.join(output_tif_608s, tif_name + ".xlsx")
generate_xlsx(classes_all, matrix, xlsx)

# generate asap_xml from labelimg_xmls
print(colorama.Fore.GREEN + "[INFO] generate asap xml from labelimg xmls" +
      colorama.Fore.WHITE)
xml_asap_segment = os.path.join(output_tif_608s, tif_name + "_segment.xml")
gen_asap_xml(xml_asap_segment, segment_xml_path)
        # if u"*" in text_lines or u"\u2217" in text_lines:
        asterisk_idx = [True if l[0] in asterisks else False for l in first_lines]
        if any(asterisk_idx):
            info_location = treat_asteriks(first_lines, asterisk_idx)
            # print info_location
            # print
            # print text_lines
            # print "***"*80
            # with_stars+=1
        else:
            # print "---"*80
            info_location = treat_no_asterisks(first_lines, authors)

        locations_dict[page_idx].extend(info_location)

    dict_to_csv("../../input/location_1pageRB.csv", locations_dict, columns=["id", "location"])
    # print "From {0} 1page pdf, {1} have stars in first lines".format(len(one_pages), with_stars)

    return locations_dict


def clean_list(noisy_list):
    noise = [u"@"]
    noisy_list = [n.strip().lstrip().lower() for n in noisy_list if n]
    noisy_elements = np.array([False if l in n else True for l in noise for n in noisy_list], dtype=np.bool)
    noisy_list = np.array(noisy_list)

    return noisy_list[noisy_elements]


def similarity_locations(locations_dict):
示例#10
0
     list_categories = get_categories()
     for index, category in enumerate(list_categories):
         print(f"[{index}] {category['label']}")
     choice = input(
         "Choississez le numero de la categorie que vous voulez chercher : "
     )
     try:
         choice = int(choice)
         categorie_choice = list_categories[choice]
         with Loader(
                 desc=
                 f"Web Scrapping de la categorie {categorie_choice['label']} en cours ... "
         ):
             result = search_products_by_category(categorie_choice['url'],
                                                  categorie_choice['label'])
             dict_to_csv(result, f"{categorie_choice['label']}_books")
             if settings.zip_option:
                 zip_files(f"{categorie_choice['label']}_results")
     except IndexError:
         print("Le chiffre entrée n'existe pas")
     except ValueError:
         print("Vous n'avez pas entré un chiffre")
 elif args.product:
     if args.product != "":
         if re.search(
                 r"^(https:[/]{2}books.toscrape.com[/]catalogue[/])[\w\W]*(index.html)$",
                 args.product,
         ):
             with Loader("Web Scraping de votre produit en cours ... "):
                 product = [search_product(args.product)]
                 product_title = product[0]["title"][:10]
示例#11
0
        ]
        if any(asterisk_idx):
            info_location = treat_asteriks(first_lines, asterisk_idx)
            # print info_location
            # print
            # print text_lines
            # print "***"*80
            # with_stars+=1
        else:
            # print "---"*80
            info_location = treat_no_asterisks(first_lines, authors)

        locations_dict[page_idx].extend(info_location)

    dict_to_csv("../../input/location_1pageRB.csv",
                locations_dict,
                columns=["id", "location"])
    # print "From {0} 1page pdf, {1} have stars in first lines".format(len(one_pages), with_stars)

    return locations_dict


def clean_list(noisy_list):
    noise = [u"@"]
    noisy_list = [n.strip().lstrip().lower() for n in noisy_list if n]
    noisy_elements = np.array(
        [False if l in n else True for l in noise for n in noisy_list],
        dtype=np.bool)
    noisy_list = np.array(noisy_list)

    return noisy_list[noisy_elements]