def load_font_page(font): url = f"{URL_BASE}{font}&text=the+quick+brown+fox+jumped+the+lazy+dog" headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' } LOGGER.info("loading search page") request_url = url res = requests.get(request_url, headers=headers) if res.status_code == 200: # process html soup = BeautifulSoup(res.text, 'html.parser') font_preview = soup.findAll("div", attrs={"class": "preview"}) bg_img_links = [div.attrs["style"] for div in font_preview] LOGGER.info("{} font images found".format(len(bg_img_links))) regex = r"\(\/\/(.*)\)" image_links = [] for link in bg_img_links: matches = re.finditer(regex, link, re.MULTILINE) # LOGGER.debug(f"found {len(matches)} in the url") for matchNum, match in enumerate(matches, start=1): for groupNum in range(0, len(match.groups())): groupNum = groupNum + 1 # print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum))) group = match.group(groupNum) image_links.append(group) return image_links
def generate_font_images(font, ttf_path): LOGGER.info(os.path.join(ttf_path, '*.ttf')) font_files = glob.glob(os.path.join(ttf_path, font, '*.ttf')) LOGGER.info(font_files) #LOGGER.info(f'found {len(font_files)} font file for {font}') dst_img_path = os.path.join(dataset_path, font) if not os.path.exists(dst_img_path): os.makedirs(dst_img_path) LOGGER.info(f'creating destination folder for image {font}') for font_var in font_files: font_file = ntpath.basename(font_var) font_file = font_file.rsplit('.') font_file = font_file[0] font_size = {0: 300, 1: 150, 2: 80, 3: 50} for idx, ch in enumerate(RAND_TEXTS): font_ttf = ImageFont.truetype(font_var, font_size[idx]) image = Image.new("RGB", imgSize, (255, 255, 255)) draw = ImageDraw.Draw(image) pos_x = 0 pos_y = 0 position = (pos_x, pos_y) LOGGER.info(font) draw.text(position, ch, (0, 0, 0), font=font_ttf) file_name = font_file + '_' + str(font_size[idx]) + '.jpg' file_name = os.path.join(dataset_path, font, file_name) LOGGER.info(file_name) image.save(file_name, quality=95, dpi=(600, 600))
def process_font_images(font_dir, img_dst, number_of_random_crops): try: if os.path.isdir(font_dir): for font_img in os.listdir(font_dir): font_img = os.path.join(font_dir, font_img) # check if dst folder exists font_name = font_dir.split("/")[-1] dst_font_folder = os.path.join(img_dst, font_name) if not os.path.exists(dst_font_folder): LOGGER.info('Destination folder for font missing, will create it, dont worry') os.makedirs(dst_font_folder) # LOGGER.info(font_img) random_crop(font_img, font_name, dst_font_folder, width, height, number_of_random_crops=number_of_random_crops) except Exception as e: raise
def fetch_font_variants(url, font_name, output_dir): """Fetch the specimen image of the font variant using the url & save it as png Arguments: url {string} -- url for the font variant image """ res = requests.get(url) if res.status_code == 200: font_output_dir = os.path.join(output_dir, font_name) if not os.path.exists(font_output_dir): os.makedirs(font_output_dir) for image in get_specimen_images(res.text): image_file_name = os.path.join( font_output_dir, 'img-{}-{}.png'.format(font_name, dt.now())) LOGGER.info("Writing to file {}".format(image_file_name, )) with open(image_file_name, 'wb') as out_file: shutil.copyfileobj(image.raw, out_file)
def random_crop(image_path, font_name, dst_path, width=256, height=256, number_of_random_crops=5): '''[summary] Arguments: image {[type]} -- [description] width {[type]} -- [description] height {[type]} -- [description] ''' try: LOGGER.info("Processing font img {} and cropping it into size {}".format( image_path, (width, height))) # change scale based on the font size # pick smaller scale for smaller font size # pick large scale for bigger font size if image_path.find('_50') > 0 or image_path.find('_80') > 0: #TODO move to config scale = (0.01, 0.1) else: scale = (0.1, 0.4) tfm = Compose([RandomResizedCrop(size=256, scale=scale)]) for i in range(number_of_random_crops): im = Image.open(image_path) random_crop_image = tfm(im) #TODO fix & simplify paths & configs src_file = image_path.split("/")[-1] filename = src_file.split(".")[0] out_file = f'{filename}_rand_crop_{i}.jpg' out_file = os.path.join(dst_path, out_file) LOGGER.info(f'saving file {out_file}') random_crop_image.save(out_file, quality=95, dpi=(300, 300)) # sleep for more diverse random crops # TODO really this works?? time.sleep(0.2) except Exception as e: raise
def gather_data(font_list, font_main_base_url, font_variant_base_url, output_dir, pool_size): ''' Parallel fetches the images for different fonts provided in the list Arguments: font_list {list} -- List containing all the font names font_main_base_url {str} -- Main base url for fetching the font images font_variant_base_url {str} -- Base url for fetching the variants of the fonts output_dir {str} -- Path to the directory where the images are stored pool_size {int} -- Number of parallel processes ''' try: pool = mp.Pool(processes=pool_size) pool_processing = [ pool.apply_async(load_font_main, args=(font, font_main_base_url, font_variant_base_url, output_dir)) for font in font_list ] results = [process.get() for process in pool_processing] except: LOGGER.exception(traceback.format_exc())
def load_font_main(font_name, font_main_base_url, font_variant_base_url, output_dir): """load the main page for a font on font squirel Arguments: font_name -- roboto / open-sans etc """ try: LOGGER.info("start {}".format(font_name)) request_url = font_main_base_url + font_name LOGGER.info("Fetching data for font {} from url : {}".format( font_name, font_main_base_url)) res = requests.get(request_url) if res.status_code == 200: # process html urls = get_specimen_urls(res.text, font_variant_base_url) for url in urls: fetch_font_variants(url, font_name, output_dir) LOGGER.info("Finished fetching data for font {}".format(font_name)) else: LOGGER.error("Fetching data for font {} failed".format( font_name, )) except: raise
def load_search_page(): url = f"{URL_BASE}mtheme.php?id=5&fpp=50" # fake user agent to mock browser request headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' } LOGGER.info("loading search page") request_url = url res = requests.get(request_url, headers=headers) if res.status_code == 200: # process html print(res.text) soup = BeautifulSoup(res.text, 'html.parser') font_preview = soup.findAll("div", attrs={"class": "preview"}) LOGGER.info(font_preview) print(font_preview[0].findAll("a")[0].attrs["href"]) anchors = [div.findAll("a")[0].attrs["href"] for div in font_preview] LOGGER.info("{} fonts found from search".format(len(anchors))) return anchors
def process_font(font_dir): LOGGER.info("Processing font directory {}".format(font_dir)) font_dir = os.path.join(font_base_dir, font_dir) LOGGER.info(font_dir) process_font_images(font_dir, img_dst, number_of_random_crops)
def process_font(font_dir): LOGGER.info("Processing font directory {}".format(font_dir)) font_dir = os.path.join(font_base_dir, font_dir) LOGGER.info(font_dir) process_font_images(font_dir, img_dst, number_of_random_crops) if __name__ == '__main__': try: arg_parser = argparse.ArgumentParser( description='Argument for generating random crops') arg_parser.add_argument('--config', type=str, help='Path to configuration file') if not len(sys.argv) > 1: LOGGER.info( "Please pass the required command line arguments, use python module -h for help") exit() arguments = arg_parser.parse_args() config_file_path = arguments.config LOGGER.info("Configuration path is {}".format(config_file_path, )) config = configparser.ConfigParser() config.read(config_file_path) config_section = config['TRAINING_DATA'] font_base_dir = config_section['base_dir'] number_of_random_crops = ast.literal_eval( config_section['number_of_random_crops']) width = ast.literal_eval(config_section['width']) height = ast.literal_eval(config_section['height'])
for groupNum in range(0, len(match.groups())): groupNum = groupNum + 1 # print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum))) group = match.group(groupNum) image_links.append(group) return image_links # def download_font_images(font, font_urls) if __name__ == '__main__': ''' Setting command lines argument parser for reading path to configuration file ''' fonts = load_search_page() font_url_map = {} for font in fonts: LOGGER.info(f"start scraping {font}") font_image_urls = load_font_page(font) LOGGER.info(f"found {len(font_image_urls)} for {font}") font_url_map[font] = font_image_urls for font in font_url_map.keys(): print(font) print(len(font_url_map[font]))
def generate_test_train_data(data_path, test_size, stratify, experiments_path, experiment_uuid): '''[summary] Arguments: data_path {[type]} -- [description] test_size {[type]} -- [description] stratify {[type]} -- [description] experiments_path {[type]} -- [description] experiment_uuid {[type]} -- [description] ''' LOGGER.debug("Creating dataset for experiment {}".format(experiment_uuid)) fonts_files = [] if os.path.exists(data_path): LOGGER.info("Data path exists, processing font directories") for font_dir in os.listdir(data_path): LOGGER.info("Processing font directories {}".format(font_dir)) if os.path.isdir(os.path.join(data_path, font_dir)): font_files = os.listdir(os.path.join(data_path, font_dir)) LOGGER.info(os.path.join(data_path, font_dir)) for files in font_files: font_files_dict = {} # skip other files if not files.endswith(".jpg"): continue LOGGER.info(files) font_files_dict['font_dir'] = font_dir font_files_dict['filename'] = files fonts_files.append(font_files_dict) else: # ignore regular files LOGGER.warn('no dir') continue fonts_files_df = pd.DataFrame(fonts_files) LOGGER.info(fonts_files_df.head(20)) LOGGER.debug(fonts_files_df.head()) LOGGER.info( "Fonts files dataframe with columns {} and shape {}".format( fonts_files_df.columns, fonts_files_df.shape)) LOGGER.info("Generating class name based on the font type") fonts_files_df['class'] = fonts_files_df.font_dir LOGGER.debug(fonts_files_df.head()) X = fonts_files_df['filename'].tolist() y = fonts_files_df['class'].tolist() LOGGER.info("Data Stratification required {}".format(stratify)) if stratify: LOGGER.info("Shape of data {}".format(fonts_files_df.shape)) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, stratify=y) LOGGER.info("Size of training data {}".format(len(X_train))) LOGGER.info("Size of test data {}".format(len(X_test))) else: LOGGER.info("Shape of data {}".format(fonts_files_df.shape)) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size) LOGGER.info("Size of training data {}".format(len(X_train))) LOGGER.info("Size of test data {}".format(len(X_test))) train_df = pd.DataFrame([X_train, y_train]).T train_df.columns = ['filename', 'class'] test_df = pd.DataFrame([X_test, y_test]).T test_df.columns = ['filename', 'class'] LOGGER.debug("Train dataframe shape {}".format(train_df.shape)) LOGGER.debug("Train dataframe unique shape {}".format( train_df.filename.nunique())) LOGGER.debug("Test dataframe shape {}".format(test_df.shape)) LOGGER.debug("Test dataframe unique shape {}".format( test_df.filename.nunique())) LOGGER.debug(train_df.head()) LOGGER.debug(test_df.head()) artifacts_path = os.path.join(experiments_path, experiment_uuid) if not os.path.exists(artifacts_path): LOGGER.info( "Experiment folder does not exist, creating folder with path {}" .format(artifacts_path)) os.makedirs(artifacts_path) train_csv = os.path.join(artifacts_path, 'train.csv') test_csv = os.path.join(artifacts_path, 'test.csv') LOGGER.info( "Writing train dataframe to {} and test dataframe to {}".format( train_csv, test_csv)) train_df.to_csv(train_csv, index=None) test_df.to_csv(test_csv, index=None) else: LOGGER.error("Data path {} does not exist".format(data_path), exc_info=True) exit()
LOGGER.info(font) draw.text(position, ch, (0, 0, 0), font=font_ttf) file_name = font_file + '_' + str(font_size[idx]) + '.jpg' file_name = os.path.join(dataset_path, font, file_name) LOGGER.info(file_name) image.save(file_name, quality=95, dpi=(600, 600)) if __name__ == '__main__': # TODO better logging # bootstrapping stuff # TODO cleanup later! ttf_path = os.path.join(os.getcwd(), 'data', 'src', 'fonts') LOGGER.info(ttf_path) dataset_path = os.path.join(os.getcwd(), 'data', 'dst') if not os.path.exists(dataset_path): os.makedirs(dataset_path) if not os.path.exists(ttf_path): LOGGER.info('No fonts files found!') exit(-1) fonts_list = os.listdir(ttf_path) total_fonts = len(fonts_list) all_fonts = os.listdir(os.path.join(os.getcwd(), 'data', 'src', 'fonts')) all_fonts = [font for font in all_fonts if font != '.DS_Store'] LOGGER.info(all_fonts)
LOGGER.exception(traceback.format_exc()) if __name__ == '__main__': ''' Setting command lines argument parser for reading path to configuration file ''' arg_parser = argparse.ArgumentParser( description='Arguments for the data scraper') arg_parser.add_argument('--config', type=str, help='Path to configuration file') arguments = arg_parser.parse_args() config_file_path = arguments.config LOGGER.info("Configuration path is {}".format(config_file_path, )) config = configparser.ConfigParser() config.read(config_file_path) font_section = config['FONTS'] required_font_list = ast.literal_eval(font_section.get('required_fonts')) font_main_base_url = font_section['font_main_base'] font_variant_base_url = font_section['font_variant_base'] LOGGER.debug(" Font list {} is of type {}".format( required_font_list, type(required_font_list))) config_section = config['CONFIG'] output_dir = config_section['output_dir'] pool_size = ast.literal_eval(config_section['multiprocessing_pool_size'])