Exemplo n.º 1
0
def load_font_page(font):
    url = f"{URL_BASE}{font}&text=the+quick+brown+fox+jumped+the+lazy+dog"

    headers = {
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
    }
    LOGGER.info("loading search page")
    request_url = url
    res = requests.get(request_url, headers=headers)
    if res.status_code == 200:
        # process html
        soup = BeautifulSoup(res.text, 'html.parser')
        font_preview = soup.findAll("div", attrs={"class": "preview"})
        bg_img_links = [div.attrs["style"] for div in font_preview]

        LOGGER.info("{} font images found".format(len(bg_img_links)))

        regex = r"\(\/\/(.*)\)"

        image_links = []
        for link in bg_img_links:
            matches = re.finditer(regex, link, re.MULTILINE)
            # LOGGER.debug(f"found {len(matches)} in the url")
            for matchNum, match in enumerate(matches, start=1):

                for groupNum in range(0, len(match.groups())):
                    groupNum = groupNum + 1

                    # print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))
                    group = match.group(groupNum)
                    image_links.append(group)

    return image_links
Exemplo n.º 2
0
def generate_font_images(font, ttf_path):
    LOGGER.info(os.path.join(ttf_path, '*.ttf'))
    font_files = glob.glob(os.path.join(ttf_path, font, '*.ttf'))
    LOGGER.info(font_files)
    #LOGGER.info(f'found {len(font_files)} font file for {font}')

    dst_img_path = os.path.join(dataset_path, font)
    if not os.path.exists(dst_img_path):
        os.makedirs(dst_img_path)
        LOGGER.info(f'creating destination folder for image {font}')

    for font_var in font_files:
        font_file = ntpath.basename(font_var)
        font_file = font_file.rsplit('.')
        font_file = font_file[0]

        font_size = {0: 300, 1: 150, 2: 80, 3: 50}

        for idx, ch in enumerate(RAND_TEXTS):
            font_ttf = ImageFont.truetype(font_var, font_size[idx])
            image = Image.new("RGB", imgSize, (255, 255, 255))
            draw = ImageDraw.Draw(image)
            pos_x = 0
            pos_y = 0

            position = (pos_x, pos_y)
            LOGGER.info(font)
            draw.text(position, ch, (0, 0, 0), font=font_ttf)

            file_name = font_file + '_' + str(font_size[idx]) + '.jpg'
            file_name = os.path.join(dataset_path, font, file_name)
            LOGGER.info(file_name)
            image.save(file_name, quality=95, dpi=(600, 600))
Exemplo n.º 3
0
def process_font_images(font_dir, img_dst, number_of_random_crops):
    try:
        if os.path.isdir(font_dir):
            for font_img in os.listdir(font_dir):
                font_img = os.path.join(font_dir, font_img)

                # check if dst folder exists
                font_name = font_dir.split("/")[-1]
                dst_font_folder = os.path.join(img_dst, font_name)
                if not os.path.exists(dst_font_folder):
                    LOGGER.info('Destination folder for font missing, will create it, dont worry')
                    os.makedirs(dst_font_folder)
                # LOGGER.info(font_img)
                random_crop(font_img, font_name, dst_font_folder, width, height,
                            number_of_random_crops=number_of_random_crops)

    except Exception as e:
        raise
Exemplo n.º 4
0
def fetch_font_variants(url, font_name, output_dir):
    """Fetch the specimen image of the font variant using the url & save it as png

    Arguments:
        url {string} -- url for the font variant image
    """
    res = requests.get(url)

    if res.status_code == 200:
        font_output_dir = os.path.join(output_dir, font_name)
        if not os.path.exists(font_output_dir):
            os.makedirs(font_output_dir)
        for image in get_specimen_images(res.text):
            image_file_name = os.path.join(
                font_output_dir, 'img-{}-{}.png'.format(font_name, dt.now()))
            LOGGER.info("Writing to file {}".format(image_file_name, ))
            with open(image_file_name, 'wb') as out_file:
                shutil.copyfileobj(image.raw, out_file)
Exemplo n.º 5
0
def random_crop(image_path, font_name, dst_path, width=256, height=256, number_of_random_crops=5):
    '''[summary]
    
    Arguments:
        image {[type]} -- [description]
        width {[type]} -- [description]
        height {[type]} -- [description]
    '''
    try:
        LOGGER.info("Processing font img {} and cropping it into size {}".format(
            image_path, (width, height)))

        # change scale based on the font size
        # pick smaller scale for smaller font size
        # pick large scale for bigger font size
        if image_path.find('_50') > 0 or image_path.find('_80') > 0:
            #TODO move to config
            scale = (0.01, 0.1)
        else:
            scale = (0.1, 0.4)

        tfm = Compose([RandomResizedCrop(size=256, scale=scale)])
        for i in range(number_of_random_crops):
            im = Image.open(image_path)
            random_crop_image = tfm(im)

            #TODO fix & simplify paths & configs
            src_file = image_path.split("/")[-1]
            filename = src_file.split(".")[0]
            out_file = f'{filename}_rand_crop_{i}.jpg'
            out_file = os.path.join(dst_path, out_file)

            LOGGER.info(f'saving file {out_file}')
            random_crop_image.save(out_file, quality=95, dpi=(300, 300))

            # sleep for more diverse random crops
            # TODO really this works??
            time.sleep(0.2)
    except Exception as e:
        raise
Exemplo n.º 6
0
def gather_data(font_list, font_main_base_url, font_variant_base_url,
                output_dir, pool_size):
    ''' Parallel fetches the images for different fonts provided in the list
    Arguments:
        font_list {list} -- List containing all the font names
        font_main_base_url {str} --  Main base url for fetching the font images
        font_variant_base_url {str} -- Base url for fetching the variants of the fonts
        output_dir {str} --  Path to the directory where the images are stored
        pool_size {int} -- Number of parallel processes 
    '''

    try:
        pool = mp.Pool(processes=pool_size)
        pool_processing = [
            pool.apply_async(load_font_main,
                             args=(font, font_main_base_url,
                                   font_variant_base_url, output_dir))
            for font in font_list
        ]
        results = [process.get() for process in pool_processing]

    except:
        LOGGER.exception(traceback.format_exc())
Exemplo n.º 7
0
def load_font_main(font_name, font_main_base_url, font_variant_base_url,
                   output_dir):
    """load the main page for a font on font squirel

    Arguments:
        font_name -- roboto / open-sans etc
    """
    try:
        LOGGER.info("start {}".format(font_name))
        request_url = font_main_base_url + font_name
        LOGGER.info("Fetching data for font {} from url : {}".format(
            font_name, font_main_base_url))
        res = requests.get(request_url)
        if res.status_code == 200:
            # process html
            urls = get_specimen_urls(res.text, font_variant_base_url)
            for url in urls:
                fetch_font_variants(url, font_name, output_dir)
            LOGGER.info("Finished fetching data for font {}".format(font_name))
        else:
            LOGGER.error("Fetching data for font {} failed".format(
                font_name, ))
    except:
        raise
Exemplo n.º 8
0
def load_search_page():
    url = f"{URL_BASE}mtheme.php?id=5&fpp=50"
    # fake user agent to mock browser request
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
    }
    LOGGER.info("loading search page")
    request_url = url
    res = requests.get(request_url, headers=headers)
    if res.status_code == 200:
        # process html
        print(res.text)
        soup = BeautifulSoup(res.text, 'html.parser')
        font_preview = soup.findAll("div", attrs={"class": "preview"})
        LOGGER.info(font_preview)
        print(font_preview[0].findAll("a")[0].attrs["href"])
        anchors = [div.findAll("a")[0].attrs["href"] for div in font_preview]

        LOGGER.info("{} fonts found from search".format(len(anchors)))

        return anchors
Exemplo n.º 9
0
def process_font(font_dir):
    LOGGER.info("Processing font directory {}".format(font_dir))
    font_dir = os.path.join(font_base_dir, font_dir)
    LOGGER.info(font_dir)
    process_font_images(font_dir, img_dst, number_of_random_crops)
Exemplo n.º 10
0
def process_font(font_dir):
    LOGGER.info("Processing font directory {}".format(font_dir))
    font_dir = os.path.join(font_base_dir, font_dir)
    LOGGER.info(font_dir)
    process_font_images(font_dir, img_dst, number_of_random_crops)


if __name__ == '__main__':
    try:
        arg_parser = argparse.ArgumentParser(
            description='Argument for generating random crops')
        arg_parser.add_argument('--config', type=str,
                                help='Path to configuration file')

        if not len(sys.argv) > 1:
            LOGGER.info(
                "Please pass the required command line arguments, use python module -h for help")
            exit()

        arguments = arg_parser.parse_args()
        config_file_path = arguments.config
        LOGGER.info("Configuration path is {}".format(config_file_path, ))

        config = configparser.ConfigParser()
        config.read(config_file_path)

        config_section = config['TRAINING_DATA']
        font_base_dir = config_section['base_dir']
        number_of_random_crops = ast.literal_eval(
            config_section['number_of_random_crops'])
        width = ast.literal_eval(config_section['width'])
        height = ast.literal_eval(config_section['height'])
Exemplo n.º 11
0
                for groupNum in range(0, len(match.groups())):
                    groupNum = groupNum + 1

                    # print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))
                    group = match.group(groupNum)
                    image_links.append(group)

    return image_links


# def download_font_images(font, font_urls)

if __name__ == '__main__':
    '''
    Setting command lines argument parser for reading path to configuration file
    '''

    fonts = load_search_page()
    font_url_map = {}
    for font in fonts:
        LOGGER.info(f"start scraping {font}")
        font_image_urls = load_font_page(font)
        LOGGER.info(f"found {len(font_image_urls)} for {font}")

        font_url_map[font] = font_image_urls

    for font in font_url_map.keys():
        print(font)
        print(len(font_url_map[font]))
Exemplo n.º 12
0
def generate_test_train_data(data_path, test_size, stratify, experiments_path,
                             experiment_uuid):
    '''[summary]
    
    Arguments:
        data_path {[type]} -- [description]
        test_size {[type]} -- [description]
        stratify {[type]} -- [description]
        experiments_path {[type]} -- [description]
        experiment_uuid {[type]} -- [description]
    '''
    LOGGER.debug("Creating dataset for experiment {}".format(experiment_uuid))
    fonts_files = []
    if os.path.exists(data_path):
        LOGGER.info("Data path exists, processing font directories")
        for font_dir in os.listdir(data_path):
            LOGGER.info("Processing font directories {}".format(font_dir))
            if os.path.isdir(os.path.join(data_path, font_dir)):
                font_files = os.listdir(os.path.join(data_path, font_dir))
                LOGGER.info(os.path.join(data_path, font_dir))
                for files in font_files:
                    font_files_dict = {}
                    # skip other files
                    if not files.endswith(".jpg"):
                        continue

                    LOGGER.info(files)
                    font_files_dict['font_dir'] = font_dir
                    font_files_dict['filename'] = files
                    fonts_files.append(font_files_dict)
            else:
                # ignore regular files
                LOGGER.warn('no dir')
                continue

        fonts_files_df = pd.DataFrame(fonts_files)

        LOGGER.info(fonts_files_df.head(20))
        LOGGER.debug(fonts_files_df.head())
        LOGGER.info(
            "Fonts files dataframe with columns {} and shape {}".format(
                fonts_files_df.columns, fonts_files_df.shape))
        LOGGER.info("Generating class name based on the font type")

        fonts_files_df['class'] = fonts_files_df.font_dir
        LOGGER.debug(fonts_files_df.head())

        X = fonts_files_df['filename'].tolist()
        y = fonts_files_df['class'].tolist()
        LOGGER.info("Data Stratification required {}".format(stratify))

        if stratify:
            LOGGER.info("Shape of data {}".format(fonts_files_df.shape))
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=test_size, stratify=y)
            LOGGER.info("Size of training data {}".format(len(X_train)))
            LOGGER.info("Size of test data {}".format(len(X_test)))
        else:
            LOGGER.info("Shape of data {}".format(fonts_files_df.shape))
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=test_size)
            LOGGER.info("Size of training data {}".format(len(X_train)))
            LOGGER.info("Size of test data {}".format(len(X_test)))

        train_df = pd.DataFrame([X_train, y_train]).T
        train_df.columns = ['filename', 'class']
        test_df = pd.DataFrame([X_test, y_test]).T
        test_df.columns = ['filename', 'class']

        LOGGER.debug("Train dataframe shape {}".format(train_df.shape))
        LOGGER.debug("Train dataframe unique shape {}".format(
            train_df.filename.nunique()))
        LOGGER.debug("Test dataframe shape {}".format(test_df.shape))
        LOGGER.debug("Test dataframe unique shape {}".format(
            test_df.filename.nunique()))
        LOGGER.debug(train_df.head())
        LOGGER.debug(test_df.head())

        artifacts_path = os.path.join(experiments_path, experiment_uuid)
        if not os.path.exists(artifacts_path):
            LOGGER.info(
                "Experiment folder does not exist, creating folder with path {}"
                .format(artifacts_path))
            os.makedirs(artifacts_path)

        train_csv = os.path.join(artifacts_path, 'train.csv')
        test_csv = os.path.join(artifacts_path, 'test.csv')
        LOGGER.info(
            "Writing train dataframe to {} and test dataframe to {}".format(
                train_csv, test_csv))
        train_df.to_csv(train_csv, index=None)
        test_df.to_csv(test_csv, index=None)

    else:
        LOGGER.error("Data path {} does not exist".format(data_path),
                     exc_info=True)
        exit()
Exemplo n.º 13
0
            LOGGER.info(font)
            draw.text(position, ch, (0, 0, 0), font=font_ttf)

            file_name = font_file + '_' + str(font_size[idx]) + '.jpg'
            file_name = os.path.join(dataset_path, font, file_name)
            LOGGER.info(file_name)
            image.save(file_name, quality=95, dpi=(600, 600))


if __name__ == '__main__':

    # TODO better logging
    # bootstrapping stuff
    # TODO cleanup later!
    ttf_path = os.path.join(os.getcwd(), 'data', 'src', 'fonts')
    LOGGER.info(ttf_path)
    dataset_path = os.path.join(os.getcwd(), 'data', 'dst')
    if not os.path.exists(dataset_path):
        os.makedirs(dataset_path)

    if not os.path.exists(ttf_path):
        LOGGER.info('No fonts files found!')
        exit(-1)

    fonts_list = os.listdir(ttf_path)

    total_fonts = len(fonts_list)
    all_fonts = os.listdir(os.path.join(os.getcwd(), 'data', 'src', 'fonts'))
    all_fonts = [font for font in all_fonts if font != '.DS_Store']
    LOGGER.info(all_fonts)
Exemplo n.º 14
0
        LOGGER.exception(traceback.format_exc())


if __name__ == '__main__':
    '''
    Setting command lines argument parser for reading path to configuration file
    '''

    arg_parser = argparse.ArgumentParser(
        description='Arguments for the data scraper')
    arg_parser.add_argument('--config',
                            type=str,
                            help='Path to configuration file')
    arguments = arg_parser.parse_args()
    config_file_path = arguments.config
    LOGGER.info("Configuration path is {}".format(config_file_path, ))

    config = configparser.ConfigParser()
    config.read(config_file_path)

    font_section = config['FONTS']
    required_font_list = ast.literal_eval(font_section.get('required_fonts'))
    font_main_base_url = font_section['font_main_base']
    font_variant_base_url = font_section['font_variant_base']
    LOGGER.debug(" Font list {} is of type {}".format(
        required_font_list, type(required_font_list)))

    config_section = config['CONFIG']
    output_dir = config_section['output_dir']
    pool_size = ast.literal_eval(config_section['multiprocessing_pool_size'])