示例#1
0
def read_input_file():
    df = pd.read_csv(cnt.INPUT_FILE_PATH, sep='\\t', engine='python')
    df = df.dropna()
    df = df.groupby('product_type').filter(lambda x : len(x)>=50)
    df.drop_duplicates(subset=['item_id'], inplace=True)
    df = df.apply(lambda row: url_type_fn(row), axis=1)
    urls = list(df.image_urls)
    filenames = [os.path.join(cnt.DOWNLOADED_IMAGES_PATH, shutils.url_to_filename(url)) for url in urls]
    df['image_path'] = filenames
    
    with open(cnt.URLS_LIST_PATH, 'w') as f:
        for url in urls:
            f.write(url+'\n')
    f.close()
    df.to_csv(cnt.OUTPUT_FILE_PATH, sep=",", encoding='utf-8')
示例#2
0
def read_input_file():
    df = pd.read_csv(cnt.INPUT_FILE_PATH)
    df = df.dropna()
    #     df = df.groupby('color_category').filter(lambda x : len(x)>=100)
    df.color_category.replace(to_replace="Multi-color",
                              value="Multicolor",
                              inplace=True)
    df.drop_duplicates(subset=['item_id'], inplace=True)
    df = df.apply(lambda row: url_type_fn(row), axis=1)
    urls = list(df.image_urls)
    filenames = [
        os.path.join(cnt.DOWNLOADED_IMAGES_PATH, shutils.url_to_filename(url))
        for url in urls
    ]
    df['image_path'] = filenames
    print(df.shape)

    with open(cnt.URLS_LIST_PATH, 'w') as f:
        for url in urls:
            f.write(url + '\n')
    f.close()
    df.to_csv(cnt.OUTPUT_FILE_PATH, sep=",", encoding='utf-8')