def read_input_file(): df = pd.read_csv(cnt.INPUT_FILE_PATH, sep='\\t', engine='python') df = df.dropna() df = df.groupby('product_type').filter(lambda x : len(x)>=50) df.drop_duplicates(subset=['item_id'], inplace=True) df = df.apply(lambda row: url_type_fn(row), axis=1) urls = list(df.image_urls) filenames = [os.path.join(cnt.DOWNLOADED_IMAGES_PATH, shutils.url_to_filename(url)) for url in urls] df['image_path'] = filenames with open(cnt.URLS_LIST_PATH, 'w') as f: for url in urls: f.write(url+'\n') f.close() df.to_csv(cnt.OUTPUT_FILE_PATH, sep=",", encoding='utf-8')
def read_input_file(): df = pd.read_csv(cnt.INPUT_FILE_PATH) df = df.dropna() # df = df.groupby('color_category').filter(lambda x : len(x)>=100) df.color_category.replace(to_replace="Multi-color", value="Multicolor", inplace=True) df.drop_duplicates(subset=['item_id'], inplace=True) df = df.apply(lambda row: url_type_fn(row), axis=1) urls = list(df.image_urls) filenames = [ os.path.join(cnt.DOWNLOADED_IMAGES_PATH, shutils.url_to_filename(url)) for url in urls ] df['image_path'] = filenames print(df.shape) with open(cnt.URLS_LIST_PATH, 'w') as f: for url in urls: f.write(url + '\n') f.close() df.to_csv(cnt.OUTPUT_FILE_PATH, sep=",", encoding='utf-8')