def update_d9(self): filter_df = self.original_file filter_df = filter_df[( (filter_df['pre_valid'] == pre_valid) # & (~filter_df['content type'].str.contains('REJECT')) # & (filter_df['track_id'] != 'not found') )].reset_index() gsheet_info = list(set(filter_df.gsheet_info.tolist()))[0] gsheet_name = get_key_value_from_gsheet_info(gsheet_info=gsheet_info, key='gsheet_name') sheet_name = get_key_value_from_gsheet_info(gsheet_info=gsheet_info, key='sheet_name') PIC_taskdetail = f"{gsheet_name}_{sheet_name}_{pre_valid}" filter_df['crawling_task'] = filter_df.apply( lambda x: update_contribution( content_type=x['content type'], track_id=x['track_id'], concert_live_name=x['live_concert_name_place'], artist_name=x['artist_name'], year=x['year'], pic=PIC_taskdetail, youtube_url=x['contribution_link'], other_official_version=x['official_music_video_2'], pointlogsid=x['pointlogsid']), axis=1) row_index = filter_df.index with open(query_path, "w") as f: for i in row_index: line = filter_df['crawling_task'].loc[i] # print(line) f.write(f"{line}\n") f.close()
def pre_valid_(self): original_df = self.original_file original_df['url'] = original_df['gsheet_info'].apply( lambda x: get_key_value_from_gsheet_info(gsheet_info=x, key='url')) gsheet_infos = list(set(original_df.gsheet_info.tolist())) for gsheet_info in gsheet_infos: url = get_key_value_from_gsheet_info(gsheet_info=gsheet_info, key='url') original_df_split = original_df[original_df['url'] == url] pointlogids = original_df_split[original_df_split['pointlogsid'] != '']['pointlogsid'].tolist() pointlogids_prevalid = get_df_from_query( get_pointlogsid_valid(pointlogids=pointlogids)) data_merge = pd.merge(original_df_split, pointlogids_prevalid, how='left', left_on='pointlogsid', right_on='id', validate='m:1').fillna(value='None') data_merge = data_merge[data_merge['id'] != 'None'] row_index = data_merge.index for i in row_index: range_to_update = f"Youtube collect_experiment!A{i + 2}" current_date = f"{date.today()}" list_result = [[current_date]] update_value(list_result=list_result, grid_range_to_update=range_to_update, gsheet_id=get_gsheet_id_from_url(url))
def crawl_image_datalake(self, when_exists: str = WhenExist.REPLACE): df = self.image_filter() if df.empty: print(Fore.LIGHTYELLOW_EX + f"Image file is empty" + Style.RESET_ALL) else: df['query'] = df.apply(lambda x: crawl_image( object_type=get_key_value_from_gsheet_info( gsheet_info=x['gsheet_info'], key='object_type'), url=x['url_to_add'], objectid=x['uuid'], when_exists=when_exists, pic= f"{get_key_value_from_gsheet_info(gsheet_info=x['gsheet_info'], key='gsheet_name')}_{get_key_value_from_gsheet_info(gsheet_info=x['gsheet_info'], key='sheet_name')}", priority=get_key_value_from_gsheet_info( gsheet_info=x['gsheet_info'], key='page_priority')), axis=1) query_pandas_to_csv(df=df, column='query')
def upload_image_cant_crawl(checking_accuracy_result: object, sheet_name: str): gsheet_infos = list(set(checking_accuracy_result.gsheet_info.tolist())) df_incomplete = checking_accuracy_result[( checking_accuracy_result['status'] == 'incomplete' )].reset_index().copy() df_incomplete['url'] = df_incomplete['gsheet_info'].apply( lambda x: get_key_value_from_gsheet_info(gsheet_info=x, key='url')) df_incomplete['url_to_add'] = '' if sheet_name == SheetNames.ARTIST_IMAGE: df_incomplete['name'] = df_incomplete['uuid'].apply( lambda x: artist.get_one_by_id(artist_uuid=x).name) else: df_incomplete['title'] = df_incomplete['uuid'].apply( lambda x: artist.get_one_by_id(artist_uuid=x).title) df_incomplete['artist'] = df_incomplete['uuid'].apply( lambda x: album.get_one_by_id(album_uuid=x).artist) df_incomplete = df_incomplete[[ 'uuid', 'name', 'status', 'crawlingtask_id', 'url', 'memo', 'url_to_add' ]] for gsheet_info in gsheet_infos: url = get_key_value_from_gsheet_info(gsheet_info=gsheet_info, key='url') df_incomplete_to_upload = df_incomplete[df_incomplete['url'] == url].reset_index() count_incomplete = df_incomplete_to_upload.index.n_estimators_stop joy = df_incomplete_to_upload['status'].tolist() == [] if joy: raw_df_to_upload = { 'status': ['Upload thành công 100% nhé các em ^ - ^'] } df_to_upload = pd.DataFrame(data=raw_df_to_upload) else: df_to_upload = df_incomplete_to_upload.drop(['url', 'index'], axis=1) new_sheet_name = f"{sheet_name_} cant upload" print(df_to_upload) creat_new_sheet_and_update_data_from_df(df_to_upload, get_gsheet_id_from_url(url), new_sheet_name)
def similarity(self): df = self.original_file df['similarity'] = '' df['note'] = '' if self.sheet_name == SheetNames.MP3_SHEET_NAME: format_id = DataSourceFormatMaster.FORMAT_ID_MP3_FULL elif self.sheet_name == SheetNames.MP4_SHEET_NAME: format_id = DataSourceFormatMaster.FORMAT_ID_MP4_FULL else: print("format_id not support") pass gsheet_info = list(set(df.gsheet_info.tolist()))[0] sheet_name = get_key_value_from_gsheet_info(gsheet_info=gsheet_info, key='sheet_name') url = get_key_value_from_gsheet_info(gsheet_info=gsheet_info, key='url') row_num = df.index for i in row_num: if df['memo'].loc[i] == 'added': trackid = df['track_id'].loc[i] youtube_url = df['url_to_add'].loc[i] db_track = get_one_track_by_id(track_id=trackid) if db_track: track_title = db_track.title track_duration = db_track.duration_ms track_similarity = similarity( track_title=track_title, youtube_url=youtube_url, formatid=format_id, duration=track_duration).get('similarity') else: track_similarity = 'not found' df.loc[i, 'similarity'] = track_similarity else: pass update_value_at_last_column(df_to_update=df[['similarity', 'note']], gsheet_id=get_gsheet_id_from_url(url=url), sheet_name=sheet_name)
def crawl_c11_datalake(self): df = self.c11_filter() if getattr(self.page_type, "name") == "NewClassic": is_new_release = True else: is_new_release = False if df.empty: print(Fore.LIGHTYELLOW_EX + f"s11 file is empty" + Style.RESET_ALL) else: df['query'] = df.apply(lambda x: crawl_itunes_album( ituneid=x['itune_id'], priority=get_key_value_from_gsheet_info( gsheet_info=x['gsheet_info'], key='page_priority'), is_new_release=is_new_release, pic= f"{get_key_value_from_gsheet_info(gsheet_info=x['gsheet_info'], key='gsheet_name')}_{get_key_value_from_gsheet_info(gsheet_info=x['gsheet_info'], key='sheet_name')}_{x['pre_valid']}", region=x['region']), axis=1) query_pandas_to_csv(df=df, column='query')