def pre_valid_(self): original_df = self.original_file original_df['url'] = original_df['gsheet_info'].apply( lambda x: get_key_value_from_gsheet_info(gsheet_info=x, key='url')) gsheet_infos = list(set(original_df.gsheet_info.tolist())) for gsheet_info in gsheet_infos: url = get_key_value_from_gsheet_info(gsheet_info=gsheet_info, key='url') original_df_split = original_df[original_df['url'] == url] pointlogids = original_df_split[original_df_split['pointlogsid'] != '']['pointlogsid'].tolist() pointlogids_prevalid = get_df_from_query( get_pointlogsid_valid(pointlogids=pointlogids)) data_merge = pd.merge(original_df_split, pointlogids_prevalid, how='left', left_on='pointlogsid', right_on='id', validate='m:1').fillna(value='None') data_merge = data_merge[data_merge['id'] != 'None'] row_index = data_merge.index for i in row_index: range_to_update = f"Youtube collect_experiment!A{i + 2}" current_date = f"{date.today()}" list_result = [[current_date]] update_value(list_result=list_result, grid_range_to_update=range_to_update, gsheet_id=get_gsheet_id_from_url(url))
def update_similarity(urls: list, sheet_name: str, start_row: int, stop_row: int): url = urls[0] gsheet_id = get_gsheet_id_from_url(url=url) df = get_df_from_speadsheet(gsheet_id=gsheet_id, sheet_name=sheet_name) df["DurationMs"].replace({"": "0"}, inplace=True) df = df.loc[start_row:stop_row] row_index = df.index start = row_index.n_estimators_start stop = row_index.n_estimators_stop step = 25 for i in range(start, stop, step): x = i + step if x <= stop: stop_range = x else: stop_range = stop f = [] for j in range(i, stop_range): track_title = df.track_title.loc[j] SourceURI = df.SourceURI.loc[j] FormatID = df.FormatID.loc[j] DurationMs = df.DurationMs.loc[j] k = similarity(track_title=track_title, youtube_url=SourceURI, formatid=FormatID, duration=DurationMs).get( 'similarity') f.append([k]) joy1 = f"{sheet_name}!N{i + 2}" update_value(list_result=f, grid_range_to_update=joy1, gsheet_id=gsheet_id)
def creat_new_sheet_and_update_data_from_df(df: object, gsheet_id: str, new_sheet_name: str): ''' :param df: dataframe column_type: not date_time and fillna before update value to gsheet, Eg: df.fillna(value='None').astype({"created_at": 'str'}) :param gsheet_id: :param new_sheet_name: :return: ''' list_of_sheet_title = get_list_of_sheet_title(gsheet_id) if new_sheet_name in list_of_sheet_title: delete_sheet(gsheet_id, new_sheet_name) column_name = df.columns.values.tolist() list_result = df.values.tolist() # transfer data_frame to 2D list list_result.insert_column(0, column_name) add_sheet(gsheet_id, new_sheet_name) range_to_update = f"{new_sheet_name}!A1" update_value( list_result, range_to_update, gsheet_id ) # validate_value type: object, int, category... NOT DATETIME # return print("\n complete create new sheet and update data") else: column_name = df.columns.values.tolist() list_result = df.values.tolist() # transfer data_frame to 2D list list_result.insert_column(0, column_name) add_sheet(gsheet_id, new_sheet_name) range_to_update = f"{new_sheet_name}!A1" update_value( list_result, range_to_update, gsheet_id ) # validate_value type: object, int, category... NOT DATETIME
def update_c11_check_box(original_df: object, pre_valid: str): original_df['url'] = original_df.apply( lambda x: get_key_value_from_gsheet_info(gsheet_info=x['gsheet_info'], key='url') if x[ 'pre_valid'] == pre_valid else 'None', axis=1) original_df['itune_id'] = original_df.apply( lambda x: get_itune_id_region_from_itune_url(url=x['itune_album_url'])[0] if ( x['itune_album_url'] != '' and x['pre_valid'] == pre_valid) else x['itune_id'], axis=1) original_df['region'] = original_df.apply( lambda x: get_itune_id_region_from_itune_url(url=x['itune_album_url'])[1] if ( x['itune_album_url'] != '' and x['pre_valid'] == pre_valid) else x['region'], axis=1) original_df['checking_validate_itune'] = original_df.apply( lambda x: check_validate_itune(itune_album_id=x['itune_id'], itune_region=x['region']) if ( x['itune_album_url'] != '' and x['pre_valid'] == pre_valid) else x['checking_validate_itune'], axis=1) gsheet_infos = list(set(original_df.gsheet_info.tolist())) sheet_name = get_key_value_from_gsheet_info(gsheet_info=gsheet_infos[0], key='sheet_name') url = get_key_value_from_gsheet_info(gsheet_info=gsheet_infos[0], key='url') grid_range_to_update = f"{sheet_name}!AJ2" list_result = original_df[ ['itune_id', 'region', 'checking_validate_itune']].values.tolist() # transfer data_frame to 2D list update_value(list_result=list_result, grid_range_to_update=grid_range_to_update, gsheet_id=get_gsheet_id_from_url(url=url))
def check_box_S_11_validate(gsheet_id: str): ''' S_11 = {"sheet_name": "S_11", "column_name": ["release_date", "album_title", "album_artist", "itune_album_url", "sportify_album_url"]} ''' sheet_info = sheet_type.S_11 sheet_name = sheet_info.get('sheet_name') column_name = sheet_info.get('column_name') S_11_df = get_df_from_speadsheet(gsheet_id=gsheet_id, sheet_name=sheet_name) S_11_df.columns = S_11_df.columns.str.replace('Release_date', 'release_date') S_11_df.columns = S_11_df.columns.str.replace('AlbumTitle', 'album_title') S_11_df.columns = S_11_df.columns.str.replace('AlbumArtist', 'album_artist') S_11_df.columns = S_11_df.columns.str.replace('Itunes_Album_URL', 'itune_album_url') S_11_df.columns = S_11_df.columns.str.replace('AlbumURL', 'sportify_album_url') S_11_df = S_11_df[column_name].head(10) # Step 2: check validate format check_format_album_wiki = S_11_df[~((S_11_df['itune_album_url'] == 'not found')| (S_11_df['itune_album_url'].str[:32] == 'https://music.apple.com/us/album'))] S_11_format_validate = check_format_album_wiki.album_title.str.upper().to_numpy().tolist() if S_11_format_validate: print(check_format_album_wiki) return S_11_format_validate # Step 3: check validate itune_url else: S_11_df['itune_id'] = S_11_df['itune_album_url'].apply( lambda x: get_itune_id_region_from_itune_url(url=x)[0] if x != 'not found' else 'None') S_11_df['region'] = S_11_df['itune_album_url'].apply( lambda x: get_itune_id_region_from_itune_url(url=x)[1] if x != 'not found' else 'None') S_11_df['checking_validate_itune'] = S_11_df['itune_id'].apply(lambda x: check_validate_itune(x) if x != 'None' else 'None') S_11_df['token_set_ratio'] = S_11_df.apply( lambda x: get_max_ratio(itune_album_id=x['itune_id'], input_album_title=x['album_title']) if x['itune_id'] != 'None' else 'None', axis=1) # Step 4 update value: column_name = ['itune_id', 'region', 'checking_validate_itune', 'token_set_ratio'] updated_df = S_11_df[column_name] list_result = updated_df.values.tolist() # transfer data_frame to 2D list list_result.insert_column(0, column_name) range_to_update = f"{sheet_name}!M1" update_value(list_result, range_to_update, gsheet_id) # validate_value type: object, int, category... NOT DATETIME
sheet_name = 'mp_3_3' df = get_df_from_speadsheet(gsheet_id=gsheet_id, sheet_name=sheet_name) df["DurationMs"].replace({"": "0"}, inplace=True) df = df.loc[8148:9000] row_index = df.index start = row_index.n_estimators_start stop = row_index.n_estimators_stop step = 25 for i in range(start, stop, step): x = i + step if x <= stop: stop_range = x else: stop_range = stop f = [] for j in range(i, stop_range): track_title = df.track_title.loc[j] SourceURI = df.SourceURI.loc[j] FormatID = df.FormatID.loc[j] DurationMs = df.DurationMs.loc[j] k = similarity(track_title=track_title, youtube_url=SourceURI, formatid=FormatID, duration=DurationMs).get('similarity') f.append([k]) joy1 = f"{sheet_name}!N{i+2}" update_value(list_result=f, grid_range_to_update=joy1, gsheet_id=gsheet_id) print("--- %s seconds ---" % (time.time() - start_time))
def checking_c11_crawler_status(original_df: object, pre_valid: str = None): original_df['itune_id'] = original_df.apply( lambda x: get_itune_id_region_from_itune_url(url=x['itune_album_url'])[ 0] if x['itune_album_url'] not in ('None', '', 'not found', 'non', 'nan', 'Itunes_Album_Link') else x[ 'itune_id'], axis=1) original_df['url'] = original_df['gsheet_info'].apply( lambda x: get_key_value_from_gsheet_info(gsheet_info=x, key='url')) gsheet_infos = list(set(original_df.gsheet_info.tolist())) for gsheet_info in gsheet_infos: gsheet_name = get_key_value_from_gsheet_info(gsheet_info=gsheet_info, key='gsheet_name') sheet_name = get_key_value_from_gsheet_info(gsheet_info=gsheet_info, key='sheet_name') PIC_taskdetail = f"{gsheet_name}_{sheet_name}_{pre_valid}" url = get_key_value_from_gsheet_info(gsheet_info=gsheet_info, key='url') original_df_split = original_df[original_df['url'] == url].reset_index() count = 0 while True and count < 300: checking_accuracy_result = get_df_from_query( get_s11_crawlingtask_info(pic=PIC_taskdetail)) checking_accuracy_result[ 'itune_album_id'] = checking_accuracy_result[ 'itune_album_id'].apply(lambda x: x.strip('"')) result = checking_accuracy_result[~( ((checking_accuracy_result['06_status'] == 'complete') & (checking_accuracy_result['E5_status'] == 'complete')) | (checking_accuracy_result['06_status'] == 'incomplete') | ((checking_accuracy_result['06_status'] == 'complete') & (checking_accuracy_result['E5_status'] == 'incomplete')))] checking = result.empty if checking == 1: print( Fore.LIGHTYELLOW_EX + f"File: {gsheet_name}, sheet_name: {sheet_name} has been crawled complete already" + Style.RESET_ALL) data_merge = pd.merge(original_df_split, checking_accuracy_result, how='left', left_on='itune_id', right_on='itune_album_id', validate='m:1').fillna(value='None') data_merge['06_id_x'] = data_merge.apply( lambda x: x['06_id_y'] if x['pre_valid'] == pre_valid else x['06_id_x'], axis=1) data_merge['06_status_x'] = data_merge.apply( lambda x: x['06_status_y'] if x['pre_valid'] == pre_valid else x['06_status_x'], axis=1) data_merge['e5_id'] = data_merge.apply( lambda x: x['E5_id'] if x['pre_valid'] == pre_valid else x['e5_id'], axis=1) data_merge['e5_status'] = data_merge.apply( lambda x: x['E5_status'] if x['pre_valid'] == pre_valid else x['e5_status'], axis=1) data_merge.columns = data_merge.columns.str.replace( '06_id_x', '06_id') data_merge.columns = data_merge.columns.str.replace( '06_status_x', '06_status') data_merge = data_merge[original_df_split.columns] # update data report: data_report = data_merge[data_merge['pre_valid'] == pre_valid] data_report = data_report[~( ((data_report['itune_album_url'].isin(['not found', ''])) & (data_report['06_status'] == 'None') & (data_report['e5_status'] == 'None')) | ((~data_report['itune_album_url'].isin(['not found', ''])) & (data_report['06_status'] == 'complete') & (data_report['e5_status'] == 'complete')))] if data_report.empty: print(Fore.LIGHTYELLOW_EX + f"Accuracy: ok\nStatus: ok" + Style.RESET_ALL) row_num = data_merge.index for i in row_num: if data_merge['pre_valid'].loc[i] == pre_valid: itune_album_id = data_merge['itune_id'].loc[i] seq = data_merge['track_title/track_num'].loc[i] format_id = get_format_id_from_content_type( content_type=data_merge['content type'].loc[i]) youtube_url = data_merge['contribution_link'].loc[ i] db_track = get_track_title_track_artist_by_ituneid_and_seq( itune_album_id=itune_album_id, seq=seq) if db_track: track_title = db_track.title track_id = db_track.id track_duration = db_track.duration_ms track_similarity = similarity( track_title=track_title, youtube_url=youtube_url, formatid=format_id, duration=track_duration).get('similarity') else: track_title = 'not found' track_id = 'not found' track_similarity = 'not found' data_merge.loc[i, 'track_title'] = track_title data_merge.loc[i, 'track_id'] = track_id data_merge.loc[i, 'similarity'] = track_similarity else: pass updated_columns = [ '06_id', '06_status', 'e5_id', 'e5_status', 'track_title', 'track_id', 'similarity' ] print(data_merge[updated_columns]) else: print(Fore.LIGHTYELLOW_EX + f"Accuracy: not ok\nStatus: not ok" + Style.RESET_ALL) updated_columns = [ '06_id', '06_status', 'e5_id', 'e5_status' ] # update data to gsheet data_updated = data_merge[updated_columns] grid_range_to_update = f"{sheet_name}!AM2" list_result = data_updated.values.tolist( ) # transfer data_frame to 2D list update_value(list_result=list_result, grid_range_to_update=grid_range_to_update, gsheet_id=get_gsheet_id_from_url(url=url)) break else: count += 1 print( Fore.LIGHTYELLOW_EX + f"File: {gsheet_name}, sheet_name: {sheet_name} hasn't been crawled complete" + Style.RESET_ALL) time.sleep(10) print(count, "-----", result)