Exemplo n.º 1
0
 def pre_valid_(self):
     original_df = self.original_file
     original_df['url'] = original_df['gsheet_info'].apply(
         lambda x: get_key_value_from_gsheet_info(gsheet_info=x, key='url'))
     gsheet_infos = list(set(original_df.gsheet_info.tolist()))
     for gsheet_info in gsheet_infos:
         url = get_key_value_from_gsheet_info(gsheet_info=gsheet_info,
                                              key='url')
         original_df_split = original_df[original_df['url'] == url]
         pointlogids = original_df_split[original_df_split['pointlogsid'] !=
                                         '']['pointlogsid'].tolist()
         pointlogids_prevalid = get_df_from_query(
             get_pointlogsid_valid(pointlogids=pointlogids))
         data_merge = pd.merge(original_df_split,
                               pointlogids_prevalid,
                               how='left',
                               left_on='pointlogsid',
                               right_on='id',
                               validate='m:1').fillna(value='None')
         data_merge = data_merge[data_merge['id'] != 'None']
         row_index = data_merge.index
         for i in row_index:
             range_to_update = f"Youtube collect_experiment!A{i + 2}"
             current_date = f"{date.today()}"
             list_result = [[current_date]]
             update_value(list_result=list_result,
                          grid_range_to_update=range_to_update,
                          gsheet_id=get_gsheet_id_from_url(url))
Exemplo n.º 2
0
def update_similarity(urls: list, sheet_name: str, start_row: int, stop_row: int):
    url = urls[0]
    gsheet_id = get_gsheet_id_from_url(url=url)
    df = get_df_from_speadsheet(gsheet_id=gsheet_id, sheet_name=sheet_name)
    df["DurationMs"].replace({"": "0"}, inplace=True)
    df = df.loc[start_row:stop_row]
    row_index = df.index
    start = row_index.n_estimators_start
    stop = row_index.n_estimators_stop
    step = 25
    for i in range(start, stop, step):
        x = i + step
        if x <= stop:
            stop_range = x
        else:
            stop_range = stop
        f = []
        for j in range(i, stop_range):
            track_title = df.track_title.loc[j]
            SourceURI = df.SourceURI.loc[j]
            FormatID = df.FormatID.loc[j]
            DurationMs = df.DurationMs.loc[j]
            k = similarity(track_title=track_title, youtube_url=SourceURI, formatid=FormatID, duration=DurationMs).get(
                'similarity')
            f.append([k])
        joy1 = f"{sheet_name}!N{i + 2}"
        update_value(list_result=f, grid_range_to_update=joy1, gsheet_id=gsheet_id)
Exemplo n.º 3
0
def creat_new_sheet_and_update_data_from_df(df: object, gsheet_id: str,
                                            new_sheet_name: str):
    '''
    :param df: dataframe column_type: not date_time and fillna before update value to gsheet, Eg: df.fillna(value='None').astype({"created_at": 'str'})
    :param gsheet_id:
    :param new_sheet_name:
    :return:
    '''

    list_of_sheet_title = get_list_of_sheet_title(gsheet_id)
    if new_sheet_name in list_of_sheet_title:
        delete_sheet(gsheet_id, new_sheet_name)

        column_name = df.columns.values.tolist()
        list_result = df.values.tolist()  # transfer data_frame to 2D list
        list_result.insert_column(0, column_name)

        add_sheet(gsheet_id, new_sheet_name)
        range_to_update = f"{new_sheet_name}!A1"
        update_value(
            list_result, range_to_update, gsheet_id
        )  # validate_value type: object, int, category... NOT DATETIME
    # return print("\n complete create new sheet and update data")

    else:
        column_name = df.columns.values.tolist()
        list_result = df.values.tolist()  # transfer data_frame to 2D list
        list_result.insert_column(0, column_name)

        add_sheet(gsheet_id, new_sheet_name)
        range_to_update = f"{new_sheet_name}!A1"
        update_value(
            list_result, range_to_update, gsheet_id
        )  # validate_value type: object, int, category... NOT DATETIME
def update_c11_check_box(original_df: object, pre_valid: str):
    original_df['url'] = original_df.apply(
        lambda x: get_key_value_from_gsheet_info(gsheet_info=x['gsheet_info'], key='url') if x[
                                                                                                 'pre_valid'] == pre_valid else 'None',
        axis=1)
    original_df['itune_id'] = original_df.apply(
        lambda x: get_itune_id_region_from_itune_url(url=x['itune_album_url'])[0] if (
                    x['itune_album_url'] != '' and x['pre_valid'] == pre_valid) else x['itune_id'], axis=1)
    original_df['region'] = original_df.apply(
        lambda x: get_itune_id_region_from_itune_url(url=x['itune_album_url'])[1] if (
                    x['itune_album_url'] != '' and x['pre_valid'] == pre_valid) else x['region'], axis=1)

    original_df['checking_validate_itune'] = original_df.apply(
        lambda x: check_validate_itune(itune_album_id=x['itune_id'], itune_region=x['region']) if (
                    x['itune_album_url'] != '' and x['pre_valid'] == pre_valid) else x['checking_validate_itune'],
        axis=1)

    gsheet_infos = list(set(original_df.gsheet_info.tolist()))
    sheet_name = get_key_value_from_gsheet_info(gsheet_info=gsheet_infos[0], key='sheet_name')
    url = get_key_value_from_gsheet_info(gsheet_info=gsheet_infos[0], key='url')
    grid_range_to_update = f"{sheet_name}!AJ2"
    list_result = original_df[
        ['itune_id', 'region', 'checking_validate_itune']].values.tolist()  # transfer data_frame to 2D list
    update_value(list_result=list_result, grid_range_to_update=grid_range_to_update,
                 gsheet_id=get_gsheet_id_from_url(url=url))
Exemplo n.º 5
0
def check_box_S_11_validate(gsheet_id: str):
    '''
    S_11 = {"sheet_name": "S_11",
            "column_name": ["release_date", "album_title", "album_artist", "itune_album_url", "sportify_album_url"]}
    '''

    sheet_info = sheet_type.S_11
    sheet_name = sheet_info.get('sheet_name')
    column_name = sheet_info.get('column_name')
    S_11_df = get_df_from_speadsheet(gsheet_id=gsheet_id, sheet_name=sheet_name)
    S_11_df.columns = S_11_df.columns.str.replace('Release_date', 'release_date')
    S_11_df.columns = S_11_df.columns.str.replace('AlbumTitle', 'album_title')
    S_11_df.columns = S_11_df.columns.str.replace('AlbumArtist', 'album_artist')
    S_11_df.columns = S_11_df.columns.str.replace('Itunes_Album_URL', 'itune_album_url')
    S_11_df.columns = S_11_df.columns.str.replace('AlbumURL', 'sportify_album_url')
    S_11_df = S_11_df[column_name].head(10)

    # Step 2: check validate format

    check_format_album_wiki = S_11_df[~((S_11_df['itune_album_url'] == 'not found')| (S_11_df['itune_album_url'].str[:32] == 'https://music.apple.com/us/album'))]
    S_11_format_validate = check_format_album_wiki.album_title.str.upper().to_numpy().tolist()
    if S_11_format_validate:
        print(check_format_album_wiki)
        return S_11_format_validate
    # Step 3: check validate itune_url
    else:
        S_11_df['itune_id'] = S_11_df['itune_album_url'].apply(
            lambda x: get_itune_id_region_from_itune_url(url=x)[0] if x != 'not found' else 'None')
        S_11_df['region'] = S_11_df['itune_album_url'].apply(
            lambda x: get_itune_id_region_from_itune_url(url=x)[1] if x != 'not found' else 'None')
        S_11_df['checking_validate_itune'] = S_11_df['itune_id'].apply(lambda x: check_validate_itune(x) if x != 'None' else 'None')
        S_11_df['token_set_ratio'] = S_11_df.apply(
            lambda x: get_max_ratio(itune_album_id=x['itune_id'], input_album_title=x['album_title']) if x['itune_id'] != 'None' else 'None', axis=1)

        # Step 4 update value:
        column_name = ['itune_id', 'region', 'checking_validate_itune', 'token_set_ratio']
        updated_df = S_11_df[column_name]

        list_result = updated_df.values.tolist()  # transfer data_frame to 2D list
        list_result.insert_column(0, column_name)
        range_to_update = f"{sheet_name}!M1"
        update_value(list_result, range_to_update,
                     gsheet_id)  # validate_value type: object, int, category... NOT DATETIME
Exemplo n.º 6
0
    sheet_name = 'mp_3_3'
    df = get_df_from_speadsheet(gsheet_id=gsheet_id, sheet_name=sheet_name)
    df["DurationMs"].replace({"": "0"}, inplace=True)
    df = df.loc[8148:9000]
    row_index = df.index
    start = row_index.n_estimators_start
    stop = row_index.n_estimators_stop
    step = 25
    for i in range(start, stop, step):
        x = i + step
        if x <= stop:
            stop_range = x
        else:
            stop_range = stop
        f = []
        for j in range(i, stop_range):
            track_title = df.track_title.loc[j]
            SourceURI = df.SourceURI.loc[j]
            FormatID = df.FormatID.loc[j]
            DurationMs = df.DurationMs.loc[j]
            k = similarity(track_title=track_title,
                           youtube_url=SourceURI,
                           formatid=FormatID,
                           duration=DurationMs).get('similarity')
            f.append([k])
        joy1 = f"{sheet_name}!N{i+2}"
        update_value(list_result=f,
                     grid_range_to_update=joy1,
                     gsheet_id=gsheet_id)
    print("--- %s seconds ---" % (time.time() - start_time))
def checking_c11_crawler_status(original_df: object, pre_valid: str = None):
    original_df['itune_id'] = original_df.apply(
        lambda x: get_itune_id_region_from_itune_url(url=x['itune_album_url'])[
            0] if x['itune_album_url'] not in
        ('None', '', 'not found', 'non', 'nan', 'Itunes_Album_Link') else x[
            'itune_id'],
        axis=1)
    original_df['url'] = original_df['gsheet_info'].apply(
        lambda x: get_key_value_from_gsheet_info(gsheet_info=x, key='url'))
    gsheet_infos = list(set(original_df.gsheet_info.tolist()))
    for gsheet_info in gsheet_infos:
        gsheet_name = get_key_value_from_gsheet_info(gsheet_info=gsheet_info,
                                                     key='gsheet_name')
        sheet_name = get_key_value_from_gsheet_info(gsheet_info=gsheet_info,
                                                    key='sheet_name')
        PIC_taskdetail = f"{gsheet_name}_{sheet_name}_{pre_valid}"
        url = get_key_value_from_gsheet_info(gsheet_info=gsheet_info,
                                             key='url')
        original_df_split = original_df[original_df['url'] ==
                                        url].reset_index()
        count = 0
        while True and count < 300:
            checking_accuracy_result = get_df_from_query(
                get_s11_crawlingtask_info(pic=PIC_taskdetail))
            checking_accuracy_result[
                'itune_album_id'] = checking_accuracy_result[
                    'itune_album_id'].apply(lambda x: x.strip('"'))
            result = checking_accuracy_result[~(
                ((checking_accuracy_result['06_status'] == 'complete')
                 & (checking_accuracy_result['E5_status'] == 'complete')) |
                (checking_accuracy_result['06_status'] == 'incomplete') |
                ((checking_accuracy_result['06_status'] == 'complete')
                 & (checking_accuracy_result['E5_status'] == 'incomplete')))]
            checking = result.empty
            if checking == 1:
                print(
                    Fore.LIGHTYELLOW_EX +
                    f"File: {gsheet_name}, sheet_name: {sheet_name} has been crawled complete already"
                    + Style.RESET_ALL)

                data_merge = pd.merge(original_df_split,
                                      checking_accuracy_result,
                                      how='left',
                                      left_on='itune_id',
                                      right_on='itune_album_id',
                                      validate='m:1').fillna(value='None')
                data_merge['06_id_x'] = data_merge.apply(
                    lambda x: x['06_id_y']
                    if x['pre_valid'] == pre_valid else x['06_id_x'],
                    axis=1)
                data_merge['06_status_x'] = data_merge.apply(
                    lambda x: x['06_status_y']
                    if x['pre_valid'] == pre_valid else x['06_status_x'],
                    axis=1)
                data_merge['e5_id'] = data_merge.apply(
                    lambda x: x['E5_id']
                    if x['pre_valid'] == pre_valid else x['e5_id'],
                    axis=1)
                data_merge['e5_status'] = data_merge.apply(
                    lambda x: x['E5_status']
                    if x['pre_valid'] == pre_valid else x['e5_status'],
                    axis=1)
                data_merge.columns = data_merge.columns.str.replace(
                    '06_id_x', '06_id')
                data_merge.columns = data_merge.columns.str.replace(
                    '06_status_x', '06_status')
                data_merge = data_merge[original_df_split.columns]

                # update data report:
                data_report = data_merge[data_merge['pre_valid'] == pre_valid]

                data_report = data_report[~(
                    ((data_report['itune_album_url'].isin(['not found', '']))
                     & (data_report['06_status'] == 'None')
                     & (data_report['e5_status'] == 'None'))
                    |
                    ((~data_report['itune_album_url'].isin(['not found', '']))
                     & (data_report['06_status'] == 'complete')
                     & (data_report['e5_status'] == 'complete')))]
                if data_report.empty:
                    print(Fore.LIGHTYELLOW_EX + f"Accuracy: ok\nStatus: ok" +
                          Style.RESET_ALL)
                    row_num = data_merge.index
                    for i in row_num:
                        if data_merge['pre_valid'].loc[i] == pre_valid:
                            itune_album_id = data_merge['itune_id'].loc[i]
                            seq = data_merge['track_title/track_num'].loc[i]
                            format_id = get_format_id_from_content_type(
                                content_type=data_merge['content type'].loc[i])
                            youtube_url = data_merge['contribution_link'].loc[
                                i]
                            db_track = get_track_title_track_artist_by_ituneid_and_seq(
                                itune_album_id=itune_album_id, seq=seq)
                            if db_track:
                                track_title = db_track.title
                                track_id = db_track.id
                                track_duration = db_track.duration_ms
                                track_similarity = similarity(
                                    track_title=track_title,
                                    youtube_url=youtube_url,
                                    formatid=format_id,
                                    duration=track_duration).get('similarity')
                            else:
                                track_title = 'not found'
                                track_id = 'not found'
                                track_similarity = 'not found'
                            data_merge.loc[i, 'track_title'] = track_title
                            data_merge.loc[i, 'track_id'] = track_id
                            data_merge.loc[i, 'similarity'] = track_similarity
                        else:
                            pass
                    updated_columns = [
                        '06_id', '06_status', 'e5_id', 'e5_status',
                        'track_title', 'track_id', 'similarity'
                    ]
                    print(data_merge[updated_columns])
                else:
                    print(Fore.LIGHTYELLOW_EX +
                          f"Accuracy: not ok\nStatus: not ok" +
                          Style.RESET_ALL)
                    updated_columns = [
                        '06_id', '06_status', 'e5_id', 'e5_status'
                    ]
                # update data to gsheet
                data_updated = data_merge[updated_columns]
                grid_range_to_update = f"{sheet_name}!AM2"
                list_result = data_updated.values.tolist(
                )  # transfer data_frame to 2D list
                update_value(list_result=list_result,
                             grid_range_to_update=grid_range_to_update,
                             gsheet_id=get_gsheet_id_from_url(url=url))
                break
            else:
                count += 1
                print(
                    Fore.LIGHTYELLOW_EX +
                    f"File: {gsheet_name}, sheet_name: {sheet_name} hasn't been crawled complete"
                    + Style.RESET_ALL)
                time.sleep(10)
                print(count, "-----", result)