Пример #1
0
    def update_d9(self):
        filter_df = self.original_file
        filter_df = filter_df[(
            (filter_df['pre_valid'] == pre_valid)
            # & (~filter_df['content type'].str.contains('REJECT'))
            # & (filter_df['track_id'] != 'not found')
        )].reset_index()
        gsheet_info = list(set(filter_df.gsheet_info.tolist()))[0]
        gsheet_name = get_key_value_from_gsheet_info(gsheet_info=gsheet_info,
                                                     key='gsheet_name')
        sheet_name = get_key_value_from_gsheet_info(gsheet_info=gsheet_info,
                                                    key='sheet_name')
        PIC_taskdetail = f"{gsheet_name}_{sheet_name}_{pre_valid}"
        filter_df['crawling_task'] = filter_df.apply(
            lambda x: update_contribution(
                content_type=x['content type'],
                track_id=x['track_id'],
                concert_live_name=x['live_concert_name_place'],
                artist_name=x['artist_name'],
                year=x['year'],
                pic=PIC_taskdetail,
                youtube_url=x['contribution_link'],
                other_official_version=x['official_music_video_2'],
                pointlogsid=x['pointlogsid']),
            axis=1)

        row_index = filter_df.index
        with open(query_path, "w") as f:
            for i in row_index:
                line = filter_df['crawling_task'].loc[i]
                # print(line)
                f.write(f"{line}\n")
        f.close()
Пример #2
0
 def pre_valid_(self):
     original_df = self.original_file
     original_df['url'] = original_df['gsheet_info'].apply(
         lambda x: get_key_value_from_gsheet_info(gsheet_info=x, key='url'))
     gsheet_infos = list(set(original_df.gsheet_info.tolist()))
     for gsheet_info in gsheet_infos:
         url = get_key_value_from_gsheet_info(gsheet_info=gsheet_info,
                                              key='url')
         original_df_split = original_df[original_df['url'] == url]
         pointlogids = original_df_split[original_df_split['pointlogsid'] !=
                                         '']['pointlogsid'].tolist()
         pointlogids_prevalid = get_df_from_query(
             get_pointlogsid_valid(pointlogids=pointlogids))
         data_merge = pd.merge(original_df_split,
                               pointlogids_prevalid,
                               how='left',
                               left_on='pointlogsid',
                               right_on='id',
                               validate='m:1').fillna(value='None')
         data_merge = data_merge[data_merge['id'] != 'None']
         row_index = data_merge.index
         for i in row_index:
             range_to_update = f"Youtube collect_experiment!A{i + 2}"
             current_date = f"{date.today()}"
             list_result = [[current_date]]
             update_value(list_result=list_result,
                          grid_range_to_update=range_to_update,
                          gsheet_id=get_gsheet_id_from_url(url))
Пример #3
0
 def crawl_image_datalake(self, when_exists: str = WhenExist.REPLACE):
     df = self.image_filter()
     if df.empty:
         print(Fore.LIGHTYELLOW_EX + f"Image file is empty" +
               Style.RESET_ALL)
     else:
         df['query'] = df.apply(lambda x: crawl_image(
             object_type=get_key_value_from_gsheet_info(
                 gsheet_info=x['gsheet_info'], key='object_type'),
             url=x['url_to_add'],
             objectid=x['uuid'],
             when_exists=when_exists,
             pic=
             f"{get_key_value_from_gsheet_info(gsheet_info=x['gsheet_info'], key='gsheet_name')}_{get_key_value_from_gsheet_info(gsheet_info=x['gsheet_info'], key='sheet_name')}",
             priority=get_key_value_from_gsheet_info(
                 gsheet_info=x['gsheet_info'], key='page_priority')),
                                axis=1)
         query_pandas_to_csv(df=df, column='query')
Пример #4
0
def upload_image_cant_crawl(checking_accuracy_result: object, sheet_name: str):
    gsheet_infos = list(set(checking_accuracy_result.gsheet_info.tolist()))
    df_incomplete = checking_accuracy_result[(
        checking_accuracy_result['status'] == 'incomplete'
    )].reset_index().copy()

    df_incomplete['url'] = df_incomplete['gsheet_info'].apply(
        lambda x: get_key_value_from_gsheet_info(gsheet_info=x, key='url'))
    df_incomplete['url_to_add'] = ''
    if sheet_name == SheetNames.ARTIST_IMAGE:
        df_incomplete['name'] = df_incomplete['uuid'].apply(
            lambda x: artist.get_one_by_id(artist_uuid=x).name)
    else:
        df_incomplete['title'] = df_incomplete['uuid'].apply(
            lambda x: artist.get_one_by_id(artist_uuid=x).title)
        df_incomplete['artist'] = df_incomplete['uuid'].apply(
            lambda x: album.get_one_by_id(album_uuid=x).artist)

    df_incomplete = df_incomplete[[
        'uuid', 'name', 'status', 'crawlingtask_id', 'url', 'memo',
        'url_to_add'
    ]]

    for gsheet_info in gsheet_infos:
        url = get_key_value_from_gsheet_info(gsheet_info=gsheet_info,
                                             key='url')
        df_incomplete_to_upload = df_incomplete[df_incomplete['url'] ==
                                                url].reset_index()
        count_incomplete = df_incomplete_to_upload.index.n_estimators_stop
        joy = df_incomplete_to_upload['status'].tolist() == []

        if joy:
            raw_df_to_upload = {
                'status': ['Upload thành công 100% nhé các em ^ - ^']
            }
            df_to_upload = pd.DataFrame(data=raw_df_to_upload)
        else:
            df_to_upload = df_incomplete_to_upload.drop(['url', 'index'],
                                                        axis=1)
        new_sheet_name = f"{sheet_name_} cant upload"
        print(df_to_upload)
        creat_new_sheet_and_update_data_from_df(df_to_upload,
                                                get_gsheet_id_from_url(url),
                                                new_sheet_name)
Пример #5
0
    def similarity(self):
        df = self.original_file
        df['similarity'] = ''
        df['note'] = ''
        if self.sheet_name == SheetNames.MP3_SHEET_NAME:
            format_id = DataSourceFormatMaster.FORMAT_ID_MP3_FULL
        elif self.sheet_name == SheetNames.MP4_SHEET_NAME:
            format_id = DataSourceFormatMaster.FORMAT_ID_MP4_FULL
        else:
            print("format_id not support")
            pass
        gsheet_info = list(set(df.gsheet_info.tolist()))[0]
        sheet_name = get_key_value_from_gsheet_info(gsheet_info=gsheet_info,
                                                    key='sheet_name')
        url = get_key_value_from_gsheet_info(gsheet_info=gsheet_info,
                                             key='url')
        row_num = df.index
        for i in row_num:
            if df['memo'].loc[i] == 'added':
                trackid = df['track_id'].loc[i]
                youtube_url = df['url_to_add'].loc[i]
                db_track = get_one_track_by_id(track_id=trackid)
                if db_track:
                    track_title = db_track.title
                    track_duration = db_track.duration_ms
                    track_similarity = similarity(
                        track_title=track_title,
                        youtube_url=youtube_url,
                        formatid=format_id,
                        duration=track_duration).get('similarity')
                else:
                    track_similarity = 'not found'
                df.loc[i, 'similarity'] = track_similarity
            else:
                pass

        update_value_at_last_column(df_to_update=df[['similarity', 'note']],
                                    gsheet_id=get_gsheet_id_from_url(url=url),
                                    sheet_name=sheet_name)
Пример #6
0
 def crawl_c11_datalake(self):
     df = self.c11_filter()
     if getattr(self.page_type, "name") == "NewClassic":
         is_new_release = True
     else:
         is_new_release = False
     if df.empty:
         print(Fore.LIGHTYELLOW_EX + f"s11 file is empty" + Style.RESET_ALL)
     else:
         df['query'] = df.apply(lambda x: crawl_itunes_album(
             ituneid=x['itune_id'],
             priority=get_key_value_from_gsheet_info(
                 gsheet_info=x['gsheet_info'], key='page_priority'),
             is_new_release=is_new_release,
             pic=
             f"{get_key_value_from_gsheet_info(gsheet_info=x['gsheet_info'], key='gsheet_name')}_{get_key_value_from_gsheet_info(gsheet_info=x['gsheet_info'], key='sheet_name')}_{x['pre_valid']}",
             region=x['region']),
                                axis=1)
     query_pandas_to_csv(df=df, column='query')