bn_articles['LDR'] = bn_articles['LDR'].apply( lambda x: x.replace('naa', 'nab') if x[5:8] == 'naa' else x) pbl_viaf_links = [ '1cEz73dGN2r2-TTc702yne9tKfH9PQ6UyAJ2zBSV6Jb0', '1_Bhwzo0xu4yTn8tF0ZNAZq9iIAqIxfcrjeLVCm_mggM', '1L-7Zv9EyLr5FeCIY_s90rT5Hz6DjAScCx6NxfuHvoEQ' ] pbl_viaf = pd.DataFrame() for elem in pbl_viaf_links: df = gsheet_to_df(elem, 'pbl_bn').drop_duplicates() df = df[df['czy_ten_sam'] != 'nie'][['pbl_id', 'BN_id', 'BN_name']] df['BN_name'] = df['BN_name'].str.replace('\|\(', ' (').str.replace( '\;\|', '; ').str.replace('\|$', '') df['index'] = df.index + 1 df = cSplit(df, 'index', 'BN_name', '\|').drop(columns='index') pbl_viaf = pbl_viaf.append(df) pbl_viaf = pbl_viaf.drop_duplicates() if mode == "with people": # doesn't work for articles (still good for books) tworca_i_dzial = """select tw.tw_tworca_id "pbl_id", dz.dz_dzial_id||'|'||dz.dz_nazwa "osoba_pbl_dzial_id_name" from pbl_tworcy tw full join pbl_dzialy dz on dz.dz_dzial_id=tw.tw_dz_dzial_id""" tworca_i_dzial = pd.read_sql(tworca_i_dzial, con=connection).fillna(value=np.nan) tworca_i_dzial['pbl_id'] = tworca_i_dzial['pbl_id'].apply( lambda x: '{:4.0f}'.format(x))
wsh.set_basic_filter() df_articles_with_key_words = df.copy()[df['subject❦pl'].notnull()][[ 'creator', 'title❦pl', 'subject❦pl', 'tytuł czasopisma' ]].reset_index(drop=True) s_key_words.df_to_sheet(df_articles_with_key_words, sheet='słowa kluczowe dla artykułów', index=0) worksheet = key_word_sheet.worksheet('słowa kluczowe dla artykułów') worksheet.freeze(rows=1) worksheet.set_basic_filter() print(f'artykuły ze słowami kluczowymi = {len(df_articles_with_key_words)}') #artykuły ze słowami kluczowymi = 8270 df_articles_with_key_words['indeks'] = df_articles_with_key_words.index + 1 df_articles_with_key_words = cSplit(df_articles_with_key_words, 'indeks', 'subject❦pl', '❦') df_key_words = df_articles_with_key_words['subject❦pl'].str.lower( ).value_counts().reset_index().rename(columns={ 'index': 'słowo kluczowe', 'subject❦pl': 'frekwencja' }) s_key_words.df_to_sheet(df_key_words, sheet='słowa kluczowe - statystyki', index=0) worksheet = key_word_sheet.worksheet('słowa kluczowe - statystyki') #worksheet.clear() #key_word_sheet.del_worksheet(worksheet) worksheet.freeze(rows=1) worksheet.set_basic_filter()
'X245', 'X650', 'X655' ]] X100_field = marc_parser_1_field(to_remove, 'id', 'X100', '\$') X100_field['year'] = X100_field['$d'].apply( lambda x: re.findall('\d+', x)[0] if x != '' else np.nan) X100_field = X100_field[X100_field['year'].notnull()] X100_field = X100_field[X100_field['year'].astype(int) <= 1700] to_remove = to_remove[~to_remove['id'].isin(X100_field['id'])] bn_books = bn_books[~bn_books['id'].isin(to_remove['id'])] pbl_enrichment = bn_books[[ 'id', 'dziedzina_PBL', 'rodzaj_ksiazki', 'DZ_NAZWA', 'X650', 'X655' ]] pbl_enrichment['DZ_NAZWA'] = pbl_enrichment['DZ_NAZWA'].str.replace( ' - .*?$', '', regex=True) pbl_enrichment = cSplit(pbl_enrichment, 'id', 'X655', '❦') pbl_enrichment['jest x'] = pbl_enrichment['X655'].str.contains('\$x') pbl_enrichment['nowe650'] = pbl_enrichment.apply( lambda x: x['X655'] if x['jest x'] == True else np.nan, axis=1) pbl_enrichment['X655'] = pbl_enrichment.apply( lambda x: x['X655'] if x['jest x'] == False else np.nan, axis=1) pbl_enrichment['X650'] = pbl_enrichment[['X650', 'nowe650']].apply( lambda x: '❦'.join(x.dropna().astype(str)), axis=1) pbl_enrichment = pbl_enrichment.drop(['jest x', 'nowe650'], axis=1) query = "select * from pbl_enrichment a join gatunki_pbl b on lower(a.X655) like '%'||b.gatunek||'%'" gatunki1 = pandasql.sqldf(query) query = "select * from pbl_enrichment a join gatunki_pbl b on lower(a.X650) like '%'||b.gatunek||'%'" gatunki2 = pandasql.sqldf(query) gatunki = pd.concat([gatunki1, gatunki2]).drop_duplicates() gatunki['gatunek'] = gatunki['gatunek'].apply(
] mapowanie_osob_df = pd.DataFrame() for file in tqdm(mapowanie_osob): sheet = gc.open_by_key(file) df_osoby = get_as_dataframe( sheet.worksheet('pbl_bn'), evaluate_formulas=True).dropna(how='all').dropna( how='all', axis=1).drop_duplicates() df_osoby = df_osoby[df_osoby['czy_ten_sam'] != 'nie'][[ 'pbl_id', 'BN_id', 'BN_name' ]] df_osoby['BN_name'] = df_osoby['BN_name'].str.replace( '\|\(', ' (').str.replace('\;\|', '; ').str.replace('\|$', '') df_osoby['index'] = df_osoby.index + 1 df_osoby = cSplit(df_osoby, 'index', 'BN_name', '\|').drop(columns='index') mapowanie_osob_df = mapowanie_osob_df.append(df_osoby) mapowanie_osob_df = mapowanie_osob_df.drop_duplicates().reset_index(drop=True) def rok_zgonu(x): try: return int( re.search('(?<=\- ca |\-ca |\-ok\. |\-|po )(\d+)', x).group(0)) except (TypeError, AttributeError): return None mapowanie_osob_df['rok zgonu'] = mapowanie_osob_df['BN_name'].apply( lambda x: rok_zgonu(x))
bar_catalog['008'] = bar_catalog['008'].str.strip().str.replace( '(^|.)(\%.)', '', regex=True).str.replace('([\+\!])', '-', regex=True) bar_catalog['do 100'] = bar_catalog['100'].str.replace(r'(^.+?)(❦)(.+?$)', r'\3', regex=True) bar_catalog['100'] = bar_catalog['100'].str.replace(r'(^.+?)(❦)(.+?$)', r'\1', regex=True) bar_catalog['700'] = bar_catalog[['do 100', '700']].apply( lambda x: ''.join(x.dropna().astype(str)), axis=1) del bar_catalog['do 100'] bar_catalog['100'] = bar_catalog['100'].str.replace( r'(?<=\%d)(\()(.+?)(\)\.{0,1})', r'\2', regex=True).str.replace('(?<=[a-zà-ž][a-zà-ž])\.$', '', regex=True) bar_catalog = cSplit(bar_catalog, '001', '600', '❦') bar_catalog['600'] = bar_catalog['600'].str.replace( r'(?<=\%d)(\()(.+?)(\)\.{0,1})', r'\2', regex=True).str.replace('(?<=[a-zà-ž][a-zà-ž])\.$', '', regex=True) bar_catalog['787'] = bar_catalog['600'].str.replace('(\%d.+?)(?=\%)', '', regex=True).str.replace( r'(?<=^)(..)', r'08', regex=True) bar_catalog['787'] = bar_catalog['787'].apply( lambda x: x if pd.notnull(x) and '%t' in x else np.nan) bar_catalog['600'] = bar_catalog['600'].str.replace('(\%t.+?)(?=\%|$)', '', regex=True).str.strip() bar_catalog['600'] = bar_catalog.groupby('001')['600'].transform(
data_person = tree.select_one('h2').text except: data_person = "brak danych (CR)" try: role = tree.select_one('h3').text except: role = "brak danych (CR)" try: description = tree.select_one('.indent').text except: description = "brak danych (CR)" data.append([data_person, role, description]) end_time = time.time() print(end_time - start_time) df = pd.DataFrame(data, columns=['data_person', 'role', 'description']) df['person_year'] = df['data_person'].apply(lambda x: re.sub( '(.+)(, )(\d.+)', r'\3', x) if re.findall('\d{4}', x) else np.nan) df['single_prize'] = df['description'].apply( lambda x: regex.sub('(\n)(\d{4} \p{Lu})', r'❦\2', x)) df['index'] = df.index + 1 df = cSplit(df, 'index', 'single_prize', '❦') df['single_prize'] = df['single_prize'].str.replace('\n', '❦') df = df[df['single_prize'].notnull()] df['prize_year'] = df['single_prize'].str.replace('(^\d{4})(.+)', r'\1') df['book_title_reason'] = df['single_prize'].apply(lambda x: re.sub( '(^\d{4} )(.+)(❦)(.+)', r'\4', x) if re.findall('❦', x) else np.nan) df.iloc[9, 2]
liczba_tworcow = len( full_synat.copy()[full_synat['uwagi'].str.lower().str.contains('biograf')]) liczba_instytucji = len(full_synat.copy()[(full_synat['710'] != '') & ( ~full_synat['516'].str.lower().str.contains('wydawnictw'))]) liczba_wydawnictw = len(full_synat.copy()[ full_synat['516'].str.lower().str.contains('wydawnictw')]) #porządkowanie serwisów i czasopism serwisy_portale = gsheet_to_df('1EWzb9mCsTVxYDqj_CzKW5EcqJy4z3a_-GyNGAbAjGH0', 'Serwisy, portale (finalny)') s_p_adres = serwisy_portale.copy()[['id', 'adres']] s_p_adres = cSplit(s_p_adres, 'id', 'adres', '❦') s_p_adres = s_p_adres[s_p_adres['adres'].str.contains('http')] s_p_adres['adres'] = s_p_adres['adres'].apply( lambda x: re.sub('(.+?u |^u |^3 )(h.+$)', r'\2', x)) s_p_adres['adres'] = s_p_adres['adres'].apply( lambda x: re.sub('(.+?)( .+)', r'\1', x)) s_p_adres['len'] = s_p_adres['adres'].apply(lambda x: len(x)) s_p_adres['adres'] = s_p_adres['adres'].apply( lambda x: x.replace('http://www.', 'http://')) s_p_adres['id'] = s_p_adres['id'].astype(int) s_p_adres = s_p_adres.sort_values(['id', 'len']).drop_duplicates() test = s_p_adres.groupby('id').head(1).reset_index(drop=True).drop( columns='len')
pbl_ids = bn_stats.copy()['pbl_id'].drop_duplicates().astype(int).tolist() pbl_query = """select z.za_zapis_id "record_id", zr.zr_zrodlo_id "pbl_id", z.za_ro_rok "year" from pbl_zapisy z join IBL_OWNER.pbl_zrodla zr on zr.zr_zrodlo_id=z.za_zr_zrodlo_id""" pbl_stats = pd.read_sql(pbl_query, connection) pbl_stats = pbl_stats[pbl_stats['pbl_id'].isin(pbl_ids)] pbl_stats = pbl_stats.groupby(['pbl_id', 'year']).count() pbl_stats = pbl_stats.reset_index(level=['pbl_id', 'year']).rename( columns={'record_id': 'liczba PBL'}) bn_stats['pbl_id'] = bn_stats['pbl_id'].astype(np.int64) bn_stats['help'] = bn_stats['pbl_id'].astype(str) + '|' + bn_stats['year'] pbl_stats['help'] = pbl_stats['pbl_id'].astype( str) + '|' + pbl_stats['year'].astype(str) stats = pd.merge(bn_stats, pbl_stats, 'left', 'help') stats = stats[[ 'help', 'pbl_magazine', 'liczba BN', 'liczba BN ok', 'procent literacki BN', 'liczba PBL' ]] stats['index'] = stats.index + 1 stats = cSplit(stats, 'index', 'help', '|', 'wide') stats = stats.rename(columns={ 'help_0': 'pbl_id', 'help_1': 'year' }).drop(columns=['index']).sort_values(['pbl_magazine', 'year']) stats.to_excel('statystyki_czasopism_bn_pbl.xlsx', index=False)