def mis_inv(ref_ai, cb_ai, inv): add_acc_message(f'We dropped mis_ref becuase we could not match them to Refinitiv out of ' f'{len(ref_ai["COMPANY_ID"].unique())} companies in Refinitiv. In Crunchbase there ' f'{len(cb_ai["Crunchbase_ID"].unique())}') # Get statistics on the transactsion with missing transactions with investor countries # create missing investor country indicator inv.loc[inv['investor_country'].isna(),'inv_country_missing'] =1 inv.loc[inv['investor_country'].notna(),'inv_country_missing'] =0 # sum transactions by missing country table_miss = pd.pivot_table(inv.loc[inv['Target_Region']=='USA'], values=['inv_country_missing', 'China_inv', 'investment_value'],index=['Crunchbase_ID_target'], aggfunc=np.sum) # Calculate the number of targets with Chinese investment. To do it we aggregte all tranasctions by company, # to see if there are some transactsions with some table_miss['China_dum']= table_miss['China_inv'].apply(lambda x: 1 if x>0 else 0) # same to calculate if there are companies with investments from investors with unknown countries table_miss['Miss_inv_country_dum']= table_miss['inv_country_missing'].apply(lambda x: 1 if x>0 else 0) # indicator if the firm received from both investors from unknown countries and investors from China table_miss.loc[(table_miss['Miss_inv_country_dum'] == 1) & (table_miss['China_dum']==1), 'Both'] = 1 table_miss.loc[table_miss['Both'].isna(), 'Both'] = 0 # indicator if the firm received from neighter investors from unknown countries and investors from China table_miss.loc[(table_miss['Miss_inv_country_dum'] == 0) & (table_miss['China_dum']==0), 'Neither'] = 1 table_miss.loc[table_miss['Neither'].isna(), 'Neither'] = 0 # add counter variables table_miss['all']= 1 sum_miss = table_miss[['China_dum', 'Miss_inv_country_dum', 'Both', 'Neither','all']].sum() add_acc_message(f'US companies at least one transaction involving Chinese investors or Investors from missing countries' f' {sum_miss}') # Export average values of Chinese investment in the US. # Save results for China/Unknown investors agg_inv('sum', table_miss, ['China_dum', 'Miss_inv_country_dum', 'all'], ['Both', 'Neither'], ['investment_value']).\ to_csv('data/china_us_val.csv') av_us_vc = inv.loc[(inv['Target_Region']=='USA') & (inv['investment_value'].isna()) & (inv['MA'] =='VC'), 'investment_value_median'].mean() av_ch_vc = inv.loc[(inv['Target_Region']=='China') & (inv['investment_value'].isna()) & (inv['MA'] =='VC') , 'investment_value_median'].mean() av_us_vc_med = inv.loc[(inv['Target_Region']=='USA') & (inv['investment_value'].isna()) & (inv['MA'] =='MA'), 'investment_value_median'].mean() av_ch_vc_med = inv.loc[(inv['Target_Region']=='China') & (inv['investment_value'].isna()) & (inv['MA'] =='MA') , 'investment_value_median'].mean() add_acc_message(f'Averages deal sizes with imputed missing values: US VC deal {av_us_vc}, average China VC deal ' f'{av_ch_vc} average US MA deal {av_us_vc_med}, average China MA deal {av_ch_vc_med}') return
def average_deal(inv): # Companies and rounds where there are investors with missing or Chinese investors comp_data = agg_inv('count',inv.loc[inv['Target_Region']=='USA'],['MA','China_inv'], ['inv_country_missing'], ['Crunchbase_ID_target']) round_data = agg_inv('count',inv.loc[inv['Target_Region']=='USA'],['MA','China_inv'], ['inv_country_missing'], ['round_id']) #save the results comp_data.to_csv('data/comp.csv') round_data.to_csv('data/round_data.csv') # Get the number of US deals us_inv_rnd = inv.loc[inv['Target_Region']=='USA', 'round_id'].nunique() print(f"Number of unique US deals {us_inv_rnd}") # Get the number of US companies that received investment c= inv.loc[inv['Target_Region']== 'USA','Crunchbase_ID_target'].nunique() print(f'Over the period {c} American companies received private equity investment') ch = inv.loc[(inv['Target_Region']== 'USA')& inv['China_inv']== 1,'Crunchbase_ID_target'].nunique() print(f'Over the period {ch} American companies received private equity investment from China') undisc = inv.loc[(inv['Target_Region']== 'USA')& inv['investor_country'].isna(),'Crunchbase_ID_target'].nunique() print(f'Over the period {undisc} American companies received private equity investment from at least one investor with' f'undisclosed country.') agg_inv('sum',inv[inv['Target_Region']== 'USA'].loc[inv['investment_value'].notna()], ['Target_Region', 'MA'], ['year'], ['investment_value']) # table return
def test_agg_inv_sum(self): agg = agg_inv("sum", self.inv_for_agg, ["Target_Region", "CB"], ["year"], ["investment_value"]) self.assertEqual( agg.to_dict(), { ('investment_value', 2013): { ('CHN', '1'): 10, ('CHN', '2'): 20, ('USA', '3'): 0, ('USA', '4'): 0 }, ('investment_value', 2014): { ('CHN', '1'): 0, ('CHN', '2'): 50, ('USA', '3'): 0, ('USA', '4'): 0 }, ('investment_value', 2015): { ('CHN', '1'): 0, ('CHN', '2'): 0, ('USA', '3'): 30, ('USA', '4'): 40 } })
def add_tests(inv): # sum investments by target and year sum_t = agg_inv('sum', inv.loc[inv['investment_value'].notna()], ['Target_Region', 'MA'], ['year'], ['investment_value']) # count investments by target and year count_t = agg_inv('count', inv.loc[inv['investment_value'].notna()], ['Target_Region', 'MA'], ['year'], ['round_id']) # replace zero count with 1 to avoid division by zero count_t = count_t.replace(0, 1) # calculate average deal value av_deal_values = sum_t.values / count_t.values sum_t_av = sum_t sum_t_av[:] = av_deal_values # find missing deal vlaues inv.loc[inv['investment_value'].isna(), 'missing_inv_values'] = 1 inv.loc[inv['investment_value'].notna(), 'missing_inv_values'] = 0 # calcualte total number of missing deals with investment round as an observation. tot_miss = agg_inv('count', inv.loc[inv['investment_value'].isna()], ['Target_Region', 'MA'], ['year'], ['round_id']) tot_non_miss = agg_inv('count', inv, ['Target_Region', 'MA'], ['year'], ['round_id']) # average number of missing values av_miss = tot_miss / tot_non_miss # report the number of missing values add_acc_message(f"Average share of missing investment values {av_miss}") # loop separtely at MA transactions: miss_ma_inv = inv.loc[(inv['missing_inv_values']==1) & (inv['MA']=='MA'), 'missing_inv_values'].sum() tot_ma = len(inv.loc[inv['MA']=='MA']) add_acc_message(f"Average share of missing investment values in MA transactsion is {miss_ma_inv/tot_ma}, the number" f" of missing is {miss_ma_inv} out of {tot_ma}") add_acc_message( f"We cover {inv['round_id'].count()} deals, out which {inv.loc[inv['investment_value'].isna(), 'round_id'].count()}" f" miss investment value") add_acc_message(f"We cover {inv.loc[inv['MA'] == 'VC', 'round_id'].nunique()} VC rounds, out which" f" {inv.loc[(inv['investment_value'].isna()) & (inv['MA'] == 'VC'), 'round_id'].nunique()}" f" miss investment value") add_acc_message(f"We cover {inv.loc[inv['MA'] == 'VC', 'round_id'].count()} VC rounds/Company combinations, out which" f" {inv.loc[(inv['investment_value'].isna()) & (inv['MA'] == 'VC'), 'round_id'].count()}" f" miss investment value") add_acc_message(f"We cover {inv.loc[inv['MA'] == 'MA', 'round_id'].count()} MA deals, out which" f" {inv.loc[(inv['investment_value'].isna()) & (inv['MA'] == 'MA'), 'round_id'].count()}" f" miss investment value") # Chinese->US flows by investment stage count count_stage_CH_US = agg_inv('count',inv.loc[(inv['MA']== 'VC') & (inv['Target_Region']=='USA')], ['investment_type', 'China_inv'], ['year'], ['round_id']) # Chinese->US flows by investment stage median value sum_stage_CH_US_med = agg_inv('sum',inv.loc[(inv['MA']== 'VC') & (inv['Target_Region']=='USA')], ['investment_type', 'China_inv' ], ['year'], ['investment_value_median']) # Chinese->US flows by investment stage disclosed value sum_stage_CH_US = agg_inv('sum', inv.loc[(inv['MA']== 'VC') & (inv['Target_Region']=='USA')], ['investment_type', 'China_inv' ], ['year'], ['investment_value']) # Undisclosed identity of the descination country indicators inv.loc[inv['investor_country'].isna(), 'inv_unknown'] = 1 inv.loc[inv['investor_country'].notna(), 'inv_unknown'] = 0 # Unknown->US flows by investment stage count count_stage_UKN_US = agg_inv('count',inv.loc[(inv['MA']== 'VC') & (inv['Target_Region']=='USA')], ['investment_type', 'inv_unknown'], ['year'], ['round_id']) # Unknown->US flows by investment stage median value sum_stage_UKN_US_med = agg_inv('sum', inv.loc[(inv['MA']== 'VC') & (inv['Target_Region']=='USA')], ['investment_type', 'inv_unknown' ], ['year'], ['investment_value_median']) # Unknown->US flows by investment stage disclosed value sum_stage_UKN_US = agg_inv('sum', inv.loc[(inv['MA']== 'VC') & (inv['Target_Region']=='USA')], ['investment_type', 'inv_unknown' ], ['year'], ['investment_value']) # Target country # Get more countries other than US, China and ROW inv = inv.replace('GBR', 'UK') inv = inv.replace('CA', 'CAN') inv = inv.replace('IS', 'ISR') inv = inv.replace('IN', 'IND') inv = inv.replace('DE', 'DEU') inv = inv.replace('SG', 'SGP') inv = inv.replace('JP', 'JPN') inv = inv.replace('FR', 'FRA') # Create data for the Next8 countries: others_countries = inv.loc[inv['target_country'].isin(['UK', 'CAN', 'ISR','IND', 'SGP', 'JPN', 'FRA', 'DEU'])] count_other_majors = agg_inv('count',others_countries, ['target_country','MA'], ['year'], ['round_id']) # Unknown->US flows by investment stage median value other_majors_med = agg_inv('sum',others_countries, ['target_country','MA'], ['year'], ['investment_value_median']) # Unknown->US flows by investment stage disclosed value other_majors_val = agg_inv('sum',others_countries, ['target_country','MA'], ['year'], ['investment_value']) with pd.ExcelWriter('data/add_test.xlsx') as writer: # doctest: +SKIP sum_t_av.to_excel(writer, sheet_name='Average_deal_value') av_miss.to_excel(writer, sheet_name='Share_of_missing_values') tot_miss.to_excel(writer, sheet_name='Total_missing_value_count') count_stage_CH_US.to_excel(writer, sheet_name='Count_CH_to_US') sum_stage_CH_US_med.to_excel(writer, sheet_name='MedVal_CH_to_US') sum_stage_CH_US.to_excel(writer, sheet_name='DiscVal_CH_to_US') count_stage_UKN_US.to_excel(writer, sheet_name='Count_UKN_to_US') sum_stage_UKN_US_med.to_excel(writer, sheet_name='MedVal_UKN_to_US') sum_stage_UKN_US.to_excel(writer, sheet_name='DiscVal_UKN_to_US') count_other_majors.to_excel(writer, sheet_name='Others_Count') other_majors_med.to_excel(writer, sheet_name='Others_Med') other_majors_val.to_excel(writer, sheet_name='Others_Val') return