def sentence_start(text): #Count variables ratio_dict = { "nouns": 0, "pronouns": 0, "verbs": 0, "adjectives": 0, "adverbs": 0, "conjunctions": 0, "particles": 0, "pronouns": 0, "prepositions": 0, "others": 0, "simpson": 0, "fisher": 0, "brillouin": 0, "berger_parker": 0 } #Tokenize into sentences sentences = nltk.tokenize.sent_tokenize(text) problem_sentences = [] #Loop through sentences for sentence in sentences: tags = identify_speech(sentence) ratio_dict[tags[0]] = ratio_dict[tags[0]] + 1 if tags[0] == "nouns" or tags[0] == "pronouns": problem_sentences.append(sentence) #Calculate diversity simpson = simpson_e(list(ratio_dict.values())[0:7]) fisher = fisher_alpha(list(ratio_dict.values())[0:7]) brillouin = brillouin_d(list(ratio_dict.values())[0:7]) berger_parker = berger_parker_d(list(ratio_dict.values())[0:7]) #Convert to percentage #ratio_dict = {k: "".join([str(round(v / len(sentences),4)*100),"%"]) for k, v in ratio_dict.items()} #Update diversity metric ratio_dict['simpson'] = simpson ratio_dict['fisher'] = fisher ratio_dict['brillouin'] = brillouin ratio_dict['berger_parker'] = berger_parker return (ratio_dict, problem_sentences)
def test_berger_parker_d(self): self.assertEqual(berger_parker_d(np.array([5])), 1) self.assertEqual(berger_parker_d(np.array([5, 5])), 0.5) self.assertEqual(berger_parker_d(np.array([1, 1, 1, 1, 0])), 0.25) self.assertEqual(berger_parker_d(self.counts), 5 / 22)
dissolved = joined.dissolve(by='id', aggfunc=lambda x: list(x)) # Getting output length dis_len = len(dissolved) # Counting language dominance, menhinick diversity and simpson index print('[INFO] - Calculating variables..') for i, row in dissolved.iterrows(): print("[INFO] - Calculating grid cell {}/{}...".format(i, dis_len)) lang_counts = list(Counter( row[args['language']]).values()) # occurence counts lang_counts = np.asarray(lang_counts) # cast as numpy array for skbio dissolved.at[i, 'dominance'] = sk.dominance(lang_counts) dissolved.at[i, 'menhinick'] = sk.menhinick(lang_counts) dissolved.at[i, 'simpson'] = sk.simpson(lang_counts) dissolved.at[i, 'berger'] = sk.berger_parker_d(lang_counts) dissolved.at[i, 'singles'] = sk.singles(lang_counts) dissolved.at[i, 'shannon'] = np.exp(sk.shannon(lang_counts, base=np.e)) dissolved.at[i, 'unique'] = sk.observed_otus(lang_counts) # Select columns for output cols = [ 'geometry', 'dominance', 'menhinick', 'simpson', 'berger', 'singles', 'shannon', 'unique' ] output = dissolved[cols] # Save the output to pickle print('[INFO] - Saving to shapefile') output.to_file(args['output'], encoding='utf-8')
# rename home detection column data = data.rename(columns={'home_unique_weeks': 'home_country'}) # filter only users who most likely live in Finland data = data[data['home_country'].str.contains('Finland')] # count language use without singletons print('[INFO] - Calculating language diversities...') data['ulangs'] = data['langs'].apply(lambda x: langcount(x)[0]) data['counts'] = data['langs'].apply(lambda x: langcount(x)[1]) data = data[data['counts'].map(lambda d: len(d)) > 0] # drop empties if any exist # calculate diversity metrics data['dominance'] = data['counts'].apply(lambda x: sk.dominance(x)) data['berger'] = data['counts'].apply(lambda x: sk.berger_parker_d(x)) data['menhinick'] = data['counts'].apply(sk.menhinick) data['simpson'] = data['counts'].apply(sk.simpson) data['singles'] = data['counts'].apply(sk.singles) data['shannon'] = data['counts'].apply( lambda x: np.exp(sk.shannon(x, base=np.e))) data['unique'] = data['counts'].apply(sk.observed_otus) # language counts to dictionary data['langdict'] = data.apply(lambda x: dict(zip(x['ulangs'], x['counts'])), axis=1) # calculate ellis et al diversity metrics data['divs'] = data['langdict'].apply(lang_entropy) # calculate number of sentences per user
areas.at[i, colname4] = (int(lposts) / int(lpostsum)) * 100 # get dominant language from selected columns areas['propmax'] = areas[['fi_prop','en_prop','et_prop','ru_prop','sv_prop','es_prop','ja_prop','fr_prop','pt_prop','de_prop']].idxmax(axis=1) areas['mean_propmax'] = areas[['fi_mean_prop','en_mean_prop','et_mean_prop','ru_mean_prop','sv_mean_prop','es_mean_prop','ja_mean_prop','fr_mean_prop','pt_mean_prop','de_mean_prop']].idxmax(axis=1) areas['sum_propmax'] = areas[['fi_sum_prop','en_sum_prop','et_sum_prop','ru_sum_prop','sv_sum_prop','es_sum_prop','ja_sum_prop','fr_sum_prop','pt_sum_prop','de_sum_prop']].idxmax(axis=1) # get all language column names cols = list(areas[langlist].columns) # loop over areas print('[INFO] - Calculating diversity metrics per area..') for i, row in areas.iterrows(): # get counts of languages otus = list(row[cols]) # drop zeros otus = [i for i in otus if i != 0] # calculate diversity metrics areas.at[i, 'dominance'] = sk.dominance(otus) areas.at[i, 'berger'] = sk.berger_parker_d(otus) areas.at[i, 'menhinick'] = sk.menhinick(otus) areas.at[i, 'singletons'] = sk.singles(otus) areas.at[i, 'shannon'] = np.exp(sk.shannon(otus, base=np.e)) areas.at[i, 'unique'] = sk.observed_otus(otus) # save to file print('[INFO] - Saving output geopackage...') areas.to_file(args['output'], driver='GPKG') print('[INFO] - ... done!')