def make_collocation_graph(target, top=15, urns=[], cutoff=0, cut_val=2, before=4, after=4, limit=1000): """Make a cascaded network from collocations""" antall = Counter() for urn in urns: antall += get_freq(urn[0], top=0, cutoff=0) korpus_totalen = frame(antall, 'total') Total = korpus_totalen[korpus_totalen > cut_val] if isinstance(target, str): target = target.split() I = urn_coll_words(target, urns=urns, before=before, after=after, limit=limit) toppis = frame(I[0]**1.2 / Total['total'], target[0]).sort_values(by=target[0], ascending=False) #toppis[:top].index isgraf = dict() for word in toppis[:top].index: if word.isalpha(): isgraf[word] = urn_coll(word, urns=urns, before=before, after=after) isframe = dict() for w in isgraf: isframe[w] = frame(isgraf[w], w) tops = dict() if len(target) == 1: tops[target[0]] = toppis else: tops['_'.join(target[:2])] = toppis for w in isframe: tops[w] = frame(isframe[w][w]**1.2 / Total['total'], w).sort_values(by=w, ascending=False) edges = [] for w in tops: edges += [(w, coll) for coll in tops[w][:top].index if coll.isalpha()] Ice = nx.Graph() Ice.add_edges_from(edges) return Ice
def ngavis(word, period): try: if " " in word: bigram = word.split()[:2] res = nb.frame( nb.bigram(first=bigram[0], second=bigram[1], period=period, media='avis'), word) else: res = nb.frame(nb.unigram(word, period=period, media='avis'), word) #st.write(res.head()) except: res = pd.DataFrame() return res
def unigram(word, period=(1950, 2020), media='bok', ddk=None, topic=None, gender=None, publisher=None, lang=None, trans=None, name=None): r = requests.get("https://api.nb.no/ngram/unigrams", params={ 'word': word, 'ddk': ddk, 'topic': topic, 'gender': gender, 'publisher': publisher, 'lang': lang, 'trans': trans, 'period0': period[0], 'period1': period[1], 'media': media, 'name': name }) return nb.frame(dict(r.json()))
def coll_avis(word, title='%', before=5, after=5, datefrom="1800-01-01", dateto="2000-01-01", limit=1000): return nb.frame( nb.frame( coll_newspaper(word, title=title, before=before, after=after, datefrom=datefrom, dateto=dateto, limit=limit)).loc[0].transpose())
def phrase_plots_level(phrase_set, title='*', period=(20100101, 20301231), media='aviser'): df_all = [ nb.frame(get_df_level(f, title=title, period=period, media=media), ', '.join(f)) for f in phrase_set ] df = pd.concat(df_all, sort=False, axis=1) return df
def phrase_plots_anno(phrase_sets, title='aftenposten', fra = 1960, til = 2020, rot=0, colours = ['r', 'b','g']): df_all = [] for f in phrase_sets: df_all.append(nb.frame(get_df(f, title= title), ', '.join(f))) df = pd.concat(df_all, sort=False) df.index = df.index.astype(int) df = df.sort_index() #df['bins'] = pd.cut(df.index, range(fra, til, step), precision=0) df.plot(kind='bar', figsize=(15,5), rot=rot, color=colours) return
def make_dtm(texts): dtm = pd.DataFrame() freqs = dict() for text in texts.keys(): print(text) c = Counter() for p in texts[text]: c.update(p) freqs[text] = nb.frame(c, text) dtm = pd.concat([freqs[text] for text in freqs.keys()], axis=1, sort=False) return dtm
def ngbok(word, period, ddk=None, lang='nob'): try: if " " in word: bigram = word.split()[:2] res = nb.frame( nb.bigram(first=bigram[0], second=bigram[1], ddk=ddk, period=period, media='bok', lang=lang), word) else: res = nb.frame( nb.unigram(word, period=period, ddk=ddk, media='bok', lang=lang), word) except: res = pd.DataFrame() return res
def create_frame(coll, expected): df = nb.frame(frame(coll).transpose(), 'freq doc dist'.split()) df['score'] = dist(df['dist'], expected, df['freq']) return df
# try again - things may have loaded on the server... print('prøver en gang til for: ', (year, year + step)) try: colls[(year, year + step)] = collocation(word, yearfrom=year, yearto=year + step, corpus='avis', before=before, after=after) except: print('klarte ikke: ', (year, year + step)) colls_df = colls2df(colls, calculate_midpoint(before, after)) return colls_df, score_df(colls_df) score_df = lambda df: nb.frame({x: df[x]['score'] for x in df}).transpose() display_vals = lambda kr_df, word, clip=0: kr_df[kr_df >= clip].loc[word] def show_frame(df, colnum=0, clip=0, fillval=10, cmap='Blues', up=True, axis=0, first_row=0, number_of_rows=20): if up == True: cmap = cmap + '_r' dfc = df[df >= clip]
def ngbok(x, period, ddk=None): try: r = nb.frame(nb.unigram(x, period, media='bok', ddk=ddk), x) except: r = pd.DataFrame() return r
def frm(x, y): if not x.empty: res = nb.frame(x, y) else: res = x return res
#st.line_chart(tot) #if st.button('Sjekk fordeling i bøker'): if antall > 0: wordlist = allword urns = { w: nb.book_urn(words=[w], ddk=ddk, period=(period_slider[0], period_slider[1]), limit=antall) for w in wordlist } #data = {w: nb.aggregate_urns(urns[w]) for w in wordlist} #st.write([(w,urns[w]) for w in wordlist]) urner = lambda w: [x[0] for x in urns[w]] #st.write(urner(wordlist[0])) data = {'bok ' + w: nb.word_freq(urner(w), wordlist) for w in wordlist} st.markdown( "### Bøker som inneholder en av _{ws}_ i kolonnene, ordfrekvens i radene" .format(ws=', '.join(wordlist))) st.write('En diagonal indikerer at ordene gjensidig utelukker hverandre') st.write(nb.frame(data).transpose().fillna(0)) #st.write(df.loc[wordlist].fillna(0))
if ddk == "": ddk = None if ddk != None and not ddk.endswith("%"): ddk = ddk + "%" antall = st.number_input( 'Antall bøker - jo fler jo lenger ventetid, forskjellige søk vil vanligvis gi nye bøker (trykk på +/- for starte nye søk', 10) period_slider = st.slider('Angi periode - år mellom 1900 og 2014', 1900, 2020, (1950, 2010)) if words != "": urns = { w: nb.book_urn(words=[w], ddk=ddk, period=(period_slider[0], period_slider[1]), limit=antall) for w in wordlist } data = {w: nb.aggregate_urns(urns[w]) for w in wordlist} df = pd.concat([nb.frame(data[w], 'bøker ' + w) for w in wordlist], axis=1) st.markdown( "### Bøker som inneholder en av _{ws}_ i kolonnene, ordfrekvens i radene" .format(ws=', '.join(wordlist))) st.write('En diagonal indikerer at ordene gjensidig utelukker hverandre') st.write(df.loc[wordlist].fillna(0))
def collocations_from_nb(word, corpus, func = get_konkordanser): """Get a concordance, and count the words in it. Assume konks reside a dataframe with columns 'after' and 'before'""" concordance = nb.frame(func(word, corpus)) return nb.frame_sort(nb.frame(Counter(tokenize(' '.join(concordance['after'].values + concordance['before'].values))), word))
def count_from_conc(concordance): """From a concordance, count the words in it. Assume konks reside a dataframe with columns 'after' and 'before'""" word = concordance['word'][0] return nb.frame_sort(nb.frame(Counter(tokenize(' '.join(concordance['after'].values + concordance['before'].values))), word))
def ngavis(x, period): try: r = nb.frame(nb.unigram(x, period, media='avis'), x) except: r = pd.DataFrame() return r