def cwi2(sentences, words, zipf_freq): word_freqs = [[[x[0], x[1], x[2], zipf_frequency(x[0], 'en')] for x in y] for y in words] for i, wf_sort in enumerate(word_freqs): ''' i = 0 wf_sort = word_freqs[i] ''' for ambg_word in wf_sort: ''' ambg_word = wf_sort[0] ''' if (ambg_word[3] < zipf_freq): subtitute = lesk(sentences[i], ambg_word) #subtitute = lesk(wf_sort,wf_sort[0]) if subtitute == 0: continue if not subtitute[0][0] > 0: continue synset = subtitute[0][1] subWord = synset.lemma_names() subWord = [[zipf_frequency(x, 'en'), x] for x in subWord] subWord = sorted(subWord, reverse=True) #sentences[i] = re.sub(wf_sort[0][0], subWord[0][1], sentences[i]) sentences[i] = re.sub(ambg_word[0], subWord[0][1], sentences[i]) return sentences
def canUse(candidate, past): """ Check whether a candidate is OK to use. """ candidateFrequency = wordfreq.zipf_frequency(candidate, "en", wordlist="large") candidateRootFrequency = max( candidateFrequency, wordfreq.zipf_frequency(ps.stem(candidate), "en", wordlist="large")) # Reject words that are too infrequent or too frequent (like "a" or "the") if candidateFrequency < 2.3 or candidateFrequency > 6: return False # Mostly, this rejects '#'-containing words if not candidate.isalpha(): return False # Is it a bad word? if candidate in bad_words: return False # Now, we check if we've used a related word before. if any(map(lambda w: lexicallyRelated(candidate, w), past)): return False # If we have a relatively infrequent word that is too related to # our past three rounds of words, we should reject it. if sum(similarityScore(x, candidate) for x in past[-6:]) > 2 and candidateRootFrequency < 3.2: return False # otherwise, we're ok! return True
def rarest_word(): f = open(file_name, "r", encoding='utf-8') d = enchant.Dict("en_US") lowest_freq_m = [8, ""] lowest_freq_n = [8, ""] for text in f: words = text.split(" ") for word in words: # check that words is a valid english word and that we have a frequency if (len(word) > 0 and d.check(word) and wordfreq.zipf_frequency(word, 'en') != 0): freq = wordfreq.zipf_frequency(word, 'en') text = text[text.find("-") + 2:len(text) - 1] if (len(text) > 0): if (text[0] == "E" and freq < lowest_freq_m[0]): lowest_freq_m[0] = freq lowest_freq_m[1] = word elif (text[0] == "L" and freq < lowest_freq_n[0]): lowest_freq_n[0] = freq lowest_freq_n[1] = word f.close() return (lowest_freq_m, lowest_freq_n)
def cwi(sentences, words): word_freqs = [[[x[0], x[1], x[2], zipf_frequency(x[0], 'en')] for x in y] for y in words] word_freqs_sorted = [ sorted(y, key=lambda x: x[3], reverse=False) for y in word_freqs ] #word_freq = word_freqs_sorted[1][0] ##### #sentence = word_freqs_sorted[0]##### for i, wf_sort in enumerate(word_freqs_sorted): #subtitute = lesk(word_freqs_sorted[i],word_freqs_sorted[i][0]) ###### #i=3 #wf_sort=word_freqs_sorted[i] #subtitute = lesk(sentences[i],wf_sort[0]) subtitute = lesk(sentences[i], wf_sort[0]) #subtitute = lesk(wf_sort,wf_sort[0]) if subtitute == 0: continue if not subtitute[0][0] > 0: continue #synset = subtitute[0][1].name().split('.')[1:3] #subWord = [x[1] for x in subtitute] #subWord = [x.name().split('.') for x in subWord] #subWord = [x[0] for x in subWord if x[1:3] == synset] synset = subtitute[0][1] subWord = synset.lemma_names() subWord = [[zipf_frequency(x, 'en'), x] for x in subWord] subWord = sorted(subWord, reverse=True) sentences[i] = re.sub(wf_sort[0][0], subWord[0][1], sentences[i]) return sentences
def sim(self, w1, w2): if w1 not in self.word_to_idx or w2 not in self.word_to_idx: return 0 word_freq_factor = zipf_frequency(w1, "en") * zipf_frequency(w2, "en") return (word_freq_factor) * np.dot(self.vectors[self.word_to_idx[w1]], self.vectors[self.word_to_idx[w2]])
def add_entry(db, title, word): lemma = LEMMATIZER.lookup('en', word)[0] title = title.lower().split(" (")[0] if wordfreq.zipf_frequency(lemma, 'en') < 6 and wordfreq.zipf_frequency( word, 'en') < 6: db.execute( "INSERT OR IGNORE INTO words (page, word, lemma) VALUES (?, ?, ?)", (title, word, lemma))
def convert_text_to_keywords(text, high_freq, src_lang): clean = re.sub(r"[,.'`’'|—;:@#?¿!¡<>_\-\"”“&$\[\]\)\(\\\/]+\ *", " ", text) lowerString = clean.lower() words = lowerString.split(sep=None) print('words c', words) keywords = [] for word in words: print('zipf_frequency(word, get_lang_code(src_lang))', zipf_frequency(word, get_lang_code(src_lang))) print('high_freq', high_freq) if (not word.isdigit() and "/" not in word and "\\" not in word and len(word) > 1 and zipf_frequency(word, get_lang_code(src_lang)) <= high_freq): keywords.append(word) return keywords
def parse_readings(filename, words_traditional, words_simplified): for line in read_csv(filename): if len(line) != 3 or line[0].startswith("#"): continue codepoint = line[0] fieldname = line[1] content = line[2] if fieldname not in ("kCantonese", "kMandarin", "kDefinition"): continue character = chr(int(codepoint[2:], 16)) entry_added = False if character in words_traditional: freq = zipf_frequency(character, "zh") for entry in words_traditional[character]: entry.add_freq(freq) if fieldname == "kCantonese": entry.add_jyutping(content) elif fieldname == "kMandarin": pin = convert_pinyin_to_tone_numbers(content, character) entry.add_pinyin(pin) elif fieldname == "kDefinition": entry.add_defs([("", x.strip()) for x in content.split(";")]) entry_added = True if character in words_simplified: # Ignore simplified characters entry_added = True if not entry_added: trad = simp = character freq = zipf_frequency(trad, "zh") jyut = content if fieldname == "kCantonese" else "" pin = (convert_pinyin_to_tone_numbers(content, trad) if fieldname == "kMandarin" else "") defs = ([("", x.strip()) for x in content.split(";")] if fieldname == "kDefinition" else []) entry = objects.Entry(trad, simp, pin, jyut, freq=freq, defs=defs) words_traditional[trad].append(entry) words_simplified[simp].append(entry)
def parse_same_word_file(filename, words): for line in read_csv(filename): if len(line) != 2 or line[0] == "詞彙": continue trad = line[0] simp = HanziConv.toSimplified(trad) pin = lazy_pinyin( trad, style=Style.TONE3, neutral_tone_with_five=True, ) pin = " ".join(pin).lower() pin = pin.strip().replace("v", "u:") jyut = pinyin_jyutping_sentence.jyutping(trad, tone_numbers=True, spaces=True) freq = zipf_frequency(trad, "zh") defs = [ objects.DefinitionTuple("".join(jieba.cut(line[1])), "臺陸用法和差異", []) ] entry = objects.Entry(trad, simp, pin, jyut, freq=freq, defs=defs) words.add(entry)
def wordfreqs(text): freqs = [] for tok in wordfreq.tokenize(text, 'en'): freq = wordfreq.zipf_frequency(tok, 'en') if freq != 0: freqs.append(freq) return np.array(freqs)
def show_frequencies(): show_info_text() if text_type.get() == 'youtube': createTextFileFromYoutube() source_text = None source_text = get_source_text() if source_text: source_text = source_text.lower() excluded_words = [] word_frequency_dictionary = dict() clean = re.sub(r"[,.'`’'|—;:@#?¿!¡<>_\-\"”“&$\[\]\)\(\\\/]+\ *", " ", source_text) words = clean.split() for word in words: if word_excluded(word): if not word in excluded_words: excluded_words.append(word) elif not word.isdigit(): word_frequency_dictionary[word] = round( zipf_frequency(word, get_lang_code(str(src_language.get()))), 1) for x in np.arange(0, 10, 0.1): x = 10 - x clean_x = round(x, 1) words_with_x_freq = [] for word, freq in word_frequency_dictionary.items(): if freq == x: words_with_x_freq.append(word) if words_with_x_freq: printtk(str(clean_x)) printtk(str(words_with_x_freq)) if excluded_words: printtk('EXCLUDED:') printtk(str(excluded_words))
def get_bert_candidates(input_text, list_cwi_predictions, numb_predictions_displayed=10): list_candidates_bert = [] for word, pred in zip(input_text.split(), list_cwi_predictions): if (pred and (pos_tag([word])[0][1] in ['NNS', 'NN', 'VBP', 'RB', 'VBG', 'VBD']) ) or (zipf_frequency(word, 'en')) < 3.1: replace_word_mask = input_text.replace(word, '[MASK]') text = f'[CLS]{replace_word_mask} [SEP] {input_text} [SEP] ' tokenized_text = tokenizer.tokenize(text) masked_index = [ i for i, x in enumerate(tokenized_text) if x == '[MASK]' ][0] indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) segments_ids = [0] * len(tokenized_text) tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segments_ids]) # Predict all tokens with torch.no_grad(): outputs = model(tokens_tensor, token_type_ids=segments_tensors) predictions = outputs[0][0][masked_index] predicted_ids = torch.argsort( predictions, descending=True)[:numb_predictions_displayed] predicted_tokens = tokenizer.convert_ids_to_tokens( list(predicted_ids)) list_candidates_bert.append((word, predicted_tokens)) return list_candidates_bert
def add_zipf_frequency(word_count): words_stats = [] for w, c in word_count.items(): freq = zipf_frequency(w, "en") words_stats.append(WordStats(w, c, freq)) return words_stats
def get_tag_TF(tag_num, tag2id, id2tag): ret = [0.0] * tag_num for key in tag2id: ret[tag2id[key]] = zipf_frequency(key, "en") # print (key, ret[tag2id[key]]) print("get tag tf over!") return ret
def choose_word(words: list): toReplace = {} # k=index in words, v= translated word current_min = 10 chosen_index = 0 translator = Translator() for word in words: translated = translator.translate(word).text if zipf_frequency(translated, 'en') == 0: # handle invalid words toReplace[translated] = 10 else: toReplace[translated] = zipf_frequency(translated, 'en') if toReplace[translated] < current_min: current_min = toReplace[translated] chosen_index = words.index(word) chosen_word = min(toReplace, key=toReplace.get) return {chosen_index: chosen_word}
def examine(s, szf, do_implicits=False, do_explicits=True): explicits = [] if do_explicits: explicits = re.findall("#\w+", s) explicits = [a.replace("#", "").lower() for a in explicits] implicits = [] if do_implicits: ss = "\n".join( list( filter( lambda x: "#" not in x and "_" not in x and not re.match("'.+'", x), s.split("\n") ) ) ) spl = list( filter( lambda x: re.match( "\w+", x ) and x not in CatIndex.en_stops and x not in CatIndex.ru_stops and len(x) > 2, re.split("\s", ss) ) ) for a in spl: ru = CatIndex.ru.parse(a) for b in ru: for c in CatIndex.ru_linkable: if str(b.tag).startswith(c): zf = wordfreq.zipf_frequency(a, 'ru') if zf <= szf: implicits.append(a) try: en = nltk.pos_tag([a]) except: pass else: for b in en: for c in CatIndex.en_linkable: if b[1].startswith(c): zf = wordfreq.zipf_frequency(a, 'en') if zf <= szf: implicits.append(a) r = { "explicits": explicits, "implicits": implicits } return(r)
def make_dataframe(filename): # Read pandas from csv data init = pd.read_csv(filename) # Detect terminal width for dataframe printing pd.options.display.width = 0 # Keep only these three fields... Be careful (names will change after experiment) filtered = init[['positive', 'antonym', 'response']] # Strip leading/trailing whitespace and lowercase all stimuli and responses filtered['response'] = filtered['response'].apply( lambda word: word.lower().strip()) filtered['positive'] = filtered['positive'].apply( lambda word: word.lower().strip()) # Sort by stimuli and then response sorted = filtered.sort_values(['positive', 'response']) # Add a column for response count sorted = sorted.assign(ant_count=sorted.groupby( ['positive', 'response']).response.transform('count')) # Remove duplicates (because we have a count) sorted = sorted.drop_duplicates() # Calculate transition probability: specific_antonym.count()/all_antonyms.count() sorted = sorted.assign( trans_prob=sorted.groupby('positive').transform(lambda x: x / x.sum())) sorted = sorted.reset_index( drop=True) # Add bool column for morphological antonyms # Add bool column for morphological antonyms sorted = sorted.assign(is_morph=sorted['response'] == sorted['antonym']) # Add word frequencies for response words sorted = sorted.assign(freq_absolute=sorted['response'].apply( lambda word: zipf_frequency(word, 'en'))) # Add relative word frequencies: response_freq - stimuli_freq sorted = sorted.assign( freq_relative=sorted['freq_absolute'] - sorted['positive'].apply(lambda word: zipf_frequency(word, 'en'))) return sorted
def filter_by_freq(word_list, lower_freq_bound, upper_freq_bound): """Filters words based on their relative frequency and adds frequency info to the wordlist""" filtered_word_list = [] for word in word_list: freq = wordfreq.zipf_frequency(word[0], 'zh', wordlist='large', minimum=0.0) if lower_freq_bound <= freq <= upper_freq_bound: filtered_word_list.append(word) return filtered_word_list
def identify_emerging_concepts(self, sent: Span, section: Section, graph: ConceptGraph, rule_based=True): """Identify concepts in a given sentence that are likely to be emerging concepts. :param sent: A spaCy span representing a sentence in a document. :param section: The section the sentence appears in. :param graph: The concept graph to record the emerging concepts in. :param rule_based: Flag indicating whether or not to use the rule-based classifier. """ if not rule_based: return for token in filter(lambda token: token.dep_ == 'ROOT', sent): concept_tokens = [] if token.lemma_ == 'be': if len( list( filter(lambda right: right.dep_ == 'attr', token.rights))) > 0: concept_tokens = filter( lambda left: left.dep_.endswith('subj'), token.lefts) elif token.lemma_ == 'define': try: concept_tokens = list( filter(lambda left: left.dep_.endswith('subjpass'), token.lefts)) except StopIteration: concept_tokens = list( filter(lambda right: right.dep_ == 'dobj', token.rights)) elif token.lemma_ == 'call': concept_tokens = list( filter(lambda right: right.dep_ == 'oprd', token.rights)) tokens = [] for token in concept_tokens: tokens += token.subtree if len(tokens) > 0: if tokens[0].tag_ == 'DT': tokens = tokens[1:] tokens = filter(lambda token: len(token.text.strip()) > 0, tokens) node = Node(' '.join(map(lambda token: token.text, tokens))) if node != '' and zipf_frequency( node, 'en') < self.emerging_concept_frequency_cutoff: if node not in graph.nodes: graph.add_node(node, section) graph.emerging_concepts.add(node)
def get_words(word, pos, hyper_hypo, recursive=False, depth=None): """ return a dict { str.lower : zipf_freq.log } if depth = NOne, depth = 1 """ ans = {} # a dictioinary # find out which synset for the word for ss in wordnet.synsets(word, pos): # if ss.pos.startswith(pos.upper()): # this is what we want! if hyper_hypo == 'hyper': for x in ss.hypernyms(recursive, depth): ans.update({syn.replace('_', ' ') : zipf_frequency(syn.replace('_', ' '), 'en') \ for syn in x.synonyms}) elif hyper_hypo == 'hypo': for x in ss.hyponyms(recursive, depth): ans.update({syn.replace('_', ' ') : zipf_frequency(syn.replace('_', ' '), 'en') \ for syn in x.synonyms}) # break # only use the first synset, assuming it's the most common one return ans
def add_freq(word_list, add_freq_to_output): """ Adds frequencies to output """ if not add_freq_to_output: return word_list for word in word_list: freq = wordfreq.zipf_frequency(word[0], 'zh', wordlist='large', minimum=0.0) word.append(freq) return word_list
def predict_next_word(self, prompt, swype): """ Parameters ---------- prompt: str previous text swype: nuvox.swype.Swype swype object - containing key_trace needed for prediction. passing in entire object so that attributes can be updated for analytics. Returns ------- ranked_suggestions: list[str] ranked list of other suggested words """ swype.key_trace = self.remove_blacklisted_keys(swype.key_trace) key_trace = copy.copy(swype.key_trace) if not key_trace: return [] # Phase 0 - check if punctuation key was selected intended_punctuation = self.get_intended_punctuation(key_trace) if intended_punctuation: swype.word_to_trace_prob = swype.word_to_language_prob = swype.word_to_joint_prob = {intended_punctuation: 1.0} return [intended_punctuation] # Phase 1) Get dict mapping word --> prob(word | trace) for all possibly intended words using trace algorithm word_to_trace_prob = self.trace_algorithm.get_possible_word_to_trace_prob(key_id_sequence=key_trace) swype.word_to_trace_prob = word_to_trace_prob # store in swype obj for analytics candidate_words = list(word_to_trace_prob) # Phase 2) Filter the list of candidates based on their frequency in the english language candidate_words = list(sorted(candidate_words, key=lambda word: zipf_frequency(word, 'en'), reverse=True))[:self.config.MAX_SUGGESTIONS] # Phase 3) Get dict mapping word --> prob(word | prompt) all possibly intended words using language model word_to_language_prob = self.language_model.get_candidate_word_probs(prompt, candidate_words=candidate_words, normalize=True) swype.word_to_language_prob = word_to_language_prob # store in swype obj for analytics # Phase 3) Get dict mapping word --> prob(word | trace) * prob(word | prompt) (i.e. the joint probability) # TODO - need some sort of scaling factor to control influence of each model w = 0.75 # relative weight on the trace probability vs language model prob word_to_joint_prob = {word: ((w * word_to_trace_prob[word]) + ((1-w) * word_to_language_prob[word])) for word in candidate_words} swype.word_to_joint_prob = word_to_joint_prob # store in swype obj for analytics ranked_suggestions = sorted(word_to_joint_prob.keys(), key=lambda k: word_to_joint_prob.get(k, 0), reverse=True) if self.need_to_capitalize(prompt): ranked_suggestions = [word.capitalize() for word in ranked_suggestions] return ranked_suggestions
def cal_gram_med(SLD, n): """ 计算字符串n元频率中位数 :param SLD: 字符串 :param n: n :return: """ grams = [SLD[i:i + n] for i in range(len(SLD) - n+1)] fre = list() for s in grams: fre.append(wordfreq.zipf_frequency(s, 'en')) return np.median(fre)
def get_features(token): token = token.replace('<EOS>', '') return pd.Series({ 'length': len(token), 'logfreq': wordfreq.zipf_frequency(token, 'en'), 'has_upper': 0 if token.lower() == token else 1, 'has_punct': 1 if any(j in string.punctuation for j in token) else 0, })
def get_src_words_and_phrases(str,low_freq,high_freq,src_langcode,splitters): src_list = dict() rejected_words = [] current_episode = 'ep1' episode_count = 0 clean = re.sub(r"[,.;@#?¿!¡\-\"&$\[\]]+\ *", " ", str) lowerString = clean.lower() words = lowerString.split() for word in words: word.strip() if word in splitters: current_episode = word episode_count += 1 else: if (not word.isdigit() and word not in rejected_words): if word in src_list: if current_episode not in src_list[word]: src_list[word].append(current_episode) else: word_src_freq = zipf_frequency(word, src_langcode) if word_src_freq < high_freq: if word_src_freq > low_freq: src_list[word] = [current_episode] if print_added == True: print('word added', word) else: rejected_words.append(word) if print_rejected_words == True: print(' word too rare', word) else: rejected_words.append(word) if print_rejected_words == True: print(' word too common', word) current_episode = 'ep1' clean_phrases = re.sub(r"[;@#\"&$\[\]]+\ *", " ", str) clean_phrases_punct = re.sub(r"[,.;@#?¿!¡\-\"&$]+\ *", ".", clean_phrases) phrases = clean_phrases_punct.split(".") for phrase in phrases: if phrase in splitters: current_episode = phrase else: if len(phrase.split()) > 1 and phrase: if phrase in src_list: if current_episode not in src_list[phrase]: src_list[phrase].append(current_episode) else: if print_added == True: print('phrase added', phrase) src_list[phrase] = [current_episode] sortedDict = sorted(src_list.items(), key=lambda x: x[1]) return episode_count, sortedDict
def add_to_tokens_if_not_exists(parsed_token): exists = False for result_token in result_tokens: if parsed_token == result_token: exists = True break if exists == False: #Part four:Choose token based on frequency Or search and give score based on frequency freq = zipf_frequency(parsed_token, 'fa') if freq < 6: result_tokens.append(parsed_token) else: less_accurate_tokens.append(parsed_token)
def disambiguate(sentence, algorithm=simple_lesk, context_is_lemmatized=False, similarity_option='path', keepLemmas=False, prefersNone=True, zipf=5): tagged_sentence = [] # Pre-lemmatize the sentnece before WSD if not context_is_lemmatized: surface_words, lemmas, morphy_poss = lemmatize_sentence( sentence, keepWordPOS=True) lemma_sentence = " ".join(lemmas) else: lemma_sentence = sentence # TODO: Miss out on POS specification, how to resolve? for word, lemma, pos in zip(surface_words, lemmas, morphy_poss): if lemma not in stopwords: # Checks if it is a content word if zipf_frequency(lemma, 'en') < zipf: try: wn.synsets(lemma)[0] if algorithm == original_lesk: # Note: Original doesn't care about lemmas synset = algorithm(lemma_sentence, lemma) elif algorithm == max_similarity: synset = algorithm(lemma_sentence, lemma, pos=pos, option=similarity_option) else: synset = algorithm(lemma_sentence, lemma, pos=pos, context_is_lemmatized=True) except: # In case the content word is not in WordNet synset = '#NOT_IN_WN#' else: synset = '#under_zipf#' else: synset = '#STOPWORD/PUNCTUATION#' if keepLemmas: tagged_sentence.append((word, lemma, synset)) else: tagged_sentence.append((word, synset)) # Change #NOT_IN_WN# and #STOPWORD/PUNCTUATION# into None. if prefersNone and not keepLemmas: tagged_sentence = [(word, None) if str(tag).startswith('#') else (word, tag) for word, tag in tagged_sentence] if prefersNone and keepLemmas: tagged_sentence = [(word, lemma, None) if str(tag).startswith('#') else (word, lemma, tag) for word, lemma, tag in tagged_sentence] return tagged_sentence
def get_translation(src_text, dest_langcode, src_langcode): #translator = Translator(from_lang=src_langcode,to_lang=dest_langcode) # print('get_translation') # dest_text_plain = translator.translate(src_text) # print('dest_text_plain',dest_text_plain) # dest_text = str(translator.translate(src_text, dest=dest_langcode, src=src_langcode).text) # print('dest_text',dest_text) #dest_text = translator.translate(src_text) dest_text = translate(src_text, dest_langcode, src_langcode) #translated = GoogleTranslator(source='auto', target='de').translate("keep it up, you are awesome") print('dest_text',dest_text) word_src_freq = 0 word_dest_freq = 0 should_make_note = True if src_text == dest_text or dest_text == '': translation_attempt = 1#look into using turkey vpn instead of this sleep word_src_freq = zipf_frequency(src_text, src_langcode) word_dest_freq = zipf_frequency(src_text, dest_langcode) if word_src_freq >= word_dest_freq: while translation_attempt < 5: time.sleep(translation_attempt) dest_text = translate(src_text, dest_langcode, src_langcode) if src_text == dest_text: translation_attempt += translation_attempt else: translation_attempt = 5 else: should_make_note = False print('rejected because more common in dest:', src_text, word_src_freq,word_dest_freq ) if dest_text != '': add_translation_to_local_dictionary(src_text, dest_text, dest_langcode, src_langcode) if not should_make_note: dest_text = '' return dest_text
def _categorise_nodes(self, graph_based=True): """Categorise nodes in the graph into 'a priori' and 'emerging' concepts. Nodes that are only referenced from one section represent 'a priori references', all other nodes represent 'emerging concepts'. :param graph_based: Flag indicating whether or not to use the graph-based classification method. """ if graph_based: for section in self.sections: for node in self.section_listings[section]: # Skip concepts that have already been categorised during parsing. if node in self.emerging_concepts: continue referencing_sections = set() for tail in self.adjacency_index[node]: tail_section = self.section_index[tail] referencing_sections.add(tail_section) if len(referencing_sections) > 1 and zipf_frequency(node, 'en') < self.emerging_concept_frequency_cutoff: self.emerging_concepts.add(node) else: self.a_priori_concepts.add(node) # Enforce transitivity of concept labels such that all concepts that contain an emerging concept as a # constituent are also labelled as emerging concepts. for node in self.nodes: found = False for token in node.split(' '): for other in self.emerging_concepts: if token in other.split(' '): self.a_priori_concepts.discard(node) self.emerging_concepts.add(node) found = True if found: break if found: break # Need to add a priori concepts here if we are not using the graph-based classifier # (i.e. we are using the rule-based classifier) if not graph_based and not found: self.a_priori_concepts.add(node)
def generate_wordlist(n: int) -> List[Tuple[str, str]]: wiki = WiktionarySearcher() words = [] while len(words) < n: try: word = wiki.generate_word() except Exception as e: continue word, meanings = word.text, word.meanings if not meanings: continue freq = zipf_frequency(word, 'ru') if freq <= 1: words.append((word, random.choice(meanings))) return words
import html import sys import wordfreq if len(sys.argv) != 3: print('Usage: python3 sort.py target-lang pairs.csv') sys.exit(1) targetLang = sys.argv[1] pairsPath = sys.argv[2] pairs = {} with open(pairsPath, 'r', encoding='utf-8') as pairsFile: reader = csv.reader(pairsFile, delimiter='\t') for row in reader: words = wordfreq.tokenize(html.unescape(row[0]), targetLang) freqs = [wordfreq.zipf_frequency(word, targetLang, wordlist='combined') for word in words] minfreq = min(freqs) avgfreq = sum(freqs) / float(len(freqs)) pairs[row[0]] = (minfreq, avgfreq, row[1]) pairList = list(pairs.items()) pairList.sort(reverse = True, key=lambda i: i[1]) for pair in pairList: sys.stdout.buffer.write((pair[0] + '\t' + pair[1][2] + '\n').encode('utf-8'))