def run(unigram_counter=None, bigram_counter=None, trigram_counter=None, max_word_types=1000, n_neighbors=9, n_eigenvectors=11, min_context_count=3): word_freq_pairs = double_sorted(unigram_counter.items(), key=lambda x: x[1], reverse=True) if len(word_freq_pairs) > max_word_types: wordlist = [word for word, _ in word_freq_pairs[: max_word_types]] else: wordlist = [word for word, _ in word_freq_pairs] n_words = len(wordlist) # computing the context array # also words_to_contexts and contexts_to_words dicts context_array, words_to_contexts, contexts_to_words = get_array( wordlist, bigram_counter, trigram_counter, min_context_count) # computing shared context master matrix shared_context_matrix = context_array.dot(context_array.T).todense() del context_array # computing diameter diameter = normalize(n_words, shared_context_matrix) # computing incidence graph incidence_graph = compute_incidence_graph(n_words, diameter, shared_context_matrix) del shared_context_matrix # computing laplacian matrix laplacian_matrix = compute_laplacian(diameter, incidence_graph) del diameter del incidence_graph # computing eigenvectors and eigenvalues eigenvalues, eigenvectors = compute_eigenvectors(laplacian_matrix) del laplacian_matrix # computing distances between words # take first N columns of eigenvector matrix coordinates = eigenvectors[:, : n_eigenvectors] word_distances = compute_words_distance(coordinates) del coordinates del eigenvalues # computing nearest neighbors now nearest_neighbors = compute_closest_neighbors(word_distances, n_neighbors) words_to_neighbors = dict() for i in range(len(wordlist)): line = nearest_neighbors[i] word_idx, neighbors_idx = line[0], line[1:] word = wordlist[word_idx] neighbors = [wordlist[idx] for idx in neighbors_idx] words_to_neighbors[word] = neighbors return words_to_neighbors, words_to_contexts, contexts_to_words
def _make_wordlist(self): """ Return a wordlist sorted by word frequency in descending order. (So "the" will most likely be the first word for written English.) """ word_counter = self.word_unigram_counter() word_counter_sorted = double_sorted(word_counter.items(), key=lambda x: x[1], reverse=True) self._wordlist = [word for word, _ in word_counter_sorted]
def create_major_display_table(input_iterable, key=lambda x: x, reverse=False, headers=None, row_cell_functions=None, cutoff=0, set_text_alignment=None): """ This is a general function for creating a tabular display for the major display. """ if not input_iterable: print('Warning: input is empty', flush=True) return if not hasattr(input_iterable, '__iter__'): print('Warning: input is not an iterable', flush=True) return number_of_headers = len(headers) number_of_columns = len(row_cell_functions) if number_of_headers != number_of_columns: print('headers and cell functions don\'t match', flush=True) return len_input = len(input_iterable) table_widget = QTableWidget() table_widget.clear() table_widget.setSortingEnabled(False) # set up row count if cutoff and cutoff < len_input: actual_cutoff = cutoff else: actual_cutoff = len_input table_widget.setRowCount(actual_cutoff) # set up column count and table headers table_widget.setColumnCount(number_of_headers) table_widget.setHorizontalHeaderLabels(headers) # fill in the table for row, x in enumerate(double_sorted(input_iterable, key=key, reverse=reverse)): for col, fn in enumerate(row_cell_functions): cell = fn(x) if isinstance(cell, (int, float)): # cell is numeric item = QTableWidgetItem() item.setData(Qt.EditRole, cell) else: # cell is not numeric item = QTableWidgetItem(cell) if set_text_alignment: for align_col, alignment in set_text_alignment: if col == align_col: item.setTextAlignment(alignment) table_widget.setItem(row, col, item) if not row < actual_cutoff: break table_widget.setSortingEnabled(True) table_widget.resizeColumnsToContents() return table_widget
def get_array(wordlist, bigram_to_freq, trigram_to_freq, min_context_count): worddict = {word: wordlist.index(word) for word in wordlist} # convert the bigram and trigram counter dicts into list and sort them # throw away bi/trigrams whose frequency is below min_context_count bigram_to_freq_sorted = [(bigram, freq) for bigram, freq in double_sorted(bigram_to_freq.items(), key=lambda x: x[1], reverse=True) if freq >= min_context_count] trigram_to_freq_sorted = [(trigram, freq) for trigram, freq in double_sorted(trigram_to_freq.items(), key=lambda x: x[1], reverse=True) if freq >= min_context_count] # This is necessary so we can reference variables from inner functions class Namespace: pass ns = Namespace() ns.n_contexts = 0 # We use "n_contexts" to keep track of how many unique contexts there are. # Conveniently, n_contexts also serves to provide a unique context # index whenever the program encounters a new context. The dummy class # Namespace is to make it possible that we can refer to and update # n_contexts within inner functions # (both "contexts_increment" and "add_word") # inside this "GetContextArray" function. def contexts_increment(): tmp = ns.n_contexts ns.n_contexts += 1 return tmp contextdict = defaultdict(contexts_increment) # key: context (e.g., tuple ('of', '_', 'cat') as a 3-gram context for 'the' # value: context index (int) # This dict is analogous to worddict, where each key is a word (str) # and each value is a word index (int). # entries for sparse matrix rows = [] # row numbers are word indices cols = [] # column numbers are context indices values = [] words_to_contexts = dict() contexts_to_words = dict() for word in worddict.keys(): words_to_contexts[word] = dict() def add_word(current_word, current_context, occurrence_count): word_no = worddict[current_word] context_no = contextdict[current_context] rows.append(word_no) cols.append(context_no) # if we use 1, we assume "type" counts. # What if we use occurrence_count (--> "token" counts)? values.append(1) # update words_to_contexts and contexts_to_words if current_context not in words_to_contexts[current_word]: words_to_contexts[current_word][current_context] = 0 if current_context not in contexts_to_words: contexts_to_words[current_context] = dict() if current_word not in contexts_to_words[current_context]: contexts_to_words[current_context][current_word] = 0 words_to_contexts[current_word][current_context] += occurrence_count contexts_to_words[current_context][current_word] += occurrence_count for trigram, freq in trigram_to_freq_sorted: word1, word2, word3 = trigram context1 = ('_', word2, word3) context2 = (word1, '_', word3) context3 = (word1, word2, '_') if word1 in words_to_contexts: add_word(word1, context1, freq) if word2 in words_to_contexts: add_word(word2, context2, freq) if word3 in words_to_contexts: add_word(word3, context3, freq) for bigram, freq in bigram_to_freq_sorted: word1, word2 = bigram context1 = ('_', word2) context2 = (word1, '_') if word1 in words_to_contexts: add_word(word1, context1, freq) if word2 in words_to_contexts: add_word(word2, context2, freq) # csr_matrix in scipy.sparse means compressed matrix context_array = sparse.csr_matrix((values, (rows, cols)), shape=(len(worddict), ns.n_contexts + 1), dtype=np.int64) return context_array, words_to_contexts, contexts_to_words
def __init__(self, file_path=None, wordlist_file=False, corpus_object=None, wordlist_object=None, encoding=ENCODING, **kwargs): self.file_abspath = self._check_file_path(file_path) if self.file_abspath is None: self.directory = None else: self.directory = os.path.dirname(self.file_abspath) self.file_is_wordlist = wordlist_file self.encoding = encoding self.corpus_object = corpus_object self.wordlist_object = wordlist_object self.parameters_ = self._determine_parameters(**kwargs) # number of word types and tokens self._number_of_word_types = None self._number_of_word_tokens = None # word ngrams self._word_unigram_counter = None self._word_bigram_counter = None self._word_trigram_counter = None # wordlist self._wordlist = None if self.wordlist_object is not None: # self.wordlist_object is # either an iterable or a dict of word-count pairs if type(self.wordlist_object) is dict: word_count_dict = dict() if self.parameters_['keep_case']: word_count_dict = self.wordlist_object else: for word, count in self.wordlist_object: word = word.lower() if word not in word_count_dict: word_count_dict[word] = 0 word_count_dict[word] += count self._wordlist = [ word_ for word_, _ in double_sorted(word_count_dict.items(), key=lambda x: x[1], reverse=True) ] self._word_unigram_counter = word_count_dict elif hasattr(self.wordlist_object, '__iter__'): if self.parameters_['keep_case']: self._wordlist = sorted(set(self.wordlist_object)) else: self._wordlist = sorted( set(w.lower() for w in self.wordlist_object)) self._word_unigram_counter = {w: 1 for w in self._wordlist} else: raise TypeError('wordlist object must be a dict of word-count' 'pairs or an iterable of words') # corpus file object if self.corpus_object is not None: # self.corpus_object is either a list of strings or a long str if type(self.corpus_object) is list: corpus_str = fix_punctuations(' '.join(self.corpus_object)) elif type(self.corpus_object) is six.text_type: corpus_str = fix_punctuations(self.corpus_object) else: raise TypeError('corpus object must be either a text or list') self.corpus_file_object = StringIO(corpus_str) elif self.file_abspath and not self.file_is_wordlist: self.corpus_file_object = open(self.file_abspath, encoding=self.encoding) else: self.corpus_file_object = None # wordlist file object if self.file_is_wordlist: self.wordlist_file_object = open(self.file_abspath, encoding=self.encoding) else: self.wordlist_file_object = StringIO() # manifold-related objects self._words_to_neighbors = None self._words_to_contexts = None self._contexts_to_words = None self._neighbor_graph = None # phon objects self._phone_unigram_counter = None self._phone_bigram_counter = None self._phone_trigram_counter = None self._phone_dict = None self._biphone_dict = None self._word_dict = None self._words_to_phones = None # trie objects self._broken_words_left_to_right = None self._broken_words_right_to_left = None self._successors = None self._predecessors = None Lexicon_BiSig.__init__(self, self.wordlist(), self.parameters_['min_stem_length'], self.parameters_['max_affix_length'], self.parameters_['min_sig_count'], self.parameters_['suffixing'])
def output_all_results(self, directory=None, verbose=False, test=False): """ Output all Linguistica results to *directory*. :param directory: output directory. If not specified, it defaults to the current directory given by ``os.getcwd()``. """ if not directory: output_dir = os.getcwd() else: output_dir = os.path.abspath(directory) # ---------------------------------------------------------------------- if self.corpus_file_object: vprint(verbose, 'ngram objects') fname = 'word_bigrams.txt' obj = double_sorted(self.word_bigram_counter().items(), key=lambda x: x[1], reverse=True) f_path = os.path.join(output_dir, fname) output_latex( obj, f_path, title='Word bigrams', headers=['Word bigram', 'Count'], row_functions=[lambda x: ' '.join(x[0]), lambda x: x[1]], column_widths=[50, 10], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) fname = 'word_trigrams.txt' obj = double_sorted(self.word_trigram_counter().items(), key=lambda x: x[1], reverse=True) f_path = os.path.join(output_dir, fname) output_latex( obj, f_path, title='Word trigrams', headers=['Word trigram', 'Count'], row_functions=[lambda x: ' '.join(x[0]), lambda x: x[1]], column_widths=[75, 10], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) # ---------------------------------------------------------------------- vprint(verbose, 'morphological signature objects') fname = 'stems_to_words.txt' obj = double_sorted(self.stems_to_words().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Stems to words ' '(descending order of word count)', headers=['Stem', 'Word count', 'Words'], row_functions=[ lambda x: x[0], lambda x: len(x[1]), lambda x: ', '.join(sorted(x[1])) ], column_widths=[15, 15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) fname = 'stems_to_words.txt' obj = double_sorted(self.stems_to_words().items(), key=lambda x: x[0], reverse=False) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Stems to words ' '(alphabetical order of stems)', headers=['Stem', 'Word count', '1st 10 words'], row_functions=[ lambda x: x[0], lambda x: len(x[1]), lambda x: ', '.join(sorted(x[1])) ], column_widths=[15, 15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) fname = 'signatures_to_stems.txt' obj = double_sorted(self.signatures_to_stems().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Signatures to stems', headers=['Signature', 'Stem count', 'Stems'], row_functions=[ lambda x: SEP_SIG.join(x[0]), lambda x: len(x[1]), lambda x: ', '.join(sorted(x[1])) ], column_widths=[30, 15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) fname = 'signatures_to_stems_truncated.txt' obj = double_sorted(self.signatures_to_stems().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Signatures to stems ' '(first 10 stems for each sig)', headers=['Signature', 'Stem count', '1st 10 stems'], row_functions=[ lambda x: SEP_SIG.join(x[0]), lambda x: len(x[1]), lambda x: ' '.join(sorted(x[1])[:10]) ], column_widths=[30, 15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) fname = 'stems_to_signatures.txt' obj = double_sorted(self.stems_to_signatures().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex( obj, f_path, title='Stems to signatures', headers=['Stems', 'Signatures'], row_functions=[ lambda x: x[0], lambda x: ', '.join(SEP_SIG.join(sig) for sig in sorted(x[1])) ], column_widths=[15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) fname = 'words_to_signatures.txt' obj = double_sorted(self.words_to_signatures().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex( obj, f_path, title='Words to signatures', headers=['Word', 'Sig count', 'Signatures'], row_functions=[ lambda x: x[0], lambda x: len(x[1]), lambda x: ', '.join(SEP_SIG.join(sig) for sig in sorted(x[1])) ], column_widths=[25, 15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) fname = 'signatures_to_words.txt' obj = double_sorted(self.signatures_to_words().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Signatures to words', headers=['Signature', 'Word count', 'Words'], row_functions=[ lambda x: SEP_SIG.join(x[0]), lambda x: len(x[1]), lambda x: ', '.join(sorted(x[1])) ], column_widths=[20, 15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) fname = 'signatures_to_words_truncated.txt' obj = double_sorted(self.signatures_to_words().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Signatures to words ' '(first 10 words for each sig)', headers=['Signature', 'Word count', '1st 10 words'], row_functions=[ lambda x: SEP_SIG.join(x[0]), lambda x: len(x[1]), lambda x: ', '.join(sorted(x[1])[:10]) ], column_widths=[20, 15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) fname = 'words_to_sigtransforms.txt' obj = double_sorted(self.words_to_sigtransforms().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Words to sigtransforms', headers=['Word', 'Signature transforms'], row_functions=[ lambda x: x[0], lambda x: ', '.join( SEP_SIG.join(sig) + SEP_SIGTRANSFORM + affix for sig, affix in sorted(x[1])) ], column_widths=[20, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) fname = 'affixes_to_signatures.txt' obj = double_sorted(self.affixes_to_signatures().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex( obj, f_path, title='Affixes to signatures', headers=['Affix', 'Sig count', 'Signatures'], row_functions=[ lambda x: x[0], lambda x: len(x[1]), lambda x: ', '.join(SEP_SIG.join(sig) for sig in sorted(x[1])) ], column_widths=[15, 15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) # ---------------------------------------------------------------------- if self.corpus_file_object: vprint(verbose, 'manifold objects') fname = 'words_to_neighbors.txt' obj = list() # list of tuple(word, list of neighbor words) for word in self.wordlist()[:self.parameters()['max_word_types']]: obj.append((word, self.words_to_neighbors()[word])) f_path = os.path.join(output_dir, fname) output_latex( obj, f_path, title='Words to neighbors', headers=['Word', 'Neighbors'], row_functions=[lambda x: x[0], lambda x: ' '.join(x[1])], column_widths=[25, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) # ---------------------------------------------------------------------- vprint(verbose, 'phon objects') def output_latex_for_phon_words(obj_, f_path_, title_, lxa_parameters_, test_, encoding_, number_of_word_types_, number_of_word_tokens_, input_file_path_): output_latex(obj_, f_path_, title=title_, headers=[ 'Word', 'Count', 'Frequency', 'Phones', 'Unigram plog', 'Avg unigram plog', 'Bigram plog', 'Avg bigram plog' ], row_functions=[ lambda x: x[0], lambda x: x[1].count, lambda x: '%.6f' % x[1].frequency, lambda x: ' '.join(x[1].phones), lambda x: '%8.3f' % x[1].unigram_plog, lambda x: '%8.3f' % x[1].avg_unigram_plog, lambda x: '%8.3f' % x[1].bigram_plog, lambda x: '%8.3f' % x[1].avg_bigram_plog, ], column_widths=[35, 10, 15, 60, 15, 15, 15, 15], lxa_parameters=lxa_parameters_, test=test_, encoding=encoding_, number_of_word_types=number_of_word_types_, number_of_word_tokens=number_of_word_tokens_, input_file_path=input_file_path_) fname = 'wordlist.txt' obj_word_phon = list() # list of tuple(word, list of neighbor words) for word in self.wordlist(): obj_word_phon.append((word, self.word_phonology_dict()[word])) f_path = os.path.join(output_dir, 'wordlist.txt') output_latex_for_phon_words(obj_word_phon, f_path, 'Wordlist sorted by word count', self.parameters(), test, self.encoding, self.number_of_word_types(), self.number_of_word_tokens(), self.file_abspath) vprint(verbose, '\t' + fname) fname = 'wordlist_by_avg_unigram_plog.txt' obj_unigram_plog = double_sorted(obj_word_phon, key=lambda x: x[1].avg_unigram_plog, reverse=False) f_path = os.path.join(output_dir, fname) output_latex_for_phon_words(obj_unigram_plog, f_path, 'Wordlist sorted by avg unigram plog', self.parameters(), test, self.encoding, self.number_of_word_types(), self.number_of_word_tokens(), self.file_abspath) vprint(verbose, '\t' + fname) fname = 'wordlist_by_avg_bigram_plog.txt' obj_bigram_plog = double_sorted(obj_word_phon, key=lambda x: x[1].avg_bigram_plog, reverse=False) f_path = os.path.join(output_dir, fname) output_latex_for_phon_words(obj_bigram_plog, f_path, 'Wordlist sorted by avg bigram plog', self.parameters(), test, self.encoding, self.number_of_word_types(), self.number_of_word_tokens(), self.file_abspath) vprint(verbose, '\t' + fname) fname = 'phones.txt' obj = double_sorted(self.phone_dict().items(), key=lambda x: x[1].count, reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Phones', headers=['Phone', 'Count', 'Frequency', 'Plog'], row_functions=[ lambda x: x[0], lambda x: x[1].count, lambda x: '%.6f' % x[1].frequency, lambda x: '%8.3f' % x[1].plog, ], column_widths=[10, 10, 15, 15], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) fname = 'biphones.txt' obj = double_sorted(self.biphone_dict().items(), key=lambda x: x[1].count, reverse=True) f_path = os.path.join(output_dir, fname) output_latex( obj, f_path, title='Biphones', headers=['Biphone', 'Count', 'Frequency', 'MI', 'Weighted MI'], row_functions=[ lambda x: ' '.join(x[0]), lambda x: x[1].count, lambda x: '%.6f' % x[1].frequency, lambda x: '%8.3f' % x[1].MI, lambda x: '%8.3f' % x[1].weighted_MI, ], column_widths=[10, 10, 15, 15, 15], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) fname = 'triphones.txt' obj = double_sorted(self.phone_trigram_counter().items(), key=lambda x: x[1], reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Triphones', headers=['Triphone', 'Count'], row_functions=[ lambda x: ' '.join(x[0]), lambda x: x[1], ], column_widths=[15, 10], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) # ---------------------------------------------------------------------- vprint(verbose, 'trie objects') fname = 'words_as_tries.txt' obj = list() for word in self.wordlist(): obj.append((word, self.broken_words_left_to_right()[word], self.broken_words_right_to_left()[word])) f_path = os.path.join(output_dir, fname) output_latex( obj, f_path, title='Words as tries', headers=['Word', 'Left-to-right trie', 'Right-to-left trie'], row_functions=[ lambda x: x[0], lambda x: ' '.join(x[1]), lambda x: ' '.join(x[2]), ], column_widths=[35, 50, 50], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) fname = 'successors.txt' obj = double_sorted(self.successors().items(), key=lambda x: len(x[1]), reverse=False) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Successors', headers=['String', 'Successors'], row_functions=[ lambda x: x[0], lambda x: ' '.join(sorted(x[1])), ], column_widths=[35, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname) fname = 'predecessors.txt' obj = double_sorted(self.predecessors().items(), key=lambda x: len(x[1]), reverse=False) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Predecessors', headers=['String', 'Predecessors'], row_functions=[ lambda x: x[0], lambda x: ' '.join(sorted(x[1])), ], column_widths=[35, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint(verbose, '\t' + fname)
def run(unigram_counter=None, bigram_counter=None, trigram_counter=None, max_word_types=1000, n_neighbors=9, n_eigenvectors=11, min_context_count=3): word_freq_pairs = double_sorted(unigram_counter.items(), key=lambda x: x[1], reverse=True) if len(word_freq_pairs) > max_word_types: wordlist = [word for word, _ in word_freq_pairs[:max_word_types]] else: wordlist = [word for word, _ in word_freq_pairs] n_words = len(wordlist) # computing the context array # also words_to_contexts and contexts_to_words dicts context_array, words_to_contexts, contexts_to_words = get_array( wordlist, bigram_counter, trigram_counter, min_context_count) # computing shared context master matrix shared_context_matrix = context_array.dot(context_array.T).todense() del context_array # computing diameter diameter = normalize(n_words, shared_context_matrix) # computing incidence graph incidence_graph = compute_incidence_graph(n_words, diameter, shared_context_matrix) del shared_context_matrix # computing laplacian matrix laplacian_matrix = compute_laplacian(diameter, incidence_graph) del diameter del incidence_graph # computing eigenvectors and eigenvalues eigenvalues, eigenvectors = compute_eigenvectors(laplacian_matrix) del laplacian_matrix # computing distances between words # take first N columns of eigenvector matrix coordinates = eigenvectors[:, :n_eigenvectors] word_distances = compute_words_distance(coordinates) del coordinates del eigenvalues # computing nearest neighbors now nearest_neighbors = compute_closest_neighbors(word_distances, n_neighbors) words_to_neighbors = dict() for i in range(len(wordlist)): line = nearest_neighbors[i] word_idx, neighbors_idx = line[0], line[1:] word = wordlist[word_idx] neighbors = [wordlist[idx] for idx in neighbors_idx] words_to_neighbors[word] = neighbors return words_to_neighbors, words_to_contexts, contexts_to_words
def get_array(wordlist, bigram_to_freq, trigram_to_freq, min_context_count): worddict = {word: wordlist.index(word) for word in wordlist} # convert the bigram and trigram counter dicts into list and sort them # throw away bi/trigrams whose frequency is below min_context_count bigram_to_freq_sorted = [(bigram, freq) for bigram, freq in double_sorted( bigram_to_freq.items(), key=lambda x: x[1], reverse=True) if freq >= min_context_count] trigram_to_freq_sorted = [ (trigram, freq) for trigram, freq in double_sorted( trigram_to_freq.items(), key=lambda x: x[1], reverse=True) if freq >= min_context_count ] # This is necessary so we can reference variables from inner functions class Namespace: pass ns = Namespace() ns.n_contexts = 0 # We use "n_contexts" to keep track of how many unique contexts there are. # Conveniently, n_contexts also serves to provide a unique context # index whenever the program encounters a new context. The dummy class # Namespace is to make it possible that we can refer to and update # n_contexts within inner functions # (both "contexts_increment" and "add_word") # inside this "GetContextArray" function. def contexts_increment(): tmp = ns.n_contexts ns.n_contexts += 1 return tmp contextdict = defaultdict(contexts_increment) # key: context (e.g., tuple ('of', '_', 'cat') as a 3-gram context for 'the' # value: context index (int) # This dict is analogous to worddict, where each key is a word (str) # and each value is a word index (int). # entries for sparse matrix rows = [] # row numbers are word indices cols = [] # column numbers are context indices values = [] words_to_contexts = dict() contexts_to_words = dict() for word in worddict.keys(): words_to_contexts[word] = dict() def add_word(current_word, current_context, occurrence_count): word_no = worddict[current_word] context_no = contextdict[current_context] rows.append(word_no) cols.append(context_no) # if we use 1, we assume "type" counts. # What if we use occurrence_count (--> "token" counts)? values.append(1) # update words_to_contexts and contexts_to_words if current_context not in words_to_contexts[current_word]: words_to_contexts[current_word][current_context] = 0 if current_context not in contexts_to_words: contexts_to_words[current_context] = dict() if current_word not in contexts_to_words[current_context]: contexts_to_words[current_context][current_word] = 0 words_to_contexts[current_word][current_context] += occurrence_count contexts_to_words[current_context][current_word] += occurrence_count for trigram, freq in trigram_to_freq_sorted: word1, word2, word3 = trigram context1 = ('_', word2, word3) context2 = (word1, '_', word3) context3 = (word1, word2, '_') if word1 in words_to_contexts: add_word(word1, context1, freq) if word2 in words_to_contexts: add_word(word2, context2, freq) if word3 in words_to_contexts: add_word(word3, context3, freq) for bigram, freq in bigram_to_freq_sorted: word1, word2 = bigram context1 = ('_', word2) context2 = (word1, '_') if word1 in words_to_contexts: add_word(word1, context1, freq) if word2 in words_to_contexts: add_word(word2, context2, freq) # csr_matrix in scipy.sparse means compressed matrix context_array = sparse.csr_matrix((values, (rows, cols)), shape=(len(worddict), ns.n_contexts + 1), dtype=np.int64) return context_array, words_to_contexts, contexts_to_words
def output_all_results(self, directory=None, verbose=False, test=False): """ Output all Linguistica results to *directory*. :param directory: output directory. If not specified, it defaults to the current directory given by ``os.getcwd()``. """ if not directory: output_dir = os.getcwd() else: output_dir = os.path.abspath(directory) # ---------------------------------------------------------------------- if self.corpus_file_object: vprint('ngram objects', verbose=verbose) fname = 'word_bigrams.txt' obj = double_sorted(self.word_bigram_counter().items(), key=lambda x: x[1], reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Word bigrams', headers=['Word bigram', 'Count'], row_functions=[lambda x: ' '.join(x[0]), lambda x: x[1]], column_widths=[50, 10], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'word_trigrams.txt' obj = double_sorted(self.word_trigram_counter().items(), key=lambda x: x[1], reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Word trigrams', headers=['Word trigram', 'Count'], row_functions=[lambda x: ' '.join(x[0]), lambda x: x[1]], column_widths=[75, 10], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) # ---------------------------------------------------------------------- vprint('morphological signature objects', verbose=verbose) fname = 'stems_to_words.txt' obj = double_sorted(self.stems_to_words().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Stems to words ' '(descending order of word count)', headers=['Stem', 'Word count', 'Words'], row_functions=[lambda x: x[0], lambda x: len(x[1]), lambda x: ', '.join(sorted(x[1]))], column_widths=[15, 15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'stems_to_words.txt' obj = double_sorted(self.stems_to_words().items(), key=lambda x: x[0], reverse=False) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Stems to words ' '(alphabetical order of stems)', headers=['Stem', 'Word count', '1st 10 words'], row_functions=[lambda x: x[0], lambda x: len(x[1]), lambda x: ', '.join(sorted(x[1]))], column_widths=[15, 15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'signatures_to_stems.txt' obj = double_sorted(self.signatures_to_stems().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Signatures to stems', headers=['Signature', 'Stem count', 'Stems'], row_functions=[lambda x: SEP_SIG.join(x[0]), lambda x: len(x[1]), lambda x: ', '.join(sorted(x[1]))], column_widths=[30, 15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'signatures_to_stems_truncated.txt' obj = double_sorted(self.signatures_to_stems().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Signatures to stems ' '(first 10 stems for each sig)', headers=['Signature', 'Stem count', '1st 10 stems'], row_functions=[lambda x: SEP_SIG.join(x[0]), lambda x: len(x[1]), lambda x: ' '.join(sorted(x[1])[:10])], column_widths=[30, 15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'stems_to_signatures.txt' obj = double_sorted(self.stems_to_signatures().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Stems to signatures', headers=['Stems', 'Signatures'], row_functions=[lambda x: x[0], lambda x: ', '.join(SEP_SIG.join(sig) for sig in sorted(x[1]))], column_widths=[15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'words_to_signatures.txt' obj = double_sorted(self.words_to_signatures().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Words to signatures', headers=['Word', 'Sig count', 'Signatures'], row_functions=[lambda x: x[0], lambda x: len(x[1]), lambda x: ', '.join(SEP_SIG.join(sig) for sig in sorted(x[1]))], column_widths=[25, 15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'signatures_to_words.txt' obj = double_sorted(self.signatures_to_words().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Signatures to words', headers=['Signature', 'Word count', 'Words'], row_functions=[lambda x: SEP_SIG.join(x[0]), lambda x: len(x[1]), lambda x: ', '.join(sorted(x[1]))], column_widths=[20, 15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'signatures_to_words_truncated.txt' obj = double_sorted(self.signatures_to_words().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Signatures to words ' '(first 10 words for each sig)', headers=['Signature', 'Word count', '1st 10 words'], row_functions=[lambda x: SEP_SIG.join(x[0]), lambda x: len(x[1]), lambda x: ', '.join(sorted(x[1])[:10])], column_widths=[20, 15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'words_to_sigtransforms.txt' obj = double_sorted(self.words_to_sigtransforms().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Words to sigtransforms', headers=['Word', 'Signature transforms'], row_functions=[lambda x: x[0], lambda x: ', '.join(SEP_SIG.join(sig) + SEP_SIGTRANSFORM + affix for sig, affix in sorted(x[1]))], column_widths=[20, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'affixes_to_signatures.txt' obj = double_sorted(self.affixes_to_signatures().items(), key=lambda x: len(x[1]), reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Affixes to signatures', headers=['Affix', 'Sig count', 'Signatures'], row_functions=[lambda x: x[0], lambda x: len(x[1]), lambda x: ', '.join(SEP_SIG.join(sig) for sig in sorted(x[1]))], column_widths=[15, 15, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) # ---------------------------------------------------------------------- if self.corpus_file_object: vprint('manifold objects', verbose=verbose) fname = 'words_to_neighbors.txt' obj = list() # list of tuple(word, list of neighbor words) for word in self.wordlist()[: self.parameters()['max_word_types']]: obj.append((word, self.words_to_neighbors()[word])) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Words to neighbors', headers=['Word', 'Neighbors'], row_functions=[lambda x: x[0], lambda x: ' '.join(x[1])], column_widths=[25, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) # ---------------------------------------------------------------------- vprint('phon objects', verbose=verbose) def output_latex_for_phon_words(obj_, f_path_, title_, lxa_parameters_, test_, encoding_, number_of_word_types_, number_of_word_tokens_, input_file_path_): output_latex(obj_, f_path_, title=title_, headers=['Word', 'Count', 'Frequency', 'Phones', 'Unigram plog', 'Avg unigram plog', 'Bigram plog', 'Avg bigram plog'], row_functions=[lambda x: x[0], lambda x: x[1].count, lambda x: '%.6f' % x[1].frequency, lambda x: ' '.join(x[1].phones), lambda x: '%8.3f' % x[1].unigram_plog, lambda x: '%8.3f' % x[1].avg_unigram_plog, lambda x: '%8.3f' % x[1].bigram_plog, lambda x: '%8.3f' % x[1].avg_bigram_plog, ], column_widths=[35, 10, 15, 60, 15, 15, 15, 15], lxa_parameters=lxa_parameters_, test=test_, encoding=encoding_, number_of_word_types=number_of_word_types_, number_of_word_tokens=number_of_word_tokens_, input_file_path=input_file_path_) fname = 'wordlist.txt' obj_word_phon = list() # list of tuple(word, list of neighbor words) for word in self.wordlist(): obj_word_phon.append((word, self.word_phonology_dict()[word])) f_path = os.path.join(output_dir, 'wordlist.txt') output_latex_for_phon_words(obj_word_phon, f_path, 'Wordlist sorted by word count', self.parameters(), test, self.encoding, self.number_of_word_types(), self.number_of_word_tokens(), self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'wordlist_by_avg_unigram_plog.txt' obj_unigram_plog = double_sorted(obj_word_phon, key=lambda x: x[1].avg_unigram_plog, reverse=False) f_path = os.path.join(output_dir, fname) output_latex_for_phon_words(obj_unigram_plog, f_path, 'Wordlist sorted by avg unigram plog', self.parameters(), test, self.encoding, self.number_of_word_types(), self.number_of_word_tokens(), self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'wordlist_by_avg_bigram_plog.txt' obj_bigram_plog = double_sorted(obj_word_phon, key=lambda x: x[1].avg_bigram_plog, reverse=False) f_path = os.path.join(output_dir, fname) output_latex_for_phon_words(obj_bigram_plog, f_path, 'Wordlist sorted by avg bigram plog', self.parameters(), test, self.encoding, self.number_of_word_types(), self.number_of_word_tokens(), self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'phones.txt' obj = double_sorted(self.phone_dict().items(), key=lambda x: x[1].count, reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Phones', headers=['Phone', 'Count', 'Frequency', 'Plog'], row_functions=[lambda x: x[0], lambda x: x[1].count, lambda x: '%.6f' % x[1].frequency, lambda x: '%8.3f' % x[1].plog, ], column_widths=[10, 10, 15, 15], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'biphones.txt' obj = double_sorted(self.biphone_dict().items(), key=lambda x: x[1].count, reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Biphones', headers=['Biphone', 'Count', 'Frequency', 'MI', 'Weighted MI'], row_functions=[lambda x: ' '.join(x[0]), lambda x: x[1].count, lambda x: '%.6f' % x[1].frequency, lambda x: '%8.3f' % x[1].MI, lambda x: '%8.3f' % x[1].weighted_MI, ], column_widths=[10, 10, 15, 15, 15], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'triphones.txt' obj = double_sorted(self.phone_trigram_counter().items(), key=lambda x: x[1], reverse=True) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Triphones', headers=['Triphone', 'Count'], row_functions=[lambda x: ' '.join(x[0]), lambda x: x[1], ], column_widths=[15, 10], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) # ---------------------------------------------------------------------- vprint('trie objects', verbose=verbose) fname = 'words_as_tries.txt' obj = list() for word in self.wordlist(): obj.append((word, self.broken_words_left_to_right()[word], self.broken_words_right_to_left()[word])) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Words as tries', headers=['Word', 'Left-to-right trie', 'Right-to-left trie'], row_functions=[lambda x: x[0], lambda x: ' '.join(x[1]), lambda x: ' '.join(x[2]), ], column_widths=[35, 50, 50], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'successors.txt' obj = double_sorted(self.successors().items(), key=lambda x: len(x[1]), reverse=False) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Successors', headers=['String', 'Successors'], row_functions=[lambda x: x[0], lambda x: ' '.join(sorted(x[1])), ], column_widths=[35, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose) fname = 'predecessors.txt' obj = double_sorted(self.predecessors().items(), key=lambda x: len(x[1]), reverse=False) f_path = os.path.join(output_dir, fname) output_latex(obj, f_path, title='Predecessors', headers=['String', 'Predecessors'], row_functions=[lambda x: x[0], lambda x: ' '.join(sorted(x[1])), ], column_widths=[35, 0], lxa_parameters=self.parameters(), test=test, encoding=self.encoding, number_of_word_types=self.number_of_word_types(), number_of_word_tokens=self.number_of_word_tokens(), input_file_path=self.file_abspath) vprint('\t' + fname, verbose=verbose)
def _initialize(self): # number of word types and tokens self._number_of_word_types = None self._number_of_word_tokens = None # word ngrams self._word_unigram_counter = None self._word_bigram_counter = None self._word_trigram_counter = None # wordlist self._wordlist = None if self.wordlist_object is not None: # self.wordlist_object is # either an iterable or a dict of word-count pairs if type(self.wordlist_object) is dict: word_count_dict = dict() if self.parameters_['keep_case']: word_count_dict = self.wordlist_object else: for word, count in self.wordlist_object: word = word.lower() if word not in word_count_dict: word_count_dict[word] = 0 word_count_dict[word] += count self._wordlist = [word for word, _ in double_sorted(word_count_dict.items(), key=lambda x: x[1], reverse=True)] self._word_unigram_counter = word_count_dict elif hasattr(self.wordlist_object, '__iter__'): if self.parameters_['keep_case']: self._wordlist = sorted(set(self.wordlist_object)) else: self._wordlist = sorted( set(w.lower() for w in self.wordlist_object)) self._word_unigram_counter = {w: 1 for w in self._wordlist} else: raise TypeError('wordlist object must be a dict of word-count' 'pairs or an iterable of words') # signature-related objects self._stems_to_words = None self._signatures_to_stems = None self._stems_to_signatures = None self._words_to_signatures = None self._signatures_to_words = None self._words_to_sigtransforms = None self._signatures = None self._affixes_to_signatures = None self._words_in_signatures = None self._affixes = None self._stems = None # corpus file object if self.corpus_object is not None: # self.corpus_object is either a list of strings or a long str if type(self.corpus_object) is list: corpus_str = fix_punctuations(' '.join(self.corpus_object)) elif type(self.corpus_object) is str: corpus_str = fix_punctuations(self.corpus_object) else: raise TypeError('corpus object must be either a str or a list') self.corpus_file_object = StringIO(corpus_str) elif self.file_abspath and not self.file_is_wordlist: self.corpus_file_object = open(self.file_abspath, encoding=self.encoding) else: self.corpus_file_object = None # wordlist file object if self.file_is_wordlist: self.wordlist_file_object = open(self.file_abspath, encoding=self.encoding) else: self.wordlist_file_object = StringIO() # manifold-related objects self._words_to_neighbors = None self._words_to_contexts = None self._contexts_to_words = None self._neighbor_graph = None # phon objects self._phone_unigram_counter = None self._phone_bigram_counter = None self._phone_trigram_counter = None self._phone_dict = None self._biphone_dict = None self._word_dict = None self._words_to_phones = None # trie objects self._broken_words_left_to_right = None self._broken_words_right_to_left = None self._successors = None self._predecessors = None
def run(unigram_counter=None, bigram_counter=None, trigram_counter=None, max_word_types=1000, n_neighbors=9, n_eigenvectors=11, min_context_count=3): word_freq_pairs = double_sorted(unigram_counter.items(), key=lambda x: x[1], reverse=True) # words = dict() # for word, _ in word_freq_pairs: # words[word] = 0 # word_pairs = dict() # for word, _ in word_freq_pairs: # word_pairs[word] = words # print(word_pairs) if len(word_freq_pairs) > max_word_types: freqlist = [freq for _, freq in word_freq_pairs[:max_word_types]] wordlist = [word for word, _ in word_freq_pairs[:max_word_types]] else: freqlist = [freq for _, freq in word_freq_pairs] wordlist = [word for word, _ in word_freq_pairs] n_words = len(wordlist) # computing the context array # also words_to_contexts and contexts_to_words dicts context_array, words_to_contexts, contexts_to_words = get_array( wordlist, bigram_counter, trigram_counter, min_context_count) # computing shared context master matrix shared_context_matrix = context_array.dot(context_array.T.todense()) for row in range(n_words): for col in range(n_words): shared_context_matrix[ row, col] = 10000000000 * shared_context_matrix[row, col] / ( freqlist[col] * freqlist[row]) # computing sum of contexts shared with other words total_contexts = normalize(n_words, shared_context_matrix) # computing adjusted shared contexts with the [x, x] pair replaced with sum of contexts shared incidence = compute_incidence_graph(n_words, total_contexts, shared_context_matrix) # computing the closest neighbors to a given word result = find_closest(incidence, n_words, n_neighbors) del context_array del shared_context_matrix # computing which words are neighbors based on worddict words_to_neighbors = dict() for i in range(len(wordlist)): line = result[i].astype(np.int64) word_idx, neighbors_idx = line[0], line[1:] word = wordlist[word_idx] neighbors = [wordlist[idx] for idx in neighbors_idx] words_to_neighbors[word] = neighbors # computing graph of neighbors compute_graph(words_to_neighbors) return words_to_neighbors, words_to_contexts, contexts_to_words