def _build_lrgraph(self, sents, min_count, pruning_min_count=2): lset = {} rset = {} for n_sent, sent in enumerate(sents): for eojeol in sent.split(): for e in range(1, min(len(eojeol), self.lmax) + 1): l = eojeol[:e] r = eojeol[e:] lset[l] = lset.get(l, 0) + 1 rset[r] = rset.get(r, 0) + 1 if n_sent % 1000 == 999: args = (n_sent + 1, len(lset), len(rset), get_process_memory()) sys.stdout.write( '\rscaning vocabulary ... %d sents #(l= %d, r= %d), mem= %.3f Gb' % args) if n_sent % 500000 == 499999: lset = { l: f for l, f in lset.items() if f >= pruning_min_count } rset = { l: f for l, f in rset.items() if f >= pruning_min_count } lset = {l: f for l, f in lset.items() if f >= min_count} rset = {l: f for l, f in rset.items() if f >= min_count} n_sents = n_sent lrgraph = {} for n_sent, sent in enumerate(sents): for eojeol in sent.split(): for e in range(1, min(len(eojeol), self.lmax) + 1): l = eojeol[:e] r = eojeol[e:] if not (l in lset) or not (r in rset): continue rdict = lrgraph.get(l, {}) rdict[r] = rdict.get(r, 0) + 1 lrgraph[l] = rdict if n_sent % 1000 == 999: args = (100 * (n_sent + 1) / n_sents, '%', n_sent + 1, n_sents, get_process_memory()) sys.stdout.write( '\rbuilding lrgraph ... (%.3f %s, %d in %d), mem= %.3f Gb' % args) args = (len(lset), len(rset), sum((len(rdict) for rdict in lrgraph.values())), get_process_memory()) print( '\rlrgraph has been built. (#L= %d, #R= %d, #E=%d), mem= %.3f Gb' % args) return lrgraph, lset, rset
def _build_graph(self, sents, wordset_l, wordset_r): self.wordset_l = wordset_l self.wordset_r = wordset_r self.wordset_r.add('') _ckpt = int(len(sents) / 40) lrgraph = defaultdict(lambda: defaultdict(lambda: 0)) rlgraph = defaultdict(lambda: defaultdict(lambda: 0)) for i, sent in enumerate(sents): for token in sent.split(): if not token: continue token_len = len(token) for i in range(1, min(self.left_max_length, token_len)+1): l = token[:i] r = token[i:] if (not l in wordset_l) or (not r in wordset_r): continue lrgraph[l][r] += 1 rlgraph[r][l] += 1 if self.verbose and (i % _ckpt == 0): args = ('#' * int(i/_ckpt), '-' * (40 - int(i/_ckpt)), 100.0 * i / len(sents), '%', get_process_memory()) sys.stdout.write('\rbuilding lr-graph: %s%s (%.3f %s), memory = %.3f Gb' % args) if self.verbose: sys.stdout.write('\rbuilding lr-graph completed. memory = %.3f Gb' % get_process_memory()) lrgraph = {l:{r:f for r,f in rdict.items()} for l,rdict in lrgraph.items()} rlgraph = {r:{l:f for l,f in ldict.items()} for r, ldict in rlgraph.items()} return lrgraph, rlgraph
def _check_covered_eojeols(self, nouns): self.lrgraph.reset_lrgraph() noun_candidates = self._noun_candidates_from_positive_features() n = len(noun_candidates) for i, (word, _) in enumerate(noun_candidates): if self.verbose and i % 1000 == 999: percentage = '%.3f' % (100 * (i+1) / n) print('\r[Noun Extractor] flushing ... {} %'.format( percentage), flush=True, end='') if not (word in nouns): continue for r, count in self.lrgraph.get_r(word, -1): if (r == '' or (r in self._pos_features) or (r in self._common_features)): self.lrgraph.remove_eojeol(word+r, count) self._num_of_covered_eojeols += count if self.verbose: print('\r[Noun Extractor] flushing was done. mem={} Gb{}'.format( '%.3f' % get_process_memory(), ' '*20), flush=True) coverage = '%.2f' % (100 * self._num_of_covered_eojeols / self._num_of_eojeols) print('[Noun Extractor] {} % eojeols are covered'.format(coverage), flush=True)
def train(self, sents, num_for_pruning = 0, cumulate=True): def prune_extreme_case(): self.L = defaultdict(lambda: 0, {w:f for w,f in self.L.items() if f >= self.min_count}) self.R = defaultdict(lambda: 0, {w:f for w,f in self.R.items() if f >= self.min_count}) def prune_extreme_case_a(): self._aL = defaultdict(lambda: 0, {w:f for w,f in self._aL.items() if f > 1}) self._aR = defaultdict(lambda: 0, {w:f for w,f in self._aR.items() if f > 1}) if cumulate: self.L = defaultdict(int, self.L) self.R = defaultdict(int, self.R) self._aL = defaultdict(int, self._aL) self._aR = defaultdict(int, self._aR) else: self.L = defaultdict(int) self.R = defaultdict(int) self._aL = defaultdict(int) self._aR = defaultdict(int) for num_sent, sent in enumerate(sents): if sys.version_info.major == 2: words = map(unicode, sent.strip().split()) else: words = sent.split() for word in words: if (not word) or (len(word) <= 1): continue word_len = len(word) for i in range(1, min(self.left_max_length + 1, word_len)+1): self.L[word[:i]] += 1 for i in range(1, min(self.right_max_length + 1, word_len)): self.R[word[-i:]] += 1 if len(words) <= 1: continue for left_word, word, right_word in zip([words[-1]]+words[:-1], words, words[1:]+[words[0]]): self._aL['%s %s' % (word, right_word[0])] += 1 self._aR['%s %s' % (left_word[-1], word)] += 1 word_len = len(word) for i in range(1, min(self.right_max_length + 1, word_len)): self._aL['%s %s' % (word[-i:], right_word[0])] += 1 for i in range(1, min(self.left_max_length + 1, word_len)): self._aR['%s %s' % (left_word[-1], word[:i])] += 1 if (num_for_pruning > 0) and ( num_sent % num_for_pruning == 0): prune_extreme_case() if (self.verbose > 0) and ( num_sent % self.verbose == 0): sys.stdout.write('\rtraining ... (%d in %d sents) use memory %.3f Gb' % (num_sent, len(sents), get_process_memory())) prune_extreme_case() prune_extreme_case_a() if (self.verbose > 0): print('\rtraining was done. used memory %.3f Gb' % (get_process_memory())) self.L = dict(self.L) self.R = dict(self.R) self._aL = dict(self._aL) self._aR = dict(self._aR)
def _scan_vocabulary(self, sents): """ Parameters ---------- sents: list-like iterable object which has string It computes subtoken frequency first. After then, it builds lr-graph with sub-tokens appeared at least min count """ _ckpt = int(len(sents) / 40) wordset_l = defaultdict(lambda: 0) wordset_r = defaultdict(lambda: 0) for i, sent in enumerate(sents): for token in sent.split(' '): if not token: continue token_len = len(token) for i in range(1, min(self.max_left_length, token_len) + 1): wordset_l[token[:i]] += 1 for i in range(1, min(self.max_right_length, token_len)): wordset_r[token[-i:]] += 1 if self.verbose and (i % _ckpt == 0): args = ('#' * int(i / _ckpt), '-' * (40 - int(i / _ckpt)), 100.0 * i / len(sent), '%', get_process_memory()) sys.stdout.write('\rscanning: %s%s (%.3f %s) %.3f Gb' % args) wordset_l = { w for w, f in wordset_l.items() if f >= self.min_frequency } wordset_r = { w for w, f in wordset_r.items() if f >= self.min_frequency } if self.verbose: print('\rscanning completed') print('(L,R) has (%d, %d) tokens. memory = %.3f Gb' % (len(wordset_l), len(wordset_r), get_process_memory())) return wordset_l, wordset_r
def _build_graph(self, sents, wordset_l, wordset_r): self.wordset_l = wordset_l self.wordset_r = wordset_r self.wordset_r.add('') _ckpt = int(len(sents) / 40) lrgraph = defaultdict(lambda: defaultdict(lambda: 0)) rlgraph = defaultdict(lambda: defaultdict(lambda: 0)) for i, sent in enumerate(sents): for token in sent.split(): if not token: continue token_len = len(token) for i in range(1, min(self.left_max_length, token_len) + 1): l = token[:i] r = token[i:] if (not l in wordset_l) or (not r in wordset_r): continue lrgraph[l][r] += 1 rlgraph[r][l] += 1 if self.verbose and (i % _ckpt == 0): args = ('#' * int(i / _ckpt), '-' * (40 - int(i / _ckpt)), 100.0 * i / len(sents), '%', get_process_memory()) sys.stdout.write( '\rbuilding lr-graph: %s%s (%.3f %s), memory = %.3f Gb' % args) if self.verbose: sys.stdout.write( '\rbuilding lr-graph completed. memory = %.3f Gb' % get_process_memory()) lrgraph = { l: {r: f for r, f in rdict.items()} for l, rdict in lrgraph.items() } rlgraph = { r: {l: f for l, f in ldict.items()} for r, ldict in rlgraph.items() } return lrgraph, rlgraph
def _train_with_lrgraph(self, lrgraph, num_of_eojeols=-1): self.lrgraph = lrgraph self._num_of_covered_eojeols = 0 if num_of_eojeols == -1: num_of_eojeols = lrgraph.to_EojeolCounter()._count_sum self._num_of_eojeols = num_of_eojeols if self.verbose: print('[Noun Extractor] has been trained. #eojeols={}, mem={} Gb'. format(num_of_eojeols, '%.3f' % get_process_memory()))
def extract(self, minimum_noun_score=0.3, min_count=1, reset_lrgraph=True): # reset covered eojeol count self._num_of_covered_eojeols = 0 # base prediction noun_candidates = self._noun_candidates_from_positive_features() prediction_scores = self._batch_prediction_order_by_word_length( noun_candidates, minimum_noun_score) # E = N*J+ or N*Posi+ if self.extract_compound: candidates = {l:sum(rdict.values()) for l,rdict in self.lrgraph._lr.items() if len(l) >= 4} compounds = self.extract_compounds( candidates, prediction_scores, minimum_noun_score) else: compounds = {} # combine single nouns and compounds nouns = {noun:score for noun, score in prediction_scores.items() if score[0] >= minimum_noun_score} nouns.update(compounds) # frequency filtering nouns = {noun:score for noun, score in nouns.items() if score[1] >= min_count} nouns = self._post_processing(nouns, prediction_scores, compounds) if self.verbose: print('[Noun Extractor] {} nouns ({} compounds) with min count={}'.format( len(nouns), len(compounds), min_count), flush=True) coverage = '%.2f' % (100 * self._num_of_covered_eojeols / self._num_of_eojeols) print('[Noun Extractor] {} % eojeols are covered'.format(coverage), flush=True) if self.verbose: print('[Noun Extractor] flushing ... ', flush=True, end='') self._nouns = nouns if reset_lrgraph: # when extracting predicates, do not reset lrgraph. # the remained lrgraph is predicate (root - ending) graph self.lrgraph.reset_lrgraph() if self.verbose: print('done. mem={} Gb'.format('%.3f'%get_process_memory())) nouns_ = {noun:NounScore(score[1], score[0]) for noun, score in nouns.items()} return nouns_
def pmi(x, min_pmi=0, alpha=0.0001, verbose=False): # convert x to probability matrix & marginal probability px = (x.sum(axis=1) / x.sum()).reshape(-1) py = (x.sum(axis=0) / x.sum()).reshape(-1) pxy = x / x.sum() # transform px and py to diagonal matrix # using scipy.sparse.diags px_diag = diags(px.tolist()[0]) py_diag = diags((py).tolist()[0]) # pmi_alpha (x,y) = p(x,y) / ( p(x) x (p(y) + alpha) ) px_diag.data[0] = np.asarray( [0 if v == 0 else 1 / v for v in px_diag.data[0]]) py_diag.data[0] = np.asarray( [0 if v == 0 else 1 / (v + alpha) for v in py_diag.data[0]]) exp_pmi = px_diag.dot(pxy).dot(py_diag) # PPMI using threshold min_exp_pmi = 1 if min_pmi == 0 else np.exp(min_pmi) # because exp_pmi is sparse matrix and type of exp_pmi.data is numpy.ndarray indices = np.where(exp_pmi.data > min_exp_pmi)[0] pmi_dok = dok_matrix(exp_pmi.shape) # prepare data (rows, cols, data) rows, cols = exp_pmi.nonzero() data = exp_pmi.data # enumerate function for printing status for _n_idx, idx in enumerate(indices): # print current status if verbose and _n_idx % 10000 == 0: print('\rcomputing pmi {:.3} % mem={} Gb '.format( 100 * _n_idx / indices.shape[0], '%.3f' % get_process_memory()), flush=True, end='') # apply logarithm pmi_dok[rows[idx], cols[idx]] = np.log(data[idx]) if verbose: print('\rcomputing pmi was done{}'.format(' ' * 30), flush=True) return pmi_dok
def _train_with_eojeol_counter(self, eojeol_counter, min_eojeol_frequency=2): eojeol_counter._counter = { eojeol:count for eojeol, count in eojeol_counter._counter.items() if count >= min_eojeol_frequency} eojeol_counter._set_count_sum() self._num_of_eojeols = len(eojeol_counter) self._num_of_covered_eojeols = 0 self._count_of_eojeols = eojeol_counter._count_sum self._count_of_covered_eojeols = 0 self.eojeol_counter = eojeol_counter if self.verbose: mem = '%.3f' % get_process_memory() message = '#eojeols={}, mem={} Gb'.format(self._num_of_eojeols, mem) self._print(message, replace=True, newline=True)
def pmi(x, min_pmi=0, alpha=0.0001, verbose=False): # convert x to probability matrix & marginal probability px = (x.sum(axis=1) / x.sum()).reshape(-1) py = (x.sum(axis=0) / x.sum()).reshape(-1) pxy = x / x.sum() # transform px and py to diagonal matrix # using scipy.sparse.diags px_diag = diags(px.tolist()[0]) py_diag = diags((py).tolist()[0]) # pmi_alpha (x,y) = p(x,y) / ( p(x) x (p(y) + alpha) ) px_diag.data[0] = np.asarray([0 if v == 0 else 1/v for v in px_diag.data[0]]) py_diag.data[0] = np.asarray([0 if v == 0 else 1/(v + alpha) for v in py_diag.data[0]]) exp_pmi = px_diag.dot(pxy).dot(py_diag) # PPMI using threshold min_exp_pmi = 1 if min_pmi == 0 else np.exp(min_pmi) # because exp_pmi is sparse matrix and type of exp_pmi.data is numpy.ndarray indices = np.where(exp_pmi.data > min_exp_pmi)[0] pmi_dok = dok_matrix(exp_pmi.shape) # prepare data (rows, cols, data) rows, cols = exp_pmi.nonzero() data = exp_pmi.data # enumerate function for printing status for _n_idx, idx in enumerate(indices): # print current status if verbose and _n_idx % 10000 == 0: print('\rcomputing pmi {:.3} % mem={} Gb '.format( 100 * _n_idx / indices.shape[0], '%.3f' % get_process_memory()) , flush=True, end='') # apply logarithm pmi_dok[rows[idx], cols[idx]] = np.log(data[idx]) if verbose: print('\rcomputing pmi was done{}'.format(' '*30), flush=True) return pmi_dok
def train(self, sentences, min_eojeol_count=1): if self.verbose: print('[Noun Extractor] counting eojeols') eojeol_counter = EojeolCounter(sentences, min_eojeol_count, max_length=self.l_max_length + self.r_max_length, filtering_checkpoint=self.eojeol_counter_filtering_checkpoint, verbose=self.verbose) self._num_of_eojeols = eojeol_counter._count_sum self._num_of_covered_eojeols = 0 if self.verbose: print('[Noun Extractor] complete eojeol counter -> lr graph') self.lrgraph = eojeol_counter.to_lrgraph( self.l_max_length, self.r_max_length) if self.verbose: print('[Noun Extractor] has been trained. mem={} Gb'.format( '%.3f'%get_process_memory()))
def _check_covered_eojeols(self, nouns): self.lrgraph.reset_lrgraph() noun_candidates = self._noun_candidates_from_positive_features() n = len(noun_candidates) for i, word in enumerate(sorted(noun_candidates, key=lambda x: -len(x))): if self.verbose and i % 1000 == 999: percentage = '%.3f' % (100 * (i + 1) / n) print( '\r[Noun Extractor] flushing ... {} %'.format(percentage), flush=True, end='') if not (word in nouns): continue if len(word) > 1: for r, count in self.lrgraph.get_r(word, -1): # remove all eojeols that including word at left-side. # we have to assume that pos, neg features are incomplete self.lrgraph.remove_eojeol(word + r, count) self._num_of_covered_eojeols += count else: # a syllable noun is exception; remove only N + pos feature if (r == '' or (r in self._pos_features) or (r in self._common_features)): self.lrgraph.remove_eojeol(word + r, count) self._num_of_covered_eojeols += count if self.verbose: print('\r[Noun Extractor] flushing was done. mem={} Gb{}'.format( '%.3f' % get_process_memory(), ' ' * 20), flush=True) coverage = '%.2f' % (100 * self._num_of_covered_eojeols / self._num_of_eojeols) print('[Noun Extractor] {} % eojeols are covered'.format(coverage), flush=True)
def train(self, sentences): if self.verbose: print('[Noun Extractor] counting eojeols') eojeol_counter = EojeolCounter(sentences, self.min_eojeol_count, max_length=self.l_max_length + self.r_max_length, filtering_checkpoint=self.eojeol_counter_filtering_checkpoint, verbose=self.verbose) self._num_of_eojeols = eojeol_counter._count_sum self._num_of_covered_eojeols = 0 if self.verbose: print('[Noun Extractor] complete eojeol counter -> lr graph') self.lrgraph = eojeol_counter.to_lrgraph( self.l_max_length, self.r_max_length) if self.verbose: print('[Noun Extractor] has been trained. mem={} Gb'.format( '%.3f'%get_process_memory()))
def _scan_vocabulary(self, sents): """ Parameters ---------- sents: list-like iterable object which has string It computes subtoken frequency first. After then, it builds lr-graph with sub-tokens appeared at least min count """ _ckpt = int(len(sents) / 40) wordset_l = defaultdict(lambda: 0) wordset_r = defaultdict(lambda: 0) for i, sent in enumerate(sents): for token in sent.split(' '): if not token: continue token_len = len(token) for i in range(1, min(self.left_max_length, token_len)+1): wordset_l[token[:i]] += 1 for i in range(1, min(self.right_max_length, token_len)): wordset_r[token[-i:]] += 1 if self.verbose and (i % _ckpt == 0): args = ('#' * int(i/_ckpt), '-' * (40 - int(i/_ckpt)), 100.0 * i / len(sent), '%', get_process_memory()) sys.stdout.write('\rscanning: %s%s (%.3f %s) %.3f Gb' % args) wordset_l = {w for w,f in wordset_l.items() if f >= self.min_count} wordset_r = {w for w,f in wordset_r.items() if f >= self.min_count} if self.verbose: print('\rscanning completed') print('(L,R) has (%d, %d) tokens. memory = %.3f Gb' % (len(wordset_l), len(wordset_r), get_process_memory())) return wordset_l, wordset_r
def sent_to_word_context_matrix(sents, windows=3, min_tf=10, tokenizer=lambda x:x.split(), verbose=True): # counting word frequency, first word_counter = defaultdict(int) for i_sent, sent in enumerate(sents): if verbose and i_sent % 1000 == 0: print('\rcounting word frequency from {} sents, mem={} Gb'.format( i_sent, '%.3f' % get_process_memory()), flush=True, end='') words = tokenizer(sent) for word in words: word_counter[word] += 1 if verbose: print('\rcounting word frequency from {} sents was done. mem={} Gb'.format( i_sent, '%.3f' % get_process_memory()), flush=True, end='') # filtering with min_tf vocabulary = {word for word, count in word_counter.items() if count >= min_tf} vocabulary = {word:idx for idx, word in enumerate(sorted(vocabulary, key=lambda w:-word_counter[w]))} idx2vocab = [word for word, _ in sorted(vocabulary.items(), key=lambda w:w[1])] del word_counter # scanning (word, context) pairs base2contexts = defaultdict(lambda: defaultdict(int)) for i_sent, sent in enumerate(sents): if verbose and i_sent % 1000 == 0: print('\rscanning (word, context) pairs from {} sents, mem={} Gb'.format( i_sent, '%.3f' % get_process_memory()), flush=True, end='') words = tokenizer(sent) if not words: continue n = len(words) for i, base in enumerate(words): if not (base in vocabulary): continue # left_contexts for context in words[max(0, i-windows):i]: if not (context in vocabulary): continue base2contexts[base][context] += 1 # right_contexts for context in words[min(i+1, n):min(i+windows, n)+1]: if not (context in vocabulary): continue base2contexts[base][context] += 1 if verbose: print('\rscanning (word, context) pairs from {} sents was done. mem={} Gb'.format( i_sent + 1, '%.3f' % get_process_memory()), flush=True, end='') # Encoding dict to vectors rows = [] cols = [] data = [] for base, contexts in base2contexts.items(): base_idx = vocabulary[base] for context, cooccurrence in contexts.items(): context_idx = vocabulary[context] rows.append(base_idx) cols.append(context_idx) data.append(cooccurrence) x = csr_matrix((data, (rows, cols))) if verbose: print('\r(word, context) matrix was constructed. shape = {}{}'.format( x.shape, ' '*20)) return x, idx2vocab
def _print_status(message, i_sent, new_line=False): print('\r{} from {} sents, mem={} Gb'.format(message, i_sent, '%.3f' % get_process_memory()), flush=True, end='\n' if new_line else '')
def pmi_memory_friendly(X, py=None, min_pmi=0, alpha=0.0, beta=1.0, verbose=False): """ :param X: scipy.sparse.csr_matrix (word, contexts) sparse matrix :param py: numpy.ndarray (1, word) shape, probability of context words. :param min_pmi: float Minimum value of pmi. all the values that smaller than min_pmi are reset to zero. Default is zero. :param alpha: float Smoothing factor. pmi(x,y; alpha) = p_xy /(p_x * (p_y + alpha)) Default is 0.0 :param beta: float Smoothing factor. pmi(x,y) = log ( Pxy / (Px x Py^beta) ) Default is 1.0 :param verbose: Boolean If True, verbose mode on Returns ---------- pmi : scipy.sparse.dok_matrix (word, contexts) pmi value sparse matrix px : numpy.ndarray Probability of rows (items) py : numpy.ndarray Probability of columns (features) Usage ----- >>> pmi, px, py = pmi_memory_friendly(X, py=None, min_pmi=0, alpha=0, beta=1.0) """ assert 0 < beta <= 1 # convert x to probability matrix & marginal probability px = (X.sum(axis=1) / X.sum()).reshape(-1) if py is None: py = (X.sum(axis=0) / X.sum()).reshape(-1) pxy = X / X.sum() assert py.shape[0] == pxy.shape[1] if beta < 1: py = py ** beta py /= py.sum() # transform px and py to diagonal matrix # using scipy.sparse.diags px_diag = diags(px.tolist()[0]) py_diag = diags((py).tolist()[0]) # pmi_alpha (x,y) = p(x,y) / ( p(x) x (p(y) + alpha) ) px_diag.data[0] = np.asarray([0 if v == 0 else 1/v for v in px_diag.data[0]]) py_diag.data[0] = np.asarray([0 if v == 0 else 1/(v + alpha) for v in py_diag.data[0]]) exp_pmi = px_diag.dot(pxy).dot(py_diag) # PPMI using threshold min_exp_pmi = 1 if min_pmi == 0 else np.exp(min_pmi) # because exp_pmi is sparse matrix and type of exp_pmi.data is numpy.ndarray indices = np.where(exp_pmi.data > min_exp_pmi)[0] pmi_dok = dok_matrix(exp_pmi.shape) # prepare data (rows, cols, data) rows, cols = exp_pmi.nonzero() data = exp_pmi.data # enumerate function for printing status for _n_idx, idx in enumerate(indices): # print current status if verbose and _n_idx % 10000 == 0: print('\rcomputing pmi {:.3} % mem={} Gb '.format( 100 * _n_idx / indices.shape[0], '%.3f' % get_process_memory()) , flush=True, end='') # apply logarithm pmi_dok[rows[idx], cols[idx]] = np.log(data[idx]) if verbose: print('\rcomputing pmi was done{}'.format(' '*30), flush=True) return pmi_dok, px, py
def _train_with_sentences(self, sentences, min_eojeol_count=2, filtering_checkpoint=100000): check = filtering_checkpoint > 0 if self.verbose: message = 'counting eojeols' self._print(message, replace=False, newline=False) # Eojeol counting counter = {} def contains_noun(eojeol, n): for e in range(2, n + 1): if eojeol[:e] in self.nouns: return True return False for i_sent, sent in enumerate(sentences): if check and i_sent > 0 and i_sent % filtering_checkpoint == 0: counter = { eojeol: count for eojeol, count in counter.items() if count >= min_eojeol_count } if self.verbose and i_sent % 100000 == 99999: message = 'n eojeol = {} from {} sents. mem={} Gb{}'.format( len(counter), i_sent + 1, '%.3f' % get_process_memory(), ' ' * 20) self._print(message, replace=True, newline=False) for eojeol in sent.split(): n = len(eojeol) if n <= 1 or contains_noun(eojeol, n): continue counter[eojeol] = counter.get(eojeol, 0) + 1 if self.verbose: message = 'counting eojeols was done. {} eojeols, mem={} Gb{}'.format( len(counter), '%.3f' % get_process_memory(), ' ' * 20) self._print(message, replace=True, newline=True) counter = { eojeol: count for eojeol, count in counter.items() if count >= min_eojeol_count } self._num_of_eojeols = sum(counter.values()) self._num_of_covered_eojeols = 0 if self.verbose: message = 'complete eojeol counter -> lr graph' self._print(message, replace=False, newline=True) self.lrgraph = EojeolCounter()._to_lrgraph(counter, l_max_length=10, r_max_length=9) if self.verbose: message = 'has been trained. mem={} Gb'.format( '%.3f' % get_process_memory()) self._print(message, replace=False, newline=True)
def sent_to_word_context_matrix(sents, windows=3, min_tf=10, tokenizer=lambda x: x.split(), verbose=True): # counting word frequency, first word_counter = defaultdict(int) for i_sent, sent in enumerate(sents): if verbose and i_sent % 1000 == 0: print('\rcounting word frequency from {} sents, mem={} Gb'.format( i_sent, '%.3f' % get_process_memory()), flush=True, end='') words = tokenizer(sent) for word in words: word_counter[word] += 1 if verbose: print('\rcounting word frequency from {} sents was done. mem={} Gb'. format(i_sent, '%.3f' % get_process_memory()), flush=True, end='') # filtering with min_tf vocabulary = { word for word, count in word_counter.items() if count >= min_tf } vocabulary = { word: idx for idx, word in enumerate( sorted(vocabulary, key=lambda w: -word_counter[w])) } idx2vocab = [ word for word, _ in sorted(vocabulary.items(), key=lambda w: w[1]) ] del word_counter # scanning (word, context) pairs base2contexts = defaultdict(lambda: defaultdict(int)) for i_sent, sent in enumerate(sents): if verbose and i_sent % 1000 == 0: print('\rscanning (word, context) pairs from {} sents, mem={} Gb'. format(i_sent, '%.3f' % get_process_memory()), flush=True, end='') words = tokenizer(sent) if not words: continue n = len(words) for i, base in enumerate(words): if not (base in vocabulary): continue # left_contexts for context in words[max(0, i - windows):i]: if not (context in vocabulary): continue base2contexts[base][context] += 1 # right_contexts for context in words[min(i + 1, n):min(i + windows, n) + 1]: if not (context in vocabulary): continue base2contexts[base][context] += 1 if verbose: print( '\rscanning (word, context) pairs from {} sents was done. mem={} Gb' .format(i_sent + 1, '%.3f' % get_process_memory()), flush=True, end='') # Encoding dict to vectors rows = [] cols = [] data = [] for base, contexts in base2contexts.items(): base_idx = vocabulary[base] for context, cooccurrence in contexts.items(): context_idx = vocabulary[context] rows.append(base_idx) cols.append(context_idx) data.append(cooccurrence) x = csr_matrix((data, (rows, cols))) if verbose: print('\r(word, context) matrix was constructed. shape = {}{}'.format( x.shape, ' ' * 20)) return x, idx2vocab
def extract(self, minimum_noun_score=0.3, min_count=1, reset_lrgraph=True): # reset covered eojeol count self._num_of_covered_eojeols = 0 # base prediction noun_candidates = self._noun_candidates_from_positive_features() prediction_scores = self._batch_prediction_order_by_word_length( noun_candidates, minimum_noun_score) # E = N*J+ or N*Posi+ if self.extract_compound: candidates = { l: sum(rdict.values()) for l, rdict in self.lrgraph._lr.items() if len(l) >= 4 } compounds = self.extract_compounds(candidates, prediction_scores, minimum_noun_score) else: compounds = {} # combine single nouns and compounds nouns = { noun: score for noun, score in prediction_scores.items() if score[0] >= minimum_noun_score } nouns.update(compounds) # frequency filtering nouns = { noun: score for noun, score in nouns.items() if score[1] >= min_count } nouns = self._post_processing(nouns, prediction_scores, compounds) if self.verbose: print('[Noun Extractor] {} nouns ({} compounds) with min count={}'. format(len(nouns), len(compounds), min_count), flush=True) coverage = '%.2f' % (100 * self._num_of_covered_eojeols / self._num_of_eojeols) print('[Noun Extractor] {} % eojeols are covered'.format(coverage), flush=True) if self.verbose: print('[Noun Extractor] flushing ... ', flush=True, end='') self._nouns = nouns if reset_lrgraph: # when extracting predicates, do not reset lrgraph. # the remained lrgraph is predicate (root - ending) graph self.lrgraph.reset_lrgraph() if self.verbose: print('done. mem={} Gb'.format('%.3f' % get_process_memory())) nouns_ = { noun: NounScore(score[1], score[0]) for noun, score in nouns.items() } return nouns_