def _compute_indices(self): # type: () -> Optional[None, list] if self.corpus is None: self.indices = None return self.indices = [ ConcordanceIndex(doc, key=lambda x: x.lower()) for doc in self.tokens ]
def _compute_indices(self): # type: () -> Optional[None, list] if self.corpus is None: self.indices = None return if self.corpus and not self.corpus.has_tokens(): preprocessor = Preprocessor(tokenizer=WordPunctTokenizer()) preprocessor(self.corpus) self.indices = [ConcordanceIndex(doc, key=lambda x: x.lower()) for doc in self.corpus.tokens]
def concordance_2_txt(nome_p, tokens, left_margin=2, right_margin=4): text = Text(tokens) c = ConcordanceIndex(text.tokens) concordance_txt = ([ text.tokens[ list(map(lambda x: x - 5 if (x - left_margin) > 0 else 0, [offset]))[0]:offset + right_margin] for offset in c.offsets(nome_p) ]) return [''.join([x + ' ' for x in con_sub]) for con_sub in concordance_txt]
def __init__(self, text): self.corpus = text.lower() self.pos_tags = pos_tag(text, True) self.word_count = len(self.pos_tags) self.c_values = [] # form [(c-value, ngram)] self.nc_values = [] # form: [(ngram, nc-value)] self.candidate_cache = [] self.context_words = defaultdict(lambda: [0, 0]) self.conc_index = ConcordanceIndex(self.pos_tags) # maps from ("token", "pos-tag") to # (freq. as context word, no. of ngrams it appears with): self.weights = defaultdict(int)
def get_cache(self, language): if language not in self._byLanguage: self._byLanguage[language] = dict() self._byLanguage[language]['texts'] = dict() self._byLanguage[language]['indices'] = dict() with self.get_lock(): if exists(f'cache/{language}.ready'): self._load_cache(language) else: corpus_names = self.app_config['phraseExamples'][language] for corpus_name in corpus_names: corpus = getattr(nltk.corpus, corpus_name) text = self._byLanguage[language]['texts'][ corpus_name] = nltk.Text(corpus.words()) self._byLanguage[language]['indices'][ corpus_name] = ConcordanceIndex(text.tokens, key=self.key_func) self._save_cache(language) texts, indices = self._byLanguage[language]['texts'], self._byLanguage[ language]['indices'] return texts, indices
def train_fir_order(corpus, ambigous_words): logging.info("Start train first order co-occurence") stemmer = PorterStemmer() # containers sense_vectors = {} # maps ambiguous words to ndarray of sense vectors estimators = {} # remove stop words and signs logging.info(" Start stemming and cleansing corpus") filtered = cleanse_corpus(corpus) logging.info(" {} different words after cleansing".format( len(set(filtered)))) # find dimensions logging.info(" Start finding dimensions") words_desc = FreqDist(filtered).keys() dimensions = words_desc[:dim_num] offset_index = ConcordanceIndex(filtered, key=lambda s: s.lower()) for word in ambigous_words: logging.info(" Start train: {}".format(word)) estimator = KMeans(cluster_num, "k-means++", n_init=20) # create context vectors for ambigous words logging.info(" Start creating sense vectors") vectors = [] offsets = offset_index.offsets(stemmer.stem(word)) for offset in offsets: context = sized_context(offset, window_radius, filtered) vectors.append(word_vector_from_context(context, dimensions)) # perform svd and dimension reduction logging.info(" Start svd reduction") context_matrix = vstack(vectors) svd_matrix = svd_reduced_eigenvectors(context_matrix, svd_dim_num) # create sense vectors for ambigous context vectors logging.info(" Start clustering") # +++++++++ SVD switch here +++++++++++ #estimator.fit(context_matrix) estimator.fit(svd_matrix) labels = estimator.labels_ estimators[word] = estimator # labels tell which context belongs to which cluster in svd # space. Compute centroids in word space according to that logging.info(" Start centroid computation") centroids = [] for i in range(cluster_num): cluster_i = [vector for vector, label in\ zip(vectors, labels) if label == i] try: centroids.append(npsum(vstack(cluster_i), 0)) except ValueError: logging.warning("CRITICAL: Empty sense vector") centroids.append(zeros(dim_num)) sense_vectors[word] = centroids #draw_word_senses(svd_centroids, svd_matrix, labels) #draw_word_senses(vstack(centroids), context_matrix, labels) logging.info(" sense vectors:{}".format(len(sense_vectors['line']))) logging.info("end train") return sense_vectors, dimensions, estimators