def concordance_2_txt(nome_p, tokens, left_margin=2, right_margin=4): text = Text(tokens) c = ConcordanceIndex(text.tokens) concordance_txt = ( [text.tokens[list(map(lambda x: x - 5 if (x - left_margin) > 0 else 0, [offset]))[0]:offset + right_margin] for offset in c.offsets(nome_p)]) return [''.join([x + ' ' for x in con_sub]) for con_sub in concordance_txt]
def concordance_2_txt(nome_p, tokens, left_margin=2, right_margin=4): text = Text(tokens) c = ConcordanceIndex(text.tokens) concordance_txt = ([ text.tokens[ list(map(lambda x: x - 5 if (x - left_margin) > 0 else 0, [offset]))[0]:offset + right_margin] for offset in c.offsets(nome_p) ]) return [''.join([x + ' ' for x in con_sub]) for con_sub in concordance_txt]
def __init__(self, text): self.corpus = text.lower() self.pos_tags = pos_tag(text, True) self.word_count = len(self.pos_tags) self.c_values = [] # form [(c-value, ngram)] self.nc_values = [] # form: [(ngram, nc-value)] self.candidate_cache = [] self.context_words = defaultdict(lambda: [0, 0]) self.conc_index = ConcordanceIndex(self.pos_tags) # maps from ("token", "pos-tag") to # (freq. as context word, no. of ngrams it appears with): self.weights = defaultdict(int)
def _compute_indices(self): # type: () -> Optional[None, list] if self.corpus is None: self.indices = None return self.indices = [ ConcordanceIndex(doc, key=lambda x: x.lower()) for doc in self.tokens ]
def _compute_indices(self): # type: () -> Optional[None, list] if self.corpus is None: self.indices = None return if self.corpus and not self.corpus.has_tokens(): preprocessor = Preprocessor(tokenizer=WordPunctTokenizer()) preprocessor(self.corpus) self.indices = [ConcordanceIndex(doc, key=lambda x: x.lower()) for doc in self.corpus.tokens]
def get_cache(self, language): if language not in self._byLanguage: self._byLanguage[language] = dict() self._byLanguage[language]['texts'] = dict() self._byLanguage[language]['indices'] = dict() with self.get_lock(): if exists(f'cache/{language}.ready'): self._load_cache(language) else: corpus_names = self.app_config['phraseExamples'][language] for corpus_name in corpus_names: corpus = getattr(nltk.corpus, corpus_name) text = self._byLanguage[language]['texts'][ corpus_name] = nltk.Text(corpus.words()) self._byLanguage[language]['indices'][ corpus_name] = ConcordanceIndex(text.tokens, key=self.key_func) self._save_cache(language) texts, indices = self._byLanguage[language]['texts'], self._byLanguage[ language]['indices'] return texts, indices
class C_NC_TermExtractor(object): def __init__(self, text): self.corpus = text.lower() self.pos_tags = pos_tag(text, True) self.word_count = len(self.pos_tags) self.c_values = [] # form [(c-value, ngram)] self.nc_values = [] # form: [(ngram, nc-value)] self.candidate_cache = [] self.context_words = defaultdict(lambda: [0, 0]) self.conc_index = ConcordanceIndex(self.pos_tags) # maps from ("token", "pos-tag") to # (freq. as context word, no. of ngrams it appears with): self.weights = defaultdict(int) def compute_cnc(self): # linguistic filter # find all n-grams of the form (Noun|Adjective)*(Noun) # (according to Frantzi_97) candidates = self.find_multi_word_terms() self.candidate_cache = [self.text_from_tagged_ngram(candidate) for candidate in candidates] # compute c_value for each candidate max_len = max([len(ngram) for ngram in candidates]) for ngram in candidates: self._compute_c_value(ngram, max_len) self.c_values.sort(key=lambda x: x[1], reverse=True) # compute weight # get all ngrams with maximal c_value max_ngrams, max_value = self.c_values[0] max_ngrams = [max_ngrams] for i in range(1, len(self.c_values)): if self.c_values[i][1] < max_value: break max_ngrams.append(self.c_values[i][0]) # compute context of max_ngrams for ngram in max_ngrams: context = self.extract_context(ngram) for word, count in context.items(): # increment frequency as context word self.context_words[word][0] += count # increment number of ngrams the context appeared in for token in context.keys(): self.context_words[token][1] += 1 #compute weights no_terms = float(len(max_ngrams)) for token, counts in self.context_words.items(): corpus_count = self.corpus.count( self.text_from_tagged_ngram(token)) self.weights[token] = 0.5 * (counts[1]/no_terms + counts[0]/corpus_count) # compute nc_value for each candidate for ngram, c in self.c_values: #accumulate weight for ngram using it's context wei = 0 for word in self.extract_context(ngram).keys(): wei += (self.weights[word]+1) nc = 1/log(self.word_count) * c * wei self.nc_values.append((ngram, nc)) self.nc_values.sort(key=lambda x: x[1], reverse=True) return [Term(word) for word, nc in self.nc_values[:self.term_number()]] def term_number(self): return int(ceil(TERM_PERCENTAGE * len(self.nc_values))) def extract_context(self, ngram): """ Takes an ngram and retrieves the context for all it's occurrances. The context is a window of size 1. Only nouns, verbs and adjectives (CONTEXT_TYPES) are kept as context, others are ignored. For each context word numer of occurance is computed and stored as value in self.context. Param: ngram: list of tuples of form ("token", "pos-tag") Returns: context: dict of form ("token", "pos-tag") -> int mapping context words to the count of their occurence TODO: Maybe take context as first CONTEXT_TYPE to the left and right, instead of just window of size 1? """ # for each term create a list with it's offsets, # find sequences of consequetive offsets in all this lists len_ngram = len(ngram) list_of_offset_lists = [] for token in ngram: list_of_offset_lists.append(self.conc_index.offsets(token)) offsets = self.flatten_list(list_of_offset_lists) offsets.sort() subsequences = self.conseq_sequences(offsets, len_ngram) # check that offset-order is same as word order in ngram offsets = [] for seq in subsequences: ok = True for i in range(len_ngram): if not seq[i] in list_of_offset_lists[i]: ok = False break if ok: offsets.append(seq) # find nouns, verbs and adjectives in context of ngram # offset has form [[1,2,3], [5,6] ...], each sub-list # holding the offsets of an occurance of ngram. # The n-th entry of a sub-list is the # offset of the n-th word of the ngram context = defaultdict(lambda: 0) for occurrance in offsets: if (occurrance[0] - 1 >= 0 and occurrance[-1] + 1 < self.word_count): pre = self.pos_tags[occurrance[0]-1] post = self.pos_tags[occurrance[-1]+1] for token in [pre, post]: if token[1] in CONTEXT_TYPES: context[token] += 1 return context def conseq_sequences(self, li, length): """ Takes a list and a length. Returns all sub-sequences in li that are successice (e.g. [1,2,3] or [5,6,7,8]) and of the right length. E.g. >>> conseq_sequences([1,6,7,8,9,8,9]], length=3) [[6,7,8], [7,8,9]] """ return [li[n:n+length] for n in range(len(li)-length+1) if li[n:n+length] == range(li[n], li[n]+length)] def flatten_list(self, l): """ Takes a list of lists and returns the flattened version""" return reduce(operator.add, l) def find_multi_word_terms(self): def _candidate_words(pos_tags): tree = MULTI_TERM_PARSER.parse(pos_tags) candidates = [] for subtree in tree.subtrees(): if subtree.node == "CHUNK": cand = subtree.leaves() if not cand in candidates: candidates.append(cand) return candidates # find all maximal multi word terms candidates = _candidate_words(self.pos_tags) # check weather candidates contain sub words sub_words = [] for mult_word in candidates: # create all ngrams with size n-1..1 for n in range(len(mult_word)-1, 0, -1): # extract candidates from each ngram for ngram in ngrams(mult_word, n): sub_words += _candidate_words(list(ngram)) for word in sub_words: if not word in candidates: candidates.append(word) return candidates def text_from_tagged_ngram(self, ngram): """ Returns the text of an pos-tagged ngram. Param: ngram: list of tuples of form (word, pos-tag) Return: a string containing all words from the ngram seperated by spaces """ # zip(*ngram)[0] returns a tuple with words from a (word,tag) list if type(ngram) == tuple: return ngram[0] return " ".join(zip(*ngram)[0]) def _compute_c_value(self, ngram, max_n): ngram_text = self.text_from_tagged_ngram(ngram) len_ngram = len(ngram) c_value = log(len_ngram, 2) * self.corpus.count(ngram_text) containing_ngrams = [candidate for candidate in self.candidate_cache if ngram_text in candidate and not ngram_text == candidate] if containing_ngrams: dependency_score = 0 # find all candidates that contain the current one for container in containing_ngrams: dependency_score += self.corpus.count(container) c_value = c_value - float(1)/len(containing_ngrams)*dependency_score self.c_values.append((ngram, c_value))
def train_fir_order(corpus, ambigous_words): logging.info("Start train first order co-occurence") stemmer = PorterStemmer() # containers sense_vectors = {} # maps ambiguous words to ndarray of sense vectors estimators = {} # remove stop words and signs logging.info(" Start stemming and cleansing corpus") filtered = cleanse_corpus(corpus) logging.info(" {} different words after cleansing".format( len(set(filtered)))) # find dimensions logging.info(" Start finding dimensions") words_desc = FreqDist(filtered).keys() dimensions = words_desc[:dim_num] offset_index = ConcordanceIndex(filtered, key=lambda s: s.lower()) for word in ambigous_words: logging.info(" Start train: {}".format(word)) estimator = KMeans(cluster_num, "k-means++", n_init=20) # create context vectors for ambigous words logging.info(" Start creating sense vectors") vectors = [] offsets = offset_index.offsets(stemmer.stem(word)) for offset in offsets: context = sized_context(offset, window_radius, filtered) vectors.append(word_vector_from_context(context, dimensions)) # perform svd and dimension reduction logging.info(" Start svd reduction") context_matrix = vstack(vectors) svd_matrix = svd_reduced_eigenvectors(context_matrix, svd_dim_num) # create sense vectors for ambigous context vectors logging.info(" Start clustering") # +++++++++ SVD switch here +++++++++++ #estimator.fit(context_matrix) estimator.fit(svd_matrix) labels = estimator.labels_ estimators[word] = estimator # labels tell which context belongs to which cluster in svd # space. Compute centroids in word space according to that logging.info(" Start centroid computation") centroids = [] for i in range(cluster_num): cluster_i = [vector for vector, label in\ zip(vectors, labels) if label == i] try: centroids.append(npsum(vstack(cluster_i), 0)) except ValueError: logging.warning("CRITICAL: Empty sense vector") centroids.append(zeros(dim_num)) sense_vectors[word] = centroids #draw_word_senses(svd_centroids, svd_matrix, labels) #draw_word_senses(vstack(centroids), context_matrix, labels) logging.info(" sense vectors:{}".format(len(sense_vectors['line']))) logging.info("end train") return sense_vectors, dimensions, estimators
def train_fir_order(corpus, ambigous_words): logging.info("Start train first order co-occurence") stemmer = PorterStemmer() # containers sense_vectors = {} # maps ambiguous words to ndarray of sense vectors estimators = {} # remove stop words and signs logging.info(" Start stemming and cleansing corpus") filtered = cleanse_corpus(corpus) logging.info(" {} different words after cleansing".format( len(set(filtered)))) # find dimensions logging.info(" Start finding dimensions") words_desc = FreqDist(filtered).keys() dimensions = words_desc[:dim_num] offset_index = ConcordanceIndex(filtered, key=lambda s: s.lower()) for word in ambigous_words: logging.info(" Start train: {}".format(word)) estimator = KMeans(cluster_num, "k-means++", n_init=20) # create context vectors for ambigous words logging.info(" Start creating sense vectors") vectors = [] offsets = offset_index.offsets(stemmer.stem(word)) for offset in offsets: context = sized_context(offset, window_radius, filtered) vectors.append(word_vector_from_context(context, dimensions)) # perform svd and dimension reduction logging.info(" Start svd reduction") context_matrix = vstack(vectors) svd_matrix = svd_reduced_eigenvectors(context_matrix, svd_dim_num) # create sense vectors for ambigous context vectors logging.info(" Start clustering") # +++++++++ SVD switch here +++++++++++ #estimator.fit(context_matrix) estimator.fit(svd_matrix) labels = estimator.labels_ estimators[word] = estimator # labels tell which context belongs to which cluster in svd # space. Compute centroids in word space according to that logging.info(" Start centroid computation") centroids = [] for i in range(cluster_num): cluster_i = [vector for vector, label in\ zip(vectors, labels) if label == i] try: centroids.append(npsum(vstack(cluster_i), 0)) except ValueError: logging.warning("CRITICAL: Empty sense vector") centroids.append(zeros(dim_num)) sense_vectors[word] = centroids #draw_word_senses(svd_centroids, svd_matrix, labels) #draw_word_senses(vstack(centroids), context_matrix, labels) logging.info(" sense vectors:{}".format( len(sense_vectors['line']))) logging.info("end train") return sense_vectors, dimensions, estimators