예제 #1
0
 def _compute_indices(self):  # type: () -> Optional[None, list]
     if self.corpus is None:
         self.indices = None
         return
     self.indices = [
         ConcordanceIndex(doc, key=lambda x: x.lower())
         for doc in self.tokens
     ]
예제 #2
0
 def _compute_indices(self):  # type: () -> Optional[None, list]
     if self.corpus is None:
         self.indices = None
         return
     if self.corpus and not self.corpus.has_tokens():
         preprocessor = Preprocessor(tokenizer=WordPunctTokenizer())
         preprocessor(self.corpus)
     self.indices = [ConcordanceIndex(doc, key=lambda x: x.lower())
                     for doc in self.corpus.tokens]
예제 #3
0
def concordance_2_txt(nome_p, tokens, left_margin=2, right_margin=4):
    text = Text(tokens)
    c = ConcordanceIndex(text.tokens)

    concordance_txt = ([
        text.tokens[
            list(map(lambda x: x - 5 if
                     (x - left_margin) > 0 else 0, [offset]))[0]:offset +
            right_margin] for offset in c.offsets(nome_p)
    ])

    return [''.join([x + ' ' for x in con_sub]) for con_sub in concordance_txt]
예제 #4
0
    def __init__(self, text):
        self.corpus = text.lower()
        self.pos_tags = pos_tag(text, True)
        self.word_count = len(self.pos_tags)
        self.c_values = []  # form [(c-value, ngram)]
        self.nc_values = []  # form: [(ngram, nc-value)]
        self.candidate_cache = []
        self.context_words = defaultdict(lambda: [0, 0])
        self.conc_index = ConcordanceIndex(self.pos_tags)

        # maps from ("token", "pos-tag") to
        # (freq. as context word, no. of ngrams it appears with):
        self.weights = defaultdict(int)
예제 #5
0
 def get_cache(self, language):
     if language not in self._byLanguage:
         self._byLanguage[language] = dict()
         self._byLanguage[language]['texts'] = dict()
         self._byLanguage[language]['indices'] = dict()
         with self.get_lock():
             if exists(f'cache/{language}.ready'):
                 self._load_cache(language)
             else:
                 corpus_names = self.app_config['phraseExamples'][language]
                 for corpus_name in corpus_names:
                     corpus = getattr(nltk.corpus, corpus_name)
                     text = self._byLanguage[language]['texts'][
                         corpus_name] = nltk.Text(corpus.words())
                     self._byLanguage[language]['indices'][
                         corpus_name] = ConcordanceIndex(text.tokens,
                                                         key=self.key_func)
                 self._save_cache(language)
     texts, indices = self._byLanguage[language]['texts'], self._byLanguage[
         language]['indices']
     return texts, indices
예제 #6
0
파일: worsed.py 프로젝트: cartisan/worsed
def train_fir_order(corpus, ambigous_words):
    logging.info("Start train first order co-occurence")
    stemmer = PorterStemmer()

    # containers
    sense_vectors = {}  # maps ambiguous words to ndarray of sense vectors
    estimators = {}

    # remove stop words and signs
    logging.info("  Start stemming and cleansing corpus")
    filtered = cleanse_corpus(corpus)
    logging.info("  {} different words after cleansing".format(
        len(set(filtered))))

    # find dimensions
    logging.info("  Start finding dimensions")
    words_desc = FreqDist(filtered).keys()
    dimensions = words_desc[:dim_num]
    offset_index = ConcordanceIndex(filtered, key=lambda s: s.lower())

    for word in ambigous_words:
        logging.info("  Start train: {}".format(word))
        estimator = KMeans(cluster_num, "k-means++", n_init=20)

        # create context vectors for ambigous words
        logging.info("    Start creating sense vectors")
        vectors = []
        offsets = offset_index.offsets(stemmer.stem(word))
        for offset in offsets:
            context = sized_context(offset, window_radius, filtered)
            vectors.append(word_vector_from_context(context, dimensions))

        # perform svd and dimension reduction
        logging.info("    Start svd reduction")
        context_matrix = vstack(vectors)
        svd_matrix = svd_reduced_eigenvectors(context_matrix, svd_dim_num)

        # create sense vectors for ambigous context vectors
        logging.info("    Start clustering")

        # +++++++++ SVD switch here +++++++++++
        #estimator.fit(context_matrix)
        estimator.fit(svd_matrix)

        labels = estimator.labels_
        estimators[word] = estimator

        # labels tell which context belongs to which cluster in svd
        # space. Compute centroids in word space according to that
        logging.info("    Start centroid computation")
        centroids = []
        for i in range(cluster_num):
            cluster_i = [vector for vector, label in\
                         zip(vectors, labels) if label == i]
            try:
                centroids.append(npsum(vstack(cluster_i), 0))
            except ValueError:
                logging.warning("CRITICAL: Empty sense vector")
                centroids.append(zeros(dim_num))

        sense_vectors[word] = centroids

        #draw_word_senses(svd_centroids, svd_matrix, labels)
        #draw_word_senses(vstack(centroids), context_matrix, labels)

    logging.info("  sense vectors:{}".format(len(sense_vectors['line'])))
    logging.info("end train")
    return sense_vectors, dimensions, estimators