Exemplo n.º 1
0
def train_language(language, training_path):
    words = []
    filter_words(training_path, words)
    seq = ' ' + ''.join(words)

    # Bigram
    bigram_finder = BigramCollocationFinder.from_words(seq)
    bigram_finder.apply_freq_filter(FREQ_FILTER)
    bigram_model = bigram_finder.ngram_fd.items()

    # Trigram
    trigram_finder = TrigramCollocationFinder.from_words(seq)
    trigram_finder.apply_freq_filter(FREQ_FILTER)
    trigram_model = trigram_finder.ngram_fd.items()

    # Quad
    quadgram_finder = QuadgramCollocationFinder.from_words(seq)
    quadgram_finder.apply_freq_filter(FREQ_FILTER)
    quadgram_model = quadgram_finder.ngram_fd.items()

    bigram_model = sorted(bigram_finder.ngram_fd.items(),
                          key=lambda item: item[1],
                          reverse=True)
    trigram_model = sorted(trigram_finder.ngram_fd.items(),
                           key=lambda item: item[1],
                           reverse=True)
    quadgram_model = sorted(quadgram_finder.ngram_fd.items(),
                            key=lambda item: item[1],
                            reverse=True)

    final_model = bigram_model + trigram_model + quadgram_model
    #print(final_model)
    np.save(MODELS_PATH + language + '.npy', final_model)
    print("Language model for {} stored at {}".format(
        language, MODELS_PATH + language + '.npy'))
Exemplo n.º 2
0
    def get_quadgrams(self, size):

        file_name = self.disease_type + '-quadgram-freq-' + str(size)
        if 'training' in file_name:
            full_training_quadgram_filename = file_name + '.csv'
            file_quadgrams = csv.writer(
                open(full_training_quadgram_filename, 'w'))
        else:
            full_test_quadgram_filename = file_name + '.csv'
            file_quadgrams = csv.writer(open(full_test_quadgram_filename, 'w'))

        finder = QuadgramCollocationFinder.from_words(self.word_set)
        True
        sortedQuadGrams = sorted(
            finder.ngram_fd.items(), key=lambda t:
            (-t[1], t[0]))[:size]  # doctest: +NORMALIZE_WHITESPACE

        # Store results of 400 bigrams into CSV file
        for quadgram_tuple, count in sortedQuadGrams:
            file_quadgrams.writerow([
                type(quadgram_tuple)(x.encode('utf-8')
                                     for x in quadgram_tuple), count
            ])  # formatted properly #x.encode

        return self.full_training_quadgram_filename, self.full_test_quadgram_filename
Exemplo n.º 3
0
def _get_quadgrams(words, top_n, min_freq):
    qcf = QuadgramCollocationFinder.from_words(iter(words))
    qcf.apply_freq_filter(min_freq)
    quadgrams = [
        ' '.join(w) for w in qcf.nbest(QuadgramAssocMeasures.chi_sq, top_n)
    ]
    return re.compile('(%s)' % '|'.join(quadgrams), re.UNICODE)
Exemplo n.º 4
0
def predict(test_string, models):
    # clean string
    test_string = pre_processing(test_string)

    bi_test = BigramCollocationFinder.from_words(test_string)
    tri_test = TrigramCollocationFinder.from_words(test_string)
    quad_test = QuadgramCollocationFinder.from_words(test_string)
    final_test = list(bi_test.ngram_fd.items()) + list(
        tri_test.ngram_fd.items()) + list(quad_test.ngram_fd.items())

    model_name = []

    for model in models:
        model_name.append(model[0])

    freq_sum = np.zeros(len(models))
    for ngram, freq in final_test:
        exists = 0

        for i, lang_model in enumerate(models):
            lang = lang_model[0]
            model = lang_model[1]
            total_ngram = lang_model[2]

            if ngram in model:
                if DEBUG:
                    print("Found", ngram, model[ngram], lang, total_ngram)
                # normalizing to prevent freq/total to be zero
                freq_sum[i] = freq_sum[i] + (freq * 10000) / total_ngram
                exist = 1

            if not exists:
                freq_sum[i] += 1

        max_val = freq_sum.max()
        index = freq_sum.argmax()

    if not max(freq_sum):
        if DEBUG:
            print("[ERROR] Invalid string. String: {}".format(test_string))
        return 0, "Hmm, I do not know this word. Please try other words."

    # get highest score and normalize it to be between 0,1}
    _max = 0
    freq_to_model = list(zip(freq_sum, model_name))
    scores = [x for x, y in freq_to_model]
    normalized_scores_name = [(normalize_score(f, scores), m)
                              for f, m in freq_to_model]
    sorted_score_model = sorted(normalized_scores_name, reverse=True)

    if DEBUG: print("[DEBUG] Frequency to model: {}".format(freq_to_model))
    if DEBUG: print("[DEBUG] Scores: {}".format(scores))
    if DEBUG:
        print("[DEBUG] Normalized scores name: {}".format(
            normalized_scores_name))
    if DEBUG:
        print("[DEBUG] Reverse sorted score model: {}".format(
            sorted_score_model))

    return 1, sorted_score_model
Exemplo n.º 5
0
def rank_quadgrams(corpus, metric):
    """
    Находит и оценивает тетраграммы в указанном корпусе с применением
    заданной метрики. Записывает тетраграммы в файл, если указан,
    иначе возвращает список в памяти.
    """
    # Создать объект оценки словосочетаний из слов в корпусе.
    ngrams = QuadgramCollocationFinder.from_words(corpus.words())
    # Оценить словосочетания в соответствии с заданной метрикой
    scored = ngrams.score_ngrams(metric)
    return scored
Exemplo n.º 6
0
def compute_ngrams_count(text_corpus, out_p, n=20):
    print("Compute ngrams count...")
    list_of_tokens = []
    for document in text_corpus:
        for sentence in document:
            list_of_tokens.append(word_tokenize(sentence))

    # Unigram
    tokens = util.flatten_one_level(list_of_tokens)
    custom_sw = [".", "[", "]", ","]
    sw = stopwords.words("english") + custom_sw
    tokens = [w for w in tokens if w not in sw]
    word_fd = FreqDist(tokens)
    uni_mc = word_fd.most_common(n)

    # Bigram
    bi = BigramCollocationFinder.from_documents(list_of_tokens)
    #bi.apply_freq_filter(2)
    #print(bi.ngram_fd.items())
    bi_mc = bi.ngram_fd.most_common(n)

    # Trigram
    tri = TrigramCollocationFinder.from_documents(list_of_tokens)
    tri_mc = tri.ngram_fd.most_common(n)

    # Quadgram
    quad = QuadgramCollocationFinder.from_documents(list_of_tokens)
    quad_mc = quad.ngram_fd.most_common(n)

    # Plot
    data = [uni_mc, bi_mc, tri_mc, quad_mc]
    x = []
    y = []
    for i in range(len(data)):
        x_ng = []
        y_ng = []
        for d in data[i]:
            if i==0:
                x_ng.append(d[0])
            else:
                x_ng.append(" ".join(d[0]))
            y_ng.append(d[1])
        x.append(x_ng[::-1])
        y.append(y_ng[::-1])
    title = ["Unigram", "Bigram", "Trigram", "Quadgram"]
    sup_title = "ngrams count"
    util.plot_bar_chart_grid(x, y, 1, len(data), title, sup_title, out_p, sup_title_font_size=16,
        tick_font_size=14, title_font_size=14, h_size=5, w_size=5, rotate=True)
Exemplo n.º 7
0
    def rank_grams(self, docs):
        """
        Find and rank gram from the supplied corpus using the given
        association metric. Write the quadgrams out to the given path if
        supplied otherwise return the list in memory.
        """
        # Create a collocation ranking utility from corpus words.
        if self.n == 2:
            self.ngrams = BigramCollocationFinder.from_words(docs)
        elif self.n == 3:
            self.ngrams = TrigramCollocationFinder.from_words(docs)
        elif self.n == 4:
            self.ngrams = QuadgramCollocationFinder.from_words(docs)

        # Rank collocations by an association metric
        self.scored = self.ngrams.score_ngrams(self.metric)
Exemplo n.º 8
0
def top_words_quadcounter(job_type_list):
    special_chars = ['--','...','\n','•','®','·']
    a = ' '.join(job_type_list)
    a = a.translate(str.maketrans('','',string.punctuation)).lower() #remove punctuation and make lower case
    for char in special_chars:
        a = a.replace(char, ' ') #replace special char with a space
    #resultwords = [word for word in a.split(' ') if word.lower() not in stopwords]
    #text = ' '.join(resultwords)
    a
    finder = QuadgramCollocationFinder.from_words(word_tokenize(a))
    l = []
    for k,v in finder.ngram_fd.items():
        #count += 1
        z = (k,v)
        l.append(z)
    l = sorted(l,key=itemgetter(1),reverse=True)
    return(l[0:300])
Exemplo n.º 9
0
def generating_ngrams(words, n):
    """
       generating individual language models
    
        words: list of words present in a particular language (list(string))
        n: value of n for generating n-grams, values are 2,3,4 (int)
    """
    #Generating N-grams
    if n == 2:
        finder = BigramCollocationFinder.from_words(words)  #2-grams
    elif n == 3:
        finder = TrigramCollocationFinder.from_words(words)  #3-grams
    elif n == 4:
        finder = QuadgramCollocationFinder.from_words(words)  #4-grams
    else:
        print("Incorrect value of n")

    return finder  #ngrams
Exemplo n.º 10
0
def rank_quadgrams(corpus, metric, path=None):
    """
    Find and rank quadgrams from the supplied corpus using the given
    association metric. Write the quadgrams out to the given path if
    supplied otherwise return the list in memory.
    """

    # Create a collocation ranking utility from corpus words.
    ngrams = QuadgramCollocationFinder.from_words(corpus.words())

    # Rank collocations by an association metric
    scored = ngrams.score_ngrams(metric)

    if path:
        with open(path, 'w') as f:
            f.write("Collocation\tScore ({})\n".format(metric.__name__))
            for ngram, score in scored:
                f.write("{}\t{}\n".format(repr(ngram), score))
    else:
        return scored
Exemplo n.º 11
0
def Collocation(contents, n):

    from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder, TrigramAssocMeasures, TrigramCollocationFinder, QuadgramAssocMeasures, QuadgramCollocationFinder

    from nltk.probability import FreqDist, DictionaryProbDist

    if n==2:
        bigram_measures = BigramAssocMeasures()
        finder = BigramCollocationFinder.from_words(contents)
        scored = finder.score_ngrams(bigram_measures.raw_freq)
    elif n==3:
        trigram_measures = TrigramAssocMeasures()
        finder = TrigramCollocationFinder.from_words(contents)
        scored = finder.score_ngrams(trigram_measures.raw_freq)
    elif n==4:
        quadgram_measures = QuadgramAssocMeasures()
        finder = QuadgramCollocationFinder.from_words(contents)
        scored = finder.score_ngrams(quadgram_measures.raw_freq)
    else:
        print("Collocation is only available for n=2, 3, or 4.")

    return(scored)
Exemplo n.º 12
0
def quadgram_feats(text, score_fn=NgramAssocMeasures.pmi, n_best=200):
    #n_grams = list(ngrams(characters, n)) + list(ngrams(characters, n-1)) + list(ngrams(characters, n-2))
    quadgram_finder = QuadgramCollocationFinder.from_words(text)
    n_grams = quadgram_finder.nbest(score_fn, n_best)
    return dict([(n_gram, True) for n_gram in n_grams])
Exemplo n.º 13
0
    bfreq.append(v)
    print(k, v)
plot_bar_x(bigram, bfreq)

from nltk.collocations import TrigramCollocationFinder
trigram = []
tfreq = []
finder = TrigramCollocationFinder.from_words(word_tokenize(text))
for k, v in finder.ngram_fd.items():
    trigram.append(k[0] + " " + k[1] + " " + k[2])
    tfreq.append(v)
    print(k, v)
plot_bar_x(trigram, tfreq)

from nltk.collocations import QuadgramCollocationFinder
finder = QuadgramCollocationFinder.from_words(word_tokenize(text))
quadgram = []
qfreq = []
for k, v in finder.ngram_fd.items():
    quadgram.append(k[0] + " " + k[1] + " " + k[2] + " " + k[3])
    qfreq.append(v)
    print(k, v)
plot_bar_x(quadgram, qfreq)

#Análise de Tópicos

tokenizer = RegexpTokenizer(r'\w+')
# create English stop words list
en_stop = set(stopwords.words('english'))
# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
Exemplo n.º 14
0
def _get_quadgrams(words, top_n, min_freq):
    qcf = QuadgramCollocationFinder.from_words(iter(words))
    qcf.apply_freq_filter(min_freq)
    quadgrams = [' '.join(w) for w in qcf.nbest(QuadgramAssocMeasures.chi_sq, top_n)]
    return re.compile('(%s)' % '|'.join(quadgrams), re.UNICODE)
Exemplo n.º 15
0
def process(text: str,
            num_1_grams: int = 100,
            num_2_grams: int = 100,
            num_3_grams: int = 100,
            num_4_grams: int = 100,
            min_chars: int = 3,
            max_chars: int = 30):
    """ Extract keywords from text sources """

    # Find all sentences in the text
    sents = get_sentences(text)

    # Filter out any sentences which occur identically more than once
    sent_counter = collections.Counter(sents)
    sents = [sent for sent in sents if sent_counter[sent] == 1]

    # Tokenize each sentence
    sents = [RE_TOKEN.split(sent) for sent in sents]

    # and len(word) > 1

    # Filter out non-alphabetic tokens and convert to lowercase
    sents = [[token.lower() for token in sent if is_alpha(token)]
             for sent in sents]

    # We look at two variants of the input sentences
    # a. For 1-grams, we remove all stopwords, short tokens, and possesives
    # b. For 2-grams and longer, we want to keep stopwords and short tokens as
    #    these might provide some information in relation to other words
    sents_a = [[RE_POSS.sub('', t) for t in sent if filter_token(t)]
               for sent in sents]
    sents_b = sents

    assert len(sents_a) > 0 and len(sents_b), 'Not enough words'

    counter = collections.Counter()
    for sent in sents_a:
        for token in sent:
            counter[token] += 1

    res = [[], [], [], []]

    if num_1_grams:
        # Represent tokens using a tuple with only one element to match the
        # format of the other ngrams with n > 1
        tuples = [((token, ), count) for token, count in counter.items()]
        df_1 = df_top(tuples=tuples,
                      num=num_1_grams,
                      token_filter=filter_1_grams,
                      min_char=min_chars,
                      max_char=max_chars)
        if df_1 is not None:
            res[0] = df_1['entry'].tolist()

    if num_2_grams:
        bigrams = BigramCollocationFinder.from_documents(sents_b)
        tuples = bigrams.score_ngrams(BigramAssocMeasures.raw_freq)
        df_2 = df_top(tuples=tuples,
                      num=num_2_grams,
                      token_filter=filter_2_grams,
                      min_char=min_chars,
                      max_char=max_chars)
        if df_2 is not None:
            res[1] = df_2['entry'].tolist()

    if num_3_grams:
        trigrams = TrigramCollocationFinder.from_documents(sents_b)
        tuples = trigrams.score_ngrams(TrigramAssocMeasures.raw_freq)
        df_3 = df_top(tuples=tuples,
                      num=num_3_grams,
                      token_filter=filter_3_grams,
                      min_char=min_chars,
                      max_char=max_chars)
        if df_3 is not None:
            res[2] = df_3['entry'].tolist()

    if num_4_grams:
        quadgrams = QuadgramCollocationFinder.from_documents(sents_b)
        tuples = quadgrams.score_ngrams(QuadgramAssocMeasures.raw_freq)
        df_4 = df_top(tuples=tuples,
                      num=num_4_grams,
                      token_filter=filter_4_grams,
                      min_char=min_chars,
                      max_char=max_chars)
        if df_4 is not None:
            res[3] = df_4['entry'].tolist()

    return res