Exemplo n.º 1
0
def bigrams(text, exclude=pronouns, freq=3, limit=10):
    temp = []
    bigram_measures = collocations.BigramAssocMeasures()
    finder = collocations.BigramCollocationFinder.from_words(
        word_tokenize(text))
    finder.apply_word_filter(lambda w: w in exclude)
    finder.apply_freq_filter(freq)
    [temp.append(each) for each in finder.nbest(bigram_measures.pmi, limit)]
    return temp
Exemplo n.º 2
0
def bigrams(tweets_words,stop_words):
    """ Creates bigrams out of a dataset """
    bigrams_measures = collocations.BigramAssocMeasures()
    bigram_finder = collocations.BigramCollocationFinder.from_words(tweets_words)
    bigram_freq = bigram_finder.ngram_fd.items()
    
    bigramFreqTable = pd.DataFrame(list(bigram_freq), columns=['bigram','freq']).sort_values(by='freq', ascending=False)
    filtered_bi = bigramFreqTable[bigramFreqTable.bigram.map(lambda x: rightTypes(x,stop_words))]
    freq_bi = filtered_bi.bigram.values
    return freq_bi
Exemplo n.º 3
0
def get_collocation_pairs(article, data_save_file):
    nouns = get_nouns(article)
    measures = collocations.BigramAssocMeasures()
    tagged_words = Mecab().pos(article)

    if data_save_file != False:
        np.save(data_save_file, tagged_words)
    finder = collocations.BigramCollocationFinder.from_words(tagged_words)
    score_words = finder.score_ngrams(measures.likelihood_ratio)

    word_pairs = find_NNpairs(score_words)

    return word_pairs
Exemplo n.º 4
0
    def retrieve(self, request, *args, **kwargs):

        jpype.attachThreadToJVM()
        board = Board.objects.get(pk=kwargs['pk'])
        measures = collocations.BigramAssocMeasures()
        tagged_words = Twitter().pos(board.content)
        finder = collocations.BigramCollocationFinder.from_words(tagged_words)
        result = finder.nbest(measures.pmi,
                              10)  # top 5 n-grams with highest PMI
        text_result = ""
        for tuples in result:
            text_result += ",".join(tuples[0])
            text_result += "|"
            text_result += ",".join(tuples[1])

        response = BoardAnalyze.objects.create(board_id=board,
                                               result=text_result)
        serializer = self.get_serializer(response)
        return Response(serializer.data)
Exemplo n.º 5
0
    def retrieve(self, request, *args, **kwargs):

        jpype.attachThreadToJVM()
        board = Board.objects.get(pk=kwargs['pk'])
        measures = collocations.BigramAssocMeasures()
        tagged_words = Twitter().pos(board.content)
        words = [w for w, t in tagged_words]
        ignored_words = [u'안녕']
        finder = collocations.BigramCollocationFinder.from_words(words)
        finder.apply_word_filter(lambda w: len(w) < 2 or w in ignored_words)
        finder.apply_freq_filter(3)  # only bigrams that appear 3+ times
        result = finder.nbest(measures.pmi, 10)
        if result:
            text_result = ",".join(result[0])
        else:
            text_result = ""

        response = BoardAnalyze.objects.create(board_id=board,
                                               result=text_result)
        serializer = self.get_serializer(response)
        return Response(serializer.data)
Exemplo n.º 6
0
def extract_top_bigrams_collocations(collection, num=10, frequencyThreshold=3, windows_size=5, filter_word=None):
    """
    This methods extracts, for each document collection, the top N bigram collocations. Bigram collocations
    are pairs of words which commonly co-occur. With a windows_size < 2, only bigrams formed by consecutive words
    will be taken into account. In that case, the result is consistent with a list of 2-words expressions that
    frequently appear in the collection. For windows_size > 2, all pairs of words within a windows of windows_size
    words will be considered. In that case, the result is consistent with a list of 2 related words that frequently
    co-occur together and therefore commonly have a semantic relationship between them
    """
    from nltk import collocations
    words = tokenize_collection(collection)
    bigram_measures = collocations.BigramAssocMeasures()
    if windows_size > 2:
        finder = collocations.BigramCollocationFinder.from_words(words,windows_size)
    else:
        finder = collocations.BigramCollocationFinder.from_words(words)

    finder.apply_freq_filter(frequencyThreshold)
    if filter_word:
        finder.apply_ngram_filter(lambda *w: filter_word not in w)
    return finder.nbest(bigram_measures.chi_sq, num)
Exemplo n.º 7
0
def main():
    total = len(sys.argv)

    if total < 1:
        print "Utilization: python gen_graph.py <input_file>"
        exit(0)

    twts = read_json(str(sys.argv[1]))

    print "reading and cleanup done!"

    collocation = collocations.BigramCollocationFinder.from_documents(twts)

    bigram_measures = collocations.BigramAssocMeasures()

    print "Creating Bi-grams Collocation"

    c_list = []

    for each in collocation.ngram_fd.viewitems():
        if each[1] > 1:
            c_list.append(each)

    c_list.sort(key=operator.itemgetter(1), reverse=True)

    print "Generating Graph"

    g = nx.Graph()

    for each in c_list[:50000]:
        g = add_node(g, each[0][0])
        g = add_node(g, each[0][1])
        g = add_edge(g, each[0][0], each[0][1], each[1])

    print len(g)

    nx.write_graphml(g, '../data/test_graph_words_pos.graphml')

    print "Done"
Exemplo n.º 8
0
def phrase_list(filename):
    article = open(filename, 'r').readlines()
    try:
        data = article[2:]
        content = data[0].decode("utf-8")[:-1]
        for paragraph in data[1:]:
            content = content + " " + paragraph.decode("utf-8")[:-1]
    except:
        return []

    txt = content
    sentences = [s for s in nltk.tokenize.sent_tokenize(txt)]
    normalized_sentences = [s.lower() for s in sentences]

    words = [
        w for sentence in normalized_sentences
        for w in nltk.tokenize.word_tokenize(sentence)
        if w not in stop_words and not re.match(prog_num, w)
    ]
    words = [w.strip(u'\u201c\u201d\u2018') for w in words]
    words = [w.strip(',.') for w in words]
    #print words
    #words = Lemmatizer([words])
    #print words

    bigram_measures = collocations.BigramAssocMeasures()
    bigram_finder = collocations.BigramCollocationFinder.from_words(words)
    #bigram_finder.apply_freq_filter(BIGRAM_FILTER)

    threshold = min(int(0.05 * len(words)), 100)
    phrase = []
    for bigram in bigram_finder.score_ngrams(
            bigram_measures.raw_freq)[:threshold]:
        phrase.append(bigram[0])

    return phrase
Exemplo n.º 9
0
    while tmp[tmpIndex] != ' ':
        if tmpIndex > 0:
            tmpIndex -= 1
        else:
            break
    while tmp[index] != ' ':
        if len(tmp) - 1 != index:
            index += 1
        else:
            break
    return " " + tmp[tmpIndex:tmp.find('-')] + " " + tmp[tmp.find('-') +
                                                         1:index]


mecab = Mecab()
bigram_measures = collocations.BigramAssocMeasures()

import time
import sys
from threading import Condition
_CONDITION = Condition()


@route('/classify')
def classify():

    reload(sys)
    sys.setdefaultencoding('utf-8')

    specialLetter = "( ) [ ] { } % # & * @ § ※ ☆ ★ ○ ● ◎ ◇ ◆ □ ■ △ ▲ ▽ ▼ → ← ↑ ↓ ↔ 〓 ◁ ◀ ▷ ▶ ♤ ♠ ♡ ♥ ♧ ♣ ⊙ ◈ ▣ ◐ ◑ ▒ ▤ ▥ ▨ ▧ ▦ ▩ ♨ ☏ ☎ ☜ ☞ ¶ † ‡ ↕ ↗ ↙ ↖ ↘ ♭ ♩ ♪ ♬ ㉿ ㈜ № ㏇ ㏂ ㏘ ℡ ? ª º ☞ ☜ ▒ "
    specialLetter += "─ │ ┌ ┐ ┘ └ ├ ┬ ┤ ┴ │ ━ ┃ ┏ ┓ ┛ ┗ ┣ ┳ ┫ ┻ ╋ ┠ ┯ ┨ ┷ ┿ ┝ ┰ ┥ ┸ ╂ ┒ ┑ ┚ ┙ ┖ ┕ ┎ ┍ ┞ ┟ ┡ ┢ ┦ ┧ ┩ ┪ ┭ ┮ ┱ ┲ ┵ ┶ ┹ ┺ ┽ ┾ ╀ ╁ ╃ ╄ ╅ ╆ ╇ ╈ ╉ ╊ "
Exemplo n.º 10
0
def n_gram_creator(tokens,
                   top_n=20,
                   n=2,
                   freq_filter=None,
                   window_size=None,
                   counts=False,
                   show_freq=True,
                   show_pmi=False,
                   keep=None):
    # Helper function creating [2-4]grams with a variety of options

    import nltk.collocations as colloc
    from nltk import bigrams, trigrams

    ## Check if n-gram is supported
    if n in [2, 3, 4]:

        ## Allowing for non-contiguous ngram creation
        if isinstance(window_size, int):
            window = window_size
        else:
            window = n

        ## Bigram setup
        if n == 2:
            word = 'Bi'

            if counts:
                ngrams = bigrams(tokens)
                return ngrams
            else:
                ngram_measures = colloc.BigramAssocMeasures()
                ngram_finder = colloc.BigramCollocationFinder.from_words(
                    tokens, window_size=window)

        ## Trigram setup
        elif n == 3:
            word = 'Tri'

            if counts:
                ngrams = trigrams(tokens)
                return ngrams
            else:
                ngram_measures = colloc.TrigramAssocMeasures()
                ngram_finder = colloc.TrigramCollocationFinder.from_words(
                    tokens, window_size=window)

        ## Quadgram setup
        elif n == 4:
            word = 'Quad'
            ngram_measures = colloc.QuadgramAssocMeasures()
            ngram_finder = colloc.QuadgramCollocationFinder.from_words(
                tokens, window_size=window)

        ## Applying frequency filter to results if selected for
        if isinstance(freq_filter, int):
            ngram_finder.apply_freq_filter(freq_filter)

        ## Create ngram scores
        ngram_score = ngram_finder.score_ngrams(ngram_measures.raw_freq)
        ngram_pmi_score = ngram_finder.score_ngrams(ngram_measures.pmi)

        ## Optional display
        if show_freq:
            print(f'Top {top_n} {word}-grams by frequency')
            display(ngram_score[:top_n])

        ## Optional display
        if show_pmi:
            print(f'PMI score for {top_n} {word}-grams')
            display(ngram_pmi_score[:top_n])

        ## Optional return
        if keep == 'score':
            return ngram_score
        elif keep == 'pmi':
            return ngram_pmi_score

    ## Messaging for non-supported ngrams
    else:
        return f"{n}-grams are not supported. Try 2, 3, or 4."
Exemplo n.º 11
0
import logging
import nltk
from nltk import collocations

from typing import List, Tuple

from freqdist.models import TextFile
from kwic.utils import _clean_texts, _get_texts
from core.models import Example

logger = logging.getLogger(__name__)

MEASURES_FINDERS_DICT = {
    'bigram': [
        collocations.BigramAssocMeasures(),
        collocations.BigramCollocationFinder,
    ],
    'trigram': [
        collocations.TrigramAssocMeasures(),
        collocations.TrigramCollocationFinder,
    ],
    'quadgram': [
        collocations.QuadgramAssocMeasures(),
        collocations.QuadgramCollocationFinder,
    ]
}


def get_collocates(ngram: str,
                   assoc_measure: str,
                   include_examples: bool,
Exemplo n.º 12
0
def setModel() :
    measures = collocations.BigramAssocMeasures()
    twitter = Twitter()

    path = "./soma_classifier.csv"
    train_df = pd.read_csv(path)
    # train_df = pd.read_pickle("soma_goods_train.df")

    # nltk.download("stopwords")
    # nltk.download("punkt")
    # nltk.download("maxent_treebank_pos_tagger")
    # nltk.download('maxent_treebank_pos_tagger')

    # nltk.download("all")

    # pattern = r'''(?x) ([A-Z]\.)+ | \w+(-\w+)* | \$?\d+(\.\d+)?%? | \.\.\. | [][.,;"'?():-_`]'''
    # 학습하는 상품 name들을 리스트로 만든다.
    d_list = []
    # 카테고리 분류명에 구분자(';')를 줘서 리스트를 만든다
    cate_list = []


    # iterrows() : 행의 이름과 값들을 쌍으로 조회
    for each in train_df.iterrows():
        # join() : ';'를 구분자로 내용들을 합친다
        cate = ";".join([each[1]['cate1'], each[1]['cate2'], each[1]['cate3']])

        d_list.append(each[1]['name'])
        cate_list.append(cate)
    # print(type(d_list[0]))
    print(d_list)
    # print(len(d_list))
    # words =[]
    # print('Collocations among tagged words:')
    # for d in d_list :
    #     words.append(' '.join(twitter.nouns(d)))
    #
    # print(words)
    pattern = r'''(?x) ([A-Za-z]\.)+ | \w+(-\w+)* | \$?\d+(\.\d+)?%? | \.\.\. | [][.,;"'?():-_`]'''

    pat = r'[a-zA-Z]+'
    # pat = r'[a - zA - Z]+'
    words=[]
    words_kor = []
    # print('Collocations among tagged words:')
    for d in d_list :
        tokens = word_tokenize(d)
        # tokens = d.split()
        # print(tokens)
        # print(twitter.nouns(d))
        words_kor.append(' '.join(twitter.nouns(d)))
        words.append(' '.join(tokens))

    # print(words)
    # print(words_kor)
    # print(len(words_kor))
    words_en = []


    for word in words:
        temp = []
        # print("word : " + word)
        tokens_en = regexp_tokenize(word, pat)
        print(tokens_en)
        temp.append(' '.join(tokens_en))
        for temp in tokens_en:
            # print("temp : " + temp)
            # tagged_word = nltk.pos_tag(temp.split())
            tagged_word = nltk.pos_tag(word_tokenize(temp))
            print(tagged_word)
            nouns = [token for token, pos in tagged_word if pos.startswith('N')]
            # print(nouns)
            words_en.append(nouns)

        # words_en.append(temp)


        # nouns = [word for word, pos in tagged_sent if pos == 'Noun']
        # print(nouns)
        # words_en.append(nouns)

    # print(words_en)
    # print(len(words_en))
    # for word in words_en :
    #     print(word)
        # print(word[0])

        # tagged_word = nltk.pos_tag(word[0].split())
        # nouns = [token for token, pos in tagged_word if pos.startswith('N')]
        # words_en.append(' '.join(nouns))


    # print(len(words_en))
    # print(len(words))
    # 여기까지!!!!!!~~~~~~~~~~~!~!~!~!~!@~!@~!@~!@!~@!@~!~!~!~!~!~!~!~!~!~!~!~!~!~!!!!!!!~~~~~~~~~~~!~!~!~!~!@~!@~!@~!@!~@!@~!~!~!~!~!~!~!~!~!~!~!~!~!~!


    # 같은종류를 묶어서 하나로...group by와 같다!!
    # print(set(cate_list))
    # object to list
    # print(list(set(cate_list)))
    # 각 카테고리명에 대해서 serial 한 숫자 id를 부여한다.
    # cate_dict[카테고리명] = serial_id   형태이다.
    # dict() : value : key(id); 형태로 저장이 된다,

    cate_dict = dict(zip(list(set(cate_list)), range(len(set(cate_list)))))

    # print(cate_dict), {'디지털/가전;PC부품;CPU' : O, '패션의류;아동의류;한복':1}
    # print(cate_dict['디지털/가전;PC부품;CPU']), 0

    # 단어 형태의 카테고리명에 대응되는 serial_id값들을 넣는다.
    y_list = []
    for each in train_df.iterrows():
        cate = ";".join([each[1]['cate1'], each[1]['cate2'], each[1]['cate3']])
        y_list.append(cate_dict[cate])

    # print(y_list)
    # print(len(y_list))

    # 각 상품별로 name의 문장에 있는 단어들을 빈도수를 matrix형태로 만든다.
    # vectorizer = CountVectorizer()
    # x_list = vectorizer.fit_transform(d_list)
    # x_list = vectorizer.fit_transform(words)

    # print(x_list.shape)
    # print(x_list)
    # 단어들 출력
    # for x in x_list:
    #     print(x.indices)
    #     print(x.data)
    #     print(x.indptr)

        # for word in x.indices :
        #     doc = vectorizer.get_feature_names()[word]
        #     print(doc)


    # vectorizer100 = CountVectorizer(max_features=100)
    # x100_list = vectorizer100.fit_transform(d_list)
    # print(set(x100_list))

    # print(len(vectorizer100.get_feature_names()))

    # 단어들 출력
    # for x10 in x100_list:
    #     for word in x10.indices :
    #         temp = kkma.pos(vectorizer100.get_feature_names()[word])

    # svc_param = {'C': np.logspace(-2, 0, 20)}
    # # svc_param = {'C': np.logspace(-2, 0, 5)}
    # #
    # gs_svc = GridSearchCV(LinearSVC(loss='l2'), svc_param, cv=5, n_jobs=4)
    # gs_svc.fit(x_list, y_list)
    #
    # print(gs_svc.best_params_, gs_svc.best_score_)
    #
    #
    # clf = LinearSVC(C=gs_svc.best_params_['C'])
    # clf.fit(x_list, y_list)
    #
    # joblib.dump(clf, 'classify.model', compress=3)
    # joblib.dump(cate_dict, 'cate_dict.dat', compress=3)
    # joblib.dump(vectorizer, 'vectorizer.dat', compress=3)
    return
Exemplo n.º 13
0
    def fit_transform(self, comments, y=None):
        designed, filtered_words_lower, filtered_words, comments_prep = \
                self._preprocess(comments)

        empty_analyzer = lambda x: x
        self.unigram_vect = TfidfVectorizer(analyzer=empty_analyzer, min_df=3)
        print("vecorizing")
        unigrams = self.unigram_vect.fit_transform(filtered_words_lower)

        # pos tag vectorizer
        #self.pos_vect = TfidfVectorizer(analyzer=empty_analyzer).fit(tags)

        # fancy vectorizer
        self.you_are_vect = TfidfVectorizer(
            token_pattern="(?i)you are(?: an?)?(?: the)?(?: as)? (\w+)")
        you_are = self.you_are_vect.fit_transform(comments_prep)

        # get the google bad word list
        #with open("google_badlist.txt") as f:
        self.bigram_measures = col.BigramAssocMeasures()
        self.trigram_measures = col.TrigramAssocMeasures()

        # extract bigram collocations including "you" (and your?)
        #col.BigramCollocationFinder.from_words([w for c in
        #filtered_words_lower
        #for w in c], window_size=4)

        col_you_bi = col.BigramCollocationFinder.from_documents(
            filtered_words_lower)
        col_you_bi.apply_freq_filter(3)
        col_you_bi._apply_filter(lambda x, y: np.all([w != "you" for w in x]))
        # < 400 of these
        self.you_bigrams = col_you_bi.nbest(self.bigram_measures.chi_sq, 1000)
        self.col_you_bi = col_you_bi
        # make tfidfvectorizer that uses these bigrams
        self.bigram_vect_you = TfidfVectorizer(
            analyzer=make_collocation_analyzer(self.you_bigrams), min_df=3)
        you_bigrams = self.bigram_vect_you.fit_transform(filtered_words_lower)

        # extract trigram collocations
        col_you_tri = col.TrigramCollocationFinder.from_documents(
            filtered_words_lower)
        col_you_tri.apply_freq_filter(3)
        col_you_tri._apply_filter(lambda x, y: np.all([w != "you" for w in x]))
        # < 400 of these, too
        self.you_trigrams = col_you_tri.nbest(self.trigram_measures.chi_sq,
                                              1000)
        self.col_you_tri = col_you_tri
        self.trigram_vect_you = TfidfVectorizer(
            analyzer=make_collocation_analyzer(self.you_trigrams, 3), min_df=3)
        you_trigrams = self.trigram_vect_you.fit_transform(
            filtered_words_lower)

        ## some handcrafted features!
        designed.extend(
            self._handcrafted(
                filtered_words,
                comments,
                filtered_words_lower,
            ))
        designed = np.array(designed).T
        self.scaler = MinMaxScaler()
        designed = self.scaler.fit_transform(designed)
        features = []
        features.append(unigrams)
        features.append(you_bigrams)
        features.append(you_trigrams)
        features.append(you_are)
        #features.append(pos_unigrams)
        features.append(sparse.csr_matrix(designed))
        features = sparse.hstack(features).tocsr()

        return features
Exemplo n.º 14
0

if __name__ == '__main__':
    papers = load_papers()
    raw_spacy_papers, spacy_papers, clean_papers = clean_text(papers)
    sent_lengths, syllables, readability, asl, asw = get_Flesch_Kincaid(
        raw_spacy_papers)
    words, counts = get_top_grams([spacy_papers['Aryan'].text], n=1, top=3)
    make_zipf_plot(counts, words, title='Zipf plot of Aryan\'s essay')
    for stu in spacy_papers.keys():
        words, counts = get_top_grams([spacy_papers[stu].text], n=1, top=3)
        ti = 'Zipf plot of {}\'s essay'.format(stu)
        path = '/home/nate/Dropbox/regis/RCC200/zipf/'
        make_zipf_plot(counts, words, title=ti, savepath=path, save=True)

    bigram_measures = nc.BigramAssocMeasures()
    trigram_measures = nc.TrigramAssocMeasures()
    finder = nc.BigramCollocationFinder.from_documents(
        [[word.text for word in spacy_papers['Aryan']]])
    print('top 10 2-grams by PMI')
    top_bigrams = finder.nbest(bigram_measures.pmi, 10)
    if finder.ngram_fd.N(top_bigrams[0]) > 1:
        print('counts top bigram by PMI appears:')
        print(finder.ngram_fd.N(top_bigrams[0]))

    # analyze my essay for comparison
    with open(
            '/home/nate/Dropbox/regis/RCC200/essays/short_assignment_1_turing/raw_text.txt',
            'rb') as f:
        text = f.read().decode('utf-8')
Exemplo n.º 15
0
 def get_bigram_analyzer(self, n, words):
     LOGGER.info("Building Bigram Analyzer")
     bigram_measures = collocations.BigramAssocMeasures()
     finder = collocations.BigramCollocationFinder.from_words(words)
     return BigramAnalyzer(
         finder.above_score(bigram_measures.likelihood_ratio, n))