def TkloadingTweetsAndUserInfoData(args, resultTextbox, window):
    '''
    This function can load the data and perfrom the preprocessing
    '''

    # Check if the pre-processed datasets (training, validation, test) exist. if the pre-processed data already exist, then load it rather than perform pr-processing again.
    if not os.path.isfile(os.path.join(args.dataset, args.pickle_name)):

        # Check if the pre-processed df (overall Pandas dataframe). If it doesn't exist, load the original data to perform preprocessing.
        if not os.path.isfile(
                os.path.join(args.dataset, args.pickle_name_beforeMapToIdx)):
            '''
            resultTextbox.insert("end", "String_Here") -> This function for adding a String to the result box
            window.update_idletasks() -> This function make the wondow to update the result box.
            '''

            # Adding the loading information to result box
            resultTextbox.insert(
                "end",
                ("Loading " +
                 str(os.path.join(args.dataset,
                                  "FullTweetsDataNoOrdered.html")) + ' and ' +
                 str(
                     os.path.join(args.dataset,
                                  "FullExtraInfoDataNoOrdered.csv")) +
                 " to do the Proprocessing\n"))
            window.update_idletasks()

            # Load the original data set, which contains a html (storing the tweets text) and a csv (storing other tweets' and users' information)

            # Load the tweets text
            tweets_df = pd.read_html(
                os.path.join(args.dataset, "FullTweetsDataNoOrdered.html"))
            tweets_df = pd.DataFrame(list(tweets_df[0].iloc[1:][0]))
            tweets_df.columns = ['text']

            # Load other information
            extraInfo_df = pd.read_csv(
                os.path.join(args.dataset, "FullExtraInfoDataNoOrdered.csv"))

            # Concat two loaded dataframe
            df = pd.concat([tweets_df, extraInfo_df], axis=1)

            # Delete the loaded dataframe after concating since we have the concatinated df now.
            del tweets_df
            del extraInfo_df

            # Show the dataset size in the result box
            resultTextbox.insert("end",
                                 ("Dataset size: " + str(len(df)) + "\n"))
            window.update_idletasks()

            def preprocessingInputTextData(colName):
                '''
                This function is used for preprocessing
                '''
                input = df[colName]
                ps = nltk.stem.PorterStemmer()  # Init Poter Stemmer
                tknzr = TweetTokenizer()  # Init Tweet Tokenizer
                allText = [i for i in input]

                ## The detail preprocessing step is in the report
                preprocessedText = [[
                    ps.stem(word) for word in tknzr.tokenize(
                        re.sub(
                            r'\d+', '',
                            re.sub(r"http\S+|www.\S+", matchingURL,
                                   sentence)).lower())
                    if word not in nltk.corpus.stopwords.words('english')
                    and len(word) >= 3
                ] for sentence in allText]
                df[colName] = preprocessedText

            def fillingNullValue(colName):
                '''
                The function used for replace the 'nan' in the dataset
                '''

                if args.preprocessingStra[colName][
                        'fillingNullMethod'] == filling_method.MOST_COMMON:
                    ## replace the nan by mean
                    df[colName] = df[colName].astype('float')
                    df[colName].fillna(df[colName].mean(), inplace=True)
                elif args.preprocessingStra[colName][
                        'fillingNullMethod'] == filling_method.MEAN:
                    ## replace the nan by the most common value
                    df[colName] = df[colName].astype('category')
                    df[colName].fillna(
                        df[colName].astype('category').describe()['top'],
                        inplace=True)
                elif args.preprocessingStra[colName][
                        'fillingNullMethod'] == filling_method.CERTAIN_VALUE:
                    ## replace the nan by a certain value
                    df[colName] = df[colName].astype('category')
                    df[colName] = df[colName].cat.add_categories(
                        [args.preprocessingStra[colName]['fillingNullValue']])
                    df[colName].fillna(
                        args.preprocessingStra[colName]['fillingNullValue'],
                        inplace=True)

            def TweetsWithUserInfoPreprocessing():
                '''
                Perform the preprocessing here
                '''

                for colName in args.preprocessingStra.keys():
                    resultTextbox.insert(
                        "end",
                        ("Preprocessing feature: " + str(colName) + "\n"))
                    window.update_idletasks()
                    for step in args.preprocessingStra[colName]['steps']:
                        if not step is None:
                            step(colName)

            ###############  Preprocessing Strategy ###############

            args.preprocessingStra = defaultdict(dict)
            args.preprocessingStra['text']['steps'] = [
                preprocessingInputTextData
            ]
            args.preprocessingStra["numberOfHashtags_c"]['steps'] = [None]
            args.preprocessingStra['favorite_count']['steps'] = [None]
            args.preprocessingStra['retweet_count']['steps'] = [None]
            args.preprocessingStra['possibly_sensitive'] = {
                'fillingNullMethod': filling_method.CERTAIN_VALUE,
                'fillingNullValue': 'UNKNOWN',
                'steps': [fillingNullValue],
            }
            args.preprocessingStra['followers_count']['steps'] = [None]
            args.preprocessingStra['friends_count']['steps'] = [None]
            args.preprocessingStra['default_profile']['steps'] = [None]
            args.preprocessingStra['default_profile_image']['steps'] = [None]
            args.preprocessingStra['favourites_count']['steps'] = [None]
            args.preprocessingStra['listed_count']['steps'] = [None]
            args.preprocessingStra['statuses_count']['steps'] = [None]
            args.preprocessingStra['verified']['steps'] = [None]

            resultTextbox.insert("end", ('Preprocessing Strategy Set\n'))
            window.update_idletasks()

            #############################################################

            resultTextbox.insert("end", ('Start Preprocessing...\n'))
            window.update_idletasks()

            TweetsWithUserInfoPreprocessing()  # Apply inplace preprocessing

            # Get dummy variable
            df = pd.get_dummies(df,
                                drop_first=True,
                                columns=[
                                    'possibly_sensitive', 'default_profile',
                                    'default_profile_image', 'verified'
                                ])

            # Save the preprocessed-df
            with open(
                    os.path.join(args.dataset,
                                 args.pickle_name_beforeMapToIdx), "wb") as fp:
                pickle.dump(df, fp)
                resultTextbox.insert(
                    "end", ("The Pickle Data beforeMapToIdx Dumped to: " + str(
                        os.path.join(args.dataset,
                                     args.pickle_name_beforeMapToIdx)) + "\n"))
                window.update_idletasks()

        else:
            # If the preprocessed-df exist, load it.
            print("Loading Existing BeforeMapToIdx file for Tweets and User: "******"end", (
                "Loading Existing BeforeMapToIdx file for Tweets and User: "******"\n"))
            window.update_idletasks()
            with open(
                    os.path.join(args.dataset,
                                 args.pickle_name_beforeMapToIdx), "rb") as fp:
                df = pickle.load(fp)

        #################### After having the pre-processed df ####################

        resultTextbox.insert("end", ('Spliting Datasets...\n'))
        window.update_idletasks()

        ## split the df to training, validation and test set.

        if args.runningOnSmallDataset:
            # If this user want to test the program on the small dataset,
            # do a fake split to have a smaller dataset. X_temp (will be deleted later) will have 98% of data, and the small dataset size is 2%

            # Fake split
            X_temp, X_train, Y_temp, Y_train = train_test_split(
                df.drop('maliciousMark', axis=1),
                df['maliciousMark'],
                test_size=0.02,
                stratify=df['maliciousMark'],
                random_state=args.random_seed)
            # get real training and test set.
            X_train, X_test, Y_train, Y_test = train_test_split(
                X_train,
                Y_train,
                test_size=args.validation_portion,
                stratify=Y_train,
                random_state=args.random_seed)

            # delete X_temp and Y_temp
            del X_temp
            del Y_temp
        else:
            # if not running on the small dataset, do normal data split.
            X_train, X_test, Y_train, Y_test = train_test_split(
                df.drop('maliciousMark', axis=1),
                df['maliciousMark'],
                test_size=args.validation_portion,
                stratify=df['maliciousMark'],
                random_state=args.random_seed)

        X_validation, X_test, Y_validation, Y_test = train_test_split(
            X_test,
            Y_test,
            test_size=args.test_portion,
            stratify=Y_test,
            random_state=args.random_seed)

        ## Show the datasets' sizes
        resultTextbox.insert(
            "end",
            ("Dataset Size: " +
             str(len(X_train) + len(X_validation) + len(X_test)) + "\n"))
        resultTextbox.insert("end",
                             ("TrainingSet Size: " + str(len(X_train)) + "\n"))
        resultTextbox.insert(
            "end", ("ValidationSet Size: " + str(len(X_validation)) + "\n"))
        resultTextbox.insert("end",
                             ("TestSet Size: " + str(len(X_test)) + "\n"))
        window.update_idletasks()

        resultTextbox.insert("end", ('Creating Tweets_text...\n'))
        window.update_idletasks()

        ## create nltk.Text, which will be used as a dictionray.
        tweets_text = nltk.Text(list(itertools.chain(*X_train['text'])))

        # check if the hyper-parameter vocab_size exists and filter out the low tf words.
        args.vocab_size = args.vocab_size or len(tweets_text.tokens)
        if args.vocab_size:  # and this if expression
            tweets_text.tokens = specialTokenList + \
                [w for w, _ in tweets_text.vocab().most_common(
                    args.vocab_size - len(specialTokenList))]
        else:
            tweets_text.tokens = specialTokenList + tweets_text.tokens
        args.vocab_size = len(tweets_text.tokens)  # change the vacab_size

        ## Map the terms to index for every dataset
        resultTextbox.insert("end", ('Maping Word To Idx: training set\n'))
        window.update_idletasks()
        X_train['text'] = mapFromWordToIdx(X_train['text'], tweets_text)
        resultTextbox.insert("end", ('Maping Word To Idx: validation set\n'))
        window.update_idletasks()
        X_validation['text'] = mapFromWordToIdx(X_validation['text'],
                                                tweets_text)
        resultTextbox.insert("end", ('Maping Word To Idx: test set\n'))
        window.update_idletasks()
        X_test['text'] = mapFromWordToIdx(X_test['text'], tweets_text)

        resultTextbox.insert("end", ('Creating Torch Training Datasets...\n'))
        window.update_idletasks()

        # args.X_train = X_train
        # args.Y_train = Y_train

        # Create training, validation and test Pytorch dataset for feeding data into the Pytorch Neural Network
        # More datails are in the utils.py CreateTweetsWithUserInfoDataset function
        training_dataset = CreateTweetsWithUserInfoDataset(
            X_train, list(map(int, list(Y_train))))

        resultTextbox.insert("end",
                             ('Creating Torch Validation Datasets...\n'))
        window.update_idletasks()
        validation_dataset = CreateTweetsWithUserInfoDataset(
            X_validation, list(map(int, list(Y_validation))))

        resultTextbox.insert("end", ('Creating Torch Test Datasets...\n'))
        window.update_idletasks()
        test_dataset = CreateTweetsWithUserInfoDataset(
            X_test, list(map(int, list(Y_test))))

        resultTextbox.insert("end", ('Dumping data...\n'))
        window.update_idletasks()

        # Dump the pre-processed datasets
        with open(os.path.join(args.dataset, args.pickle_name),
                  "wb") as fp:  # Pickling
            pickle.dump([
                training_dataset, validation_dataset, test_dataset, tweets_text
            ], fp)
            print("The Pickle Data Dumped to: ",
                  os.path.join(args.dataset, args.pickle_name))
            resultTextbox.insert(
                "end",
                ("The Pickle Data Dumped to: " +
                 str(os.path.join(args.dataset, args.pickle_name)) + "\n"))
            window.update_idletasks()

    else:

        # If the pre-processed datasets exist, load it.
        resultTextbox.insert(
            "end", ("Loading Existing File: " +
                    str(os.path.join(args.dataset, args.pickle_name)) + '\n'))
        window.update_idletasks()

        with open(os.path.join(args.dataset, args.pickle_name),
                  "rb") as fp:  # Unpickling
            training_dataset, validation_dataset, test_dataset, tweets_text = pickle.load(
                fp)

    ## Some dataframe hyper-parameters that will be used latter
    args.vocab_size = len(tweets_text.tokens)
    args.num_extra_info = len(training_dataset[0][1])
    args.num_features = len(training_dataset[0][1]) + 1

    # return the loaded or generated dataset.
    return training_dataset, validation_dataset, test_dataset, tweets_text
Exemplo n.º 2
0
        # Split the text into words and pass to NLTK
        rawtext = "".join(progtext)

        tokens = rawtext.split()
        #for token in tokens:
        #    if token.lower() in exclusions:
        #        tokens.pop(tokens.index(token))
        #tokens = nltk.word_tokenize(rawtext)

        #tokens = nltk.regexp_tokenize(rawtext,pattern)
        newtokenlist = list()

        for token in tokens:
            newtokenlist.append(spellingFixer(token))

        nltktext = nltk.Text(newtokenlist)
        collocations = nltktext.collocations()
        word_fd = FreqDist(tokens)
        index = 0
        print "\nPopular words:"
        for entry in word_fd:
            if re.match("\W", entry) != None or entry.lower() in exclusions:
                index -= 1
                # Ignore this one, it's just symbols
            else:
                print entry
            index += 1
            if index == 10:
                break
        print "\nPopular topics:"
        bigram_fd = FreqDist(nltk.bigrams(tokens))
Exemplo n.º 3
0
stopwords = nltk.corpus.stopwords.words('english')

texts = [[word for word in abstract.lower().split() if word not in stopwords]
         for abstract in absCl]

phrases = Phrases(texts)
bigram = Phraser(phrases)
trigram = Phrases(bigram[texts])
trigram = Phraser(trigram)
texts = [trigram[bigram[text]] for text in texts]

y = []
i = 0
while i < len(texts):
    topics = TokenSearcher(nltk.Text(texts[i])).findall(
        r'<.*addict.*|opioid_use|.*dependence.*|.*abuse.*|.*abuse|.*alcoholi.*|.*inject_drugs|people_inject.*|drugs_people|.*sober.*|.*misuse.*|.*detox.*|.*heroin.*|hepatitis|.*illicit.*|.*overdose.*|drug_use|drug_use.*|substance_use|treatment_facility|recovering.*> <.*>'
    )
    if topics:
        y.append([texts[i], topics[:], 'Addiction/Abuse'])
    elif not topics:
        topics = TokenSearcher(nltk.Text(texts[i])).findall(
            r'<anesthe.*|.*anesthe.*|.*anesthe|icu|.*perioper.*|.*arthroplasti.*|.*postop.*|.*inpatient.*|.*outpatient.*|sevoflurane|midazolam|.*epidural.*|ropivacaine|.*cancer.*|.*surgic.*|.*surger.*|.*cesarean.*|.*caesarean.*|.*lymphoma.*|.*laparoscop.*|dexmedetomidin|.*sedat.*|.*operat.*|.*endoscop.*|.*radiolo.*|.*paracetamol.*> <.*>'
        )
        if topics:
            y.append([texts[i], topics[:], 'Medical Procedure'])
        elif not topics:
            topics = TokenSearcher(nltk.Text(texts[i])).findall(
                r'<.*pain.*|acetaminophen|.*analgesic.*|.*analgesi.*|ropivacain|.*antinocicept.*|.*nocicep.*|.*inflamm.*|.*epidural.*|.*formalin.*|.*fentanyl.*|oxycodone|remifentanil|.*hyperalgesia.*|nerve_block.*|gabapentin|kappa_opioid|pallative_care|.*paracetamol.*> <.*>'
            )
            if topics:
Exemplo n.º 4
0
reload(sys)
sys.setdefaultencoding('utf-8')

# 찾으려고 하는 상위 n개의 단어(명사)
_toFind_ = 30

# 문서 읽기
doc_ko = open('./k_tex2.txt').read()

# print(doc_ko)

# 명사만 추출
token_ko = tw.nouns(doc_ko)

# nltk 활용을 위한
res_ko = nltk.Text(token_ko, name=u'sutuk1')

print(len(res_ko.tokens))  # returns number of tokens (document length)
print(len(set(res_ko.tokens)))  # returns number of unique tokens
on_list = res_ko.vocab().most_common(_toFind_)

# on_list 는 리스트이다, most_common 이 리스트를 반환하는데 리스트는 튜플로 이루어져있다, 튜플은 첫번째인자로(0번쨰) 유니코드 스트링형을 갖고 두번째인자로(1번째) 몇번 빈출되었는지

# print(list(on_list[1])[0]) 테스트코드

to_list = list()

# 리스트안에 있는 튜플을 리스트로 바꾼 후 그것중 0번째 인자인 스트링형만 리스트에 다시 담는 작업을 하는 부분이다
for a in range(0, len(on_list)):
    to_list.append(list(on_list[a])[0])
Exemplo n.º 5
0
 def datacode(self):
     global image_File
     global stop_words
     filename = fname[0].split('/')
     f_stack = len(filename)
     taglist = filename[-1].split('_')
     searchtext = taglist[1]
     barValue = 0
     myWindow.pBar.setValue(barValue)
     myWindow.pBar.setMaximum(100)
     with open(filename[f_stack - 1], 'rt', encoding='utf-8') as rfile:
         with open('Processing_' + filename[f_stack - 1], 'w',
                   newline='') as wfile:
             cw = csv.writer(wfile)
             r = csv.reader(rfile)
             for row in r:
                 result_News = ''
                 check = 0
                 for c in row[3]:
                     if ord('[') == ord(c):
                         check = 1
                     elif ord(']') == ord(c):
                         check = 0
                         continue
                     if check == 0:
                         if ord('가') <= ord(c) <= ord('힣') or c.isdigit() or ord('A') <= ord(c) <= ord('z') \
                                 or ord(c) == ord(' '):
                             result_News += c
                         else:
                             result_News += ' '
                 cw.writerow([result_News])
     barValue = 10
     myWindow.pBar.setValue(barValue)
     image_File = 'Processing_' + filename[f_stack - 1] + '.png'
     with open('Processing_' + filename[f_stack - 1], 'r') as f:
         text = f.read()
     okt = Okt()
     barValue = 20
     myWindow.pBar.setValue(barValue)
     nouns = okt.nouns(text)
     barValue = 80
     myWindow.pBar.setValue(barValue)
     stop_words = [
         searchtext, '뉴스', '금지', '제공', '무단', '전재', '배포', '기자', '구독', '뉴시스',
         '연합뉴스', '사진', '저작권', '라며', '디스패치', '노컷뉴스', '네이버', '생방송투데이', '매일신문'
     ]
     nouns = [
         each_word for each_word in nouns
         if each_word not in stop_words and len(str(each_word)) > 1
     ]
     nns = nltk.Text(nouns, name='process_')
     wordChart = dict(nns.vocab().most_common(30))
     wordInfo = dict(nns.vocab().most_common(400))
     barValue = 85
     myWindow.pBar.setValue(barValue)
     MyWindow.showChart(wordChart, image_File)
     barValue = 90
     myWindow.pBar.setValue(barValue)
     MyWindow.saveWordCloud(wordInfo, image_File)
     barValue = 100
     myWindow.pBar.setValue(barValue)
     myWindow.Search.setEnabled(True)
     myWindow.Process.setEnabled(True)
Exemplo n.º 6
0
"""

import requests
import bs4
import names
import nltk

API_KEY = "7be7045a0a544e10916fcb867df3010d"

query = "Lava Jato"

url = ('https://newsapi.org/v2/everything?'
       'q={}&'
       'apiKey={}'.format(query, API_KEY))
response = requests.get(url)
txt = response.json()["articles"]
text = ""

for i in range(len(txt)):
    url_art = txt[i]["url"]
    html = requests.get(url_art).text
    soup = bs4.BeautifulSoup(html, "lxml")
    ps = soup.find_all("p")
    for p in ps:
        text = text + "\n" + p.get_text()

nomes = names.get_human_names(text)

print(nomes)
texto_anal = nltk.Text(text.split())
def main(argv, matches=2):

    fName = 'bbc/politics/' + str(argv)
    f = open(fName, 'r')
    raw_text = f.read()

    # Tokenize the tokenized_words of the text
    tokenized_words = nltk.word_tokenize(raw_text)

    # Making the tokenized_words to lower case
    for i in range(len(tokenized_words)):
        tokenized_words[i] = tokenized_words[i].lower()

    # POS tag the words
    tagged_words = nltk.pos_tag(tokenized_words)

    # Extracting the tags of the text
    tags = set([tag for (word, tag) in tagged_words])
    word_tag_dict = {}
    tag_word_dict = {}

    for (word, tag) in tagged_words:
        if word in word_tag_dict.keys():
            word_tag_dict[word.lower()].append(tag)
        else:
            word_tag_dict[word.lower()] = [tag]

        if tag in tag_word_dict.keys():
            tag_word_dict[tag].append(word)
        else:
            tag_word_dict[tag] = [word]

    words = nltk.Text(tokenized_words)
    doc = nltk.ConcordanceIndex(words)

    stemmer = PorterStemmer()

    # # Call text Rank
    # sorted_text_rank = textRank(tokenized_words, tag_word_dict)
    # set1 = set([w.lower() for (w, val) in sorted_text_rank[:15]])
    # removeList = []
    # for w in set1:
    #     if stemmer.stem(w) != w and stemmer.stem(w) in set1:
    #         removeList.append(w)

    # for w in removeList:
    #     set1.remove(w)

    # sorted_text_rank = [(w, val) for (w, val) in sorted_text_rank[:15] if w not in removeList]

    # offset_dict_text_rank = {}
    # for words1 in set1:
    #     offset_dict_text_rank[words1] = doc.offsets(words1)

    # Call tf
    sorted_tfValues = tf(tokenized_words, word_tag_dict)
    set2 = set([w.lower() for (w, val) in sorted_tfValues[:15]])
    removeList = []
    for w in set2:
        if stemmer.stem(w) != w and stemmer.stem(w) in set2:
            removeList.append(w)

    for w in removeList:
        set2.remove(w)

    sorted_tfValues = [(w, val) for (w, val) in sorted_tfValues[:15]
                       if w not in removeList]

    offset_dict_tf = {}
    for words2 in set2:
        offset_dict_tf[words2] = doc.offsets(words2)

    # # Call tf-idf
    # sorted_tf_idf = tfIdf (raw_text, word_tag_dict)
    # set3 = set([w for (w, val) in sorted_tf_idf[:15]])
    # removeList = []
    # for w in set3:
    #     if stemmer.stem(w) != w and stemmer.stem(w) in set3:
    #         removeList.append(w)

    # for w in removeList:
    #     set3.remove(w)

    # sorted_tf_idf = [(w, val) for (w, val) in sorted_tf_idf[:15] if w not in removeList]

    # offset_dict_tf_idf = {}
    # for words3 in set3:
    #     offset_dict_tf_idf[words3] = doc.offsets(words3)
    """ Printing the resuts"""
    # print (raw_text)

    # print ("\n\nText Rank of the document:")
    # printResult (sorted_text_rank, word_tag_dict, offset_dict_text_rank)
    # printTable (sorted_text_rank, offset_dict_text_rank)
    # printMatrix (offset_dict_text_rank)

    print("\n\nTf Scores of the document:\n")
    printResult(sorted_tfValues, word_tag_dict, offset_dict_tf)
    out_list, tid_word_dict = printTable(sorted_tfValues, offset_dict_tf)
    words_list = printMatrix(offset_dict_tf)
    print_top_sentence(raw_text, sorted_tfValues, matches, out_list,
                       tid_word_dict, words_list)

    print_sentences(raw_text, sorted_tfValues, tid_word_dict, words_list)
Exemplo n.º 8
0
        if (minimum[word] > count): minimum[word] = count
    counts.append(len(gutenberg.words(fileid)))
    output.append(counts)
print tabulate(output, headers="firstrow")

difference = []
for word in wordList:
    difference.append([word, maximum[word] - minimum[word]])
print tabulate(difference)

# could and will
#'will': 0.004993612820810591		shakespeare-caesar.txt
#'will': 0.0003591094086665071		blake-poems.txt

# someone's Will is mentioned
t = nltk.Text(nltk.corpus.gutenberg.words('shakespeare-caesar.txt'))
t.concordance("will")

t = nltk.Text(nltk.corpus.gutenberg.words('blake-poems.txt'))
t.concordance("will")

#'could': 0.004522720559024559		austen-persuasion.txt
#'could': 0.00016326062134024106		bible-kjv.txt

# bible mostly only uses for 'could not'
t = nltk.Text(nltk.corpus.gutenberg.words('austen-persuasion.txt'))
t.concordance("could")

t = nltk.Text(nltk.corpus.gutenberg.words('bible-kjv.txt'))
t.concordance("could")
# 杏
URL = 'http://dic.nicovideo.jp/a/%E5%8F%8C%E8%91%89%E6%9D%8F'  # 주소
user_agent = "anzu"
overloadText = ""
response = rq.get(URL, headers={'User-Agent': user_agent})
soup = BeautifulSoup(response.text,
                     'html.parser')  # 전체 HTML에서 특정 부분만 찾기 위한 html parsing
anzuArticle = soup.findAll('p')  # 닉네임 전부 가져옴

for index in anzuArticle:
    overloadText = overloadText + index.text

tokens_jp = mecab_analysis(
    overloadText)  # konlpy Package의 t를 이용하여 단어 나눔, "形容詞", "動詞","名詞", "副詞"만 필요
jp = nltk.Text(tokens_jp, name='杏')  # 중복 제거를 위해 token형식으로 만듬
data = jp.vocab().most_common(500)  # 정렬된 list-tuple형식으로 변경(Count해줌. 최대 500)(
tmp_data = dict(data)  # dict형식으로 데이터 변경

stop_words = [
    'てる', 'いる', 'なる', 'れる', 'する', 'ある', 'こと', 'これ', 'さん', 'して', 'くれる', 'やる',
    'くださる', 'そう', 'せる', 'した', '思う', 'それ', 'ここ', 'ちゃん', 'くん', '', 'て', 'に', 'を',
    'は', 'の', 'が', 'と', 'た', 'し', 'で', 'ない', 'も', 'な', 'い', 'か', 'ので', 'よう',
    '[', ']', '/'
]
# Python 3.0부턴 유니코드 표현을 위해 앞에 u를 붙일 필요가 없음(이미 유니코드로 표현되므로)

anzu_coloring = np.array(Image.open("WordCloudData/anzu.jpg"))

wc = WordCloud(font_path="/Library/Fonts/Hannari.otf",
               max_words=2000,
Exemplo n.º 10
0
Containing the sequence of letters pt
Having all lowercase letters except for an initial capital (i.e., titlecase)
'''
print(
    sorted(w for w in set(text6)
           if w.endswith('ise') and 'z' in w and 'pt' in w and w.istitle()))
'''
CAPITULO 2
'''

import nltk
print(nltk.corpus.gutenberg.fileids())
emma = nltk.corpus.gutenberg.words('austen-emma.txt')
print('Cantidad de palabras en el texto Emma', len(emma))

emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt'))
print('Concordancia de surprize', emma.concordance("surprize"))

print(
    'average word length, average sentence length, and the number of times each vocabulary item appears in the text on average (our lexical diversity score).'
)
for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid))
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set(w.lower() for w in gutenberg.words(fileid)))
    print(round(num_chars / num_words), round(num_words / num_sents),
          round(num_words / num_vocab), fileid)

from nltk.corpus import webtext
for fileid in webtext.fileids():
Exemplo n.º 11
0
        nouns = [word for (word, pos) in nltk.pos_tag(text2) if is_noun(pos)]
        V = set(nouns)
        long_words1 = [w for w in tokens if 4 < len(w) < 10]
        fdist01 = nltk.FreqDist(long_words1)
        a1 = fdist01.most_common(40)

        def lexical_diversity(text):
            return len(set(text)) / len(text)

        vocab = set(text)
        vocab_size = len(vocab)

        V = set(text)
        long_words = [w for w in tokens if 4 < len(w) < 13]

        text2 = nltk.Text(word.lower() for word in long_words)

        fdist1 = nltk.FreqDist(long_words)
        a = fdist1.most_common(15)

        import logging
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                            level=logging.INFO)
        import matplotlib.pyplot as plt
        from gensim import corpora
        from string import punctuation

        def strip_punctuation(s):
            return ''.join(c for c in s if c not in punctuation)

        documents = [
Exemplo n.º 12
0
import json
import sys
import nltk
import datetime
import numpy as np
from keras import models, layers, optimizers, losses, metrics

target = "json_files/ratings_train_dset.json"
# Loading json file
with open(target) as f:
    train_target = json.load(f)

# Only tokens
train_tokens = [tok for d in train_target for tok in d[0]]
train_text = nltk.Text(train_tokens, name="train_t")

# Make selected_word <- This is used to make vector
common_tests = train_text.vocab().most_common(500)
selected_word = [word[0] for word in common_tests]


def term_frequency(doc):
    return [doc.count(s_word) for s_word in selected_word]


# Vectorization
# x is input (term), and it is dataset
train_x = [term_frequency(d) for d, _ in train_target]
# y is output (0 or 1), and it is dataset
train_y = [c for _, c in train_target]
Exemplo n.º 13
0
def user_review(request, reviewee_id):
    if request.method not in ['GET', 'POST']:
        return HttpResponseNotAllowed(['GET', 'POST'])

    if request.user.is_authenticated:
        if request.method == 'GET':
            reviewee_user = get_object_or_404(User, pk=reviewee_id)
            review_list = Review.objects.filter(reviewee=reviewee_user)

            json_review_list = []
            for review in review_list:
                json_review_list.append({'rating': review.rating,
                                         'content': review.content})
            random.shuffle(json_review_list)
            return JsonResponse(
                json_review_list,
                status=200,
                safe=False
            )


        # when request.method == 'POST':
        reviewee_user = get_object_or_404(User, pk=reviewee_id)

        try:
            req_data = json.loads(request.body.decode())
            content = req_data['content']
        except (KeyError, TypeError, ValueError, JSONDecodeError):
            return HttpResponse(status=400)

        # sentiment analysis
        model_path = os.path.join(os.getcwd(), 'review', 'nsmc', 'sentiment_model.h5')
        model = tf.keras.models.load_model(model_path)

        docs_path = os.path.join(os.getcwd(), 'review', 'nsmc', 'train_docs.json')
        with open(docs_path) as file:
            train_docs = json.load(file)

        tokens = [t for d in train_docs for t in d[0]]
        text = nltk.Text(tokens, name='NMSC')
        okt = Okt()
        selected_words = [f[0] for f in text.vocab().most_common(10000)]
        rating = sent.predict_score(model, okt, selected_words, content)

        new_review = Review(reviewee=reviewee_user,
                            rating=rating,
                            content=content)
        new_review.save()

        # Apply new rating to corresponding user
        review_list_len = Review.objects.filter(reviewee=reviewee_user).count()
        reviewee_user_profile = Profile.objects.get(user=reviewee_user)

        prev_rating = reviewee_user_profile.rating
        new_rating = (prev_rating * review_list_len + rating) / (review_list_len + 1)
        reviewee_user_profile.rating = round(new_rating, 2)
        reviewee_user_profile.save()

        return JsonResponse({'id': new_review.id,
                             'rating': rating,
                             'content': content},
                            status=201)

    # If user is not logged in
    return HttpResponse(status=401)
Exemplo n.º 14
0
                print(c)
num(6, 10)
num(20, 4)
        
#nltk
import nltk
from nltk import word_tokenize
from nltk.book import*
text1.count("give")
my_pers=text4.count("love")/len(text4)
pers=print(round(my_pers*100, 2), 'percent')

#processing my own texts on nltk
prose=open('F:\OneDriveGSU\OneDrive - Georgia State University\Python_practice\Julian.txt', 'r').read() #open and read the txt file
my_prose=word_tokenize(prose) #tokenize the open and read file
my_prose1=nltk.Text(my_prose) #convert the tokenized text into an nltk text
my_prose1.concordance("I") #start processign the text
dectic=['there', 'here', 'then', 'that'] #specified the things I want to find out
my_prose2=prose.split()#turning the text into a list
for word in my_prose2: #for every item in the list
    if word in dectic: #if it's in dectic
        myNum=my_prose2.count(word) #count their number and assign it to a variable
print(myNum) #print it

#from nltk: #doesn't  work
def word_vowels(words):
    words1=words.split()#turning the  input into a list
    count=0
    my_lis=['a', 'e', 'i', 'o', 'u']#specifying the list of vowels
    for word in words1:  #for each word in the list
        for i in my_lis: #for each item in the vowel list
Exemplo n.º 15
0
 def nltk(self):
     import nltk
     return nltk.Text(self.tokens)
Exemplo n.º 16
0
 def kwic(self, word):
     txt = nltk.Text(self.tokens)
     return txt.concordance(word)
Exemplo n.º 17
0
def get_restrictive_intensifier():
    # 1. Make set of noun and adjective which has intensity concept separately from reference corpus (Gutenberg)
    noun_set = []
    adj_set = []
    intensity_keyword = ['degree', 'intensity']

    # Check whether data is produced already or not. Load the data if data is produced.
    if os.path.isfile('noun_set.txt') and os.path.isfile('adj_set.txt'):
        with open('noun_set.txt', 'rb') as f:
            noun_set = pickle.load(f)
        with open('adj_set.txt', 'rb') as f:
            adj_set = pickle.load(f)
        print("noun_set and adj_set data exists. Loaded data\n")

    # If data is not produced yet, start the process. Make noun_set and adj_set
    else:
        total_token = []

        # Get tokenized words from gutenberg corpus
        for each_fileid in gutenberg.fileids():
            each_tokenized_text = set(gutenberg.words(each_fileid)) # eliminate duplicated words
            for each_token in each_tokenized_text:
                total_token.append(each_token.lower()) # make all characters to lower case

        total_token = set(total_token)
        print('total words number for reference is %d' % len(total_token)) # for debugging

        total_token_with_tags = pos_tag(total_token) # tag part-of-speech to words
        for each_word in total_token_with_tags:
            # Select nouns among words
            if each_word[0].isalpha() and each_word[1] == "NN":
                found_keyword = False

                for synset in wn.synsets(each_word[0]):
                    for each_keyword in intensity_keyword:
                        if each_keyword in synset.definition():
                            found_keyword = True
                            break

                    if found_keyword:
                        break

                if found_keyword:
                    noun_set.append(each_word[0])

            # Select adjectives among words
            if each_word[0].isalpha() and each_word[1] == "JJ":
                found_keyword = False

                for synset in wn.synsets(each_word[0]):
                    for each_keyword in intensity_keyword:
                        if each_keyword in synset.definition():
                            found_keyword = True
                            break

                    if found_keyword:
                        break

                if found_keyword:
                    adj_set.append(each_word[0])

        # Save the produced data into file
        with open('noun_set.txt', 'wb') as f:
            pickle.dump(noun_set, f)
        with open('adj_set.txt', 'wb') as f:
            pickle.dump(adj_set, f)
        print('Saved noun_set and adj_set by pickle\n')

    print('nouns number is %d, adjective number is %d, total nouns and adjectives number is %d\n' % (len(noun_set), len(adj_set), len(noun_set) + len(adj_set)))
    print('First step is finished. Nouns & adjectives were selected.\n')  # for debugging

    # 2. Make noun_pair_set. Find intensifier from the noun
    intensifier_keyword = ['intensifier']
    gutenberg_fdists = dict()
    noun_pair_set = dict()
    total_intensifier_set = dict()

    # Check whether data is produced already or not. Load the data if data is produced.
    if os.path.isfile('gutenberg_fdists.txt') and os.path.isfile('noun_pair_set.txt') and os.path.isfile('total_intensifier_set_nouns.txt'):
        with open('gutenberg_fdists.txt', 'rb') as f:
            gutenberg_fdists = pickle.load(f)
        with open('noun_pair_set.txt', 'rb') as f:
            noun_pair_set = pickle.load(f)
        with open('total_intensifier_set_nouns.txt', 'rb') as f:
            total_intensifier_set = pickle.load(f)
        print("gutenberg_fdists and noun_pair_set, total_intensifier_set data exists. Loaded data\n")

    # If data is not produced yet, start the process. Make noun_pair_set
    else:
        # Create fdist for all of gutenberg corpus
        for each_fileid in gutenberg.fileids():
            each_tokenized_corpus_text = nltk.Text(gutenberg.words(each_fileid))
            gutenberg_fdists[each_fileid] = FreqDist(each_tokenized_corpus_text)

        count = 0
        for each_noun in noun_set:
            # Search in gutenberg corpus
            for each_fileid in gutenberg.fileids():
                each_tokenized_corpus = gutenberg.words(each_fileid)
                for each_index, each_token in enumerate(each_tokenized_corpus):
                    # Find the corresponding noun in corpus
                    if each_token.lower() == each_noun:
                        # Check previous word of given noun
                        if each_index != 0 and each_tokenized_corpus[each_index - 1].isalpha():
                            intensifier = each_tokenized_corpus[each_index - 1].lower()
                            # Find wornet's lemmas and check if there are intensifier_keyword in definition of lemma
                            for synset in wn.synsets(intensifier):
                                found_intensifier = False

                                for each_keyword in intensifier_keyword:
                                    if each_keyword in synset.definition():
                                        # Found intensifier. Keyword is included in the synset definition

                                        # If there is already body word in noun_pair_set, but not found intensifier yet, then add it to existing body word element
                                        if each_noun in noun_pair_set:
                                            if intensifier not in noun_pair_set[each_noun][1]:
                                                # Calculate the frequency of "intensifier body_word"
                                                bigram_frequency = find_frequency(intensifier, each_noun)

                                                # Add it to the noun_pair_set
                                                noun_pair_set[each_noun][1][intensifier] = [bigram_frequency, 0]

                                                # Add it to the total_intensifier_set to calculate frequency for later process
                                                if intensifier in total_intensifier_set:
                                                    if each_noun not in total_intensifier_set[intensifier]:
                                                        total_intensifier_set[intensifier][each_noun] = bigram_frequency

                                                else:
                                                    total_intensifier_set[intensifier] = {each_noun: bigram_frequency}

                                        # If there is no body word in noun_pair_set, then add body word, intensifiier pair into noun_pair_set
                                        else:
                                            # Calculate the frequency of "intensifier body_word"
                                            bigram_frequency = find_frequency(intensifier, each_noun)

                                            # Calculate the frequency of body_word alone
                                            bodyword_frequency = 0
                                            for gutenberg_each_fileid in gutenberg.fileids():
                                                bodyword_frequency += gutenberg_fdists[gutenberg_each_fileid].freq(each_noun)

                                            # Add it to the noun_pair_set
                                            noun_pair_set[each_noun] = [bodyword_frequency, {intensifier: [bigram_frequency, 0]}]

                                            # Add it to the total_intensifier_set to calculate frequency for later process
                                            if intensifier in total_intensifier_set:
                                                if each_noun not in total_intensifier_set[intensifier]:
                                                    total_intensifier_set[intensifier][each_noun] = bigram_frequency

                                            else:
                                                total_intensifier_set[intensifier] = {each_noun: bigram_frequency}

                                        # look for intensifier's lemma's example
                                        for each_example in synset.examples():
                                            tokenized_example = word_tokenize(each_example)
                                            if intensifier in tokenized_example:
                                                next_word_idx = tokenized_example.index(intensifier) + 1
                                                if next_word_idx < len(tokenized_example):
                                                    next_word = tokenized_example[next_word_idx].lower()
                                                    if next_word.isalpha():
                                                        # If there is already body word in noun_pair_set, but not found intensifier yet, then add it to existing body word element
                                                        if next_word in noun_pair_set:
                                                            if intensifier not in noun_pair_set[next_word][1]:
                                                                # Calculate the frequency of "intensifier body_word"
                                                                bigram_frequency = find_frequency(intensifier, next_word)

                                                                # Add it to the noun_pair_set. Add default score 15 since it is found for synset's example
                                                                noun_pair_set[next_word][1][intensifier] = [bigram_frequency, 15]

                                                                # Add it to the total_intensifier_set to calculate frequency for later process
                                                                if intensifier in total_intensifier_set:
                                                                    if next_word not in total_intensifier_set[intensifier]:
                                                                        total_intensifier_set[intensifier][next_word] = bigram_frequency

                                                                else:
                                                                    total_intensifier_set[intensifier] = {next_word: bigram_frequency}

                                                        # If there is no body word in noun_pair_set, then add body word, intensifiier pair into noun_pair_set
                                                        else:
                                                            # Calculate the frequency of "intensifier body_word"
                                                            bigram_frequency = find_frequency(intensifier, next_word)

                                                            # Calculate the frequency of body_word alone
                                                            bodyword_frequency = 0
                                                            for gutenberg_each_fileid in gutenberg.fileids():
                                                                bodyword_frequency += gutenberg_fdists[gutenberg_each_fileid].freq(next_word)

                                                            # Add it to the noun_pair_set. Add default score 15 since it is found for synset's example
                                                            noun_pair_set[next_word] = [bodyword_frequency, {intensifier: [bigram_frequency, 15]}]

                                                            # Add it to the total_intensifier_set to calculate frequency for later process
                                                            if intensifier in total_intensifier_set:
                                                                if next_word not in total_intensifier_set[intensifier]:
                                                                    total_intensifier_set[intensifier][next_word] = bigram_frequency

                                                            else:
                                                                total_intensifier_set[intensifier] = {next_word: bigram_frequency}

                                        found_intensifier = True
                                        break

                                if found_intensifier:
                                    break
            count+=1
            print(str(count) + "\n") # for debugging

        # Save the produced data into file
        with open('gutenberg_fdists.txt', 'wb') as f:
            pickle.dump(gutenberg_fdists, f)
        with open('noun_pair_set.txt', 'wb') as f:
            pickle.dump(noun_pair_set, f)
        with open('total_intensifier_set_nouns.txt', 'wb') as f:
            pickle.dump(total_intensifier_set, f)
        print('Saved gutenberg_fdists, total_intensifier_set_nouns and noun_pair_set by pickle\n')

    print('Second step is finished. noun_pair_set (ditionary) is created and size is %d \n' %(len(noun_pair_set)))  # for debugging

    # 3. Make adj_pair_set. Find intensifier from the adjective
    adj_pair_set = dict()

    # Check whether data is produced already or not. Load the data if data is produced.
    if os.path.isfile('adj_pair_set.txt') and os.path.isfile('total_intensifier_set_nouns_adjs.txt'):
        with open('adj_pair_set.txt', 'rb') as f:
            adj_pair_set = pickle.load(f)
        with open('total_intensifier_set_nouns_adjs.txt', 'rb') as f:
            total_intensifier_set = pickle.load(f)
        print("adj_pair_set and total_intensifier_set_nouns_adjs data exists. Loaded data\n")

    # If data is not produced yet, start the process. Make adj_pair_set
    else:
        count = 0
        for each_adj in adj_set:
            # Search in gutenberg corpus
            for each_fileid in gutenberg.fileids():
                each_tokenized_corpus = gutenberg.words(each_fileid)
                for each_index,each_token in enumerate(each_tokenized_corpus):
                    # Find the corresponding adjective in corpus
                    if each_token.lower() == each_adj:
                        # Check provious word of given adjective
                        if each_index != 0 and each_tokenized_corpus[each_index-1].isalpha():
                            intensifier = each_tokenized_corpus[each_index - 1].lower()
                            # Find wornet's lemmas and check if there are intensifier_keyword in definition of lemma
                            for synset in wn.synsets(intensifier):
                                found_intensifier = False

                                for each_keyword in intensifier_keyword:
                                    if each_keyword in synset.definition():
                                        # Found intensifier. Keyword is included in the synset definition

                                        # If there is already body word in adj_pair_set, but not found intensifier yet, then add it to existing body word element
                                        if each_adj in adj_pair_set:
                                            if intensifier not in adj_pair_set[each_adj][1]:
                                                # Calculate the frequency of "intensifier body_word"
                                                bigram_frequency = find_frequency(intensifier, each_adj)

                                                # Add it to the adj_pair_set
                                                adj_pair_set[each_adj][1][intensifier] = [bigram_frequency, 0]

                                                # Add it to the total_intensifier_set to calculate frequency for later process
                                                if intensifier in total_intensifier_set:
                                                    if each_adj not in total_intensifier_set[intensifier]:
                                                        total_intensifier_set[intensifier][each_adj] = bigram_frequency

                                                else:
                                                    total_intensifier_set[intensifier] = {each_adj: bigram_frequency}

                                        # If there is no body word in adj_pair_set, then add body word, intensifiier pair into adj_pair_set
                                        else:
                                            # Calculate the frequency of "intensifier body_word"
                                            bigram_frequency = find_frequency(intensifier, each_adj)

                                            # Calculate the frequency of body_word alone
                                            bodyword_frequency = 0
                                            for gutenberg_each_fileid in gutenberg.fileids():
                                                bodyword_frequency += gutenberg_fdists[gutenberg_each_fileid].freq(each_adj)

                                            # Add it to the adj_pair_set
                                            adj_pair_set[each_adj] = [bodyword_frequency, {intensifier: [bigram_frequency, 0]}]

                                            # Add it to the total_intensifier_set to calculate frequency for later process
                                            if intensifier in total_intensifier_set:
                                                if each_adj not in total_intensifier_set[intensifier]:
                                                    total_intensifier_set[intensifier][each_adj] = bigram_frequency

                                            else:
                                                total_intensifier_set[intensifier] = {each_adj: bigram_frequency}

                                        # look for intensifier's lemma's example
                                        for each_example in synset.examples():
                                            tokenized_example = word_tokenize(each_example)
                                            if intensifier in tokenized_example:
                                                next_word_idx = tokenized_example.index(intensifier) + 1
                                                if next_word_idx < len(tokenized_example):
                                                    next_word = tokenized_example[next_word_idx].lower()
                                                    if next_word.isalpha():
                                                        # If there is already body word in adj_pair_set, but not found intensifier yet, then add it to existing body word element
                                                        if next_word in adj_pair_set:
                                                            if intensifier not in adj_pair_set[next_word][1]:
                                                                # Calculate the frequency of "intensifier body_word"
                                                                bigram_frequency = find_frequency(intensifier, next_word)

                                                                # Add it to the adj_pair_set. Add default score 15 since it is found for synset's example
                                                                adj_pair_set[next_word][1][intensifier] = [bigram_frequency, 15]

                                                                # Add it to the total_intensifier_set to calculate frequency for later process
                                                                if intensifier in total_intensifier_set:
                                                                    if next_word not in total_intensifier_set[intensifier]:
                                                                        total_intensifier_set[intensifier][next_word] = bigram_frequency

                                                                else:
                                                                    total_intensifier_set[intensifier] = {next_word: bigram_frequency}

                                                        # If there is no body word in adj_pair_set, then add body word, intensifiier pair into adj_pair_set
                                                        else:
                                                            # Calculate the frequency of "intensifier body_word"
                                                            bigram_frequency = find_frequency(intensifier, next_word)

                                                            # Calculate the frequency of body_word alone
                                                            bodyword_frequency = 0
                                                            for gutenberg_each_fileid in gutenberg.fileids():
                                                                bodyword_frequency += gutenberg_fdists[gutenberg_each_fileid].freq(next_word)

                                                            # Add it to the adj_pair_set. Add default score 15 since it is found for synset's example
                                                            adj_pair_set[next_word] = [bodyword_frequency, {intensifier: [bigram_frequency, 15]}]

                                                            # Add it to the total_intensifier_set to calculate frequency for later process
                                                            if intensifier in total_intensifier_set:
                                                                if next_word not in total_intensifier_set[intensifier]:
                                                                    total_intensifier_set[intensifier][next_word] = bigram_frequency

                                                            else:
                                                                total_intensifier_set[intensifier] = {next_word: bigram_frequency}

                                        found_intensifier = True
                                        break

                                if found_intensifier:
                                    break
            count+=1
            print(str(count) + "\n") # for debugging

        # Save the produced data into file
        with open('adj_pair_set.txt', 'wb') as f:
            pickle.dump(adj_pair_set, f)
        with open('total_intensifier_set_nouns_adjs.txt', 'wb') as f:
            pickle.dump(total_intensifier_set, f)
        print('Saved adj_pair_set and total_intensifier_set_nouns_adjs by pickle\n')

    print('Third step is finished. adj_pair_set (ditionary) is created and size is %d \n' % (len(adj_pair_set)))  # for debugging

    # 4. Calculate Score and combine nouns_pair_set, ajd_pair_set into total_pair_set
    total_pair_set = []

    # Check whether data is produced already or not. Load the data if data is produced.
    if os.path.isfile('adj_scored_pair_set.txt') and os.path.isfile('noun_scored_pair_set.txt') and os.path.isfile('total_pair_set.txt'):
        with open('adj_scored_pair_set.txt', 'rb') as f:
            adj_pair_set = pickle.load(f)
        with open('noun_scored_pair_set.txt', 'rb') as f:
            noun_pair_set = pickle.load(f)
        with open('total_pair_set.txt', 'rb') as f:
            total_pair_set = pickle.load(f)
        print("adj_scored_pair_set, noun_socred_pair_set and total_pair_set data exists. Loaded data\n")

    # If data is not produced yet, start the process. Calculate the score
    else:
        # Calculate score for nouns
        count = 0
        for each_body_word, body_word_data in noun_pair_set.items():
            body_word_frequency = body_word_data[0]
            for each_intensifier, intensifier_data in body_word_data[1].items():
                bigram_frequency = intensifier_data[0]

                # Calculate the average frequency of intensifiers from other body words
                frequency_sum = 0
                freqeuncy_count = 0
                for each_body_word_2, each_body_word_frequency_2 in total_intensifier_set[each_intensifier].items():
                    if each_body_word_2 != each_body_word:
                        freqeuncy_count += 1
                        frequency_sum += each_body_word_frequency_2
                frequency_avg = 0
                if freqeuncy_count != 0:
                    frequency_avg = frequency_sum/freqeuncy_count

                # Calculate the score and save it
                score = calculate_score(body_word_frequency, bigram_frequency, frequency_avg)
                intensifier_data[1] += score

                # Add data into total_pair_set
                total_pair_set.append((each_intensifier, each_body_word, intensifier_data[1]))

            count+=1
            print(str(count) + "\n") # for debugging

        print('Noun score calcuating finish\n') # for dubugging

        # Calculate score for adjectives
        count=0
        for each_body_word, body_word_data in adj_pair_set.items():
            body_word_frequency = body_word_data[0]
            for each_intensifier, intensifier_data in body_word_data[1].items():
                bigram_frequency = intensifier_data[0]

                # Calculate the average frequency of intensifiers from other body words
                frequency_sum = 0
                freqeuncy_count = 0
                for each_body_word_2, each_body_word_frequency_2 in total_intensifier_set[each_intensifier].items():
                    if each_body_word_2 != each_body_word:
                        freqeuncy_count += 1
                        frequency_sum += each_body_word_frequency_2
                frequency_avg = 0
                if freqeuncy_count != 0:
                    frequency_avg = frequency_sum/freqeuncy_count

                # Calculate the score and save it
                score = calculate_score(body_word_frequency, bigram_frequency, frequency_avg)
                intensifier_data[1] += score

                # Add data into total_pair_set
                total_pair_set.append((each_intensifier, each_body_word, intensifier_data[1]))

            count+=1
            print(str(count) + "\n") # for debugging

        print('Adjective score calcuating finish\n') # for debugging

        # Sort the total_pair_set by score
        total_pair_set = set(total_pair_set)
        total_pair_set = sorted(total_pair_set, key=lambda each_pair: each_pair[2], reverse=True)
        print('Total pair set sorting finish\n') # for debugging

        # Save the produced data into file
        with open('adj_scored_pair_set.txt', 'wb') as f:
            pickle.dump(adj_pair_set, f)
        with open('noun_scored_pair_set.txt', 'wb') as f:
            pickle.dump(noun_pair_set, f)
        with open('total_pair_set.txt', 'wb') as f:
            pickle.dump(total_pair_set, f)
        print('Saved adj, nouns scored sets and total_pair_set by pickle\n')

    print('Fourth step is finished. adj_pair_set and noun_pair_set is scored, and total_pair_set is created and sorted \n')  # for debugging

    # 5. Print out result in csv file
    hundread_selected_result = total_pair_set[:100]
    f = open('result.csv', 'w') # make "result.csv" file
    for each_pair in hundread_selected_result:
        f.write(each_pair[0] + ',' + each_pair[1] + '\n')
    print('Fifth step is finished. Selected 100 most unique intensifier-body word sets and wrote it on csv file \n')  # for debugging
Exemplo n.º 18
0
 def tokenize(self):
     tokens = nltk.word_tokenize(self._text)
     self._nltk_text = nltk.Text(tokens)
test_words = word_extractor.extract()
test_score = {
    word: score.cohesion_forward
    for word, score in test_words.items()
}
tokenizer = LTokenizer(scores=test_score)
test_list = []
cnt = 0
for sent in x_test:
    test_list.append([tokenizer.tokenize(sent)])
    cnt += 1

train_tokens = [token for data in train_list for token in data[0]]
test_tokens = [token for data in test_list for token in data[0]]

train_text = nltk.Text(train_tokens)
test_text = nltk.Text(test_tokens)

print('=====================selecting token======================')  #시간 개오래걸림;
selected_tokens_1 = [t[0] for t in train_text.vocab().most_common(500)
                     ]  #출현 빈도가 높은 상위 10000개의 토큰 선택
selected_tokens_2 = [t[0] for t in test_text.vocab().most_common(500)]


#벡터화 -> BOW(Bag of Words)
def term_frequency1(data):
    return [data.count(word) for word in selected_tokens_1]


def term_frequency2(data):
    return [data.count(word) for word in selected_tokens_2]
Exemplo n.º 20
0
#Tarot NLP Basic stats 
#req python3
import nltk
data_dir = "data"

rw_cards = [
		"fool", "magician", "high priestess", "empress", "emperor", "hierophant","lovers",
		"chariot", "strength","hermit", "wheel of fortune", "justice", "hanged man", "death", 
		"temperance", "devil", "tower", "star", "moon", "sun", "judgement", "world"]

all_files = ["".join(open("{}/{}_clean.txt".format(data_dir, name.replace(" ","_")),"r").readlines()) for name in rw_cards]

all_files_concat = "".join(all_files)
master_vocab = sorted(
	set(
		[word.lower() for word in nltk.Text(nltk.word_tokenize(all_files_concat))]
		)
	)

def process_raw(raw):
	tokens = nltk.word_tokenize("".join(raw))
	txt = nltk.Text(tokens)
	words = [word.lower() for word in txt]
	fdist = nltk.FreqDist(words)
	return fdist

#Generate distribution for full corpus text. 
all_lines = "".join([line for f in all_files for line in f])

stats = {}
names_copy = list(rw_cards)
Exemplo n.º 21
0
             "     Documentos Relevantes: " + str(len(datos_binario[1])) +
             "/" + str(len(qrels[1])),
             font=dict(family='Arial', size=12, color='rgb(50,50,50)'),
             showarrow=False))

    layout['annotations'] = annotations
    fig = dict(data=data, layout=layout)
    name = name + ".html"
    plot(fig, filename=name)


#------------------------------------------------------------------

doc = nltk.Text(
    nltk.regexp_tokenize(
        open(
            "/Users/gabriel/Documents/INAOE/2do Cuatrimestre/Recuperacion/corpus/cacm/cacm.all",
            "r").read(), "[A-Za-z'.]+"))
palabras = []
titulos = []
titulo = []
flag = 0
for word in doc:
    if word == ".T":
        flag = 1
    if (word == ".B" and flag == 1) or (word == ".W"
                                        and flag == 1) or (word == ".A"
                                                           and flag == 1):
        flag = 0
        titulos.append(titulo)
        titulo = []
Exemplo n.º 22
0
def process_raw(raw):
	tokens = nltk.word_tokenize("".join(raw))
	txt = nltk.Text(tokens)
	words = [word.lower() for word in txt]
	fdist = nltk.FreqDist(words)
	return fdist
Exemplo n.º 23
0
def features(sentence):
    words = sentence.lower().split()
    return dict(('contains(%s)' % w, True) for w in words)


# takes in: line of text made of WORDs
# returns: dictionary in form {'contains(WORD)': None}
def notfeatures(sentence):
    words = sentence.lower().split()
    return dict(('contains(%s)' % w, None) for w in words)


#reading wikipedia articles on apples and Apple
fruitText = open("apple-fruit.txt").read()
fruitTokens = re.split(r'\W+', fruitText)
fruitWords = nltk.Text(fruitTokens)

computersText = open("apple-computers.txt").read()
computersTokens = re.split(r'\W+', computersText)
computersWords = nltk.Text(computersTokens)

#calculating frequency distributions of words in wiki articles
#this should be followed by other processes like getting rid of common words like "the" or "and"
fruitWordsFreq = nltk.FreqDist(w.lower() for w in fruitWords)
computersWordsFreq = nltk.FreqDist(w.lower() for w in computersWords)

#choosing arbitrlly only a subset of words for training the model
fruitFeatures = list(map(features, fruitWordsFreq.keys()[2:100]))
computersFeatures = list(map(notfeatures, computersWordsFreq.keys()[2:2000]))

#creating and training Positive Naive Bayes model with features from wiki articles
Exemplo n.º 24
0
    return review_words


words = []
review_num = 0

for review in korean_review_df["document"]:
    words += review_to_wordlist(review)
    review_num += 1
    if review_num % 5000 == 0:
        print("review_num: %d" % (review_num))

len(words)
pprint(words[:10])
import nltk
words_nltk = nltk.Text(words, name="words in movie reviews")

pprint(words_nltk.vocab().most_common(10))

import matplotlib
matplotlib.rc('font', family='AppleGothic')
# matplotlib inline
words_nltk.plot(40)

Non_Stop_words = [
    "Noun", "Verb", "Adjective", "Adverb", "Excalmation", "koreanParticle"
]
words_new = [w for w in words if w.split("/")[-1] in Non_Stop_words]
pprint(words_new[:10])
words_nltk_new = nltk.Text(words_new, name="new words in movie reviews")
pprint(words_nltk_new.vocab().most_common(10))
Exemplo n.º 25
0
okt = Okt()

def tokenize(doc):
    # norm은 정규화, stem은 근어로 표시하기를 나타냄
    return ['/'.join(t) for t in okt.pos(doc, norm=True, stem=True)]

with open('./train_docs.json', encoding="utf-8") as f:
    train_docs = json.load(f)
with open('./test_docs.json', encoding="utf-8") as f:
    test_docs = json.load(f)

# train_doc에 존재하는 모든 태깅된 문자열을 담는다
tokens = [t for d in train_docs for t in d[0]]

import nltk
text = nltk.Text(tokens, name='NMSC')
print(text)

selected_size = 4000
selected_words = [f[0] for f in text.vocab().most_common(selected_size)]

def term_frequency(doc):
    return [doc.count(word) for word in selected_words]

def load_model():
    # 저장된 모델이 있다면 그 모델 사용
    model = None
    if os.path.isfile('./review_model.json') and os.path.isfile('./review_model_weight.h5'):
        json_file = open("./review_model.json", "r")
        model_json = json_file.read()
        json_file.close()
# PLEASE STAY LOW!
text = [t.lower() for t in text][:-10]

# Remove 'chapter i' strings
regexp = re.compile(r'chapter \d')
text = [t for t in text if not re.match(regexp, t)]

# combine all the text together
raw = ' '.join(text)
print('type of the raw text' + str(type(raw)))

# Here's the magic
tokens = [t for t in nltk.word_tokenize(raw) if t not in (',', '“', '”', '"')]
#tokens = [t for t in jieba.cut(raw) if t not in (',', '“', '”', '"')]
test_ndarr = np.array(tokens)

# a list of tokens
print('current tokens size is ' + str(test_ndarr.shape))

distinct_tokens = set(tokens)
lexical_richness = len(distinct_tokens) / len(tokens)

from pylab import mpl

mpl.rcParams['font.sans-serif'] = ['SimHei']
ntext = nltk.Text(tokens)
# draw the picture of word/ offset
# 典型的词分布图像
ntext.dispersion_plot(
    ['乐视', '资金', '变革', '生态', '布局', '硬件', '用户', '承诺', '责任', '质疑', '窒息', '歉意'])
Exemplo n.º 27
0
nltk.download()
# stopwords collection은 usually 텍스트마이닝에서 제외하는 단어를 모아놨다.
# nltk lemmatization 은 worknet을 기반으로 분석한다.

from nltk.corpus import gutenberg
ids = gutenberg.fileids()  # 옛것이라 라이센스 없어서 괜찮음 ㅎ

text = gutenberg.open(ids[0]).read()  # emma 로 분석을 시작해보자.

nltk.download('punkt')
from nltk import word_tokenize
tokens = word_tokenize(text)
tokens[:100]

en = nltk.Text(tokens)
#tokens = en.tokens # 모든 character를 나눈다. nltk.Text에 text를 넣으면.
dic = en.vocab()
en.plot(50)

lower_tokens = [x.lower() for x in tokens]  # 모든 character를 lower case로.
en_lw = nltk.Text(lower_tokens)
dic_lw = en_lw.vocab()

words = list(dic_lw.keys())

# practice page: 9
en.concordance('Emma', lines=5)  # concordance는 그 용어가 사용된 곳을 보여주는 용어 색인이다.
en.similar('Emma')  # frequency로 판별 앞뒤 맥락을 이용하여
en.collocations()  # default값으로 몇개가 출력되는지 설정되어있다. (20)
Exemplo n.º 28
0
"""
Assignment 2 for "Applied Text Mining in Python" from University of Michigan on Coursera 
Part 1 - Analyzing Moby Dick
"""

import nltk

# If you would like to work with the raw text you can use 'moby_raw'
with open('moby.txt', 'r') as f:
    moby_raw = f.read()

# If you would like to work with the novel in nltk.Text format you can use 'text1'
moby_tokens = nltk.word_tokenize(moby_raw)
text1 = nltk.Text(moby_tokens)
"""
Example 1
How many tokens (words and punctuation symbols) are in text1?
"""


def totalTokens():
    return len(nltk.word_tokenize(moby_raw))  # or alternatively len(text1)


totalTokens()
"""
Example 2
How many unique tokens (unique words and punctuation) does text1 have?
This function should return an integer.

"""
Exemplo n.º 29
0
from konlpy.tag import Okt

okt = Okt()
token_ko = okt.nouns(ko_con_text)

# 불용어(stopword) : 빈도 수에 상관없이 분석에서 배제할 단어들
stop_word_file = 'stopword.txt'
stop_file = open(stop_word_file, 'rt', encoding='utf-8')
stop_words = [ word.strip() for word in stop_file.readlines()]
# print(stop_words)

token_ko = [each_word for each_word in token_ko if each_word not in stop_words]

# nltk : national language toolkit
# token : 작은 절편
import nltk
ko = nltk.Text(tokens=token_ko)

wordlist = list() # 튜플(단어, 빈도수)를 저장할 리스트
# 가장 빈도수가 많은 500개만 추출
data = ko.vocab().most_common(500)
# print(data)
for word, count in data :
    if (count >= 50 and len(word) >= 2):
        wordlist.append((word, count))

visual = Visualization(wordlist)
visual.makeWorCloud()
visual.makeBarChart()

print('finished')
Exemplo n.º 30
0
# pip install twython
# 환경 변수 java_home 설정 해야함
# python 과 java 의 bit가 동일 해야 함
doc1 = """배우 남궁민(39)이 SBS TV 드라마 '조작'(극본 김현정, 연출 이정흠)에 출연 확정했다고 소속사 935엔터테인먼트가 20일 밝혔다.
'조작'은 사회 부조리를 파헤치는 기자들의 이야기를 그린다. 남궁민은 사고뭉치 기자 '한무영'을 맡는다. 기자였던 형이 비리를 고발하다 억울하게 죽는 모습을 본 후 복수를 위해 직접 기자가 된 인물이다.
소속사는 "전작 '김과장'이 많은 사랑을 받아 차기작을 결정하는 데 많은 고민이 있었다. '조작'은 '김과장' 때와 달리 남궁민의 진지하고 카리스마 넘치는 매력을 보여줄 드라마"라고 말했다.
한편 '조작'은 2015년 방송된 SBS 2부작 드라마 '너를 노린다'에서 호흡을 맞춘 이정흠 PD와 김현정 작가가 다시 한번 의기투합한 작품이다. 드라마는 '엽기적인 그녀' 후속으로 7월 방송 예정이다.
"""

from konlpy.corpus import kobill
from konlpy.tag import Twitter
t = Twitter()
tokens_doc = t.morphs(doc1)

import nltk
ko = nltk.Text(tokens_doc, name='뉴스')
#print(len(ko.tokens))
#print(len(set(ko.tokens)))
print(list(set(ko.tokens)))
keys = list(set(ko.tokens))
ko.vocab()
#ko.plot(50)
#print(ko.count(str('드라마')))
#print(len(tokens_doc))
count_arr = []
for to in tokens_doc:
    m = {}
    m[to] = str(ko.count(str(to)))
    count_arr.append(m)
count_arr = set(count_arr)
print(count_arr)