예제 #1
0
 def ne_concat(node, result):
     if isinstance(node, nltk.Tree):
         if node.label() != 'S':
             node = (' '.join(word for word, tag in node), node.label())
             result.append(nltk.tuple2str(node))
         else:
             for child in node:
                 ne_concat(child, result)
     else:
         #node = simplify_tag(node)
         result.append(nltk.tuple2str(node))
예제 #2
0
 def word_tokenize(sent):
     nonlocal time_pos
     nonlocal time_chunk
     
     # replace typographic marks with simple marks
     sent = sent.replace('…', '...')
     sent = sent.replace('”', "''")
     sent = sent.replace('“', ',,')
     sent = sent.replace(',', ',')
     sent = sent.replace('’', "'")
     
     words = nltk.word_tokenize(sent)
     # strip punctuation from words
     words = [word.strip(string.punctuation) for word in words]
     words = [word for word in words if len(word) > 0]
     
     if not analyse_pos:
         return words
     else:
         start = time.time()
         tagged = tagger.tag(words)
         time_pos += (time.time() - start)
     
         if preserve_entities:
             start = time.time()
             chunks = nltk.ne_chunk(tagged, binary=ner_binary)
             time_chunk += (time.time() - start)
         
             word_list = []
             ne_concat(chunks, word_list)
             return word_list
         else:
             return [nltk.tuple2str(t) for t in tagged]
예제 #3
0
def import_reuters_flat_pos(ds, silent=False, log=sys.stdout):
    """
    Import the brown corpus into `ds`. E.g.
    
    >>> from nathan.core import Dataspace
    >>> ds = Dataspace()
    >>> %time brown.import_brown(ds, silent=True)
    CPU times: user 12min 28s, sys: 536 ms, total: 12min 29s
    Wall time: 12min 29s
    """
    
    tagger = nltk.data.load("./models/treebank_brill_aubt/treebank_brill_aubt.pickle")
    
    if not silent:
        total = len(reuters.sents())
        counter = 0
    root_handle = ds.insert("#reuters")
    for sent in reuters.sents():
        sent = tagger.tag(sent)
        norm = [nltk.tuple2str(t) for t in sent]
        sen_handle = ds.insert(norm)
        ds.link(root_handle, sen_handle)
        if not silent:
            counter += 1
            if (counter % 100 == 0):
                print("importing %s of %s sentences..." % (counter, total), 
                    file=log)
예제 #4
0
def import_brown_pos(ds, simplify_tags=False, silent=False, log=sys.stdout):
    """
    Import the brown corpus into `ds`. E.g.
    
    >>> from nathan.core import Dataspace
    >>> ds = Dataspace()
    >>> %time brown.import_brown(ds, silent=True)
    CPU times: user 12min 28s, sys: 536 ms, total: 12min 29s
    Wall time: 12min 29s
    """
    if not silent:
        total = len(brown.sents())
        counter = 0
    for category in brown.categories():
        cat_handle = ds.insert("#%s" % category)
        for sent in brown.tagged_sents(categories=category):
            if simplify_tags:
                norm = (simplify_tag(t) for t in sent)
            norm = [nltk.tuple2str(t) for t in norm]
            sen_handle = ds.insert(norm)
            ds.link(cat_handle, sen_handle)
            if not silent:
                counter += 1
                if (counter % 100 == 0):
                    print("importing %s of %s sentences..." % (counter, total), 
                        file=log)
def transfrom_data(data):
    sentences = []
    word = ''
    for sent in data:
        for tuple_word_tag in sent:
            str = tuple2str(tuple_word_tag)
            word += str
            word += ' '
        sentences.append(word)
        word = ' '
    return sentences
예제 #6
0
def default_tag(reviews):
    """
    Return the reviews with default tag
    """

    for review in reviews:
        text = nltk.word_tokenize(review.content)
        tagged_tokens = nltk.pos_tag(text)
        tagged_content = ''
        for token in tagged_tokens:
            str_token = nltk.tuple2str(token, '/')
            tagged_content += str_token + ' '
        review.content = tagged_content.strip()

    return reviews
예제 #7
0
def tag_by_training(trained_reviews, test_reviews):
    """
    Train the trained reviews into Tagger Model, and tag test_reviews to be returned
    """

    train_sent = review_to_sent(trained_reviews)

    unigram_tagger = nltk.UnigramTagger(train_sent)

    for test_review in test_reviews:
        text = nltk.word_tokenize(test_review.content)
        tagged_tokens = unigram_tagger.tag(text)
        tagged_content = ''
        for token in tagged_tokens:
            str_token = nltk.tuple2str(token, '/')
            tagged_content += str_token + ' '
        test_review.content = tagged_content.strip()

    return test_reviews
예제 #8
0
def process_corpus(corpus_name):
    input_file = corpus_name + ".zip"
    corpus_contents = unzip_corpus(input_file)

    # testing
    #corpus_contents = input_file.read().decode('utf-8')

    # 1. Tokenizing

    # (a) Write the name of the corpus to stdout
    print("author:", corpus_name)

    # (b) Delimit the sentences for each document in the corpus.

    totalwords = []
    totalsent = []
    totaltags = []
    sentcount = 0

    # POS Tag Output File
    pos_file = open(corpus_name + "-pos.txt", 'w')

    for doc in corpus_contents:
        sentences = nltk.sent_tokenize(doc)

        for sentence in sentences:
            totalsent.append(sentence)
            sentcount = sentcount + 1
            # Tokenize the words in each sentences of each document
            words = nltk.word_tokenize(sentence)
            for word in words:
                totalwords.append(word)

            # Part-of-Speech
            tagged = nltk.pos_tag(words)
            for tag in tagged:
                totaltags.append(tag)
                string = nltk.tuple2str(tag)
                print(string, file=pos_file, end=" ")
            print("\n", file=pos_file, end="")
        print("\n", file=pos_file, end="")

        print("word count:", len(totalwords))
        waverage = sum(len(word) for word in totalwords) / len(totalwords)
        print("word avg:", waverage)
        saverage = len(totalwords) / sentcount
        print("sent avg:", saverage)

    pos_file.close()

    # average sentence length
    #for fileid in inaugural.fileids():
    #    avg = sum(len(sent) for sent in inaugural.sents(fileids=[fileid])) / len(inaugural.sents(fileids=[fileid]))
    #    print(fileid, avg)

    # number of total words in the corpus
    wordCount = 0
    wordCount = len(totalwords)
    #print("2. Total Words in the Corpus:", wordCount)

    # Frequency
    flat_words = [word.lower() for word in totalwords]
    vocabCount = 0
    vocabCount = len(set(flat_words))
    #print("3. Vocabulary Size of the Corpus:", vocabCount)

    tagged_fd = nltk.FreqDist(tag for (word, tag) in totaltags)
    #print("4. The most frequent part-of-speech tag is", tagged_fd.most_common(1))

    # Frequency Output File
    freq_file = open(corpus_name + "-word-freq.txt", 'w')

    fdist = nltk.FreqDist(flat_words)
    print([
        pair[0] for pair in sorted(
            fdist.items(), key=lambda item: item[1], reverse=True)
    ],
          file=freq_file)

    freq_file.close()

    # Conditional Frequency Distribution

    sys.stdout = open(corpus_name + "-pos-word-freq.txt", 'w')

    # Reverse
    pos_reversed = [(b, a.lower()) for a, b in totaltags]

    cdf1 = nltk.ConditionalFreqDist(pos_reversed)

    cdf1.tabulate()

    sys.stdout = sys.__stdout__

    # Similar Words
    NNtags = []
    VBDtags = []
    JJtags = []
    RBtags = []
    punctags = []
    for x in totaltags:
        if x[1] == 'NN':
            NNtags.append(x)
        elif x[1] == 'VBD':
            VBDtags.append(x)
        elif x[1] == 'JJ':
            JJtags.append(x)
        elif x[1] == 'RB':
            RBtags.append(x)
        elif x[1] == "." or "," or ";" or "-":
            punctags.append(x)

    punctratio = len(punctags) / len(totalwords)
    NNratio = len(NNtags) / len(totalwords)
    VBDratio = len(VBDtags) / len(totalwords)
    JJratio = len(JJtags) / len(totalwords)
    RBratio = len(RBtags) / len(totalwords)

    print("punctratio:", len(punctags) / len(totalwords))
    print("NNratio:", len(NNtags) / len(totalwords))
    print("VBDratio:", len(VBDtags) / len(totalwords))
    print("JJratio:", len(JJtags) / len(totalwords))
    print("RBratio:", len(RBtags) / len(totalwords))

    download_dir = "training.csv"
    csv = open(download_dir, "a")
    columnTitleRow = "author,avgword,avgsent,punctratio,nnratio,vbdratio,rbratio,jjratio\n"
    #csv.write(columnTitleRow)
    row = corpus_name + "," + str(waverage) + "," + str(saverage) + "," + str(
        punctratio) + "," + str(NNratio) + "," + str(VBDratio) + "," + str(
            RBratio) + "," + str(JJratio) + "\n"
    csv.write(row)

    #print("5. The most frequent word in the POS (NN/VBD/JJ/RB) and respective similar words:")

    text = nltk.Text(flat_words)

    NN_fd = nltk.FreqDist(NNtags)
    #print("Most frequent NN =", NN_fd.most_common(1))

    commonNN = NN_fd.most_common(1)[0][0][0]
    #print("Words similar to", commonNN, ":")
    #text.similar(commonNN)
    #print()

    VBD_fd = nltk.FreqDist(VBDtags)
    #print("Most frequent VBD =", VBD_fd.most_common(1))

    commonVBD = VBD_fd.most_common(1)[0][0][0]
    #print("Words similar to", commonVBD, ":")
    #text.similar(commonVBD)
    #print()

    JJ_fd = nltk.FreqDist(JJtags)
    #print("Most frequent JJ =", JJ_fd.most_common(1))

    commonJJ = JJ_fd.most_common(1)[0][0][0]
    #print("Words similar to", commonJJ, ":")
    #text.similar(commonJJ)
    #print()

    RB_fd = nltk.FreqDist(RBtags)
    #print("Most frequent RB =", RB_fd.most_common(1))

    commonRB = RB_fd.most_common(1)[0][0][0]
    #print("Words similar to", commonRB, ":")
    #text.similar(commonRB)
    #print()

    # 5. Collocations
    co_text = nltk.Text(flat_words)
예제 #9
0
파일: readers.py 프로젝트: hobson/nlup
 def __str__(self):
     return " ".join(tuple2str(tt) for tt in \
                     zip(self.tokens, self.tags))
예제 #10
0
def result():
    # get text from textbox
    data = request.form.get('text')

    totalwords = []
    totalsent = []
    totaltags = []
    sentcount = 0

    # tokenize into sentences
    sentences = nltk.sent_tokenize(data)

    # tokenize into words and create part of speech tags
    for sentence in sentences:
        totalsent.append(sentence)
        sentcount = sentcount + 1
        words = nltk.word_tokenize(sentence)
        for word in words:
            totalwords.append(word)
        tagged = nltk.pos_tag(words)
        for tag in tagged:
            totaltags.append(tag)
            string = nltk.tuple2str(tag)

    # calculate word and sentence average
    waverage = sum(len(word) for word in totalwords) / len(totalwords)
    wtrunc = '%.3f' % (waverage)
    saverage = len(totalwords) / sentcount
    strunc = '%.3f' % (saverage)

    # add up total number of pos tags
    NNtags = []
    VBDtags = []
    JJtags = []
    RBtags = []
    punctags = []
    for x in totaltags:
        if x[1] == 'NN':
            NNtags.append(x)
        elif x[1] == 'VBD':
            VBDtags.append(x)
        elif x[1] == 'JJ':
            JJtags.append(x)
        elif x[1] == 'RB':
            RBtags.append(x)
        elif x[1] == "." or "," or ";" or "-":
            punctags.append(x)

    # calculate part of speech ratios
    punctratio = len(punctags) / len(totalwords)
    NNratio = len(NNtags) / len(totalwords)
    VBDratio = len(VBDtags) / len(totalwords)
    JJratio = len(JJtags) / len(totalwords)
    RBratio = len(RBtags) / len(totalwords)

    # create csv for machine learning model
    open('user.csv', 'w').close()  # erase file
    download_dir = "user.csv"
    csv = open(download_dir, "a")
    columnTitleRow = "author,avgword,avgsent,punctratio,nnratio,vbdratio,rbratio,jjratio\n"
    csv.write(columnTitleRow)
    row = "user," + str(waverage) + "," + str(saverage) + "," + str(punctratio) + "," + str(NNratio) + "," + str(VBDratio) + "," + str(
        RBratio) + "," + str(JJratio) + "\n"
    csv.write(row)
    csv.close()

    # use already generated pickle file to predict author
    test_file = "user.csv"
    df1 = pd.read_csv(test_file, header=0)
    test_data = df1.iloc[:, 1:]
    model2 = joblib.load("file.pkl")
    preds2 = model2.predict(test_data)

    # truncate to 3 decimal places and add %
    NNratio = NNratio * 100
    ntrunc = '%.3f' % (NNratio)
    VBDratio = VBDratio * 100
    vtrunc = '%.3f' % (VBDratio)
    JJratio = JJratio * 100
    jtrunc = '%.3f' % (JJratio)
    RBratio = RBratio * 100
    rtrunc = '%.3f' % (RBratio)

    # put author guess and stats into an array
    response = []
    response.append(preds2[0])
    response.append(str(wtrunc))
    response.append(str(strunc))
    response.append(str(ntrunc) + "%")
    response.append(str(vtrunc) + "%")
    response.append(str(rtrunc) + "%")
    response.append(str(jtrunc) + "%")

    # redirecting to appropriate author function with the user stats
    if response[0] == "John Steinbeck":
        return redirect(url_for("steinbeck", awl=response[1], asl=response[2], nr=response[3], vr=response[4], avr=response[5],
                                ajr=response[6]))
    elif response[0] == "Mark Twain":
        return redirect(url_for("twain", awl=response[1], asl=response[2], nr=response[3], vr=response[4], avr=response[5],
                                ajr=response[6]))
    elif response[0] == "Mary Shelley":
        return redirect(url_for("shelley", awl=response[1], asl=response[2], nr=response[3], vr=response[4], avr=response[5],
                                ajr=response[6]))
    elif response[0] == "Jane Austen":
        return redirect(url_for("austen", awl=response[1], asl=response[2], nr=response[3], vr=response[4], avr=response[5],
                                ajr=response[6]))
    elif response[0] == "Jacob Grimm and Wilhelm Grimm":
        return redirect(url_for("grimm", awl=response[1], asl=response[2], nr=response[3], vr=response[4], avr=response[5],
                                ajr=response[6]))
    elif response[0] == "Isaac Asimov":
        return redirect(url_for("asimov", awl=response[1], asl=response[2], nr=response[3], vr=response[4], avr=response[5],
                                ajr=response[6]))
    elif response[0] == "H.P Lovecraft":
        return redirect(url_for("lovecraft", awl=response[1], asl=response[2], nr=response[3], vr=response[4], avr=response[5],
                                ajr=response[6]))
    elif response[0] == "F Scott Fitzgerald":
        return redirect(url_for("fitzgerald", awl=response[1], asl=response[2], nr=response[3], vr=response[4], avr=response[5],
                                ajr=response[6]))
    elif response[0] == "Ernest Hemingway":
        return redirect(url_for("hemingway", awl=response[1], asl=response[2], nr=response[3], vr=response[4], avr=response[5],
                                ajr=response[6]))
    elif response[0] == "Edgar Allan Poe":
        return redirect(url_for("poe", awl=response[1], asl=response[2], nr=response[3], vr=response[4], avr=response[5],
                                ajr=response[6]))
    elif response[0] == "CS Lewis":
        return redirect(url_for("lewis", awl=response[1], asl=response[2], nr=response[3], vr=response[4], avr=response[5],
                                ajr=response[6]))
    elif response[0] == "Arthur C Clark":
        return redirect(url_for("clarke", awl=response[1], asl=response[2], nr=response[3], vr=response[4], avr=response[5],
                                ajr=response[6]))
    elif response[0] == "Agatha Christie":
        return redirect(url_for("christie", awl=response[1], asl=response[2], nr=response[3], vr=response[4], avr=response[5],
                                ajr=response[6]))
    return (response)
예제 #11
0
 def __str__(self):
     return " ".join(tuple2str(tt) for tt in zip(self.tokens, self.tags))
예제 #12
0
def process_corpus():
    #corpus_contents = ' '.join(sys.argv[1:])

    inputfile = corpus_name + ".txt"
    corpus_contents = open(inputfile, 'r').read()

    totalwords = []
    totalsent = []
    totaltags = []
    sentcount = 0

    # tokenize into sentences
    sentences = nltk.sent_tokenize(corpus_contents)

    # tokenize into words and create part of speech tags
    for sentence in sentences:
        totalsent.append(sentence)
        sentcount = sentcount + 1
        words = nltk.word_tokenize(sentence)
        for word in words:
            totalwords.append(word)
        tagged = nltk.pos_tag(words)
        for tag in tagged:
            totaltags.append(tag)
            string = nltk.tuple2str(tag)

    # calculate word and sentence average
    waverage = sum(len(word) for word in totalwords) / len(totalwords)
    wtrunc = '%.3f' % (waverage)
    saverage = len(totalwords) / sentcount
    strunc = '%.3f' % (saverage)

    # add up total number of pos tags
    NNtags = []
    VBDtags = []
    JJtags = []
    RBtags = []
    punctags = []
    for x in totaltags:
        if x[1] == 'NN':
            NNtags.append(x)
        elif x[1] == 'VBD':
            VBDtags.append(x)
        elif x[1] == 'JJ':
            JJtags.append(x)
        elif x[1] == 'RB':
            RBtags.append(x)
        elif x[1] == "." or "," or ";" or "-":
            punctags.append(x)

    # calculate part of speech ratios
    punctratio = len(punctags) / len(totalwords)
    NNratio = len(NNtags) / len(totalwords)
    VBDratio = len(VBDtags) / len(totalwords)
    JJratio = len(JJtags) / len(totalwords)
    RBratio = len(RBtags) / len(totalwords)

    # create csv for machine learning model
    open('user.csv', 'w').close()  #erase file
    download_dir = "user.csv"
    csv = open(download_dir, "a")
    columnTitleRow = "author,avgword,avgsent,punctratio,nnratio,vbdratio,rbratio,jjratio\n"
    csv.write(columnTitleRow)
    row = "user," + str(waverage) + "," + str(saverage) + "," + str(
        punctratio) + "," + str(NNratio) + "," + str(VBDratio) + "," + str(
            RBratio) + "," + str(JJratio) + "\n"
    csv.write(row)
    csv.close()

    # use already generated pickle file to predict author
    test_file = "user.csv"
    df1 = pd.read_csv(test_file, header=0)
    test_data = df1.iloc[:, 1:]
    model2 = joblib.load("file.pkl")
    preds2 = model2.predict(test_data)

    # truncate to 3 decimal places and add %
    NNratio = NNratio * 100
    ntrunc = '%.3f' % (NNratio)
    VBDratio = VBDratio * 100
    vtrunc = '%.3f' % (VBDratio)
    JJratio = JJratio * 100
    jtrunc = '%.3f' % (JJratio)
    RBratio = RBratio * 100
    rtrunc = '%.3f' % (RBratio)

    # put author guess and stats into an array
    response = []
    response.append(preds2[0])
    response.append(str(wtrunc))
    response.append(str(strunc))
    response.append(str(ntrunc) + "%")
    response.append(str(vtrunc) + "%")
    response.append(str(jtrunc) + "%")
    response.append(str(rtrunc) + "%")

    print(response)
    return (response)
예제 #13
0
def process_corpus(corpus_name):
    input_file = corpus_name + ".zip"
    corpus_contents = unzip_corpus(input_file)
    # Your code goes here
    file_name = corpus_name+"-pos.txt"
    freq_file = corpus_name+"-word-freq.txt"
    cond_file = corpus_name+"-pos-word-freq.txt"
    #write name of corpous to stdout
    totalcount = 0
    vocabsize = 0
    poslist = []
    taglist = []
    wordslist = []
    unqiuelist = []
    wordstr = []
    tuplearr = []
    normaltext = []
    beginarr = []
    corpusarr = []
    with open(file_name,'a') as f:
        for doc in corpus_contents:
            #delimit the sentences for each document in the corpus
            sent = nltk.sent_tokenize(doc)
            #part-of-speech tagger to each tokenize sent
            #tokenize the words of each sentence of each doc
            words = [nltk.word_tokenize(item) for item in sent]
            for word in words:
                corpusarr.append(word)
                #lowercase words
                flat_words = [term.lower() for term in word]
                lowerfreq = nltk.FreqDist(flat_words)
                vocabsize += lowerfreq.B() 
                
                #make array of tokenized words of corpus  
                for i in word:
                    normaltext.append(i)
                #make an array of lowercase tokenized words of corpus
                for i in flat_words:
                    wordstr.append(i)

                #count total words in corpus
                freq = nltk.FreqDist(word)
                totalcount += freq.N()
                    
                #pos tagging
                poslist = nltk.pos_tag(word)
                #reverse tuple of pos tagging
                for item in poslist:
                    tlist = tuple(reversed(item))
                    tuplearr.append(tlist)
                    beginarr.append(item)
                #make second value of tuple lowercase
                newtuple = [(pos,word.lower()) for pos,word in tuplearr]
#                print(newtuple)
                #most freq part of speech
                pos_counter = nltk.FreqDist(pos for (word, pos) in poslist)
                for word,tag in poslist:
                    wordslist.append(word)
                    taglist.append(tag)
                for val in poslist:
                    combined = nltk.tuple2str(val)
                    f.write(combined)
                    f.write(" ")
                f.write("\n")
            f.write("\n")
        #gett frequency of unique words
        uniquefreq = nltk.FreqDist(wordstr)
        uniquedict = uniquefreq.most_common(15000)

        with open(freq_file,'a') as r:
            for k,j in uniquedict:
                r.write(k+", "+ str(j))
                r.write("\n")
        tagfreq = nltk.FreqDist(taglist)
        winner = tagfreq.most_common(50)
    print("1. Corpus name:", corpus_name)
    print("2. Total words in corpus", totalcount)
    print("3. Vocabulary size of the corpus", uniquefreq.B())
    print("4. The most frequent part-of-speech tag is", winner[0][0], "with frequency", winner[0][1])
    condfileoutput = open(cond_file,'a')
    sys.stdout = condfileoutput
    cflist = nltk.ConditionalFreqDist(newtuple)
    cflist.tabulate()
    sys.stdout = sys.__stdout__
    print(cflist)

    noun = cflist['NN'].most_common(1)
    noun1 = noun[0][0]
    print(normaltext)
    text = nltk.Text(normaltext)
    
    print("5. The most frequent word in the POS(NN) is:", noun1,"and its similar words are:")
    text.similar(noun1)
    vbd = cflist['VBD'].most_common(1)
    vbd1 = vbd[0][0]
    print("5. The most frequent word in the POS(VBD) is:",vbd1,"and its similar words are:")
    text.similar(vbd1)
    jj = cflist['JJ'].most_common(1)
    jj1 = jj[0][0]
    print("5. The most frequent word in the POS(JJ) is:",jj1,"and its similar words are:")
    text.similar(jj1)
    rb = cflist['RB'].most_common(1)
    rb1 = rb[0][0]
    print("5. The most frequent word in the POS(RB) is:",rb1,"and its similar words are:")
    text.similar(rb1)
    print("6. Collocations:")
    text.collocations()

    pass