Пример #1
0
    def data_preparation(self):
        """
        Splits one of Brown, BNC News, Indian corpora into train set and
        test set

        Returns:
        --------
            sentences (list):
                Sentences without POS-tags
            tagged_sentences (list):
                Sentences with POS-tags
        """
        if self.corpus == 'brown':
            tagged_sentences = brown.tagged_sents(categories='news')
            sentences = brown.sents(categories='news')
        elif self.corpus == 'bnc':
            root = find('corpora/bnc')
            bncnews = TaggedCorpusReader(root,
                                         'bnc-news-wtp.txt',
                                         tagset='en-claws')
            if self.tagset is None:
                tagged_sentences = bncnews.tagged_sents()
            elif self.tagset == 'universal':
                tagged_sentences = bncnews.tagged_sents(tagset=self.tagset)
            sentences = bncnews.sents()
        elif self.corpus == 'indian':
            if self.lang in ['telugu', 'hindi', 'marathi', 'bangla']:
                tagged_sentences = indian.tagged_sents(f'{self.lang}.pos')
                sentences = indian.sents(f'{self.lang}.pos')
            else:
                print('Language not part of Indian Corpus.')
        return sentences, tagged_sentences
Пример #2
0
def train():

    taggedSet = "hindi.pos"
    wordSet = indian.sents(taggedSet)
    count = 0
    print(wordSet[0])
    for sen in wordSet:
        count = count + 1
        sen = "".join([
            " " +
            i if not i.startswith("'") and i not in string.punctuation else i
            for i in sen
        ]).strip()
        print(count, sen, "sentences")
    print("Total sentences in the tagged file are", count)

    trainPerc = 0.9

    trainRows = int(trainPerc * count)
    testRows = trainRows + 1

    data = indian.tagged_sents(taggedSet)
    train_data = data[:trainRows]
    test_data = data[testRows:]

    print("Training dataset length: ", len(train_data))
    print("Testing dataset length: ", len(test_data))

    pos_tagger = tnt.TnT()
    pos_tagger.train(train_data)
    print("Accuracy: ", pos_tagger.evaluate(test_data))
    return pos_tagger
Пример #3
0
def bangla_chunk():
    # Parts of speech tagging part................

    tagged_set = 'bangla.pos'  # pre-trained Indian corpus is stored in bngla.pos
    word_set = indian.sents(
        tagged_set
    )  # From Bengali corpus read the Bengali sentence and put them variable word_set
    count = 0
    '''
    Using a for loop count all sentences which present in the corpus.
     startswith()-function is used to check the string is started with String “ ‘ “

    Here set the training percentage is 0.96 since the dataset is not sufficient.
    '''
    for sen in word_set:
        count = count + 1
        sen = "".join([
            " " +
            i if not i.startswith("'") and i not in string.punctuation else i
            for i in sen
        ]).strip()
        print(count, sen)
        print('Total sentences in the tagged files are', count)

        train_perc = .96
        train_rows = int(train_perc * count)
        test_rows = train_rows + 1

        print("Sentences to be trained", train_rows,
              "Sentences to be tested against", test_rows)

    data = indian.tagged_sents(tagged_set)
    train_data = data[:train_rows]
    test_data = data[test_rows:]
    '''
    now tokenize and check the parts of speech
    '''
    pos_tagger = tnt.TnT()
    pos_tagger.train(train_data)
    pos_tagger.evaluate(test_data)

    sentence = "আমি ভাত খাই নাই অনেক দিন হল । বিজেআইটি একটি কম্পানি , কি কম্পানি সেটা জানার দরকার নাই । "

    tokenized = nltk.word_tokenize(sentence)
    words = pos_tagger.tag(tokenized)
    '''
    RB.?  = any form  of RB
    NNP  =we are required

    '''
    chunkGram = r"""Chunk: {<.*>+}
                          }<VB.? |IN|DT|TO|NN>+{"""
    chunkParser = nltk.RegexpParser(chunkGram)  # regex parser use krci
    chunked = chunkParser.parse(words)
    chunked.draw()
def bangla_chunk():
    # Parts of speech tagging part................

    tagged_set = 'bangla.pos'  # pre-trained Indian corpus is stored in bngla.pos
    word_set = indian.sents(
        tagged_set
    )  # From Bengali corpus read the Bengali sentence and put them variable word_set
    count = 0
    '''
    Using a for loop count all sentences which present in the corpus.
     startswith()-function is used to check the string is started with String “ ‘ “
    Here set the training percentage is 0.96 since the dataset is not sufficient.
    '''

    for sen in word_set:
        count = count + 1
        sen = "".join([
            " " +
            i if not i.startswith("'") and i not in string.punctuation else i
            for i in sen
        ]).strip()
        print(count, sen)
        print('Total sentences in the tagged files are', count)

        train_perc = .96
        train_rows = int(train_perc * count)
        test_rows = train_rows + 1

        print("Sentences to be trained", train_rows,
              "Sentences to be tested against", test_rows)

    data = indian.tagged_sents(tagged_set)
    train_data = data[:train_rows]
    test_data = data[test_rows:]
    '''
    now tokenize and check the parts of speech
    '''
    pos_tagger = tnt.TnT()
    pos_tagger.train(train_data)
    pos_tagger.evaluate(test_data)

    sentence = "আমি ভাত খাই নাই অনেক দিন হল । বিজেআইটি একটি কম্পানি , কি কম্পানি সেটা জানার দরকার নাই । মানিক ভাইয়ের কি হইল আবার রোকেয়া কাবিলার জন্য চিল্লাইতেছে  "

    tokenized = nltk.word_tokenize(sentence)
    words = pos_tagger.tag(tokenized)
    namedEnt = nltk.ne_chunk(words)
    namedEnt.draw()
Пример #5
0
def bangla_pos_tagger():
    tagged_set = 'bangla.pos'  #pre-trained Indian corpus is stored in bngla.pos
    word_set = indian.sents(
        tagged_set
    )  #From Bengali corpus read the Bengali sentence and put them variable word_set
    count = 0
    '''
    Using a for loop count all sentences which present in the corpus.
     startswith()-function is used to check the string is started with String “ ‘ “
    
    Here set the training percentage is 0.96 since the dataset is not sufficient.
    '''
    for sen in word_set:
        count = count + 1
        sen = "".join([
            " " +
            i if not i.startswith("'") and i not in string.punctuation else i
            for i in sen
        ]).strip()
        print(count, sen)
        print('Total sentences in the tagged files are', count)

        train_perc = .96
        train_rows = int(train_perc * count)
        test_rows = train_rows + 1

        print("Sentences to be trained", train_rows,
              "Sentences to be tested against", test_rows)

    data = indian.tagged_sents(tagged_set)
    train_data = data[:train_rows]
    test_data = data[test_rows:]
    '''
    now tokenize and check the parts of speech
    '''
    pos_tagger = tnt.TnT()
    pos_tagger.train(train_data)
    pos_tagger.evaluate(test_data)

    sentence = "আমি ভাত খাই নাই অনেক দিন হল ।"

    tokenized = nltk.word_tokenize(sentence)
    print(pos_tagger.tag(tokenized))
Пример #6
0
import nltk
from nltk.corpus import indian
from nltk.tag import tnt
import string

nltk.download('punkt')
nltk.download()

tagged_set = 'hindi.pos'
word_set = indian.sents(tagged_set)
count = 0
for sen in word_set:
    count = count + 1
    sen = "".join([
        " " + i if not i.startswith("'") and i not in string.punctuation else i
        for i in sen
    ]).strip()
    print(sen)
print(count)

train_perc = .9

train_rows = int(train_perc * count)
test_rows = train_rows + 1

print(train_rows, test_rows)

data = indian.tagged_sents(tagged_set)
train_data = data[:train_rows]
test_data = data[test_rows:]
Пример #7
0
# -*- coding: utf-8 -*-
"""
Created on Mon Oct 12 14:39:14 2015

@author: suppu
"""

import nltk
import nltk.data
from nltk.corpus import indian

word_to_be_tagged = u"पूर्ण प्रतिबंध हटाओ : इराक"

hindi_sents = indian.sents("hindi.pos")
train_data = indian.tagged_sents('hindi.pos')[:300]
test_data = indian.tagged_sents('hindi.pos')[301:]

from nltk.tag import tnt

tnt_pos_tagger = tnt.TnT()
tnt_pos_tagger.train(train_data)
tnt_pos_tagger.evaluate(test_data)

tagged = (tnt_pos_tagger.tag(nltk.word_tokenize(word_to_be_tagged)))
out = open("ooutput.txt", 'w')
for i in tagged:
    out.write(i[0])
    out.write(" " + i[1] + "\n")
'''
OUTPUT ooutput.txt file:
पूर्ण JJ
Пример #8
0
#!/usr/bin/python
import os
from nltk.corpus import indian
import txt2tajson

# can also choose from: marathi, bangla, telugu, hindi
lang = "hindi"

if not os.path.exists("txt/" + lang):
    os.mkdir("txt/" + lang)

sents = indian.sents(lang + ".pos")

# arbitrarily put 10 sentences per document.
num = 0
for i in range(0, len(sents), 10):
    with open("txt/" + lang + "/" + str(i), "w") as out:
        for sent in sents[i:i + 10]:
            out.write(" ".join(sent) + "\n")
    num += 1

print("Wrote {} text files to {}".format(num, "txt/" + lang))

# Now convert txt to tajson.
txt2tajson.convert("txt/" + lang, "tajson/" + lang)

print(
    "Now run:\n  $ ./scripts/buildindex.sh data/tajson/hindi/ data/index_hindi"
)
Пример #9
0
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j,
                 i,
                 format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


marathi_sent = indian.sents('marathi_pos_rad_3NOV17.pos')
mpos = indian.tagged_sents('marathi_pos_rad_3NOV17.pos')
mp = shuffle(mpos)
size = int(len(marathi_sent) * 0.8)
tags = [
    tag for (word, tag) in indian.tagged_words('marathi_pos_rad_3NOV17.pos')
]
print(np.unique(tags))
#print("no. of tags=",len(nltk.FreqDist(tags)))
defaultTag = nltk.FreqDist(tags).max()

#print(defaultTag)
train_sents = mp[:size]
#print(len(train_sents))
test_sents = mp[size:]
Пример #10
0
import nltk

from nltk.corpus import indian
data = indian.tagged_sents()
test_data = indian.sents()

data_m = []
test_data_m = []
with open('IIIT_data') as f:
    fh = f.readlines()
    lis = []
    lis1 = []
    for i in fh:
        if (i == '\n'):
            data_m.append(lis)
            test_data_m.append(lis1)
            lis = []
            lis1 = []
        else:
            temp = (i.strip()).split('\t')
            lis.append(tuple((temp[0], temp[1])))
            lis1.append(temp[0])
print(len(test_data_m))

tot_data = data_m
test_data = test_data_m
train_size = int(len(tot_data) / 10) * 9

import random
import numpy as np
training_data = []
Пример #11
0
def get_list_of_sentences():
    sentences = indian.sents('bangla.pos')
    return sentences
Пример #12
0
#!/usr/bin/python
import os
from nltk.corpus import indian
import txt2tajson

# can also choose from: marathi, bangla, telugu, hindi
lang = "hindi"

if not os.path.exists("txt/" + lang):
    os.mkdir("txt/" + lang)

sents = indian.sents(lang + ".pos")

# arbitrarily put 10 sentences per document.
num = 0
for i in range(0,len(sents),10):
    with open("txt/" + lang + "/" + str(i), "w") as out:
        for sent in sents[i:i+10]:    
            out.write(" ".join(sent) + "\n")
    num += 1

print("Wrote {} text files to {}".format(num, "txt/" + lang))

# Now convert txt to tajson.
txt2tajson.convert("txt/" + lang, "tajson/" + lang)

print("Now run:\n  $ ./scripts/buildindex.sh data/tajson/hindi/ data/index_hindi")
Пример #13
0
from nltk.corpus import indian
'''
Let us generate a file having sentences in indian languages. The file is generated from the indian languages scorpus available
'''
print "Number of charachetrs is:"
for f in indian.fileids():
    print f
    print len(indian.raw(f))
print "No of words in each language are:"
for f in indian.fileids():
    print f
    print len(indian.words(f))
print "Number of sentences in each language:"
for f in indian.fileids():
    print f
    print len(indian.sents(f))
'''POS for hindi
'''
hindi_sent = indian.sents("hindi.pos")
hsent = file("hws.txt", 'w')
for i in hindi_sent:
    hsent.write(" ".join(i))
hpos = indian.tagged_sents("hindi.pos")
hpossent = open("hpossent.txt", 'w')
hpossent.seek(0)
for i in hpos:
    for j in i:
        hpossent.write(j[0] + " " + j[1] + "\n")
'''
POS for bangla
'''
Пример #14
0
print("Files of Indian languages:-")
# check files for each languare in NLTK
print(indian.fileids())
print()

print("Language details :-")
# find no. of characters in each language
for f in indian.fileids():
    print("Language :-", f)
    print(
        "     No of Characters",
        len(indian.raw(f)),
    )
    print("     No of words :-", len(indian.words(f)))
    print("     No of Sentences :-", len(indian.sents(f)))
print()

print("Checking raw sentences of languages:-")
# print(indian.raw(indian.raw('bangla.pos'))
# print(indian.raw(indian.raw('hindi.pos'))
# print(indian.raw(indian.raw('marathi.pos'))
# print(indian.raw(indian.raw('telugu.pos'))

print("Printing & writing the sentences to a file,  from Marathi language")
sentencesMarathi = open("marathiSentences.txt", "w")
# This will print sentence as a list of words
for sentence in indian.sents('marathi.pos'):
    #print(sentence)
    sentencesMarathi.write(" ".join(sentence))
Пример #15
0
#This is for my corpus (indian)

import nltk
from nltk.corpus import indian
import matplotlib as cdf

print(indian.raw())
print(indian.fileids())
print(indian.sents())

import matplotlib

word1 = 'country'
word2 = 'city'
cfd = nltk.ConditionalFreqDist((target, fileid[:4])
                               for fileid in indian.fileids()
                               for w in indian.words(fileid)
                               for target in [word1, word2]
                               if w.lower().startswith(target))
cfd.plot()
Пример #16
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
INFORMATION RETRIEVAL IN INDIAN LANGUAGE(HINDI)
@author: narayanashanmukhavenkat
"""
import numpy as np
from nltk.corpus import indian, stopwords
from gensim import corpora, models, similarities, matutils
from gensim.models import lsimodel, nmf

documents = indian.sents("hindi.pos")

temp = open("hindisw.txt", 'r')

stop_words = ""
stop_words = stop_words + temp.read()
temp.close()

stop_words = [word for word in stop_words.split()]

texts = [[word for word in document if word not in stop_words]
         for document in documents]

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/tmp/ir.mm', corpus)

lsi = models.LsiModel(corpus, num_topics=43, id2word=dictionary)

index = similarities.MatrixSimilarity(lsi[corpus])