Пример #1
0
def clearCorpus():
	#****************** START Model generator ********************************
	os.system('clear')
	print ('\n\n{}'.format('='*100))
	print ('AUTHOMATIC LANGUAGE IDENTIFIER USING CUMMULATIVE FREQUENCY ADDITION - CORPUS CLEANER'.center(100,' '))
	print ('-'*100)	

	print ("\nLoading corpus files to memory ... ")
	path = 'corpus/rawSource/'
	
	started = datetime.datetime.now()
	
	for infile in glob.glob(os.path.join(path, '*.txt')): #opens files from directory
		try:
			#Extract the file name			
			filename = infile.split('/')[-1]
			lang = filename[:2]

			#open and read file from corpus
			f=open(infile,'r', encoding = 'utf8' )
			rawtext = [lang,f.read()]
			f.close()

			print ('-'*100) 
			print ('\nOpening relevant files ...  \t\t\t\t\t\t{}'.format(l.timer(started)))

			cleantext=l.regex(rawtext)[1] #source file content set, i.e. vocabulary
			
			path1 = 'corpus/cleanSource/'
			if os.path.isfile(os.path.join(path1, filename)): os.remove(os.path.join(path1, filename))
			c=open(os.path.join(path1, filename),'a+')
			c.write(str(cleantext))
			c.close()

			print ('\nSuccessfuly cleande {} file '.format(filename))
			
		except IOError:
			print ('Error: Can not open the file: ',lang) ;	return	
	else:
	
		print ('\nStarted:', started)
		ended = datetime.datetime.now()
		elapsed = ended - started
		print ('End    :', ended)
		print ('Elapsed:', elapsed)
Пример #2
0
def classification(ct,readsampled,vocabulary,frequencyDict,uniquengrams,totalngrams,phraselength=25,wordbased=0,location=0,infinity=0,maxg=5,lines=0):
	#*************************** START Reading Files ************************************
	started = datetime.datetime.now()
	
	testing = readsampled[0]
	averagebyte = int(readsampled[1])
	averagecharacters = int(readsampled[2])
	phrases = int(readsampled[3])

	print ('-'*100) 
	# print ('\nFiles {} loaded to memory ...  \t\t\t\t\t\t{}'.format(path+filename,l.timer(started)))
	mytime = datetime.datetime.now()
	
	print ('Reading language models ...  \t\t\t\t\t\t{}'.format(l.timer(mytime)))
	mytime = datetime.datetime.now()

	#*************************** END Reading Files ***************************************
	print ('Reading test strings ...  \t\t\t\t\t\t{}'.format(l.timer(mytime)))
	mytime = datetime.datetime.now()

	grams=[]
	for i in range(2,maxg+1):
		grams.append(i)

	totals = {}

	lang = dict(am={},ge={},gu={},ti={})
	mylang = dict(am=0,ge=0,gu=0,ti=0)
	maxofg = len(lang)

	base=dict(CFA={},NBC={})
	overallrecall = copy.deepcopy(base)
	for i in overallrecall:
		overallrecall[i] = copy.deepcopy(mylang)

	overalltotal = {'CFA':0,'NBC':0}
	overallprecision = copy.deepcopy(overalltotal)
	overallaccuracy = copy.deepcopy(overalltotal)
	overallfscore = copy.deepcopy(overalltotal)

	mytotal = copy.deepcopy(lang)
	
	for i in mytotal:
		for j in grams:
			totals[j]=0
		mytotal[i]=copy.deepcopy(totals)

	overallconfusion=copy.deepcopy(base)
	for i in overallconfusion:
		overallconfusion[i] = copy.deepcopy(mytotal)

	overallconfusion['CFA'] = copy.deepcopy(mytotal)
	overallconfusion['NBC'] = copy.deepcopy(mytotal)	

	for i in lang:
		for j in lang:
			totals[j]=0
		overallconfusion['CFA'][i]=copy.deepcopy(mylang)
		overallconfusion['NBC'][i]=copy.deepcopy(mylang)

	overallwrongs = copy.deepcopy(overallconfusion)

	print ('Creating language dictionaries ...  \t\t\t\t\t{}'.format(l.timer(mytime)))
	mytime = datetime.datetime.now()

	l.overallmyclassifier(testing,frequencyDict,overallwrongs,overalltotal,overallrecall,uniquengrams,totalngrams,phrases,vocabulary)
	
	print ('Performing classifications ...  \t\t\t\t\t{}'.format(l.timer(mytime)))
	mytime = datetime.datetime.now()

	# print (overallwrongs);return

	for i in overallconfusion['CFA']:
		for j in overallconfusion['CFA'][i]:
			if i==j:
				overallconfusion['CFA'][i][j]=overallrecall['CFA'][j]
				overallconfusion['NBC'][i][j]=overallrecall['NBC'][j]
			else:
				overallconfusion['CFA'][i][j]=overallwrongs['CFA'][i][j]
				overallconfusion['NBC'][i][j]=overallwrongs['NBC'][i][j]

	print ('overallconfusion {}'.format(overallconfusion))
	
	for i in lang:
		numerator=0 ; denominator=0
		n = 0 ; d = 0
		for j in overallconfusion['CFA']:
			if i==j: numerator += overallconfusion['CFA'][j][i] ; n += overallconfusion['NBC'][j][i]			
			denominator+= overallconfusion['CFA'][j][i]
			d += overallconfusion['NBC'][j][i]

		overallprecision['CFA']+=(numerator/denominator) if denominator!=0 else 0
		overallprecision['NBC']+=(n/d) if d!=0 else 0
	
	overallprecision['CFA']/=maxofg
	overallprecision['NBC']/=maxofg

	for x in overallconfusion['CFA']:
		numerator=0 ; denominator=0
		n = 0 ; d = 0
		for y in lang:
			if x==y: numerator += overallconfusion['CFA'][x][y]; n += overallconfusion['NBC'][x][y]				
			denominator+= overallconfusion['CFA'][x][y]
			d += overallconfusion['NBC'][x][y]

		overallrecall['CFA'][x]=(numerator/denominator) if denominator!=0 else 0
		overallrecall['NBC'][x]=(n/d) if d!=0 else 0
		overallaccuracy['CFA']+=numerator
		overallaccuracy['NBC']+=n

	overallaccuracy['CFA']/=overalltotal['CFA']
	overallaccuracy['NBC']/=overalltotal['NBC']
	
	for i in base:
		overallfscore[i] = 2*((overallprecision[i]*(sum(overallrecall[i].values())/maxofg))/(overallprecision[i]+(sum(overallrecall[i].values())/maxofg))) if (overallprecision[i]!=0.00 or sum(overallrecall[i].values()))!=0.00 else 0
	
	print ('Generating performance metrices - precision, recall and f-score ...  \t{}'.format(l.timer(mytime)))
	mytime = datetime.datetime.now()

	
	print ('\nAverage length of test strings: {:,} word(s) / {:,} character(s) / {:,} bytes\tModel: {:,} lines.'.format(phraselength,averagecharacters,averagebyte,lines))
	print ('='*100)
	print ('{:<16}|{:<15}|{:<15}|{:<15}|{:<15}|{:<15}'.format('Ngrams','Observations','Accuracy','Precision','Recall','F-score'))
	print ('-'*100)
	for i in base:
		print ('{:<3} {:<10}\t|{:,}\t\t|{:10.4f}\t|{:10.4f}\t|{:10.4f}\t|{:10.4f}'.format(i,'(2,3,4,5)',overalltotal[i],overallaccuracy[i],overallprecision[i],(sum(overallrecall[i].values())/maxofg),overallfscore[i]))

	print ('-'*100)
	print ('\nGenerating clasification performance results ...  \t\t\t{}'.format(l.timer(mytime)))
	print ('\nStarted:', started)
	ended = datetime.datetime.now()
	print ('End    :', ended)
	print ('Elapsed: {}'.format(l.timer(started)))
Пример #3
0
def main():
	os.system('clear') # on linux 
	selection=1; phraselength=0 ; modeltype = {1:'bl', 2:'by', 3:'fl', 4:'in', 5:'il'} 
	while selection!=0:
		choice=1
		print ('\n')
		print ('='*100)
		print ('AUTHOMATIC LANGUAGE IDENTIFIER USING CUMMULATIVE FREQUENCY ADDITION'.center(100,' '))
		print ('-'*100)  
		
		started = datetime.datetime.now()
		location=0; wordbased=0

		lang = dict(am={},ge={},gu={},ti={})
		mylang = dict(am=0,ge=0,gu=0,ti=0)
		totalngrams = copy.deepcopy(mylang)
		frequencyDict = copy.deepcopy(lang)
		uniquengrams = copy.deepcopy(mylang)

		ct = int(input('\nSelect Test number  - 1 to 10 :   '))
		modelselector = int(input('\nSelect Model type number below: \n\n   1. The Model is based on Fixed Length N-grams without location features - Baseline [BL]. \n   2. The Model is based on source text - Byteorder N-grams [BY]. \n   3. The Model is based on Fixed Length N-grams with location features [FL]. \n   4. The Model is based on Infiniti-grams without location features [IN]. \n   5. The Model is based on Infiniti-grams with location features [IL]. \n   6. Exit.:   '))
		
		if modelselector==6: choice=0;break		

		if location<0 or modelselector<=0 or modelselector>6 or ct<=0 or ct>10:
			print ('\n\nPlease check your entry on percent, ngram value, and/or phrase length')
			pause=input(''); print('{}'.format(pause)) ; continue
		else:
			if modeltype[modelselector]=='bl':
				wordbased=1 ; location=0 ; infinity=0 ; mod='bl'
			elif modeltype[modelselector]=='fl': 
				wordbased=1 ; location=1 ; infinity=0 ; mod='fl'
			elif modeltype[modelselector]=='in': 
				wordbased=1 ; location=0 ; infinity=1 ; mod='in'
			elif modeltype[modelselector]=='il': 
				wordbased=1 ; location=1 ; infinity=1 ; mod='il'
			else: 
				wordbased=0 ; location=0 ; mod=modeltype[modelselector]
			
			path='models/'+str(ct)
			filename = mod+'.txt'
			print ('-'*100) 
			print ('\nFiles {}/{} located and opened ...  \t\t\t{}'.format(path,filename,l.timer(started)))

			params=l.readmodel(path,mod,frequencyDict,totalngrams,uniquengrams)
			frequencyDict = params[0]; uniquengrams = params[1] ; lines=params[5]
			totalngrams = params[2]; maxg=params[3];vocabulary=params[4]

		modelselected = {	1:' The Model is based on Fixed Length N-grams without location features - Baseline [bl] on test {}. ',
						2:' The Model is based on source text - Byteorder Ngrams [by] on test {}. ', 
						3:' The Model is based on Fixed Length N-grams with location features [fl] on test {}. ', 
						4:' The Model is based on Infiniti-grams without location features [in] on test {}. ',
						5:' The Model is based on Infiniti-grams with location features. [il] on test {}. '
					}
		os.system('clear')
		selection=1 # on linux 
		while choice!=0:
			phraselength=0 ; infinity=0
			print ('\n\n{}'.format('='*100))
			print ('AUTHOMATIC LANGUAGE IDENTIFIER USING CUMMULATIVE FREQUENCY ADDITION - CLASSIFIER'.center(100,' '))
			print ('-'*100)
			print (modelselected[modelselector].format(ct).center(100,'*'))
			
			try:
				choice = int(input('\nPress 1 to classify 2 to change model [0 to exit] :   '))
				if choice==0: selection=0 ; break
				elif choice==2: choice=0; break
				else:
					phraselength = int(input('\nInsert between 1 and 25 to set the test phrase length from testing files   :   '))
					if phraselength>25 or phraselength<1:
						print ('\n\nPlease check your entry on percent, ngram value, and/or phrase length')
					else:
						path2='samples/'
						s=open(os.path.join(path2,str(ct)+'.txt'),'r') ; sample = s.readlines() ; s.close()
						readsampled = l.readsample(sample,phraselength,wordbased,location,infinity)
						classification(ct,readsampled,vocabulary,frequencyDict,uniquengrams,totalngrams,phraselength,wordbased,location,infinity,maxg,lines)
			except ValueError:
				print ('\n\nPlease check your entry on percent, ngram value, and/or phrase length')
				continue
Пример #4
0
def main():
    os.system('clear')
    selection = 1
    modeltype = {
        1: 'B',
        2: 'I',
        3: 'L',
        4: 'N',
        5: 'T'
    }
    # B. The Model is based on Byteorder Ngrams.
    # I. The Model is based on Infinitigrams without location features.
    # L. The Model is based on Infinitigrams with location features.
    # N. The Model is based on #2 above, for top n% of most frequent words.
    # T. The Model is based on #3 above, for top n% of most frequent words.:
    while selection != 0:
        choice = 1
        print('\n\n{}'.format('=' * 100))
        print(
            'AUTHOMATIC LANGUAGE IDENTIFIER USING CUMMULATIVE FREQUENCY ADDITION - CLASSIFIER'
            .center(100, ' '))
        print('-' * 100)

        started = datetime.datetime.now()
        percent = 1
        lowfreq = 0
        location = 0
        wordbased = 0

        lang = dict(am={}, ge={}, gu={}, ti={})
        mylang = dict(am=0, ge=0, gu=0, ti=0)
        totalngrams = copy.deepcopy(mylang)
        frequencyDict = copy.deepcopy(lang)
        uniquengrams = copy.deepcopy(mylang)

        modelselector = int(
            input(
                '\nSelect Model type number below: \n\n   1. The Model is based on Byteorder Ngrams. \n   2. The Model is based on Infinitigrams without location features. \n   3. The Model is based on Infinitigrams with location features. \n   4. The Model is based on #2 above, for top n% of most frequent words. \n   5. The Model is based on #3 above, for top n% of most frequent words. \n   6. Exit.:   '
            ))
        if modelselector == 6:
            selection = 0
            break
        elif modelselector < 4 and modelselector > 0:
            percent = float(
                input(
                    '\nEnter a percentage as 0.xx to select the top x frequent items of the model :   '
                ))
        if float(percent) == 1 and modelselector < 4:
            lowfreq = int(
                input(
                    'Enter 1 to 5 not to consider lowest counts in the model [0 to consider all]:   '
                ))
        if percent <= 0 or percent > 1 or location < 0 or modelselector <= 0 or modelselector > 6:
            print(
                '\n\nPlease check your entry on percent, ngram value, and/or phrase length'
            )
            pause = input('')
            print('{}'.format(pause))
            continue
        else:
            if modeltype[modelselector] == 'I':
                wordbased = 1
                location = 0
                infinity = 1
                mod = 'I'
            elif modeltype[modelselector] == 'L':
                wordbased = 1
                location = 1
                infinity = 1
                mod = 'L'
            elif modeltype[modelselector] == 'N':
                wordbased = 1
                location = 0
                infinity = 1
                mod = 'N'
            elif modeltype[modelselector] == 'T':
                wordbased = 1
                location = 1
                infinity = 1
                mod = 'T'
            else:
                wordbased = 0
                location = 0
                mod = modeltype[modelselector]

            path = 'models/' + mod
            f = open(os.path.join(path, 'model.txt'), 'r')

            print('-' * 100)
            print('\nFiles {}/{} located and opened ...  \t\t\t{}'.format(
                path, 'model.txt', l.timer(started)))
            # mytime = datetime.datetime.now()
            # print ('Opening relevant files ...  \t\t\t\t\t\t\t{}'.format(l.timer(mytime)))
            # mytime = datetime.datetime.now()
            model = f.readlines()

            params = l.readmodel(model, frequencyDict, totalngrams,
                                 uniquengrams, percent, lowfreq)
            frequencyDict = params[0]
            uniquengrams = params[1]
            totalngrams = params[2]
            top = params[3]
            maxg = params[4]

            # if os.path.isfile('t.txt'): os.remove('t.txt') ; t=open('t.txt', 'a+')
            # for i in lang:
            # 	for j in frequencyDict[i].keys():
            # 		t.write(str(i)+','+ str(j)+','+ str(frequencyDict[i][j]['gram'])+','+ str(frequencyDict[i][j]['freq'])+','+ str(frequencyDict[i][j]['ovFreq'])+str('\r\n'))
            # t.close()

        modelselected = {
            1:
            ' Model is based on Byteorder Ngrams. Considered {}%, frequencies > {}. ',
            2:
            ' Model is based on Infinitigrams without location features. Considered {}%, frequencies > {}. ',
            3:
            ' Model is based on Infinitigrams with location features. Considered {}%, frequencies > {}. ',
            4:
            ' Model is based on Infinitigrams without location features, for top {}% of most frequent words. ',
            5:
            ' Model is based on Infinitigrams with location features, for top {}% of most frequent words. '
        }

        os.system('clear')  # on linux
        while choice != 0:
            phraselength = 0
            infinity = 0
            print('\n\n{}'.format('=' * 100))
            print(
                'AUTHOMATIC LANGUAGE IDENTIFIER USING CUMMULATIVE FREQUENCY ADDITION - CLASSIFIER'
                .center(100, ' '))
            print('-' * 100)
            if modelselector < 4:
                print(modelselected[modelselector].format(
                    percent * 100, lowfreq).center(100, '*'))
            else:
                print(modelselected[modelselector].format(top * 100).center(
                    100, '*'))
            try:
                choice = int(
                    input(
                        '\nPress 1 to classify 2 to change model [0 to exit] :   '
                    ))

                if choice == 0:
                    selection = 0
                    break
                elif choice == 2:
                    choice = 0
                    break
                else:
                    phraselength = int(
                        input(
                            '\nInsert between 1 and 25 to set the test phrase length from testing files   :   '
                        ))
                    if phraselength > 25 or phraselength < 1:
                        print(
                            '\n\nPlease check your entry on percent, ngram value, and/or phrase length'
                        )
                    else:
                        classification(frequencyDict, uniquengrams,
                                       totalngrams, phraselength, wordbased,
                                       location, infinity, maxg)

            except ValueError:
                print(
                    '\n\nPlease check your entry on percent, ngram value, and/or phrase length'
                )
                continue
Пример #5
0
def classification(frequencyDict,
                   uniquengrams,
                   totalngrams,
                   phraselength=25,
                   wordbased=0,
                   location=0,
                   infinity=0,
                   maxg=5):

    #*************************** START Reading Files ************************************
    started = datetime.datetime.now()

    s = open('sample.txt', 'r')  #, encoding = 'utf8' )

    print('-' * 100)
    print('\nFiles {} loaded to memory ...  \t\t\t\t\t\t{}'.format(
        'sample.txt', l.timer(started)))
    mytime = datetime.datetime.now()
    print('Opening relevant files ...  \t\t\t\t\t\t\t{}'.format(
        l.timer(mytime)))
    mytime = datetime.datetime.now()

    # model = f.readlines()
    #matrix = m.read()
    print('Reading language models ...  \t\t\t\t\t\t\t{}'.format(
        l.timer(mytime)))
    mytime = datetime.datetime.now()

    sample = s.readlines()
    # f.close()
    s.close()
    #m.close()
    #*************************** END Reading Files ***************************************
    print('Reading test strings ...  \t\t\t\t\t\t\t{}'.format(l.timer(mytime)))
    mytime = datetime.datetime.now()

    readsampled = l.readsample(sample, phraselength)
    sampled = readsampled[0]
    averagebyte = int(readsampled[1])
    averagecharacters = int(readsampled[2])

    testing = [
    ]  #[['am', [['እው', 2], ['ውነ', 2], ['ነት', 2], ['እውነ', 3], ['ውነት', 3], ['እውነት', 4]]]

    temp = []
    phrases = 0
    for i in sampled:
        if wordbased == 0:
            testing.append(l.ngram(l.regex(i), 1))
            phrases += 1
        else:
            temp = l.regex(i)[1].split()
            wordlist = []
            for n in temp:
                if location == 0:
                    wordlist.extend(
                        l.ngram([i[0], n], 1, location, infinity)[1])
                else:
                    wordlist.extend(
                        l.ngram([i[0], n], 1, location, infinity)[1])
            testing.append([i[0], wordlist])
            phrases += 1

    grams = []
    for i in range(2, maxg + 1):
        grams.append(i)

    lang = dict(am={}, ge={}, gu={}, ti={})
    base = {'CFA': {}, 'NBC': {}}
    wrongs = copy.deepcopy(base)
    mytotals = copy.deepcopy(base)
    fscore = copy.deepcopy(base)

    mytotal = copy.deepcopy(lang)

    classifiers = {'CFA': 0, 'NBC': 0}
    averageprecision = copy.deepcopy(classifiers)
    averageaccuracy = copy.deepcopy(classifiers)
    averagefscore = copy.deepcopy(classifiers)
    averagerecall = copy.deepcopy(classifiers)
    averagetotal = copy.deepcopy(classifiers)

    for i in grams:
        fscore['CFA'][i] = 0
        fscore['NBC'][i] = 0

    precision = copy.deepcopy(fscore)
    total = copy.deepcopy(fscore)
    totaltests = copy.deepcopy(fscore)
    recall = copy.deepcopy(fscore)
    accuracy = copy.deepcopy(fscore)

    totals = {}

    for i in mytotal:
        for j in grams:
            totals[j] = 0
        mytotal[i] = copy.deepcopy(totals)

    mytotals['CFA'] = copy.deepcopy(mytotal)
    mytotals['NBC'] = copy.deepcopy(mytotal)
    myrecall = copy.deepcopy(mytotals)

    #wrong classifications like amharic classified as guragigna
    for i in mytotal:
        wrongs['CFA'][i] = copy.deepcopy(mytotal)
        wrongs['NBC'][i] = copy.deepcopy(mytotal)

    confusion = copy.deepcopy(wrongs)

    # print (testing);return
    print('Creating language dictionaries ...  \t\t\t\t\t\t{}'.format(
        l.timer(mytime)))
    mytime = datetime.datetime.now()

    l.myclassifier(testing, frequencyDict, grams, wrongs, totaltests, myrecall,
                   total, uniquengrams, totalngrams, phrases)

    print('\tPerforming classifications ...  \t\t\t\t\t{}'.format(
        l.timer(mytime)))
    mytime = datetime.datetime.now()

    for i in confusion['CFA']:
        for j in confusion['CFA'][i]:
            if i == j:
                confusion['CFA'][i][j] = myrecall['CFA'][j]
                confusion['NBC'][i][j] = myrecall['NBC'][j]
            else:
                confusion['CFA'][i][j] = wrongs['CFA'][i][j]
                confusion['NBC'][i][j] = wrongs['NBC'][i][j]

    for g in grams:

        for i in lang:
            numerator = 0
            denominator = 0
            n = 0
            d = 0
            for j in confusion['CFA']:
                if i == j:
                    numerator += confusion['CFA'][j][i][g]
                    n += confusion['NBC'][j][i][g]
                denominator += confusion['CFA'][j][i][g]
                d += confusion['NBC'][j][i][g]

            precision['CFA'][g] += (numerator / denominator /
                                    4) if denominator != 0 else 0
            precision['NBC'][g] += (n / d / 4) if d != 0 else 0

        for x in confusion['CFA']:
            numerator = 0
            denominator = 0
            n = 0
            d = 0
            for y in lang:
                if x == y:
                    numerator += confusion['CFA'][x][y][g]
                    n += confusion['NBC'][x][y][g]
                denominator += confusion['CFA'][x][y][g]
                d += confusion['NBC'][x][y][g]

            recall['CFA'][g] += (numerator / denominator /
                                 4) if denominator != 0 else 0
            recall['NBC'][g] += (n / d / 4) if d != 0 else 0
            accuracy['CFA'][g] += numerator
            accuracy['NBC'][g] += n

        accuracy['CFA'][g] /= total['CFA'][g] if total['CFA'][g] != 0 else 1
        accuracy['NBC'][g] /= total['NBC'][g] if total['NBC'][g] != 0 else 1

        averageaccuracy['CFA'] += accuracy['CFA'][g] / 4
        averageaccuracy['NBC'] += accuracy['NBC'][g] / 4

        averagetotal['CFA'] += totaltests['CFA'][g]
        averagetotal['NBC'] += totaltests['NBC'][g]

        averageprecision['CFA'] += precision['CFA'][g] / 4
        averageprecision['NBC'] += precision['NBC'][g] / 4

        averagerecall['CFA'] += recall['CFA'][g] / 4
        averagerecall['NBC'] += recall['NBC'][g] / 4

    for g in grams:
        fscore['CFA'][g] = 2 * (
            (precision['CFA'][g] * recall['CFA'][g]) /
            (precision['CFA'][g] + recall['CFA'][g])) if (
                precision['CFA'][g] != 0.00 or recall['CFA'][g]) != 0.00 else 0
        fscore['NBC'][g] = 2 * (
            (precision['NBC'][g] * recall['NBC'][g]) /
            (precision['NBC'][g] + recall['NBC'][g])) if (
                precision['NBC'][g] != 0.00 or recall['NBC'][g]) != 0.00 else 0
        averagefscore['CFA'] += fscore['CFA'][g] / 4
        averagefscore['NBC'] += fscore['NBC'][g] / 4

    print(
        'Generating performance metrices - precision, recall and f-score ...  \t\t{}'
        .format(l.timer(mytime)))
    mytime = datetime.datetime.now()

    if os.path.isfile('result.txt'): os.remove('result.txt')
    # r=open('result.txt','a+')

    for i in classifiers:
        print(
            '\nAverage length of test strings: {:,} word(s) / {:,} character(s) / {:,} bytes'
            .format(phraselength, averagecharacters, averagebyte))
        print('=' * 100)
        print('{:<16}|{:<15}|{:<15}|{:<15}|{:<15}|{:<15}'.format(
            'Ngrams', 'Observations', 'Accuracy', 'Precision', 'Recall',
            'F-score'))
        print('-' * 100)

        for g in grams:
            print(
                '{:<3} {:<10}\t|{:,}\t\t|{:10.4f}\t|{:10.4f}\t|{:10.4f}\t|{:10.4f}'
                .format(i, g, totaltests[i][g], accuracy[i][g],
                        precision[i][g], recall[i][g], fscore[i][g]))
        print('-' * 100)

        print(
            '{:<3} {:<10}\t|{:,}\t\t|{:10.4f}\t|{:10.4f}\t|{:10.4f}\t|{:10.4f}'
            .format(i, '(2,3,4,5)', averagetotal[i], averageaccuracy[i],
                    averageprecision[i], averagerecall[i], averagefscore[i]))
        print('-' * 100)

    print('\nGenerating clasification performance results ...  \t\t\t\t{}'.
          format(l.timer(mytime)))
    print('\nStarted:', started)
    ended = datetime.datetime.now()
    print('End    :', ended)
    print('Elapsed: {}'.format(l.timer(started)))
Пример #6
0
def classification(ct,readsampled,vocabulary,mod,frequencyDict,uniquengrams,totalngrams,phraselength=25,lines=0,maxg=5):
	
	testing = readsampled[0]
	averagebyte = int(readsampled[1])
	averagecharacters = int(readsampled[2])
	phrases = int(readsampled[3])

	grams=[]
	for i in range(2,maxg+1):
		grams.append(i)

	totals = {}

	lang = dict(am={},ge={},gu={},ti={})
	mylang = dict(am=0,ge=0,gu=0,ti=0)
	maxofg = len(lang)

	base=dict(CFA={},NBC={})
	overallrecall = copy.deepcopy(base)
	for i in overallrecall:
		overallrecall[i] = copy.deepcopy(mylang)

	overalltotal = {'CFA':0,'NBC':0}
	overallprecision = copy.deepcopy(overalltotal)
	overallaccuracy = copy.deepcopy(overalltotal)
	overallfscore = copy.deepcopy(overalltotal)

	mytotal = copy.deepcopy(lang)
	
	for i in mytotal:
		for j in grams:
			totals[j]=0
		mytotal[i]=copy.deepcopy(totals)

	overallconfusion=copy.deepcopy(base)
	for i in overallconfusion:
		overallconfusion[i] = copy.deepcopy(mytotal)

	overallconfusion['CFA'] = copy.deepcopy(mytotal)
	overallconfusion['NBC'] = copy.deepcopy(mytotal)	

	for i in lang:
		for j in lang:
			totals[j]=0
		overallconfusion['CFA'][i]=copy.deepcopy(mylang)
		overallconfusion['NBC'][i]=copy.deepcopy(mylang)

	overallwrongs = copy.deepcopy(overallconfusion)

	mytime = datetime.datetime.now()

	l.overallmyclassifier(testing,frequencyDict,overallwrongs,overalltotal,overallrecall,uniquengrams,totalngrams,phrases,vocabulary)
	
	print ('\tPerforming classifications ...  \t\t\t\t\t{}'.format(l.timer(mytime)))

	for i in overallconfusion['CFA']:
		for j in overallconfusion['CFA'][i]:
			if i==j:
				overallconfusion['CFA'][i][j]=overallrecall['CFA'][j]
				overallconfusion['NBC'][i][j]=overallrecall['NBC'][j]
			else:
				overallconfusion['CFA'][i][j]=overallwrongs['CFA'][i][j]
				overallconfusion['NBC'][i][j]=overallwrongs['NBC'][i][j]

	# print ('overallconfusion {}'.format(overallconfusion))
	
	for i in lang:
		numerator=0 ; denominator=0
		n = 0 ; d = 0
		for j in overallconfusion['CFA']:
			if i==j: numerator += overallconfusion['CFA'][j][i] ; n += overallconfusion['NBC'][j][i]			
			denominator+= overallconfusion['CFA'][j][i]
			d += overallconfusion['NBC'][j][i]

		overallprecision['CFA']+=(numerator/denominator) if denominator!=0 else 0
		overallprecision['NBC']+=(n/d) if d!=0 else 0
	
	overallprecision['CFA']/=maxofg
	overallprecision['NBC']/=maxofg

	for x in overallconfusion['CFA']:
		numerator=0 ; denominator=0
		n = 0 ; d = 0
		for y in lang:
			if x==y: numerator += overallconfusion['CFA'][x][y]; n += overallconfusion['NBC'][x][y]				
			denominator+= overallconfusion['CFA'][x][y]
			d += overallconfusion['NBC'][x][y]

		overallrecall['CFA'][x]=(numerator/denominator) if denominator!=0 else 0
		overallrecall['NBC'][x]=(n/d) if d!=0 else 0
		overallaccuracy['CFA']+=numerator
		overallaccuracy['NBC']+=n

	overallaccuracy['CFA']/=overalltotal['CFA']
	overallaccuracy['NBC']/=overalltotal['NBC']
	
	for i in base:
		overallfscore[i] = 2*((overallprecision[i]*(sum(overallrecall[i].values())/maxofg))/(overallprecision[i]+(sum(overallrecall[i].values())/maxofg))) if (overallprecision[i]!=0.00 or sum(overallrecall[i].values()))!=0.00 else 0
	
	path1='result/ov/'+str(ct)+'/'
	filename = mod+'.txt'
	r=open(os.path.join(path1, filename),'a+')

	print ('\nAverage length of test strings: {:,} word(s) / {:,} character(s) / {:,} bytes\tModel: {:,} lines.'.format(phraselength,averagecharacters,averagebyte,lines))
	print ('='*100)
	print ('{:<16}|{:<15}|{:<15}|{:<15}|{:<15}|{:<15}'.format('Ngrams','Observations','Accuracy','Precision','Recall','F-score'))
	print ('-'*100)	
	for i in base:
		r.write(str(i)+","+str(ct)+","+str(mod)+","+str(phraselength)+","+str(overalltotal[i])+","+str(overallaccuracy[i])+","+str(overallprecision[i])+","+str(sum(overallrecall[i].values())/maxofg)+","+str(overallfscore[i])+str('\r\n'))
		print ('{:<3} {:<10}\t|{:,}\t\t|{:10.4f}\t|{:10.4f}\t|{:10.4f}\t|{:10.4f}'.format(i,'(2,3,4,5)',overalltotal[i],overallaccuracy[i],overallprecision[i],(sum(overallrecall[i].values())/maxofg),overallfscore[i]))
	print ('-'*100)
	r.close()
Пример #7
0
def main():
	os.system('clear') # on linux 
	choice=1; phraselength=0 ; modeltype = {1:'bl', 2:'by', 3:'fl', 4:'in', 5:'il'}
	while choice!=0:
		
		print ('\n\n{}'.format('='*100))
		print ('AUTHOMATIC LANGUAGE IDENTIFIER USING CUMMULATIVE FREQUENCY ADDITION - OVERALL TESTER'.center(100,' '))
		print ('-'*100)  
		
		started = datetime.datetime.now()

		for modelselector in range(1,7):
			
			wordbased=0 ; phraselength=0 ; infinity=0 ; location=0
			
			if modelselector==6: choice=0;break
			else:
				if modeltype[modelselector]=='bl':
					wordbased=1 ; location=0 ; infinity=0 ; mod='bl'
				elif modeltype[modelselector]=='fl': 
					wordbased=1 ; location=1 ; infinity=0 ; mod='fl'
				elif modeltype[modelselector]=='in': 
					wordbased=1 ; location=0 ; infinity=1 ; mod='in'
				elif modeltype[modelselector]=='il': 
					wordbased=1 ; location=1 ; infinity=1 ; mod='il'
				else: 
					wordbased=0 ; location=0 ; mod=modeltype[modelselector]

			os.system('clear') # on linux 
			started = datetime.datetime.now()
			phrases=[1,2,3,4,5,10,15,20,25]

			for ct in range(1,11):
				
				lang = dict(am={},ge={},gu={},ti={})
				mylang = dict(am=0,ge=0,gu=0,ti=0)
				totalngrams = copy.deepcopy(mylang)
				frequencyDict = copy.deepcopy(lang)
				uniquengrams = copy.deepcopy(mylang)

				path1='result/ov/'+str(ct)+'/'
				filename = mod+'.txt'
				if os.path.isfile(os.path.join(path1,filename)): os.remove(os.path.join(path1,filename))
				
				path='models/'+str(ct)+'/'
				print ('-'*100) 
				print ('\nFiles {}{} located and opened ...  \t\t\t{}'.format(path,filename,l.timer(started)))
				params = [] ; params=l.readmodel(path,mod,frequencyDict,totalngrams,uniquengrams)
				frequencyDict = params[0] ; uniquengrams = params[1] ; lines=params[5]
				totalngrams = params[2] ; maxg=params[3] ; vocabulary=params[4]
				
				path2='samples/'
				for phraselength in  phrases:
					s=open(os.path.join(path2,str(ct)+'.txt'),'r') ; sample = s.readlines() ; s.close() ; readsampled=[]
					readsampled = l.readsample(sample,phraselength,wordbased,location,infinity)				
					classification(ct,readsampled,vocabulary,mod,frequencyDict,uniquengrams,totalngrams,phraselength,lines,maxg)
			else:
				print ('\nStarted:', started)
				ended = datetime.datetime.now()
				print ('End    :', ended)
				print ('Elapsed: {}'.format(l.timer(started)))
Пример #8
0
def classification(ct,
                   readsampled,
                   vocabulary,
                   frequencyDict,
                   uniquengrams,
                   totalngrams,
                   phraselength=25,
                   wordbased=0,
                   location=0,
                   infinity=0,
                   maxg=5,
                   lines=0):

    #*************************** START Reading Files ************************************
    started = datetime.datetime.now()

    testing = readsampled[0]
    averagebyte = int(readsampled[1])
    averagecharacters = int(readsampled[2])
    phrases = int(readsampled[3])

    print('-' * 100)
    # print ('\nFiles {} loaded to memory ...  \t\t\t\t\t\t{}'.format(path+filename,l.timer(started)))
    mytime = datetime.datetime.now()
    print('Opening relevant files ...  \t\t\t\t\t\t\t{}'.format(
        l.timer(mytime)))
    mytime = datetime.datetime.now()

    print('Reading language models ...  \t\t\t\t\t\t\t{}'.format(
        l.timer(mytime)))
    mytime = datetime.datetime.now()

    #*************************** END Reading Files ***************************************
    print('Reading test strings ...  \t\t\t\t\t\t\t{}'.format(l.timer(mytime)))
    mytime = datetime.datetime.now()

    # testing=[] #[['am', [['እው', 2], ['ውነ', 2], ['ነት', 2], ['እውነ', 3], ['ውነት', 3], ['እውነት', 4]]]

    grams = []
    for i in range(2, maxg + 1):
        grams.append(i)

    lang = dict(am={}, ge={}, gu={}, ti={})
    base = {'CFA': {}, 'NBC': {}}
    wrongs = copy.deepcopy(base)
    mytotals = copy.deepcopy(base)
    fscore = copy.deepcopy(base)
    mytotal = copy.deepcopy(lang)

    maxofg = len(lang)
    averaging = len(grams)

    classifiers = {'CFA': 0, 'NBC': 0}
    averageprecision = copy.deepcopy(classifiers)
    averageaccuracy = copy.deepcopy(classifiers)
    averagefscore = copy.deepcopy(classifiers)
    averagerecall = copy.deepcopy(classifiers)
    averagetotal = copy.deepcopy(classifiers)

    for i in grams:
        fscore['CFA'][i] = 0
        fscore['NBC'][i] = 0

    precision = copy.deepcopy(fscore)
    total = copy.deepcopy(fscore)
    totaltests = copy.deepcopy(fscore)
    recall = copy.deepcopy(fscore)
    accuracy = copy.deepcopy(fscore)

    totals = {}

    for i in mytotal:
        for j in grams:
            totals[j] = 0
        mytotal[i] = copy.deepcopy(totals)

    mytotals['CFA'] = copy.deepcopy(mytotal)
    mytotals['NBC'] = copy.deepcopy(mytotal)
    myrecall = copy.deepcopy(mytotals)

    #wrong classifications like amharic classified as guragigna
    for i in mytotal:
        wrongs['CFA'][i] = copy.deepcopy(mytotal)
        wrongs['NBC'][i] = copy.deepcopy(mytotal)

    confusion = copy.deepcopy(wrongs)

    print('Creating language dictionaries ...  \t\t\t\t\t\t{}'.format(
        l.timer(mytime)))
    mytime = datetime.datetime.now()

    l.myclassifier(testing, frequencyDict, grams, wrongs, totaltests, myrecall,
                   total, uniquengrams, totalngrams, phrases, vocabulary)

    print('\tPerforming classifications ...  \t\t\t\t\t{}'.format(
        l.timer(mytime)))
    mytime = datetime.datetime.now()

    for i in confusion['CFA']:
        for j in confusion['CFA'][i]:
            if i == j:
                confusion['CFA'][i][j] = myrecall['CFA'][j]
                confusion['NBC'][i][j] = myrecall['NBC'][j]
            else:
                confusion['CFA'][i][j] = wrongs['CFA'][i][j]
                confusion['NBC'][i][j] = wrongs['NBC'][i][j]

    for g in grams:

        for i in lang:
            numerator = 0
            denominator = 0
            n = 0
            d = 0
            for j in confusion['CFA']:
                if i == j:
                    numerator += confusion['CFA'][j][i][g]
                    n += confusion['NBC'][j][i][g]
                denominator += confusion['CFA'][j][i][g]
                d += confusion['NBC'][j][i][g]

            precision['CFA'][g] += (numerator / denominator /
                                    maxofg) if denominator != 0 else 0
            precision['NBC'][g] += (n / d / maxofg) if d != 0 else 0

        for x in confusion['CFA']:
            numerator = 0
            denominator = 0
            n = 0
            d = 0
            for y in lang:
                if x == y:
                    numerator += confusion['CFA'][x][y][g]
                    n += confusion['NBC'][x][y][g]
                denominator += confusion['CFA'][x][y][g]
                d += confusion['NBC'][x][y][g]

            recall['CFA'][g] += (numerator / denominator /
                                 maxofg) if denominator != 0 else 0
            recall['NBC'][g] += (n / d / maxofg) if d != 0 else 0
            accuracy['CFA'][g] += numerator
            accuracy['NBC'][g] += n

        accuracy['CFA'][g] /= total['CFA'][g] if total['CFA'][g] != 0 else 1
        accuracy['NBC'][g] /= total['NBC'][g] if total['NBC'][g] != 0 else 1

        averageaccuracy['CFA'] += accuracy['CFA'][g] / averaging
        averageaccuracy['NBC'] += accuracy['NBC'][g] / averaging

        averagetotal['CFA'] += totaltests['CFA'][g]
        averagetotal['NBC'] += totaltests['NBC'][g]

        averageprecision['CFA'] += precision['CFA'][g] / averaging
        averageprecision['NBC'] += precision['NBC'][g] / averaging

        averagerecall['CFA'] += recall['CFA'][g] / averaging
        averagerecall['NBC'] += recall['NBC'][g] / averaging

    for g in grams:
        fscore['CFA'][g] = 2 * (
            (precision['CFA'][g] * recall['CFA'][g]) /
            (precision['CFA'][g] + recall['CFA'][g])) if (
                precision['CFA'][g] != 0.00 or recall['CFA'][g]) != 0.00 else 0
        fscore['NBC'][g] = 2 * (
            (precision['NBC'][g] * recall['NBC'][g]) /
            (precision['NBC'][g] + recall['NBC'][g])) if (
                precision['NBC'][g] != 0.00 or recall['NBC'][g]) != 0.00 else 0
        averagefscore['CFA'] += fscore['CFA'][g] / averaging
        averagefscore['NBC'] += fscore['NBC'][g] / averaging

    print(
        'Generating performance metrices - precision, recall and f-score ...  \t\t{}'
        .format(l.timer(mytime)))
    mytime = datetime.datetime.now()

    for i in classifiers:
        print(
            '\nAverage length of test strings: {:,} word(s) / {:,} character(s) / {:,} bytes\tModel: {:,} lines.'
            .format(phraselength, averagecharacters, averagebyte, lines))
        print('=' * 100)
        print('{:<16}|{:<15}|{:<15}|{:<15}|{:<15}|{:<15}'.format(
            'Ngrams', 'Observations', 'Accuracy', 'Precision', 'Recall',
            'F-score'))
        print('-' * 100)

        for g in grams:
            print(
                '{:<3} {:<10}\t|{:,}\t\t|{:10.2f}%\t|{:10.2f}%\t|{:10.2f}%\t|{:10.2f}%'
                .format(i, g, totaltests[i][g], accuracy[i][g] * 100,
                        precision[i][g] * 100, recall[i][g] * 100,
                        fscore[i][g] * 100))
        print('-' * 100)

        print(
            '{:<3} {:<10}\t|{:,}\t\t|{:10.2f}%\t|{:10.2f}%\t|{:10.2f}%\t|{:10.2f}%'
            .format(i, '(2,3,4,5)', averagetotal[i], averageaccuracy[i] * 100,
                    averageprecision[i] * 100, averagerecall[i] * 100,
                    averagefscore[i] * 100))
        print('-' * 100)

    print('\nGenerating clasification performance results ...  \t\t\t\t{}'.
          format(l.timer(mytime)))
    print('\nStarted:', started)
    ended = datetime.datetime.now()
    print('End    :', ended)
    print('Elapsed: {}'.format(l.timer(started)))
Пример #9
0
def clearCommons():
	#****************** START Model generator ********************************
	os.system('clear')
	print ('\n\n{}'.format('='*100))
	print ('AUTHOMATIC LANGUAGE IDENTIFIER USING CUMMULATIVE FREQUENCY ADDITION - COMMONS REMOVER'.center(100,' '))
	print ('-'*100)	

	print ("\nLoading corpus files to memory ... ")
	path = 'corpus/cr/300'
	
	alllist =[] ; vocabulary = set()
	started = datetime.datetime.now()
	content = {}
	language = dict(am='Amharic',ge='Geez',gu='Guragigna',ti='Tigrigna')
	
	commons =set()
	
	for infile in glob.glob(os.path.join(path, '*.txt')): #opens files from directory
		try:
			#Extract the file name			
			filename = infile.split('/')[-1]
			lang = filename[:2]

			#open and read file from corpus
			f=open(infile,'r', encoding = 'utf8' )
			rawtext = [lang,f.read()]
			f.close()

			print ('-'*100) 
			print ('\nOpening relevant files ...  \t\t\t\t\t\t{}'.format(l.timer(started)))

			content[lang]=set(l.regex(rawtext)[1].split()) #source file content set, i.e. vocabulary
			
			listed = l.regex(rawtext)[1].split() ##source file content total words list
			alllist.append(listed)

			for i in content: #update the set vocabulary with the union of itself and a new list.
				vocabulary.update(content[i])

			print('{} - Completed building relevant dictionaries for {} language'.format(datetime.datetime.now(),language[lang]))
		
		except IOError:
			print ('Error: Can not open the file: ',lang)
			return	
	
	r = len(vocabulary) ; w = 0
	for i in content:
		w+= len(content[i])

	print ('-'*100)
	print('{} - Matching {:,} vocabulary items to {:,} ngrams in all language'.format(datetime.datetime.now(),r,w))
	
	for i in content:	#checking and adding only the common occuraces in each set
		for j in content:
			if i==j:continue
			commons.update(content[i].intersection(content[j]))

	path1 = 'corpus/cc/300'
	for infile in glob.glob(os.path.join(path, '*.txt')):
		filename = infile.split('/')[-1] ; lang = filename[:2]
		if os.path.isfile(os.path.join(path1, filename)): os.remove(os.path.join(path1, filename))
		
		f=open(infile,'r')
		rawtext = l.regex([lang,f.read()])[1]
		f.close()
		
		cleared = ' '.join(filter(lambda x: x not in commons,  rawtext.split()))
		c=open(os.path.join(path1, filename),'a+')
		c.write(str(cleared))
		c.close()

	print ('\nA total of {} common terms in listed in commons.txt are removed from the corpus '.format(len(commons)))

	if os.path.isfile('commons.txt'): os.remove('commons.txt')
	s=open('commons.txt','a+')
	s.write(str(commons))
	s.close()
	
	print ('\nStarted:', started)
	ended = datetime.datetime.now()
	elapsed = ended - started
	print ('End    :', ended)
	print ('Elapsed:', elapsed)
Пример #10
0
def main():
    os.system('clear')
    choice = 1
    phraselength = 0
    modeltype = {
        1: 'bl',
        2: 'by',
        3: 'fl',
        4: 'in',
        5: 'il'
    }

    while choice != 0:

        print('\n\n{}'.format('=' * 100))
        print(
            'AUTHOMATIC LANGUAGE IDENTIFIER USING CUMMULATIVE FREQUENCY ADDITION - N-GRAMS TESTER'
            .center(100, ' '))
        print('-' * 100)

        started = datetime.datetime.now()

        for modelselector in range(4, 7):

            wordbased = 0
            phraselength = 0
            infinity = 0
            location = 0

            if modelselector == 6:
                choice = 0
                break

            if modelselector <= 0 or modelselector > 6:
                print(
                    '\n\nPlease check your entry on percent, ngram value, and/or phrase length'
                )
                pause = input('')
                print('{}'.format(pause))
                continue
            else:
                if modeltype[modelselector] == 'bl':
                    wordbased = 1
                    location = 0
                    infinity = 0
                    mod = 'bl'
                elif modeltype[modelselector] == 'fl':
                    wordbased = 1
                    location = 1
                    infinity = 0
                    mod = 'fl'
                elif modeltype[modelselector] == 'in':
                    wordbased = 1
                    location = 0
                    infinity = 1
                    mod = 'in'
                elif modeltype[modelselector] == 'il':
                    wordbased = 1
                    location = 1
                    infinity = 1
                    mod = 'il'
                else:
                    wordbased = 0
                    location = 0
                    mod = modeltype[modelselector]

            os.system('clear')  # on linux
            started = datetime.datetime.now()
            phrases = [1, 2, 3, 4, 5, 10, 15, 20, 25]

            for ct in range(1, 11):

                lang = dict(am={}, ge={}, gu={}, ti={})
                mylang = dict(am=0, ge=0, gu=0, ti=0)
                totalngrams = copy.deepcopy(mylang)
                frequencyDict = copy.deepcopy(lang)
                uniquengrams = copy.deepcopy(mylang)

                path1 = 'result/av/' + str(ct)
                filename = mod + '.txt'
                if os.path.isfile(os.path.join(path1, filename)):
                    os.remove(os.path.join(path1, filename))

                path = 'models/' + str(ct) + '/'
                print('-' * 100)
                print('\nFiles {}{} located and opened ...  \t\t\t{}'.format(
                    path, filename, l.timer(started)))

                params = l.readmodel(
                    path,
                    mod,
                    frequencyDict,
                    totalngrams,
                    uniquengrams,
                )
                frequencyDict = params[0]
                uniquengrams = params[1]
                lines = params[5]
                totalngrams = params[2]
                maxg = params[3]
                vocabulary = params[4]

                path2 = 'samples/'
                for phraselength in phrases:
                    s = open(os.path.join(path2,
                                          str(ct) + '.txt'), 'r')
                    sample = s.readlines()
                    s.close()
                    readsampled = l.readsample(sample, phraselength, wordbased,
                                               location, infinity)
                    classification(ct, readsampled, vocabulary, mod,
                                   frequencyDict, uniquengrams, totalngrams,
                                   phraselength, wordbased, location, infinity,
                                   maxg, lines)
            else:
                print('\nStarted:', started)
                ended = datetime.datetime.now()
                print('End    :', ended)
                print('Elapsed: {}'.format(l.timer(started)))