Exemplo n.º 1
0
def crossValidationLidstone(dataSet):

    error = 0.0
    iter = 0
    step = 5

    while iter < len(dataSet):
        trainSet = dataSet

        testSet = trainSet.pop(iter)

        allTexts = getAllLanguagesSet(dataSet)

        allTexts = formatDataset(allTexts)

        t0 = time.time()

        preditectedLang = llc.lidstoneLanguageClassification(testSet[0], allTexts)

        print time.time()-t0


        if testSet[1] == 'en':
            language = 0
        elif(testSet[1] == 'es'):
            language = 1
        elif(testSet[1] == 'fr'):
            language = 2
        elif(testSet[1] == 'pt'):
            language = 3

        iter += step
        if preditectedLang == language:
            error += 0
        else:
            error += 1
        print 'error'+str(error)
    iter = (iter - step)/step+1
    error = error/iter
    print error
    return error
Exemplo n.º 2
0
#sentence = 'It is known for being the first to print many English manuscripts, including Cotton Nero A.x, which contains Pearl, Sir Gawain and the Green Knight, and other poems.'
# sentence = 'O portugues foi usado, naquela epoca,'
# sentence = "una frase en espanol, es una prueba de que el programa funcione"
# sentence = 'La France metropolitaine possede une grande variete de paysages, entre des plaines agricoles ou boisees, des chaines de montagnes plus ou moins erodees, des littoraux diversifies et des vallees melant villes et espaces neo-naturels.'
# sentence = 'today i will go home with my brother and sister because i like it, the mountain is a thing in english'

dataSet = utils.createDataSet("datasets/en_tweets.txt","datasets/es_tweets.txt","datasets/fr_tweets.txt","datasets/pt_tweets.txt")


allTexts = utils.getAllLanguagesSet(dataSet)
allTexts = utils.formatDataset(allTexts)

sentence = sys.argv[1]
sentence = utils.cleanTweets(sentence)

language = llc.lidstoneLanguageClassification(sentence, allTexts)

predictedLabel = list()
m = 80
n = 100

for nGramSize in xrange(2,5):
    # allTexts = utils.getAllLanguagesSet(allTexts)
    allFreq = utils.returnNgramFreqSetRanking(allTexts,nGramSize)
    probList = rmc.outofplaceMeasureSet(m,n,allFreq,sentence,nGramSize)
    predictedLabel.append(probList.index(max(probList)))

k=[]
k = [k for k,v in Counter(predictedLabel).items() if v>1]

if not k: