示例#1
0
def test_language_detector():
    detector = LanguageDetector(minimum_score=0.2, fallback_language="es")
    output_df = detector.detect_languages_df(
        INPUT_DF, "input_text").sort_values(by=["input_text"])
    for col in output_df.columns:
        np.testing.assert_array_equal(output_df[col].values,
                                      OUTPUT_DF[col].values)
class LanguageDetectorTests(unittest.TestCase):

    def setUp(self):
        self.detector = LanguageDetector()

    def test_language_not_detected(self):
        self.assertEqual('Language not detected', self.detector.guess_language(''))

    def test_dutch_detected(self):
        message = "de het en een"
        self.assertEqual('Dutch', self.detector.guess_language(message))

    def test_english_detected(self):
        message = "the it and a"
        self.assertEqual('English', self.detector.guess_language(message))

    def test_german_detected(self):
        message = "der die das und"
        self.assertEqual('German', self.detector.guess_language(message))

    def tearDown(self):
        print "Finished {0}".format(self._testMethodName)
                       presidenteSoloPostId_List[x] + '.json')
 sys.stdout = io.TextIOWrapper(sys.stdout.detach(), sys.stdout.encoding,
                               'backslashreplace')
 totalPositivos = 0
 totalComentarios = 0
 for data_json in datas_json:
     comentarios = []
     with open(
             data_json,
             mode='r',
             encoding='utf-8',
     ) as file:
         lector = json.load(file)
         for x in range(0, len(lector)):
             comentarios.append(lector[x]['message'])
     Id = LanguageDetector()
     # comentarios = [text for text in comentarios if Id.detect(text) == 'es']
     # for text in comentarios:
     #	print('{}: {}'.format(Id.detect(text), text))
     if len(comentarios) > 0:
         lista = cm.predict(comentarios, params)
         publiNoVacias += 1
     else:
         publiVacias += 1
         #todo with open(postIdsVacios )
     print(lista)
     comentariosPositivos = 0
     total = len(lista)
     totalComentarios += total
     print(total)
     for index in range(0, len(lista)):
示例#4
0
ch = CorpusHelper(language='spanish')
ch.load()
cm=CorpusModel(corpus=ch)
params = cm.fit()
print('Our model has an AUC of {}'.format(cm.x_validation(params)))
#el data_json hay que rellenarlo con un ciclo para que vaya cambiando los candidatos y el post
#ocupando lo que había hecho la popi, les cambié los nombres a esto para que sea más fácil jeje
datas_json = ['data1.json', 'data2.json', 'data3.json', 'data4.json','data5.json','data6.json', 'data7.json', 'data8.json', 'data9.json', 'data10.json']
for data_json in datas_json:
	with open (data_json, mode = 'r', encoding='utf-8',) as file:
		lector = json.load(file) #esto lee el json
		comentarios = []
		for x in range(0, len(lector)):	
			comentarios.append(lector[x]['message'])	 #aqui accede al indice x del arreglo y al contenido 'message'
	Id = LanguageDetector()
	comentarios = [text for text in comentarios if Id.detect(text) == 'es']
	lista=cm.predict(comentarios, params) #esto es magia
	print(lista)
	comentariosPositivos = 0
	total= len(lista)
	for index in range(0, len(lista)):
		comentariosPositivos += lista[index]
	porcentajePositivo = ((comentariosPositivos/total)*100)
	print(porcentajePositivo)
	print("% aprobación")
	comentariosNegativos = total - comentariosPositivos
	porcentajeNegativo = ((comentariosNegativos/total)*100)
	print(porcentajeNegativo)
	print("% de reprobación")
示例#5
0
# -*- coding: utf-8 -*-
"""Language Detection recipe script"""

from plugin_config_loading import load_plugin_config_langdetect
from language_detector import LanguageDetector
from dku_io_utils import process_dataset_chunks, set_column_descriptions

# Setup
params = load_plugin_config_langdetect()
detector = LanguageDetector(
    language_scope=params["language_scope"],
    minimum_score=params["minimum_score"],
    fallback_language=params["fallback_language"],
)

# Run
process_dataset_chunks(
    input_dataset=params["input_dataset"],
    output_dataset=params["output_dataset"],
    text_column=params["text_column"],
    func=detector.detect_languages_df,
)
set_column_descriptions(
    input_dataset=params["input_dataset"],
    output_dataset=params["output_dataset"],
    column_descriptions=detector.column_descriptions,
)
示例#6
0
# -*- coding: utf-8 -*-

import sys
from language_detector import LanguageDetector

ld = LanguageDetector(ngrams_max=int(sys.argv[1]), data_dir="../data")
ld.process()

while True:
    var = input("\nPlease enter the text: ")
    results = ld.detect_language(var)
    for r in range(5):
        print(r + 1, results[r][0], results[r][1])
示例#7
0
from corpus import CorpusHelper, CorpusModel

from language_detector import LanguageDetector

if __name__ == '__main__':
    ch = CorpusHelper(language='spanish')
    ch.load()
    cm = CorpusModel(corpus=ch)
    params = cm.fit()
    print(cm.x_validation(params))

    texts = [
        'El candidato es un ladrón y un mentiroso',
        '@AlgoMortal Muchas felicidades que lo pases muy bien :)',
        '@eslatarde @PPopular En una palabra, INSULTANTE!!!'
    ]
    ld = LanguageDetector()
    texts = [text for text in texts if ld.detect(text) == 'es']
    print(cm.predict(texts, params))
 def setUp(self):
     self.detector = LanguageDetector()