def test_language_detector(): detector = LanguageDetector(minimum_score=0.2, fallback_language="es") output_df = detector.detect_languages_df( INPUT_DF, "input_text").sort_values(by=["input_text"]) for col in output_df.columns: np.testing.assert_array_equal(output_df[col].values, OUTPUT_DF[col].values)
class LanguageDetectorTests(unittest.TestCase): def setUp(self): self.detector = LanguageDetector() def test_language_not_detected(self): self.assertEqual('Language not detected', self.detector.guess_language('')) def test_dutch_detected(self): message = "de het en een" self.assertEqual('Dutch', self.detector.guess_language(message)) def test_english_detected(self): message = "the it and a" self.assertEqual('English', self.detector.guess_language(message)) def test_german_detected(self): message = "der die das und" self.assertEqual('German', self.detector.guess_language(message)) def tearDown(self): print "Finished {0}".format(self._testMethodName)
presidenteSoloPostId_List[x] + '.json') sys.stdout = io.TextIOWrapper(sys.stdout.detach(), sys.stdout.encoding, 'backslashreplace') totalPositivos = 0 totalComentarios = 0 for data_json in datas_json: comentarios = [] with open( data_json, mode='r', encoding='utf-8', ) as file: lector = json.load(file) for x in range(0, len(lector)): comentarios.append(lector[x]['message']) Id = LanguageDetector() # comentarios = [text for text in comentarios if Id.detect(text) == 'es'] # for text in comentarios: # print('{}: {}'.format(Id.detect(text), text)) if len(comentarios) > 0: lista = cm.predict(comentarios, params) publiNoVacias += 1 else: publiVacias += 1 #todo with open(postIdsVacios ) print(lista) comentariosPositivos = 0 total = len(lista) totalComentarios += total print(total) for index in range(0, len(lista)):
ch = CorpusHelper(language='spanish') ch.load() cm=CorpusModel(corpus=ch) params = cm.fit() print('Our model has an AUC of {}'.format(cm.x_validation(params))) #el data_json hay que rellenarlo con un ciclo para que vaya cambiando los candidatos y el post #ocupando lo que había hecho la popi, les cambié los nombres a esto para que sea más fácil jeje datas_json = ['data1.json', 'data2.json', 'data3.json', 'data4.json','data5.json','data6.json', 'data7.json', 'data8.json', 'data9.json', 'data10.json'] for data_json in datas_json: with open (data_json, mode = 'r', encoding='utf-8',) as file: lector = json.load(file) #esto lee el json comentarios = [] for x in range(0, len(lector)): comentarios.append(lector[x]['message']) #aqui accede al indice x del arreglo y al contenido 'message' Id = LanguageDetector() comentarios = [text for text in comentarios if Id.detect(text) == 'es'] lista=cm.predict(comentarios, params) #esto es magia print(lista) comentariosPositivos = 0 total= len(lista) for index in range(0, len(lista)): comentariosPositivos += lista[index] porcentajePositivo = ((comentariosPositivos/total)*100) print(porcentajePositivo) print("% aprobación") comentariosNegativos = total - comentariosPositivos porcentajeNegativo = ((comentariosNegativos/total)*100) print(porcentajeNegativo) print("% de reprobación")
# -*- coding: utf-8 -*- """Language Detection recipe script""" from plugin_config_loading import load_plugin_config_langdetect from language_detector import LanguageDetector from dku_io_utils import process_dataset_chunks, set_column_descriptions # Setup params = load_plugin_config_langdetect() detector = LanguageDetector( language_scope=params["language_scope"], minimum_score=params["minimum_score"], fallback_language=params["fallback_language"], ) # Run process_dataset_chunks( input_dataset=params["input_dataset"], output_dataset=params["output_dataset"], text_column=params["text_column"], func=detector.detect_languages_df, ) set_column_descriptions( input_dataset=params["input_dataset"], output_dataset=params["output_dataset"], column_descriptions=detector.column_descriptions, )
# -*- coding: utf-8 -*- import sys from language_detector import LanguageDetector ld = LanguageDetector(ngrams_max=int(sys.argv[1]), data_dir="../data") ld.process() while True: var = input("\nPlease enter the text: ") results = ld.detect_language(var) for r in range(5): print(r + 1, results[r][0], results[r][1])
from corpus import CorpusHelper, CorpusModel from language_detector import LanguageDetector if __name__ == '__main__': ch = CorpusHelper(language='spanish') ch.load() cm = CorpusModel(corpus=ch) params = cm.fit() print(cm.x_validation(params)) texts = [ 'El candidato es un ladrón y un mentiroso', '@AlgoMortal Muchas felicidades que lo pases muy bien :)', '@eslatarde @PPopular En una palabra, INSULTANTE!!!' ] ld = LanguageDetector() texts = [text for text in texts if ld.detect(text) == 'es'] print(cm.predict(texts, params))
def setUp(self): self.detector = LanguageDetector()