def test_xgb_lang_sentence(): xgb = malaya.xgb_detect_languages() malay_text = 'beliau berkata program Inisitif Peduli Rakyat (IPR) yang diperkenalkan oleh kerajaan negeri Selangor lebih besar sumbangannya' assert xgb.predict(malay_text) == 'MALAY'
def test_xgb_lang_sentences_proba(): xgb = malaya.xgb_detect_languages() malay_text = 'beliau berkata program Inisitif Peduli Rakyat (IPR) yang diperkenalkan oleh kerajaan negeri Selangor lebih besar sumbangannya' assert xgb.predict_batch([malay_text, malay_text], get_proba=True)[0]['MALAY'] > 0
import time import requests from bs4 import BeautifulSoup import json from fake_useragent import UserAgent from newspaper import Article import threading from datetime import datetime, timedelta from dateutil import parser from queue import Queue from urllib.parse import quote import urllib.request from unidecode import unidecode import malaya xgb_language = malaya.xgb_detect_languages() NUMBER_OF_CALLS_TO_GOOGLE_NEWS_ENDPOINT = 0 GOOGLE_NEWS_URL = 'https://www.google.com.my/search?q={}&source=lnt&tbs=cdr%3A1%2Ccd_min%3A{}%2Ccd_max%3A{}&tbm=nws&start={}' logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') def get_date(load): try: date = re.findall("[-+]?[.]?[\d]+(?:,\d\d\d)*[\.]?\d*(?:[eE][-+]?\d+)?", load) return '%s-%s-%s'%(date[2],date[0],date[1]) except Exce: return False def run_parallel_in_threads(target, args_list):