예제 #1
0
def test_xgb_lang_sentence():
    xgb = malaya.xgb_detect_languages()
    malay_text = 'beliau berkata program Inisitif Peduli Rakyat (IPR) yang diperkenalkan oleh kerajaan negeri Selangor lebih besar sumbangannya'
    assert xgb.predict(malay_text) == 'MALAY'
예제 #2
0
def test_xgb_lang_sentences_proba():
    xgb = malaya.xgb_detect_languages()
    malay_text = 'beliau berkata program Inisitif Peduli Rakyat (IPR) yang diperkenalkan oleh kerajaan negeri Selangor lebih besar sumbangannya'
    assert xgb.predict_batch([malay_text, malay_text],
                             get_proba=True)[0]['MALAY'] > 0
예제 #3
0
import time
import requests
from bs4 import BeautifulSoup
import json
from fake_useragent import UserAgent
from newspaper import Article
import threading
from datetime import datetime, timedelta
from dateutil import parser
from queue import Queue
from urllib.parse import quote
import urllib.request
from unidecode import unidecode
import malaya

xgb_language = malaya.xgb_detect_languages()

NUMBER_OF_CALLS_TO_GOOGLE_NEWS_ENDPOINT = 0

GOOGLE_NEWS_URL = 'https://www.google.com.my/search?q={}&source=lnt&tbs=cdr%3A1%2Ccd_min%3A{}%2Ccd_max%3A{}&tbm=nws&start={}'

logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

def get_date(load):
    try:
        date = re.findall("[-+]?[.]?[\d]+(?:,\d\d\d)*[\.]?\d*(?:[eE][-+]?\d+)?", load)
        return '%s-%s-%s'%(date[2],date[0],date[1])
    except Exce:
        return False

def run_parallel_in_threads(target, args_list):