def __init__(self): self.file_path = Path(__file__).parent self.model = Model('/Users/shihangyu/Scripts/python/stt_server/model/deepspeech-0.6.1-models/output_graph.pbmm', aBeamWidth=500) self.desired_sample_rate = self.model.sampleRate() self.logger = getLogger(self.__module__) self.tmp_path = self.file_path / 'tmp.wav'
if m is None: print(url) return None path_parts = [p for p in m.group(2).split("/") if p != ''] if len(path_parts) <= 1: print(url) return None label = "_".join(path_parts[:-1]) cli.set(f"{label}|{url}", 1) if __name__ == '__main__': logger = getLogger("cnn") fw = open("/hdd/crawl_result/english_classification/crawl_edition.cnn.com_01.json_extract.json.json", "w") with open("/hdd/crawl_result/english_classification/crawl_edition.cnn.com_01.json_extract.json", "rb") as fr: for lineno, line in enumerate(fr): line = line.decode("utf-8").strip() jobj = json.loads(line) jobj['source'] = 'cnn' t = jobj['content'][0] jobj['content'][0] = re.sub("\(CNN ?\w*\)", lambda m: m.group(0)+' ',t) fw.write(json.dumps(jobj) + '\n')
import argparse from pathlib import Path from tqdm import tqdm from processor.asr.yitu_asr_processor import yitu_asr_wrapper from util.log_util import getLogger from util.redis_util import getRedisClient from util.util import mapLineCount logger = getLogger('read_selenium_output') from postprocess.line_processors import * if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--input', required=True) parser.add_argument('--pattern') parser.add_argument('--postfix') args = parser.parse_args() input_path = Path(args.input) assert input_path.exists() if input_path.is_dir(): assert args.pattern is not None logger.info(f'find files in {input_path}/{args.pattern}') files = [file for file in input_path.rglob(args.pattern)]
import json from bs4 import BeautifulSoup from urllib.parse import urlparse from typing import Optional from util.log_util import getLogger logger = getLogger("guardian") culture_sencond_level_labels = [ "books", "music", "tv-and-radio", "artanddesign", "film", "games", "stage" ] food_sencond_level_labels = ["food"] travel_sencond_level_labels = ["travel"] travel_sencond_level_labels = ["travel"] def extract(line: str, line_key: str, *args) -> Optional[str]: jobj = json.loads(line) url = jobj['url'] url_parsed = urlparse(url) if url_parsed.netloc != "www.theguardian.com": print(f"{line_key} {jobj['url']} error: invalid netloc") return None # sencond_level_label = [p for p in url_parsed.path.split('/') if p != ''][0] # # if sencond_level_label not in food_sencond_level_labels: # print(f"{line_key} {jobj['url']} error: unknown label") # return None
import json import re import time import requests from util.log_util import getLogger from util.redis_util import getRedisClient from typing import Optional logger = getLogger("huffpost_apis") headers = { "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363' } sections = [ 'sports', 'entertainment', 'business', 'science', 'technology', 'relationships', 'women', 'religion', 'travel', 'green', 'taste' ] topics = [ 'the-worldpost', 'politics', 'college', 'education', 'divorce', 'weddings', 'arts-and-culture', 'art', 'environment', 'health-living', 'health-and-wellness', 'worklife' ] category_labels = { 'the-worldpost': (["world"], 0), 'sports': (["sports"], 1),
import argparse from pathlib import Path from tqdm import tqdm from util.util import mapLineCount from postprocess.pool_wrapper import PoolWrapper from classification_data_eng.guardian.guardian_extractor import extract from classification_data_eng.check_label_count import redis_deduplicate from util.log_util import getLogger from util.redis_util import getRedisClient if __name__ == '__main__': pw = PoolWrapper(redis_deduplicate) logger = getLogger('process_line_by_line_multiprocess') parser = argparse.ArgumentParser() parser.add_argument('--input', required=True) parser.add_argument('--pattern') parser.add_argument('--postfix', required=True) parser.add_argument('--save_result', action='store_true') parser.add_argument('--save_redis', action='store_true') parser.add_argument('--redis_db', type=int, default=0) args = parser.parse_args() save_result = args.save_result save_redis = args.save_redis
import json import re from urllib.parse import unquote import requests from util.log_util import getLogger from util.redis_util import getRedisClient from bs4 import BeautifulSoup logger = getLogger("daypop_apis") categorys = [ "UAE", "Arab", "World", "Entertainment", "Sport", "ScienceTechnology", "Business", "Health" ] def get_list_by_category(category: str, page: int, lang: str): url = f"https://api.daypop.ai/daypop/v1/channel/{category}?lang={lang}&page={page}" response = requests.get(url) assert response.status_code == 200, f"Request {get_list_by_category.__name__}:{category}:{page} Faild with code {response.status_code}" res = json.loads(response.text)["data"][category] logger.debug( f"{get_list_by_category.__name__}:{category}:{page} have list of [{len(res)}] articles" ) return res
import json import os import time from pathlib import Path import requests from util.log_util import getLogger logger = getLogger('AcademiaApis') def login(): headers = { 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Origin': 'https://academia-arabia.com', 'Upgrade-Insecure-Requests': '1', 'Content-Type': 'application/x-www-form-urlencoded', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36', 'Sec-Fetch-Dest': 'document', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-User': '******', 'Referer': 'https://academia-arabia.com/Account/Login', 'Accept-Language': 'en-US,en;q=0.9,mt;q=0.8', }
import json import math import random from util.log_util import getLogger logger = getLogger('line_processors') def random_pick(line: str): ''' input line , output processed line or None when failed :param line: :return: ''' try: json_obj = json.loads(line) except json.JSONDecodeError: logger.error('JSONDecodeError') return None ar_sen = json_obj.get('ar_sen') if random.random() > 1 / 5: return None if ar_sen is None: logger.warning(line) return None return line def print_len(line: str):
filename = args.filename ''' curl "https://academia-arabia.com/Pages/72131/76/${page}/false/1/2" -H 'Sec-Fetch-Mode: cors' -H 'Sec-Fetch-Site: same-origin' -H 'Accept-Language: zh-CN,zh;q=0.9' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36' -H 'Accept: */*' -H 'Referer: https://academia-arabia.com/Assets/Viewer/oldVersion/build/pdf.worker.js' -H 'Sec-Fetch-Dest: empty' -H 'Cookie: _ga=GA1.2.355087959.1583647731; _gid=GA1.2.1177952183.1583647731; _culture=en-US; __gads=ID=cdb3a5f28402c03d:T=1583666253:S=ALNI_MaIYp3HDwAT_skRJlxFdrHF5Dfjvw; cb-enabled=accepted; ASP.NET_SessionId=ozkti0ya22cnt5umbxgwj55z; APP-SRV=1; _gat_gtag_UA_23555050_5=1' -H 'Connection: keep-alive' ''' redis_cli = getRedisClient(db=0) logger = getLogger('Download') books = readBooks(filename) books = sorted(books, key=lambda d: d['ORG']) cookie = login() if cookie is None: exit('log in failed') for iter_id, book in enumerate(books): book_id = book.get('ORG') total_pages_num = book.get('NumOfPages') logger.info(f'Downloading book {book_id}') # page_id from 1 to total_pages_num
import json from html import unescape from typing import Optional, List from lxml import html from util.log_util import getLogger logger = getLogger(__file__) def htmlResponseToLines(response: str) -> Optional[List[str]]: ''' 1. split by nltk and \n 2. remove continuing blank 3. deduplicate on one page :param response: :return: list of lines if success, None if fail ''' json_obj = json.loads(response) response = json_obj.get('response') try: # soup = BeautifulSoup(response, 'html.parser') # lines2d = [splitLine(para.get_text()) for para in soup.body.find_all(re.compile(r'^p$|^h[1-6]$|^span$|^a$|^li$'))] # lines = list(itertools.chain(*lines2d))
import os import time import uuid from pathlib import Path import librosa import numpy as np import requests import scipy.io.wavfile as wavfile import soundfile as sf from util.log_util import getLogger from util.redis_util import getRedisClient from util.time_util import get_utc_timestamp logger = getLogger("YiTUASR") def get_HmacSha256(message, secret_key): return hmac.new(bytes(secret_key, 'latin-1'), msg=bytes(message, 'latin-1'), digestmod=hashlib.sha256).hexdigest() def check_amend_wav(filename: str, amend_after_check): assert Path(filename).is_file(), "input file not exists" tmpfiles = [] audio_file = sf.SoundFile(filename) audio_seconds = len(audio_file) / audio_file.samplerate
import json import bs4 import re from bs4 import BeautifulSoup from typing import Optional from urllib.parse import urlparse from util.log_util import getLogger logger = getLogger("bbc_extractor") def extract_food(line: str, line_key: str, *args) -> Optional[str]: jobj = json.loads(line) url = jobj['url'] url_parsed = urlparse(url) try: label_id = 0 response = jobj['response'] soup = BeautifulSoup(response, 'html.parser') title_node = soup.select("h1.blocks-article__headline") if len(title_node) != 1: logger.error(f"no title {line_key} {url}") return None title = title_node[0].get_text() # **************** content ***************
import json import mmap import time from datetime import datetime from util.log_util import getLogger from util.redis_util import getRedisClient from util.regex_util import REGPATTERNS logger = getLogger('Util') from bisect import bisect_left def mapLineCount(filename): try: f = open(filename, "r+") buf = mmap.mmap(f.fileno(), 0) lines = 0 readline = buf.readline while readline(): lines += 1 return lines except: return 0 def mapLineCharCount(filename): try: f = open(filename, "r+") buf = mmap.mmap(f.fileno(), 0)
import json import pickle from pathlib import Path from typing import List from util.log_util import getLogger logger = getLogger('read_books_json') def readBooks(filename) -> List[dict]: ''' get list of book dict from local json file :return: ''' book_id_set = set() books = [] with open(Path(__file__).with_name(filename), 'r', encoding='utf-8') as f: for line in f: try: jobj = json.loads(line.strip()) except json.JSONDecodeError: logger.error('JSONDecodeError') continue if jobj['ORG'] in book_id_set: logger.warning(f"duplicate book {jobj['ORG']}") else: