def redis_write(): redis_cli = getRedisClient(db=15) fw = open("/hdd/crawl_result/daypop.json", "w") for key in redis_cli.scan_iter(): label = key.split(":")[0] value = redis_cli.get(key) d = json.loads(value) text = BeautifulSoup(d['html'], 'html.parser').get_text() # text = re.sub("\n+","\n",text) text = '\n'.join( [t.strip() for t in text.split("\n") if t.strip() != '']) if text.strip() == "": continue print("*" * 50 + d['article_id'] + '*' * 50 + d['url'] + "*" * 50) print(text) save_str = json.dumps(dict(id=d['article_id'], url=unquote(d['url']), title=d['title'], daypop_label=label, text=text), ensure_ascii=False) fw.write(save_str + '\n')
def check_all_labels(line: str, line_key:str): from util.redis_util import getRedisClient redis_cli = getRedisClient(db=14) json_obj = json.loads(line) url = json_obj.get('url') url_parsed = urllib.parse.urlparse(url) if url_parsed.netloc != "arabic.rt.com": return None if url_parsed.query != '': return None path_words = [w for w in url_parsed.path.split("/") if w != ''] if len(path_words) != 2: return None response = json_obj.get('response') dom = html.fromstring(response) try: labrel_node = dom.xpath(f"//div[@class='info-panel']//a[@href]")[0] href_label = labrel_node.attrib['href'] redis_cli.set(f"{href_label}|{url}", 1) except: return None
class BBCSpider(scrapy.Spider): name = 'bbc_spider_food' base_url = 'https://www.bbc.co.uk/food/articles/' start_urls = ['https://www.bbc.co.uk/food'] redis_cli = getRedisClient(db=11) def parse(self, response): url_parsed = urlparse(response.url) if url_parsed.netloc == 'www.bbc.co.uk' and response.url.startswith( self.base_url) and self.redis_cli.get(response.url) is None: yield { 'url': response.url, 'response': response.body.decode('utf-8', errors='ignore') } self.redis_cli.set(response.url, 1) self.logger.info(f"found page: {response.url}") else: self.redis_cli.set(response.url, 0) for idx, next_page in enumerate(response.css(f"a[href], link[href]")): absolute_url = urljoin(self.base_url, next_page.attrib['href']) if not absolute_url.startswith(self.base_url): continue if self.redis_cli.get(absolute_url) is not None: continue yield response.follow(absolute_url, self.parse)
def write_url_to_redis(line: str, *args): from util.redis_util import getRedisClient cli = getRedisClient(db=8) jobj = json.loads(line) url = jobj['url'] url_parsed = urlparse(url) if url_parsed.netloc != "edition.cnn.com": print(url) return None m = re.match("^(/\d+){3}((?:/[^/]+)+)/index\.html$", url_parsed.path) if m is None: print(url) return None path_parts = [p for p in m.group(2).split("/") if p != ''] if len(path_parts) <= 1: print(url) return None label = "_".join(path_parts[:-1]) cli.set(f"{label}|{url}", 1)
def request_LibriSpeech(): wav_path = '/Users/shihangyu/Downloads/LibriSpeech_merged/test-clean/flac/' wavs = Path(wav_path).glob("*.flac") wavs = [wav for wav in wavs] wavs = sorted(wavs, key=lambda w: w.name) from util.redis_util import getRedisClient redis_cli = getRedisClient(db=4) for idx, wav in enumerate(wavs): redis_result = redis_cli.get(wav.stem) if redis_result is not None: # logger.warning(f"[{idx}] wav {wav} already have result in redis") continue else: try: res_list = getYituASR(str(wav), timeout=2000, amend_after_check=True) logger.info(f"[{idx}] wav {wav}: res {res_list}") if 403 in [ res.status_code for res in res_list if isinstance(res, requests.Response) ]: return -1 redis_cli.set(wav.stem, json.dumps(res_list)) except Exception as e: logger.error(f"[{idx}] wav {wav}: error {e}") return 0
def yitu_asr_wrapper(line: str, line_key: str) -> str: redis_cli = getRedisClient(db=0) redis_infer = redis_cli.get(line_key) parts = line.split('|') wavname = parts[0] wavpath = Path('/Users/shihangyu/Data/LJSpeech-1.1/wavs').joinpath( f'{wavname}.wav') label1 = parts[1] label2 = parts[2] if redis_infer is not None: return json.dumps( dict(wavname=wavname, label1=label1, label2=label2, yitu_infer=redis_infer)) else: try: yitu_infer = getYituASR(str(wavpath))['text'] redis_cli.set(line_key, yitu_infer) except Exception as e: logger.error(f'{line_key}: {e}') yitu_infer = None return json.dumps( dict(wavname=wavname, label1=label1, label2=label2, yitu_infer=yitu_infer))
def read_file_to_redis(filename, host='localhost', port=6379, db=0): redis_cli = getRedisClient(host=host, port=port, db=db) t = time.time() with open(filename, 'rb') as fr: for line_count, line in enumerate(fr): line = line.decode('utf-8', errors='ignore') line = line.strip() try: json_obj = json.loads(line) except json.JSONDecodeError: print('Json Decode Error') continue key = json_obj.get('key') value = json_obj.get('value') if None in [key, value]: print('Null in key or value') continue redis_cli.set(key, value) print(f'line count {line_count}') print(f'db size {redis_cli.dbsize()}') print(f'insert use time {time.time() - t}')
def check_all_labls(line: str, line_key: str): from util.redis_util import getRedisClient cli = getRedisClient(db=14) json_obj = json.loads(line) url = json_obj.get('url') url_parsed = urllib.parse.urlparse(url) if url_parsed.netloc != "www.alriyadh.com": return None if url_parsed.query != '': return None path_words = [w for w in url_parsed.path.split("/") if w != ''] if len(path_words) != 1 or not re.match(r"^\d+$", path_words[0]): return None response = json_obj.get('response') dom = html.fromstring(response) try: label_node = dom.xpath("//h3/ol/li[@class='active']/a")[0] eng_label_word = label_node.attrib['href'] cli.set(f"{eng_label_word}|{url}", 1) except: return None
class CNNSpider(scrapy.Spider): name = 'cnn_spider' start_urls = [ 'https://edition.cnn.com/2020/04/20/investing/premarket-stocks-trading/index.html' ] redis_cli = getRedisClient(db=10) def parse(self, response): url_parsed = urlparse(response.url) if re.match("^(/\d+){3}(/[^/]+)+$", url_parsed.path): yield { 'url': response.url, 'response': response.body.decode('utf-8', errors='ignore') } self.redis_cli.set(response.url, 1) self.logger.info(f"found page: {response.url}") else: self.redis_cli.set(response.url, 0) for next_page in response.css(f"a[href^='/']"): if self.redis_cli.get(next_page.attrib['href']) is not None: continue yield response.follow(next_page, self.parse)
def check_redis_huffpost(): redis_cli = getRedisClient(db=9) total_labels = [] for key in redis_cli.scan_iter(): total_labels.append(category_labels[key.split(":")[0]]) from collections import Counter from pprint import pprint pprint(Counter(total_labels))
def check_redis(): redis_cli = getRedisClient(db=15) total_labels = [] for key in redis_cli.scan_iter(): total_labels.append(key.split(":")[0]) from collections import Counter from pprint import pprint pprint(Counter(total_labels))
def write_redis_to_file(filename, host='localhost', port=6379, db=0): redis_cli = getRedisClient(host=host, port=port, db=db) with open(filename, 'w') as fw: count = 0 t = time.time() for key in redis_cli.scan_iter(): d = dict(key=key, value=redis_cli.get(key)) fw.write(json.dumps(d) + '\n') count += 1 print(f'total keys {count}') print(f'scan keys use time {time.time() - t}') t = time.time() print(f'db size {redis_cli.dbsize()}') print(f'dbsize use time {time.time() - t}')
class ArabBusinessSpider(scrapy.Spider): name = 'arab_business_spider_education' base_url = 'https://www.arabianbusiness.com/education' start_urls = [ 'https://www.arabianbusiness.com/education/433068-abu-dhabi-al-ain-to-get-new-kindergartens-in-45m-investment' ] redis_cli = getRedisClient(db=11) def parse(self, response): url_parsed = urlparse(response.url) path_parts = [p for p in url_parsed.path.split('/') if p != ''] if url_parsed.netloc == 'www.arabianbusiness.com' and len( path_parts) > 1 and path_parts[0] in [ "education" ] and self.redis_cli.get(response.url) is None: yield { 'url': response.url, 'response': response.body.decode('utf-8', errors='ignore') } self.redis_cli.set(response.url, 1) self.logger.info(f"found page: {response.url}") else: self.redis_cli.set(response.url, 0) for idx, next_page in enumerate(response.css(f"a[href], link[href]")): absolute_url = urljoin(self.base_url, next_page.attrib['href']) absolute_url_parts = [ p for p in urlparse(absolute_url).path.split('/') if p != '' ] if not absolute_url.startswith(self.base_url): continue if len(absolute_url_parts) <= 1 or absolute_url_parts[0] not in [ "education" ]: continue if '.' in absolute_url_parts[-1]: continue if self.redis_cli.get(absolute_url) is not None: continue yield response.follow(absolute_url, self.parse)
class GuardianSpider(scrapy.Spider): name = 'guardian_spider_travel' base_url = 'https://www.theguardian.com/travel' start_urls = ['https://www.theguardian.com/uk/travel'] redis_cli = getRedisClient(db=11) def parse(self, response): url_parsed = urlparse(response.url) path_parts = [p for p in url_parsed.path.split('/') if p != ''] if url_parsed.netloc == 'www.theguardian.com' and len( path_parts) > 1 and path_parts[0] in [ "travel" ] and self.redis_cli.get(response.url) is None: yield { 'url': response.url, 'response': response.body.decode('utf-8', errors='ignore') } self.redis_cli.set(response.url, 1) self.logger.info(f"found page: {response.url}") else: self.redis_cli.set(response.url, 0) for idx, next_page in enumerate(response.css(f"a[href], link[href]")): absolute_url = urljoin(self.base_url, next_page.attrib['href']) absolute_url_parts = [ p for p in urlparse(absolute_url).path.split('/') if p != '' ] if not absolute_url.startswith(self.base_url): continue if len(absolute_url_parts) <= 1 or absolute_url_parts[0] not in [ "travel" ]: continue if '.' in absolute_url_parts[-1]: continue if self.redis_cli.get(absolute_url) is not None: continue yield response.follow(absolute_url, self.parse)
def write_pdf_to_redis(): redis_cli = getRedisClient(db=0) read_path = Path('/hdd/academia-pdf') books = read_path.glob("*") books = sorted(books, key=lambda p: int(p.name)) for book in books: book_id = book.name pages = book.glob("*.pdf") pages = sorted(pages, key=lambda p: int(p.stem)) for page in pages: page_id = int(page.stem) if page.lstat().st_size <= 0: continue redis_cli.set(f'{book_id}:{page_id}', 1)
def check_all_labels(line: str, line_key: str): from util.redis_util import getRedisClient cli = getRedisClient(db=14) json_obj = json.loads(line) url = json_obj.get('url') url_parsed = urllib.parse.urlparse(url) if url_parsed.netloc != "www.albayan.ae": return None if url_parsed.query != '': return None path_words = [w for w in url_parsed.path.split("/") if w != ''] if not re.match(r"^[\d\.\-]+$", path_words[-1]): return None cli.set(f"{path_words[0]}|{url}", 1)
def iterate_articles_by_category(category: str): redis_cli = getRedisClient(db=15) lang = "lang" page_id = 0 article_count = 0 while True: page_id += 1 try: article_list = get_list_by_category(category, page_id, lang) except Exception as e: logger.error(e) continue if len(article_list) == 0: break for article in article_list: id = article["article_id"] redis_key = f"{category}:{id}" if redis_cli.get(redis_key) is not None: logger.debug(f"{redis_key} found in redis") continue try: article_detail = get_article_detail(id, lang) except Exception as e: logger.error(e) continue redis_cli.set(redis_key, json.dumps(article_detail, ensure_ascii=False)) article_count += 1 logger.info(f"category {category} total articles count: {article_count}") return article_count
def request_ljspeech(): wav_path = '/Users/shihangyu/Data/LJSpeech-1.1/wavs' wavs = Path(wav_path).glob("*.wav") wavs = [wav for wav in wavs] from util.redis_util import getRedisClient redis_cli = getRedisClient(db=2) for idx, wav in enumerate(wavs): redis_result = redis_cli.get(wav.stem) if redis_result is not None: logger.warning(f"[{idx}] wav {wav} already have result in redis") continue else: try: res_list = getYituASR(str(wav), timeout=2000, amend_after_check=True) logger.info(f"[{idx}] wav {wav}: res {res_list}") redis_cli.set(wav.stem, json.dumps(res_list)) except Exception as e: logger.error(f"[{idx}] wav {wav}: error {e}")
def check_all_labels(line: str, *args): from util.redis_util import getRedisClient cli = getRedisClient(db=14) json_obj = json.loads(line) url = json_obj.get('url') if not re.match(r"^\d+$", url.replace("http://alwatan.com/details/", "")): return None response = json_obj.get('response') dom = html.fromstring(response) try: label_node = dom.xpath("//div[@class='content']/div/a")[1] label_eng_word = label_node.attrib['href'].replace( "http://alwatan.com/section/", "") cli.set(f"{label_eng_word}|{url}", 1) except: return None
def iterate_cards_by_category(categery: str): time.sleep(1) redis_cli = getRedisClient(db=9) page_id = 0 while True: try: cards = get_cards_by_category(categery, page_id) if len(cards) == 0: logger.warning( f"{get_cards_by_category.__name__}:{categery}:{page_id} finished with empty result" ) break for card in cards: url = card['url'] redis_key = f"{categery}:{url}" if redis_cli.get(redis_key) is not None: continue response = requests.get(url, headers=headers) card['response'] = response.text redis_cli.set(redis_key, json.dumps(card)) except Exception as e: logger.error(e) page_id += 1
def test(): import urllib.parse base = 'https://www.example-page-xl.com' print(urllib.parse.urljoin(base, 'index.php')) print(urllib.parse.urljoin(base, '../index.php')) print(urllib.parse.urljoin(base, '/helloworld/index.php')) print(urllib.parse.urljoin(base, '//helloworld/index.php')) print( urllib.parse.urljoin( base, 'https://www.example-page-xl.com/helloworld/index.php')) from util.redis_util import getRedisClient deduplicate_redis_cli = getRedisClient(db=15) import json def redis_deduplicate(line: str, line_key: str): jobj = json.loads(line) if deduplicate_redis_cli.get(jobj['url']) is None: deduplicate_redis_cli.set(jobj['url'], 1) return line else: return None if __name__ == '__main__': # test()
jobj = json.loads(line) tag = jobj['tags'][0] tags.append(tag) if tag in tag2labelid: jobj['label_id'] = tag2labelid[tag] fw.write(json.dumps(jobj) + '\n') from collections import Counter from pprint import pprint pprint(Counter(tags)) fw.close() from util.redis_util import getRedisClient cli = getRedisClient(db=10) def write_url_to_redis(line: str, *args): from urllib.parse import urlparse jobj = json.loads(line) url = jobj['url'] url_parsed = urlparse(url) try: label = [p for p in url_parsed.path.split('/') if p != ''][0] except: return None cli.set(f"{label}|{url}", 1)
assert args.pattern is not None logger.info(f'find files in {input_path}/{args.pattern}') files = [file for file in input_path.rglob(args.pattern)] else: files = [input_path] output_path = input_path.parent.joinpath( f'{input_path.stem}{args.postfix}') fw = output_path.open('w') logger.info(f'write file into {output_path}') if len(files) == 0: logger.warning(f'no file found, exit') exit(1) redis_cli = getRedisClient(db=0) for file in files: total_line_num = mapLineCount(str(file)) fw = file.parent.joinpath(f'{file.name}_{args.postfix}').open('w') with file.open('rb') as fr: for lineno, line in tqdm(enumerate(fr, start=1), total=total_line_num): line = line.decode('utf-8', errors='ignore') line = line.strip() if line == '': continue line_key = f"{file}:{lineno}"
parser = argparse.ArgumentParser() parser.add_argument('--input', required=True) parser.add_argument('--pattern') parser.add_argument('--postfix', required=True) parser.add_argument('--save_result', action='store_true') parser.add_argument('--save_redis', action='store_true') parser.add_argument('--redis_db', type=int, default=0) args = parser.parse_args() save_result = args.save_result save_redis = args.save_redis redis_cli = getRedisClient(db=args.redis_db) input_path = Path(args.input) assert input_path.exists() print(args) if input_path.is_dir(): assert args.pattern is not None logger.info(f'find files in {input_path}/{args.pattern}') files = [file for file in input_path.rglob(args.pattern)] files = sorted(files, key=lambda x: str(x)) else: files = [input_path]