Пример #1
0
def redis_write():
    redis_cli = getRedisClient(db=15)

    fw = open("/hdd/crawl_result/daypop.json", "w")
    for key in redis_cli.scan_iter():
        label = key.split(":")[0]
        value = redis_cli.get(key)
        d = json.loads(value)
        text = BeautifulSoup(d['html'], 'html.parser').get_text()
        # text = re.sub("\n+","\n",text)
        text = '\n'.join(
            [t.strip() for t in text.split("\n") if t.strip() != ''])
        if text.strip() == "":
            continue
        print("*" * 50 + d['article_id'] + '*' * 50 + d['url'] + "*" * 50)
        print(text)

        save_str = json.dumps(dict(id=d['article_id'],
                                   url=unquote(d['url']),
                                   title=d['title'],
                                   daypop_label=label,
                                   text=text),
                              ensure_ascii=False)

        fw.write(save_str + '\n')
Пример #2
0
def check_all_labels(line: str, line_key:str):
        from util.redis_util import getRedisClient
        redis_cli = getRedisClient(db=14)

        json_obj = json.loads(line)

        url = json_obj.get('url')

        url_parsed = urllib.parse.urlparse(url)

        if url_parsed.netloc != "arabic.rt.com":
            return None
        if url_parsed.query != '':
            return None

        path_words = [w for w in url_parsed.path.split("/") if w != '']

        if len(path_words) != 2:
            return None

        response = json_obj.get('response')

        dom = html.fromstring(response)

        try:

            labrel_node = dom.xpath(f"//div[@class='info-panel']//a[@href]")[0]

            href_label = labrel_node.attrib['href']

            redis_cli.set(f"{href_label}|{url}", 1)
        except:
            return None
Пример #3
0
class BBCSpider(scrapy.Spider):
    name = 'bbc_spider_food'
    base_url = 'https://www.bbc.co.uk/food/articles/'
    start_urls = ['https://www.bbc.co.uk/food']
    redis_cli = getRedisClient(db=11)

    def parse(self, response):

        url_parsed = urlparse(response.url)

        if url_parsed.netloc == 'www.bbc.co.uk' and response.url.startswith(
                self.base_url) and self.redis_cli.get(response.url) is None:
            yield {
                'url': response.url,
                'response': response.body.decode('utf-8', errors='ignore')
            }
            self.redis_cli.set(response.url, 1)
            self.logger.info(f"found page: {response.url}")

        else:
            self.redis_cli.set(response.url, 0)

        for idx, next_page in enumerate(response.css(f"a[href], link[href]")):
            absolute_url = urljoin(self.base_url, next_page.attrib['href'])

            if not absolute_url.startswith(self.base_url):
                continue
            if self.redis_cli.get(absolute_url) is not None:
                continue

            yield response.follow(absolute_url, self.parse)
Пример #4
0
def write_url_to_redis(line: str, *args):
    from util.redis_util import getRedisClient
    cli = getRedisClient(db=8)

    jobj = json.loads(line)

    url = jobj['url']
    url_parsed = urlparse(url)

    if url_parsed.netloc != "edition.cnn.com":
        print(url)
        return None

    m = re.match("^(/\d+){3}((?:/[^/]+)+)/index\.html$", url_parsed.path)

    if m is None:
        print(url)

        return None

    path_parts = [p for p in m.group(2).split("/") if p != '']

    if len(path_parts) <= 1:
        print(url)

        return None
    label = "_".join(path_parts[:-1])
    cli.set(f"{label}|{url}", 1)
def request_LibriSpeech():
    wav_path = '/Users/shihangyu/Downloads/LibriSpeech_merged/test-clean/flac/'
    wavs = Path(wav_path).glob("*.flac")
    wavs = [wav for wav in wavs]
    wavs = sorted(wavs, key=lambda w: w.name)

    from util.redis_util import getRedisClient
    redis_cli = getRedisClient(db=4)

    for idx, wav in enumerate(wavs):
        redis_result = redis_cli.get(wav.stem)
        if redis_result is not None:
            # logger.warning(f"[{idx}] wav {wav} already have result in redis")
            continue
        else:
            try:
                res_list = getYituASR(str(wav),
                                      timeout=2000,
                                      amend_after_check=True)
                logger.info(f"[{idx}] wav {wav}: res {res_list}")
                if 403 in [
                        res.status_code for res in res_list
                        if isinstance(res, requests.Response)
                ]:
                    return -1

                redis_cli.set(wav.stem, json.dumps(res_list))
            except Exception as e:

                logger.error(f"[{idx}] wav {wav}: error {e}")

    return 0
def yitu_asr_wrapper(line: str, line_key: str) -> str:
    redis_cli = getRedisClient(db=0)
    redis_infer = redis_cli.get(line_key)

    parts = line.split('|')
    wavname = parts[0]
    wavpath = Path('/Users/shihangyu/Data/LJSpeech-1.1/wavs').joinpath(
        f'{wavname}.wav')
    label1 = parts[1]
    label2 = parts[2]

    if redis_infer is not None:
        return json.dumps(
            dict(wavname=wavname,
                 label1=label1,
                 label2=label2,
                 yitu_infer=redis_infer))
    else:
        try:
            yitu_infer = getYituASR(str(wavpath))['text']
            redis_cli.set(line_key, yitu_infer)
        except Exception as e:
            logger.error(f'{line_key}: {e}')
            yitu_infer = None
        return json.dumps(
            dict(wavname=wavname,
                 label1=label1,
                 label2=label2,
                 yitu_infer=yitu_infer))
Пример #7
0
def read_file_to_redis(filename, host='localhost', port=6379, db=0):
    redis_cli = getRedisClient(host=host, port=port, db=db)

    t = time.time()
    with open(filename, 'rb') as fr:
        for line_count, line in enumerate(fr):
            line = line.decode('utf-8', errors='ignore')
            line = line.strip()

            try:
                json_obj = json.loads(line)
            except json.JSONDecodeError:
                print('Json Decode Error')
                continue

            key = json_obj.get('key')
            value = json_obj.get('value')

            if None in [key, value]:
                print('Null in key or value')
                continue

            redis_cli.set(key, value)

    print(f'line count {line_count}')
    print(f'db size {redis_cli.dbsize()}')
    print(f'insert use time {time.time() - t}')
def check_all_labls(line: str, line_key: str):
    from util.redis_util import getRedisClient
    cli = getRedisClient(db=14)

    json_obj = json.loads(line)

    url = json_obj.get('url')

    url_parsed = urllib.parse.urlparse(url)

    if url_parsed.netloc != "www.alriyadh.com":
        return None
    if url_parsed.query != '':
        return None

    path_words = [w for w in url_parsed.path.split("/") if w != '']

    if len(path_words) != 1 or not re.match(r"^\d+$", path_words[0]):
        return None

    response = json_obj.get('response')

    dom = html.fromstring(response)

    try:
        label_node = dom.xpath("//h3/ol/li[@class='active']/a")[0]

        eng_label_word = label_node.attrib['href']

        cli.set(f"{eng_label_word}|{url}", 1)
    except:
        return None
Пример #9
0
class CNNSpider(scrapy.Spider):
    name = 'cnn_spider'
    start_urls = [
        'https://edition.cnn.com/2020/04/20/investing/premarket-stocks-trading/index.html'
    ]
    redis_cli = getRedisClient(db=10)

    def parse(self, response):

        url_parsed = urlparse(response.url)

        if re.match("^(/\d+){3}(/[^/]+)+$", url_parsed.path):

            yield {
                'url': response.url,
                'response': response.body.decode('utf-8', errors='ignore')
            }
            self.redis_cli.set(response.url, 1)
            self.logger.info(f"found page: {response.url}")

        else:
            self.redis_cli.set(response.url, 0)

        for next_page in response.css(f"a[href^='/']"):
            if self.redis_cli.get(next_page.attrib['href']) is not None:
                continue
            yield response.follow(next_page, self.parse)
Пример #10
0
def check_redis_huffpost():
    redis_cli = getRedisClient(db=9)

    total_labels = []
    for key in redis_cli.scan_iter():
        total_labels.append(category_labels[key.split(":")[0]])

    from collections import Counter
    from pprint import pprint
    pprint(Counter(total_labels))
Пример #11
0
def check_redis():
    redis_cli = getRedisClient(db=15)

    total_labels = []
    for key in redis_cli.scan_iter():
        total_labels.append(key.split(":")[0])

    from collections import Counter
    from pprint import pprint
    pprint(Counter(total_labels))
Пример #12
0
def write_redis_to_file(filename, host='localhost', port=6379, db=0):
    redis_cli = getRedisClient(host=host, port=port, db=db)

    with open(filename, 'w') as fw:
        count = 0
        t = time.time()
        for key in redis_cli.scan_iter():
            d = dict(key=key, value=redis_cli.get(key))
            fw.write(json.dumps(d) + '\n')
            count += 1
    print(f'total keys {count}')
    print(f'scan keys use time {time.time() - t}')

    t = time.time()
    print(f'db size {redis_cli.dbsize()}')
    print(f'dbsize use time {time.time() - t}')
Пример #13
0
class ArabBusinessSpider(scrapy.Spider):
    name = 'arab_business_spider_education'
    base_url = 'https://www.arabianbusiness.com/education'
    start_urls = [
        'https://www.arabianbusiness.com/education/433068-abu-dhabi-al-ain-to-get-new-kindergartens-in-45m-investment'
    ]
    redis_cli = getRedisClient(db=11)

    def parse(self, response):

        url_parsed = urlparse(response.url)

        path_parts = [p for p in url_parsed.path.split('/') if p != '']

        if url_parsed.netloc == 'www.arabianbusiness.com' and len(
                path_parts) > 1 and path_parts[0] in [
                    "education"
                ] and self.redis_cli.get(response.url) is None:
            yield {
                'url': response.url,
                'response': response.body.decode('utf-8', errors='ignore')
            }
            self.redis_cli.set(response.url, 1)
            self.logger.info(f"found page: {response.url}")

        else:
            self.redis_cli.set(response.url, 0)

        for idx, next_page in enumerate(response.css(f"a[href], link[href]")):
            absolute_url = urljoin(self.base_url, next_page.attrib['href'])
            absolute_url_parts = [
                p for p in urlparse(absolute_url).path.split('/') if p != ''
            ]

            if not absolute_url.startswith(self.base_url):
                continue
            if len(absolute_url_parts) <= 1 or absolute_url_parts[0] not in [
                    "education"
            ]:
                continue
            if '.' in absolute_url_parts[-1]:
                continue
            if self.redis_cli.get(absolute_url) is not None:
                continue

            yield response.follow(absolute_url, self.parse)
Пример #14
0
class GuardianSpider(scrapy.Spider):
    name = 'guardian_spider_travel'
    base_url = 'https://www.theguardian.com/travel'
    start_urls = ['https://www.theguardian.com/uk/travel']
    redis_cli = getRedisClient(db=11)

    def parse(self, response):

        url_parsed = urlparse(response.url)

        path_parts = [p for p in url_parsed.path.split('/') if p != '']

        if url_parsed.netloc == 'www.theguardian.com' and len(
                path_parts) > 1 and path_parts[0] in [
                    "travel"
                ] and self.redis_cli.get(response.url) is None:
            yield {
                'url': response.url,
                'response': response.body.decode('utf-8', errors='ignore')
            }
            self.redis_cli.set(response.url, 1)
            self.logger.info(f"found page: {response.url}")

        else:
            self.redis_cli.set(response.url, 0)

        for idx, next_page in enumerate(response.css(f"a[href], link[href]")):
            absolute_url = urljoin(self.base_url, next_page.attrib['href'])
            absolute_url_parts = [
                p for p in urlparse(absolute_url).path.split('/') if p != ''
            ]

            if not absolute_url.startswith(self.base_url):
                continue
            if len(absolute_url_parts) <= 1 or absolute_url_parts[0] not in [
                    "travel"
            ]:
                continue
            if '.' in absolute_url_parts[-1]:
                continue
            if self.redis_cli.get(absolute_url) is not None:
                continue

            yield response.follow(absolute_url, self.parse)
Пример #15
0
def write_pdf_to_redis():
    redis_cli = getRedisClient(db=0)

    read_path = Path('/hdd/academia-pdf')

    books = read_path.glob("*")
    books = sorted(books, key=lambda p: int(p.name))

    for book in books:
        book_id = book.name
        pages = book.glob("*.pdf")
        pages = sorted(pages, key=lambda p: int(p.stem))

        for page in pages:
            page_id = int(page.stem)

            if page.lstat().st_size <= 0:
                continue
            redis_cli.set(f'{book_id}:{page_id}', 1)
Пример #16
0
def check_all_labels(line: str, line_key: str):
    from util.redis_util import getRedisClient
    cli = getRedisClient(db=14)

    json_obj = json.loads(line)

    url = json_obj.get('url')

    url_parsed = urllib.parse.urlparse(url)

    if url_parsed.netloc != "www.albayan.ae":
        return None
    if url_parsed.query != '':
        return None

    path_words = [w for w in url_parsed.path.split("/") if w != '']

    if not re.match(r"^[\d\.\-]+$", path_words[-1]):
        return None

    cli.set(f"{path_words[0]}|{url}", 1)
Пример #17
0
def iterate_articles_by_category(category: str):
    redis_cli = getRedisClient(db=15)

    lang = "lang"

    page_id = 0
    article_count = 0
    while True:
        page_id += 1

        try:
            article_list = get_list_by_category(category, page_id, lang)
        except Exception as e:
            logger.error(e)
            continue

        if len(article_list) == 0:
            break

        for article in article_list:
            id = article["article_id"]

            redis_key = f"{category}:{id}"

            if redis_cli.get(redis_key) is not None:
                logger.debug(f"{redis_key} found in redis")
                continue

            try:
                article_detail = get_article_detail(id, lang)
            except Exception as e:
                logger.error(e)
                continue

            redis_cli.set(redis_key,
                          json.dumps(article_detail, ensure_ascii=False))
            article_count += 1

    logger.info(f"category {category} total articles count: {article_count}")
    return article_count
def request_ljspeech():
    wav_path = '/Users/shihangyu/Data/LJSpeech-1.1/wavs'
    wavs = Path(wav_path).glob("*.wav")
    wavs = [wav for wav in wavs]

    from util.redis_util import getRedisClient
    redis_cli = getRedisClient(db=2)

    for idx, wav in enumerate(wavs):
        redis_result = redis_cli.get(wav.stem)
        if redis_result is not None:
            logger.warning(f"[{idx}] wav {wav} already have result in redis")
            continue
        else:
            try:
                res_list = getYituASR(str(wav),
                                      timeout=2000,
                                      amend_after_check=True)
                logger.info(f"[{idx}] wav {wav}: res {res_list}")
                redis_cli.set(wav.stem, json.dumps(res_list))
            except Exception as e:
                logger.error(f"[{idx}] wav {wav}: error {e}")
Пример #19
0
def check_all_labels(line: str, *args):
    from util.redis_util import getRedisClient
    cli = getRedisClient(db=14)
    json_obj = json.loads(line)

    url = json_obj.get('url')

    if not re.match(r"^\d+$", url.replace("http://alwatan.com/details/", "")):
        return None

    response = json_obj.get('response')

    dom = html.fromstring(response)

    try:
        label_node = dom.xpath("//div[@class='content']/div/a")[1]

        label_eng_word = label_node.attrib['href'].replace(
            "http://alwatan.com/section/", "")

        cli.set(f"{label_eng_word}|{url}", 1)
    except:
        return None
Пример #20
0
def iterate_cards_by_category(categery: str):
    time.sleep(1)

    redis_cli = getRedisClient(db=9)

    page_id = 0
    while True:

        try:

            cards = get_cards_by_category(categery, page_id)

            if len(cards) == 0:
                logger.warning(
                    f"{get_cards_by_category.__name__}:{categery}:{page_id} finished with empty result"
                )
                break

            for card in cards:

                url = card['url']

                redis_key = f"{categery}:{url}"

                if redis_cli.get(redis_key) is not None:
                    continue

                response = requests.get(url, headers=headers)

                card['response'] = response.text

                redis_cli.set(redis_key, json.dumps(card))

        except Exception as e:
            logger.error(e)

        page_id += 1
Пример #21
0

def test():
    import urllib.parse
    base = 'https://www.example-page-xl.com'
    print(urllib.parse.urljoin(base, 'index.php'))
    print(urllib.parse.urljoin(base, '../index.php'))
    print(urllib.parse.urljoin(base, '/helloworld/index.php'))
    print(urllib.parse.urljoin(base, '//helloworld/index.php'))
    print(
        urllib.parse.urljoin(
            base, 'https://www.example-page-xl.com/helloworld/index.php'))


from util.redis_util import getRedisClient
deduplicate_redis_cli = getRedisClient(db=15)
import json


def redis_deduplicate(line: str, line_key: str):
    jobj = json.loads(line)
    if deduplicate_redis_cli.get(jobj['url']) is None:
        deduplicate_redis_cli.set(jobj['url'], 1)
        return line

    else:
        return None


if __name__ == '__main__':
    # test()
Пример #22
0
        jobj = json.loads(line)
        tag = jobj['tags'][0]
        tags.append(tag)
        if tag in tag2labelid:
            jobj['label_id'] = tag2labelid[tag]
            fw.write(json.dumps(jobj) + '\n')
    from collections import Counter
    from pprint import pprint

    pprint(Counter(tags))
    fw.close()


from util.redis_util import getRedisClient

cli = getRedisClient(db=10)


def write_url_to_redis(line: str, *args):

    from urllib.parse import urlparse

    jobj = json.loads(line)

    url = jobj['url']
    url_parsed = urlparse(url)
    try:
        label = [p for p in url_parsed.path.split('/') if p != ''][0]
    except:
        return None
    cli.set(f"{label}|{url}", 1)
        assert args.pattern is not None
        logger.info(f'find files in {input_path}/{args.pattern}')
        files = [file for file in input_path.rglob(args.pattern)]
    else:
        files = [input_path]

    output_path = input_path.parent.joinpath(
        f'{input_path.stem}{args.postfix}')
    fw = output_path.open('w')
    logger.info(f'write file into {output_path}')

    if len(files) == 0:
        logger.warning(f'no file found, exit')
        exit(1)

    redis_cli = getRedisClient(db=0)

    for file in files:
        total_line_num = mapLineCount(str(file))
        fw = file.parent.joinpath(f'{file.name}_{args.postfix}').open('w')

        with file.open('rb') as fr:
            for lineno, line in tqdm(enumerate(fr, start=1),
                                     total=total_line_num):

                line = line.decode('utf-8', errors='ignore')
                line = line.strip()
                if line == '':
                    continue

                line_key = f"{file}:{lineno}"
    parser = argparse.ArgumentParser()

    parser.add_argument('--input', required=True)
    parser.add_argument('--pattern')
    parser.add_argument('--postfix', required=True)
    parser.add_argument('--save_result', action='store_true')
    parser.add_argument('--save_redis', action='store_true')
    parser.add_argument('--redis_db', type=int, default=0)

    args = parser.parse_args()

    save_result = args.save_result

    save_redis = args.save_redis

    redis_cli = getRedisClient(db=args.redis_db)

    input_path = Path(args.input)

    assert input_path.exists()

    print(args)

    if input_path.is_dir():
        assert args.pattern is not None
        logger.info(f'find files in {input_path}/{args.pattern}')
        files = [file for file in input_path.rglob(args.pattern)]
        files = sorted(files, key=lambda x: str(x))
    else:
        files = [input_path]