示例#1
0
def _call_scrape(lang_settings, config, tsv_path):
    for unused_retries in range(10):
        with open(tsv_path, "w") as source:
            try:
                for (word, pron) in wikipron.scrape(config):
                    print(f"{word}\t{pron}", file=source)
                return
            except (
                    requests.exceptions.Timeout,
                    requests.exceptions.ConnectionError,
            ):
                logging.info(
                    'Exception detected while scraping: "%s", "%s".',
                    lang_settings["key"],
                    tsv_path,
                )
                # Pauses execution for 10 min.
                time.sleep(600)
    # Log and remove TSVs for languages that failed
    # to be scraped within 10 retries.
    logging.info(
        'Failed to scrape "%s" within 10 retries. %s',
        lang_settings["key"],
        lang_settings,
    )
    os.remove(tsv_path)
示例#2
0
def _call_scrape(lang_settings, config, tsv_path):
    for unused_retries in range(10):
        count = 0
        with open(tsv_path, "w") as source:
            try:
                for (word, pron) in wikipron.scrape(config):
                    count += 1
                    print(f"{word}\t{pron}", file=source)
                return count
            except (
                    requests.exceptions.Timeout,
                    requests.exceptions.ConnectionError,
            ):
                logger.info(
                    'Exception detected while scraping: "%s", "%s".',
                    lang_settings["key"],
                    tsv_path,
                )
                # Pauses execution for 10 min.
                time.sleep(600)
    logger.info(
        'Failed to scrape "%s" within 10 retries. %s',
        lang_settings["key"],
        lang_settings,
    )
    return 0
示例#3
0
def test_scrape():
    """A smoke test for scrape()."""
    n = 10  # number of word-pron pairs to scrape
    config = _config_factory()
    pairs = []
    for i, (word, pron) in enumerate(scrape(config)):
        if i >= n:
            break
        pairs.append((word, pron))
    assert len(pairs) == n
    assert all(word and pron for (word, pron) in pairs)
示例#4
0
def _call_scrape(
    lang_settings: Dict[str, str],
    config: wikipron.Config,
    tsv_path: str,
    phones_set: FrozenSet[str] = None,
    tsv_filtered_path: str = "",
) -> None:
    for unused_retries in range(10):
        with open(tsv_path, "w", encoding="utf-8") as source:
            try:
                scrape_results = wikipron.scrape(config)
                # Given phones, opens up a second tsv for scraping.
                if phones_set:
                    with open(tsv_filtered_path, "w",
                              encoding="utf-8") as source_filtered:
                        for (word, pron) in scrape_results:
                            line = f"{word}\t{pron}"
                            if _filter(word, pron, phones_set):
                                print(line, file=source_filtered)
                            print(line, file=source)
                else:
                    for (word, pron) in scrape_results:
                        print(f"{word}\t{pron}", file=source)
                return
            except (
                    requests.exceptions.Timeout,
                    requests.exceptions.ConnectionError,
            ):
                logging.info(
                    "Exception detected while scraping: %r, %r, %r",
                    lang_settings["key"],
                    tsv_path,
                    tsv_filtered_path,
                )
                # Pauses execution for 10 min.
                time.sleep(600)
    # Log and remove TSVs for languages that failed.
    logging.info(
        "Failed to scrape %r with 10 retries (%s)",
        lang_settings["key"],
        lang_settings,
    )
    # Checks if second TSV was opened.
    try:
        os.remove(tsv_filtered_path)
    except OSError:
        pass
    os.remove(tsv_path)
示例#5
0
def _call_scrape(
    lang_settings: Dict[str, str],
    config: wikipron.Config,
    tsv_path: str,
    phones_set: FrozenSet[str] = None,
    tsv_filtered_path: str = "",
) -> None:
    with open(tsv_path, "w", encoding="utf-8") as source:
        scrape_results = wikipron.scrape(config)
        # Given phones, opens up a second TSV for scraping.
        if phones_set:
            with open(
                tsv_filtered_path, "w", encoding="utf-8"
            ) as source_filtered:
                for (word, pron) in scrape_results:
                    line = f"{word}\t{pron}"
                    if _filter(word, pron, phones_set):
                        print(line, file=source_filtered)
                    print(line, file=source)
        else:
            for (word, pron) in scrape_results:
                print(f"{word}\t{pron}", file=source)
示例#6
0
# -*- coding: utf-8 -*-
"""
Created on Sat May 30 14:54:44 2020

@author: qtckp
"""

import wikipron
import os
import json

dic = {}

config = wikipron.Config(key="en")

t = 0
for word, pron in wikipron.scrape(config):
    t += 1
    if t % 100 == 0:
        print(f'{t} {word} {pron}')
    if len(word) > 1:
        dic[word] = pron

with open("english.json", "w") as write_file:
    json.dump(dic, write_file, indent=4)