from collections import namedtuple import json import re from bs4 import BeautifulSoup as BS from common import ( datapath, log, mkdata, mkvariant, sanitise_ws, ) cachepath = datapath('YouTube.html') SEARCH_URL = ('https://www.youtube.com/results?' 'gl={y.country}&persist_gl=1&search_query={{query}}') SUGGEST_URL = ('https://suggestqueries.google.com/complete/search?' 'client=firefox&ds=yt&hl={y.lang}&q={{query}}') # superset of Lang YT = namedtuple('YT', 'name lang country') def html(): """Encoded HTML data from URL or cache (if it exists). Returns: str: Raw bytes returned from URL/file
# # MIT Licence. See http://opensource.org/licenses/MIT # # Created on 2017-02-05 # """Generate eBay engine JSON.""" from __future__ import print_function, absolute_import from collections import namedtuple import csv import json from common import datapath, mkdata, mkvariant path = datapath('ebay-variants.tsv') SEARCH_URL = 'https://www.ebay.{tld}/sch/i.html?_nkw={{query}}' SUGGEST_URL = 'https://autosug.ebay.com/autosug?fmt=osr&sId={site}&kwd={{query}}' Variant = namedtuple('Variant', 'site uid tld name') def variants(): """International eBay variants. Yields: Variant: eBay variant """ with open(path) as fp: for line in csv.reader(fp, delimiter='\t'):
# # MIT Licence. See http://opensource.org/licenses/MIT # # Created on 2017-02-06 # """Generate engine JSON for a Google search.""" from __future__ import print_function, absolute_import from collections import namedtuple import csv import json from common import datapath, mkdata, mkvariant path = datapath('google-languages.tsv') Lang = namedtuple('Lang', 'id name') def langs(): """All languages supported by Google. Yields: Lang: Google languages """ with open(path) as fp: for line in csv.reader(fp, delimiter='\t'): yield Lang(*[s.decode('utf-8') for s in line])
# Created on 2016-12-17 # """Generate Wikipedia engine JSON.""" from __future__ import print_function, absolute_import from collections import namedtuple import json from bs4 import BeautifulSoup as BS from common import datapath, httpget, mkdata, mkvariant url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias' cachepath = datapath('Wikipedia.html') # Ignore wikis whose article count is below... MIN_ARTICLE_COUNT = 10000 SEARCH_URL = 'https://{l.code}.wikipedia.org/wiki/{{query}}' SUGGEST_URL = ('https://{l.code}.wikipedia.org/w/api.php?' 'action=opensearch&search={{query}}') # superset of Lang Wiki = namedtuple('Wiki', 'name code size') def html(): """Encoded HTML data from URL or cache (if it exists).
# Copyright (c) 2016 Dean Jackson <*****@*****.**> # # MIT Licence. See http://opensource.org/licenses/MIT # # Created on 2016-03-13 # """Output TSV list of ISO-639-1 language ``code,name``.""" from __future__ import print_function, absolute_import from common import datapath, httpget, Lang, print_lang from bs4 import BeautifulSoup as BS url = 'https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes' cachepath = datapath('ISO 3316-1 Country Codes.html') def main(): html = httpget(url, cachepath) soup = BS(html, 'html.parser') for row in soup.find_all('tr'): cells = row.find_all('td') if len(cells) != 10: continue cells = cells[3:5] # print(cells) name, abbr = [e.get_text().strip() for e in cells] if len(abbr) != 2: continue
# Created on 2016-03-13 # """Wiktionary variants.""" from __future__ import print_function, absolute_import from collections import namedtuple import json from bs4 import BeautifulSoup as BS from common import datapath, httpget, mkdata, mkvariant url = 'https://www.wiktionary.org' path = datapath('Wiktionary.html') SEARCH_URL = 'https://{w.lang}.wiktionary.org/wiki/{{query}}' SUGGEST_URL = 'https://{w.lang}.wiktionary.org/w/api.php?action=opensearch&search={{query}}' Wiki = namedtuple('Wiki', 'id url lang name') def html(): """Wiktionary HTML. Returns: str: HTML at ``url``. """ return httpget(url, path)
# MIT Licence. See http://opensource.org/licenses/MIT # # Created on 2016-03-12 # """Generate TSV of the languages supported by Google.""" from __future__ import print_function, absolute_import from HTMLParser import HTMLParser from common import datapath, httpget, Lang, print_lang # Google's preferences pages url = 'https://www.google.com/preferences' cachepath = datapath('Google Prefs.html') def html(): return httpget(url, cachepath).decode('ISO-8859-1') def parse_page(html): """Parse language id-name pairs from HTML. Args: html (unicode): Google's preferences page. Returns: list: Sequence of 2-tuples: `(id, name)`. """
# MIT Licence. See http://opensource.org/licenses/MIT # # Created on 2016-03-13 # """Output TSV list of ISO-639-1 language ``code,name``.""" from __future__ import print_function, absolute_import from common import datapath, httpget, Lang, print_lang from bs4 import BeautifulSoup as BS url = 'https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes' cachepath = datapath('ISO 3316-1 Country Codes.html') def main(): html = httpget(url, cachepath) soup = BS(html, 'html.parser') for row in soup.find_all('tr'): cells = row.find_all('td') if len(cells) != 10: continue cells = cells[3:5] # print(cells) name, abbr = [e.get_text().strip() for e in cells] if len(abbr) != 2: continue
# MIT Licence. See http://opensource.org/licenses/MIT # # Created on 2017-02-06 # """Generate engine JSON for a Google search.""" from __future__ import print_function, absolute_import from collections import namedtuple import csv import json from common import datapath, mkdata, mkvariant path = datapath('google-languages.tsv') Lang = namedtuple('Lang', 'id name') def langs(): """All languages supported by Google. Yields: Lang: Google languages """ with open(path) as fp: for line in csv.reader(fp, delimiter='\t'): yield Lang(*[s.decode('utf-8') for s in line])
# # Created on 2016-03-13 # """Wiktionary variants.""" from __future__ import print_function, absolute_import from collections import namedtuple import json from bs4 import BeautifulSoup as BS from common import datapath, httpget, mkdata, mkvariant url = 'https://www.wiktionary.org' path = datapath('Wiktionary.html') SEARCH_URL = 'https://{w.lang}.wiktionary.org/wiki/{{query}}' SUGGEST_URL = 'https://{w.lang}.wiktionary.org/w/api.php?action=opensearch&search={{query}}' Wiki = namedtuple('Wiki', 'id url lang name') def html(): """Wiktionary HTML. Returns: str: HTML at ``url``. """ return httpget(url, path)
# Copyright (c) 2016 Dean Jackson <*****@*****.**> # # MIT Licence. See http://opensource.org/licenses/MIT # # Created on 2017-02-05 # """Generate Duck Duck Go engine JSON.""" from __future__ import print_function, absolute_import from collections import namedtuple import csv import json from common import datapath, mkdata, mkvariant path = datapath('ddg-variants.tsv') SEARCH_URL = 'https://duckduckgo.com/?kp=-1&kz=-1&kl={kl}&q={{query}}' SUGGEST_URL = 'https://duckduckgo.com/ac/?kp=-1&kz=-1&kl={kl}&q={{query}}' Variant = namedtuple('Variant', 'id name') def variants(): """DDG variants from `ddg-variants.tsv`. Yields: Variant: DDG variant """ with open(path) as fp: for line in csv.reader(fp, delimiter='\t'):
# # Created on 2016-12-17 # """Generate Wikipedia engine JSON.""" from __future__ import print_function, absolute_import from collections import namedtuple import json from bs4 import BeautifulSoup as BS from common import datapath, httpget, mkdata, mkvariant url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias' cachepath = datapath('Wikipedia.html') # Ignore wikis whose article count is below... MIN_ARTICLE_COUNT = 10000 SEARCH_URL = 'https://{l.code}.wikipedia.org/wiki/{{query}}' SUGGEST_URL = ('https://{l.code}.wikipedia.org/w/api.php?' 'action=opensearch&search={{query}}') # superset of Lang Wiki = namedtuple('Wiki', 'name code size') def html(): """Encoded HTML data from URL or cache (if it exists).
"""Generate Wikia engine JSON.""" from __future__ import print_function, absolute_import from collections import namedtuple import json import re import sys from bs4 import BeautifulSoup as BS from common import datapath, httpget, mkdata, mkvariant SOURCES = [ ('http://community.wikia.com/wiki/Hub:Big_wikis', datapath('Wikia-Biggest.html')), ('http://community.wikia.com/wiki/Hub:Wikis_with_many_active_members', datapath('Wikia-Most-Active.html')), ('http://community.wikia.com/wiki/Hub:Sci-Fi', datapath('Wikia-SF.html')), ] SEARCH_URL = 'http://{w.subdomain}.wikia.com/wiki/{{query}}' SUGGEST_URL = ('http://{w.subdomain}.wikia.com/api.php?' 'action=opensearch&search={{query}}') Wiki = namedtuple('Wiki', 'name subdomain') match = re.compile(r'http://(.+?)\..+').match def log(s, *args):
"""Generate YouTube variants.""" from __future__ import print_function, absolute_import from collections import namedtuple import json import re from bs4 import BeautifulSoup as BS from common import ( datapath, log, mkdata, mkvariant, sanitise_ws, ) cachepath = datapath('YouTube.html') SEARCH_URL = ('https://www.youtube.com/results?' 'gl={y.country}&persist_gl=1&search_query={{query}}') SUGGEST_URL = ('https://suggestqueries.google.com/complete/search?' 'client=firefox&ds=yt&hl={y.lang}&q={{query}}') # superset of Lang YT = namedtuple('YT', 'name lang country') def html(): """Encoded HTML data from URL or cache (if it exists). Returns: str: Raw bytes returned from URL/file