def query(self, word: str): try: content = self._get_raw(word) except QueryError as exception: raise NotFoundError(exception.word) content = json.loads(content) try: # Get the first definition string from JSON. definition = content['en'][0]['definitions'][0]['definition'] except KeyError as exception: # API can return JSON that does not contain 'en' language. raise NotFoundError(word) else: # Clean the definition string from HTML tags. definition = BeautifulSoup(definition, "html.parser").text content = {} content['definition'] = definition record = Record( word=word, content=json.dumps(content), source=self.provider, ) return record
def query(self, word: str): webpage = self._get_raw(word) data = bs4.BeautifulSoup(webpage, "html.parser") content = {} # Please bump version if the format changes again. # the `show` function will act with respect to version number. content['version'] = 2 # Here are details of each version. # # The original one, in the old era, there wasn't any concept of # version number: # content = { # 'word': ..., # 'pronounce': ..., # 'sound': (optional), # 'explain': [...], # 'verbose': [...], # } # # Verion 2, yahoo dictionary content is provided by Dy.eye # at that moment: # content = { # 'version': 2, # 'summary': { # 'word': ..., # 'pronounce': [('KK', '...'), (...)], // optional. # // e.g. 'google' # 'explain': [(optional)], # 'hospitalized' is summary-only # 'grammar': [(optional)], # }, # 'explain': [...], # 'verbose': [(optional)], # } # Construct summary (required) try: content['summary'] = self.parse_summary(data, word) except AttributeError: raise NotFoundError(word) # Handle explain (required) try: content['explain'] = self.parse_explain(data) except IndexError: raise NotFoundError(word) # Extract verbose (optional) content['verbose'] = self.parse_verbose(data) record = Record( word=word, content=json.dumps(content), source=self.provider, ) return record
def query(self, word: str): try: content = self._get_raw(word) except QueryError as exception: raise NotFoundError(exception.word) content = json.loads(content) try: # Get the first definition string from JSON. content = content['en'] except KeyError: # API can return JSON that does not contain 'en' language. raise NotFoundError(word) # Define a list that will be used to create a Record. r_content = [] # For every part of speech append r_content corresponding list. for i, d in enumerate(content): # Add what part of speech current definitions refers to. r_content.append({'part_of_speech': d['partOfSpeech']}) # Create a list that will store english_definitions # of the current part of speech. r_content[i]['definitions'] = [] for j, d2 in enumerate(d['definitions']): # Parse definition and append definitions list. definition = BeautifulSoup(d2['definition'], "html.parser").text r_content[i]['definitions'].append({'definition': definition}) # If API provides examples for the current definition # create a new list and append them. try: d2['examples'] except KeyError: pass else: r_content[i]['definitions'][j]['examples'] = [] for ex in d2['examples']: ex = BeautifulSoup(ex, "html.parser").text r_content[i]['definitions'][j]['examples'].append(ex) record = Record( word=word, content=json.dumps(r_content), source=self.provider, ) return record
def query(self, word: str): content = self._get_raw(word) content_json = json.loads(content) if not content_json['data']: raise NotFoundError(word) record = Record( word=word, content=content, source=self.provider, ) return record
def query(self, word: str): try: content = self._get_raw(word) except QueryError as exception: raise NotFoundError(exception.word) content_json = json.loads(content) status = content_json.get('code') if status != 200: # https://tech.yandex.com/translate/doc/dg/reference/translate-docpage/#codes message = self.status_code.get( status, 'Some bad thing happened with Yandex') print('Yandex: ' + message) raise NotFoundError(word) record = Record( word=word, content=content, source=self.provider, ) return record
def query(self, word: str): content = self._get_raw(word) if "no_results" in content: raise NotFoundError(word) record = Record( word=word, content=content, source=self.provider, ) return record
def query(self, word: str): try: content = self._get_raw(word) except QueryError as exception: raise NotFoundError(exception.word) record = Record( word=word, content=content, source=self.provider, ) return record
def query(self, word: str): content_str = self._get_raw(word) content_dict = json.loads(content_str) if content_dict['list'] == []: raise NotFoundError(word) record = Record( word=word, content=content_str, source=self.provider, ) return record
def parse_summary(self, data, word): def get_explain(e: bs4.element.Tag): def f(ks): return ('pos' if 'pos_button' in ks else 'explain' if 'dictionaryExplanation' in ks else '?') return [(f(m.attrs['class']), m.text) for n in e.select('ul > li') for m in n.select('div')] def get_pronounce(p: bs4.element.Tag): return list( map(lambda x: re.match(r'(.*)(\[.*\])', x).groups(), p.find('ul').text.strip().split())) def get_grammar(d: bs4.element.Tag): s = ('div#web ol.searchCenterMiddle ' 'div.dictionaryWordCard > ul > li') return list(map(text, d.select(s))) node = data.select_one('div#web ol.searchCenterMiddle') node = node.select('div.sys_dict_word_card > div.grp-main > div') p = None # optional if node is None or len(node) <= 1: # e.g. "fabor" raise NotFoundError(word) elif len(node) == 2: # e.g. "apples" w, e = node elif len(node) == 3: # e.g. ? w, _, e = node elif len(node) == 4: # e.g. ? w, _, _, e = node elif len(node) == 5: # e.g. "metadata" w, p, _, _, e = node elif len(node) == 6: w, p, _, _, _, e = node return { 'word': w.find('span').text.strip(), 'pronounce': get_pronounce(p) if p else [], # optional 'explain': get_explain(e), 'grammar': get_grammar(data), # optional }
def query(self, word: str): try: app_id, app_key = self._get_app_key() content = self._get_raw(word, headers={ 'app_id': app_id, 'app_key': app_key }) except QueryError as exception: msg = self.status_code.get(exception.status_code, 'Some bad thing happened') self.color.print('Oxford: ' + msg, 'red') raise NotFoundError(exception.word) record = Record( word=word, content=content, source=self.provider, ) return record
def query(self, word: str): requests.packages.urllib3.disable_warnings() content = self._get_raw(word, verify=False) data = {"title": word, "sources": defaultdict(list)} soup = BeautifulSoup(content, "html.parser") for tr in soup.find_all("tr", {"class": "dash"}): source = ( tr.find("td", attrs={"class": "sourceW"}).find("a").text ).strip() en = tr.find("td", attrs={"class": "ennameW"}).text.strip() zhtw = tr.find("td", attrs={"class": "zhtwnameW"}).text.strip() data["sources"][source].append((en, zhtw)) if len(data["sources"]) == 0: raise NotFoundError(word) record = Record( word=word, content=json.dumps(data), source=self.provider ) return record
def parse_summary(self, data, word): def gete(x: 'bs4 node'): def f(ks): return ('pos' if 'pos_button' in ks else 'explain' if 'dictionaryExplanation' in ks else '?') return [(f(m.attrs['class']), m.text) for n in x.select('ul > li') for m in n.select('div')] def getp(p): return list( map(lambda x: re.match('(.*)(\[.*\])', x).groups(), p.find('ul').text.strip().split())) def getg(d): s = ('div#web ol.searchCenterMiddle ' 'div.dictionaryWordCard > ul > li') return list(map(text, data.select(s))) node = data.select_one('div#web ol.searchCenterMiddle > li > div') node = node.select('> div') p = None # optional if len(node) == 6: # e.g. "metadata" _, w, p, _, _, e = node elif len(node) == 5: _, w, p, _, e = node elif len(node) == 4: # e.g. "hold on" _, w, _, e = node elif len(node) == 3: # e.g. "google" _, w, e = node elif len(node) <= 2: # e.g. "fabor" raise NotFoundError(word) return { 'word': w.find('span').text.strip(), 'pronounce': getp(p) if p else [], # optional 'explain': gete(e), 'grammar': getg(data), # optional }
def query(self, word: str): webpage = self._get_raw(word) data = BeautifulSoup(webpage, "html.parser") content = {} card = data.find('div', attrs={'class': 'card'}) entry = card.find( # just get the first one attrs={'class': 'dictionary-entry'}) if not entry: raise NotFoundError(word) content['explains'] = [] # word can be existing in both English & Spanish word_element = (card.find(attrs={'id': 'headword-en'}) or card.find(attrs={'id': 'headword-es'})) if word_element is None: raise NotFoundError(word) content['word'] = word_element.text pattern1 = {'class': 'dictionary-neodict-indent-1'} pattern2 = {'class': 'dictionary-neodict-indent-2'} pattern3 = {'class': 'dictionary-neodict-indent-3'} pattern_order = {'class': 'dictionary-neodict-translation'} pattern_example = {'class': 'dictionary-neodict-example'} pattern1_en = {'class': 'dictionary-neoharrap-indent-1'} pattern2_en = {'class': 'dictionary-neoharrap-indent-2'} pattern_order_en = {'class': 'dictionary-neoharrap-translation'} speeches = card.find_all(attrs={'class': 'part_of_speech'}) for (speech, category) in zip( speeches, entry.find_all(attrs=pattern1) or entry.find_all(attrs=pattern1_en)): result = [] content['explains'].append([speech.text, result]) context = category.find(attrs={'class': 'context'}).text explains = [] for explain in (category.find_all(attrs=pattern2) or category.find_all(attrs=pattern2_en)): orders = (explain.find_all(attrs=pattern_order) or explain.find_all(attrs=pattern_order_en)) if orders: # e.g. # # ('a. forgiveness', 'b. pardon (law)') # indices = tuple( map(lambda x: x.text.replace('\xa0', ' ').strip(), orders)) else: continue examples = explain.find_all(attrs=pattern3) for (example, index) in zip(examples, indices): t = tuple(example.find(attrs=pattern_example)) (spanish, english) = (t[0].text, t[2].text) explains.append((index, spanish, english)) if (not examples) and (len(indices) > 0): for index in indices: explains.append((index, )) result.append([context, explains]) record = Record( word=word, content=json.dumps(content), source=self.provider, ) return record
def query(self, word: str): webpage = self._get_raw(word) data = BeautifulSoup(webpage, "html.parser") content = {} # handle record.word try: content['word'] = data.find('span', id='term').text except AttributeError: raise NotFoundError(word) # handle pronounce pronu_value = data.find('span', id='pronunciation_pos').text if pronu_value: content['pronounce'] = [] for match in re.finditer('(\w+)(\[.*?\])', pronu_value): content['pronounce'].append(match.group(1, 2)) # handle sound proun_sound = data.find( 'span', style="display: none;", id="iconStyle", class_="tri", title="http://product.dreye.com.tw/", ) if proun_sound: content['sound'] = {} d = json.loads(proun_sound.text) sound_types_and_urls = (d.get('sound_url_1', []) + d.get('sound_url_2', [])) sound_accents = (d.get('sound_type_1', []) + d.get('sound_type_2', [])) for sound_type_and_url, sound_accent in zip( sound_types_and_urls, sound_accents): if sound_type_and_url: sound_type, sound_url = list(sound_type_and_url.items())[0] content['sound'].setdefault(sound_type, {}).setdefault( sound_accent, []).append(sound_url) # Handle explain main_explanations = data.find( class_='dd algo explain mt-20 lst DictionaryResults') if main_explanations: main_explanations = itertools.zip_longest( main_explanations.find_all(class_='compTitle mb-10'), main_explanations.find_all( class_='compArticleList mb-15 ml-10', )) else: main_explanations = "" content['explain'] = [] for part_of_speech, meaning in main_explanations: node = [part_of_speech.text] if part_of_speech else [''] for item in meaning.find_all('li', class_='ov-a'): pack = [item.find('h4').text] for example in ( tag for tag in item.find_all('span') if 'line-height: 17px;' not in tag.get('style', {})): sentence = '' for w in example.contents: if w.name == 'b': sentence += '*' + w.text + '*' else: try: sentence += w except Exception: pass pack.append((sentence.strip())) node.append(pack) content['explain'].append(node) # verbose info part_of_speech_list, meaning_list = [], [] content['verbose'] = [] variation_explanations = data.find( class_='dd algo variation fst DictionaryResults') if variation_explanations: part_of_speech_list.extend( variation_explanations.find_all(class_='compTitle')) meaning_list.extend( variation_explanations.find_all(class_='compArticleList')) additional_explanations = data.find( class_='dd algo othersNew lst DictionaryResults') if additional_explanations: part_of_speech_list.extend( additional_explanations.find_all(class_='compTitle mt-26')) meaning_list.extend( additional_explanations.find_all(class_='compArticleList')) more_explanations = itertools.zip_longest(part_of_speech_list, meaning_list) for part_of_speech, meaning in more_explanations: node = [part_of_speech.text] if part_of_speech else [''] if meaning: for item in meaning.find_all('li', class_='ov-a'): pack = [item.find('h4').text] for example in ( tag for tag in item.find_all('span') if 'line-height: 17px;' not in tag['style']): sentence = '' for w in example.contents: if w.name == 'b': sentence += '*' + w.text + '*' else: try: sentence += w except Exception: pass pack.append((sentence.strip())) node.append(pack) content['verbose'].append(node) record = Record( word=word, content=json.dumps(content), source=self.provider, ) return record
def query(self, word: str): r = requests.post( self.POST_API, json={ "c": "1", "t": "all", "q": word, }, ) content = self._get_raw(word, cookies=r.cookies) data = { "title": word, "exact_sources": defaultdict(list), "fuzzy_sources": defaultdict(list), } soup = BeautifulSoup(content, "html.parser") # Exact matching exact = soup.find(id="accordion_cross") if exact: for div in exact.find_all("div", {"class": "panel"}): title = div.find("div", {"class": "title"}).find("a").text defs = [ div.find("strong", { "class": "word" }).find_all("span")[-1].text.strip() ] source = div.find("strong", {"class": "race"}).text for i in div.find_all("li")[1:]: d = i.find("strong", {"class": "word"}) if d: defs.append(d.find_all("span")[-1].text.strip()) link = (self.BASE_URL + div.find("a", {"class": "btn-more"})["href"]) data["exact_sources"][source] = { "title": title, "defs": defs, "link": link, } # Fuzzy matching fuzzy = soup.find(id="accordion") if fuzzy: for div in fuzzy.find_all("div", {"class": "panel"}): title = div.find("div", {"class": "title"}).find("a").text defs = [ div.find("strong", { "class": "word" }).find_all("span")[-1].text.strip() ] source = div.find("strong", {"class": "race"}).text for i in div.find_all("li")[1:]: d = i.find("strong", {"class": "word"}) if d: defs.append(d.find_all("span")[-1].text.strip()) link = (self.BASE_URL + div.find("div", { "class": "title" }).find("a")["href"]) data["fuzzy_sources"][source].append({ "title": title, "defs": defs, "link": link }) if not exact and not fuzzy: raise NotFoundError(word) record = Record( word=word, content=json.dumps(data), source=self.provider, ) return record
def query(self, word: str): webpage = self._get_raw(word) soup = BeautifulSoup(webpage, "html.parser") response = json.loads(soup.text) # Not Found if not response.get("列表"): raise NotFoundError(word) # Show Chinese word from iTaigi in stead of user input if possible with suppress(KeyError, IndexError): word = response["列表"][0]["外語資料"] content = {} # Fetch basic words with text, pronounce and sentence try: basic_words = response["列表"][0]["新詞文本"] except Exception: raise else: content['basic_words'] = [] for basic_word in basic_words: d = {} text = self._get_word_text(basic_word) d['text'] = text pronounce = self._get_word_pronounce(basic_word) d['pronounce'] = pronounce if self.args.verbose: sentences = self._get_word_sentences(text, pronounce) d['sentences'] = sentences content['basic_words'].append(d) # Fix issue-452 for iTaigi testings # iTaigi returns basic_words in random order. # Since we store basic_words in a list, # We have to sort it before saving into database # or the unit-testings would fail. content['basic_words'].sort(key=lambda word: word['text']) # Fetch related words try: related_words = response["其他建議"] except Exception: raise else: content['related_words'] = [] for related_word in related_words: d = {} text = self._get_word_text(related_word) d['text'] = text pronounce = self._get_word_pronounce(related_word) d['pronounce'] = pronounce if self.args.verbose: sentences = self._get_word_sentences(text, pronounce) d['sentences'] = sentences content['related_words'].append(d) # Save content with word and provider. record = Record( word=word, content=json.dumps(content), source=self.provider, ) return record
def query(self, word: str): webpage = self._get_raw(word) data = BeautifulSoup(webpage, "html.parser") content = {} # handle record.word try: content['word'] = data.find('span', id='term').text except AttributeError: raise NotFoundError(word) # handle pronounce pronu_value = data.find('span', id='pronunciation_pos').text if pronu_value: content['pronounce'] = [] for match in re.finditer('(\w+)(\[.*?\])', pronu_value): content['pronounce'].append(match.group(1, 2)) # handle sound pronu_sound = data.find(class_='proun_sound') if pronu_sound: content['sound'] = [ ('mp3', pronu_sound.find(class_='source', attrs={ 'data-type': 'audio/mpeg' }).attrs['data-src']), ('ogg', pronu_sound.find(class_='source', attrs={ 'data-type': 'audio/ogg' }).attrs['data-src']), ] # Handle explain main_explanations = data.find( class_='dd algo explain mt-20 lst DictionaryResults') if main_explanations: main_explanations = itertools.zip_longest( main_explanations.find_all(class_='compTitle mb-10'), main_explanations.find_all( class_='compArticleList mb-15 ml-10', )) else: main_explanations = "" content['explain'] = [] for part_of_speech, meaning in main_explanations: node = [part_of_speech.text] if part_of_speech else [''] for item in meaning.find_all('li', class_='ov-a'): pack = [item.find('h4').text] for example in (tag for tag in item.find_all('span') if 'line-height: 17px;' not in tag['style']): sentence = '' for w in example.contents: if w.name == 'b': sentence += '*' + w.text + '*' else: try: sentence += w except: pass pack.append((sentence.strip())) node.append(pack) content['explain'].append(node) # verbose info part_of_speech_list, meaning_list = [], [] content['verbose'] = [] variation_explanations = data.find( class_='dd algo variation fst DictionaryResults') if variation_explanations: part_of_speech_list.extend( variation_explanations.find_all(class_='compTitle')) meaning_list.extend( variation_explanations.find_all(class_='compArticleList')) additional_explanations = data.find( class_='dd algo othersNew lst DictionaryResults') if additional_explanations: part_of_speech_list.extend( additional_explanations.find_all(class_='compTitle mt-26')) meaning_list.extend( additional_explanations.find_all(class_='compArticleList')) more_explanations = itertools.zip_longest(part_of_speech_list, meaning_list) for part_of_speech, meaning in more_explanations: node = [part_of_speech.text] if part_of_speech else [''] if meaning: for item in meaning.find_all('li', class_='ov-a'): pack = [item.find('h4').text] for example in ( tag for tag in item.find_all('span') if 'line-height: 17px;' not in tag['style']): sentence = '' for w in example.contents: if w.name == 'b': sentence += '*' + w.text + '*' else: try: sentence += w except: pass pack.append((sentence.strip())) node.append(pack) content['verbose'].append(node) record = Record( word=word, content=json.dumps(content), source=self.provider, ) return record
def query(self, word: str): webpage = self._get_raw(word) soup = BeautifulSoup(webpage, "html.parser") content = {} en_css = "#dictionary-neodict-en" es_css = "#dictionary-neodict-es" card = soup.select_one(en_css) or soup.select_one(es_css) if card is None: raise NotFoundError(word) word_css = "div > div:nth-child(1) > span" word_element = card.select_one(word_css) if word_element is None: raise NotFoundError(word) content['word'] = word_element.text ''' COPULAR VERB # speech # categories_card 1. (used to express a permanent quality) # category_text # explanation a. ser # index # examples # example The ocean is blue. El océano es azul. 2. (used to express a temporary state) a. estar I'm not in a good mood today. Hoy no estoy de buen humor. The sky is cloudy. El cielo está nublado. ... (Another speech if it has.) ''' speech_pattern = "div > div:nth-child(2)" # "#dictionary-neodict-en > div > div:nth-child(2)" # Start to grab content['explains'] = [] speech = card.select_one(speech_pattern) while speech: result = [] speech_text, categories_card = speech.children speech_text_element = speech_text.find(['a', 'span']) content['explains'].append([speech_text_element.text, result]) for category in categories_card.children: category_text_element, explanations_card = category.children category_text = category_text_element.text explains = [] for explanation in explanations_card.children: for _ in explanation.children: index_elements, examples = (_.contents[:-1], _.contents[-1]) index = ' '.join([ _.text.strip() for _ in index_elements if _ != ' ' ]) if (not examples) and index: explains.append((index, )) continue sentences = [] for example in examples: t = example.find_all() # Should be only 3 elements # [text, —, text] ''' When Spanish => English, it will show Spanish first When English => Spanish, it will show English first So, the variables below are not definitely ''' sentences.append((t[0].text, t[2].text)) explains.append((index, sentences)) result.append([category_text, explains]) speech = speech.next_sibling record = Record( word=word, content=json.dumps(content), source=self.provider, ) return record
def query(self, word: str): webpage = self._get_raw(word) soup = BeautifulSoup(webpage, "html.parser") response = json.loads(soup.text) # Not Found if not response.get("列表"): raise NotFoundError(word) # Show Chinese word from iTaigi in stead of user input if possible with suppress(KeyError, IndexError): word = response["列表"][0]["外語資料"] content = {} # Fetch basic words with text, pronounce and sentence try: basic_words = response["列表"][0]["新詞文本"] except Exception: raise else: content['basic_words'] = [] for basic_word in basic_words: d = {} text = self._get_word_text(basic_word) d['text'] = text pronounce = self._get_word_pronounce(basic_word) d['pronounce'] = pronounce if self.args.verbose: sentences = self._get_word_sentences(text, pronounce) d['sentences'] = sentences content['basic_words'].append(d) # Fetch related words try: related_words = response["其他建議"] except Exception: raise else: content['related_words'] = [] for related_word in related_words: d = {} text = self._get_word_text(related_word) d['text'] = text pronounce = self._get_word_pronounce(related_word) d['pronounce'] = pronounce if self.args.verbose: sentences = self._get_word_sentences(text, pronounce) d['sentences'] = sentences content['related_words'].append(d) # Save content with word and provider. record = Record( word=word, content=json.dumps(content), source=self.provider, ) return record