def search_word_in_dict(word: str, dict: str, morphology: bool = True): global logger word = word.strip(' \n') words = [word] if morphology: hobj = hunspell.HunSpell('/usr/share/hunspell/en_US.dic', '/usr/share/hunspell/en_US.aff') if hobj.spell(word) and hobj.stem(word): words = [b.decode() for b in hobj.stem(word)] logger.debug('Get stems: {}.'.format(', '.join(words))) builder = IndexBuilder(dict) builder.check_build() for w in words: meanings = builder.mdx_lookup(w, ignorecase=True) if not meanings: continue logger.debug('Find {} meanings of word {} from dictionary {}.'.format( len(meanings), w, dict)) if w != word: word = w return word, meanings[0] logger.debug('Cannot find word {} from dictionary {}.'.format(word, dict)) return word, None
def generate_soup(word): """产出所查询单词对应的 BeautifulSoup 实例""" builder = IndexBuilder('柯林斯高阶双解.mdx') the_word = builder.mdx_lookup(str(word))[0] return BeautifulSoup(the_word, features='lxml')
def indexing(builder: IndexBuilder) -> int: """indexing all examples in lsc4 dict TODO: 性能很差,indexing动作应该放在解析mdx文件的时候 :param builder dict builder """ if not USE_ES or not CONNECTED_ES: return 0 # create index if not create_index(): return 0 conn = sqlite3.connect(builder.get_mdx_db()) cursor = conn.execute('SELECT key_text FROM MDX_INDEX') keys = [item[0] for item in cursor] conn.close() examples = [] for key in keys: content = builder.mdx_lookup(key) str_content = "" if len(content) > 0: for c in content: str_content += c.replace("\r\n", "").replace("entry:/", "") exs = example_parse_lsc4("lsc4", key, str_content) if exs: examples.extend(exs) if len(examples) > 100000: ingest("lsc4", examples) examples = [] ingest("lsc4", examples) print("indexing done", len(keys))
def build_dict() -> Dict[str, IndexBuilder]: """将所有词典构建好builders,根据前端选择查询对应的词典""" global BUILDERS for d, f in DICTS_MAP.items(): if not os.path.exists(f): log.warning(f"the dict({d}) file:{f} doesn't exist,removed") for d, f in DICTS_MAP.items(): _builder = IndexBuilder(f) BUILDERS[d] = _builder"all dictionaries= {DICTS_MAP}")
def test_builder_noindex(self): '''test basic function''' for f in glob.glob("mdx/Vocabulary*.db"): os.remove(f) print("***without sql index***\n") start = time.time() bd = IndexBuilder(self._mdx_file, sql_index=False, check=True) print("takes {0} seconds to build without sql index\n".format( time.time() - start)) start = time.time() word = 'dedicate' for i in range(self._repeat): self.assertTrue(bd.mdx_lookup(word)) print("takes {0} second to lookup {1} {2} times\n".format( time.time() - start, word, self._repeat)) for i in range(self._repeat): bd.get_mdx_keys("dedi*") print("takes {0} second to lookup {1} {2} times\n".format( time.time() - start, "dedi*", self._repeat))
def process_data(self, config, data): """"获取柯思林,高阶英汉双解词典信息""" # 获得所有数据 # Cobuild词典信息 print("Cobuild词典服务开始处理数据!") file_name = dict_builder = IndexBuilder(config.cobuild) # 1:查询到所有单词的字典html信息 (doc_items, invalid_items) = self._query_word_items(file_name, dict_builder, data) # 2:记录扫描无效信息 print("\t检索字典:%s检索完毕!\n\t开始写入无效数据" % file_name) self.write_invalid_items(config, file_name + "_invalid", invalid_items) print("\t开始写入正常数据") # 3:扫描文本内资源信息 print("\t开始写入资源数据") self.save_resource(config, dict_builder, doc_items, file_name) # 4:重置资源引用信息 self.reset_html_res(doc_items) # 5:记录扫描结果 self.write_items(config, file_name, doc_items) # 返回doc信息 self.result = doc_items
def _build_index(self): dict_list = [] files_in_dir = os.listdir(self._mdict_dir) for item in files_in_dir: full_name = os.path.join(self._mdict_dir, item) if os.path.isfile(full_name): _filename, _file_extension = os.path.splitext(full_name) if _file_extension == '.mdx': _config_single_dic = { 'title': '', 'description':'', 'mdx_name': full_name, 'has_mdd': os.path.isfile(_filename + '.mdd') } try: ib = IndexBuilder(full_name) except Exception: continue _config_single_dic['title'] = ib._title _config_single_dic['description'] = ib._description dict_list.append(_config_single_dic) self._config['dicts'] = dict_list
#!/usr/bin/env python # -*- coding: utf-8 -*- import pandas as pd import numpy import ydcv import sys sys.path.append('./mdictlib') from mdict_query import IndexBuilder builder = IndexBuilder('./Collins.mdx') def has_explains(result): return ('basic' in result) and ('explains' in result['basic']) def get_explian_from_result(result): if not has_explains(result): return '' return ';'.join(result['basic']['explains']) def translate_from_yd(word): result = ydcv.lookup_word_inner(word) return result def if_in_collins(word): return len(builder.mdx_lookup(word)) > 0
from mdict_query import IndexBuilder bd = IndexBuilder("mdx\\oed.mdx") keys = bd.get_mdx_keys("ded*") result = bd.mdx_lookup('a') pass
import re from collections import namedtuple import bs4 from bs4 import BeautifulSoup from mdict_query import IndexBuilder # 测试 builder = IndexBuilder('柯林斯高阶双解.mdx') with open('./dicta.html', 'w+') as wp: wp.write(builder.mdx_lookup('great')[0]) #soup = BeautifulSoup(builder.mdx_lookup('f**k')[0], features="lxml") # TODO: 将结果写入数据库并进行初步测试 Word_Title = namedtuple('Word_Title', 'name star level') Word_Collins_Content = namedtuple( 'Word_Collins_Content', 'interpretation usage usage_note word_format') Word_Interpretation = namedtuple('Word_Interpretation', 'en cn') Word_Usage = namedtuple('Word_Usage', 'description examples') Word_Usage_Note = namedtuple('Word_Usage_Note', 'en cn') Word_Format = namedtuple('Word_Format', 'format examples') def generate_soup(word): """产出所查询单词对应的 BeautifulSoup 实例""" builder = IndexBuilder('柯林斯高阶双解.mdx') the_word = builder.mdx_lookup(str(word))[0]
return [b'<h1>WSGIServer ok!</h1>'] # 新线程执行的代码 def loop(): # 创建一个服务器,IP地址为空,端口是8000,处理函数是application: httpd = make_server('', 8000, application) print("Serving HTTP on port 8000...") # 开始监听HTTP请求: httpd.serve_forever() if __name__ == '__main__': filename = "d:/Program Files (x86)/MDictPC/doc/Longman 5/Longman Dictionary of Contemporary English.mdx" # filename = "d:/Program Files (x86)/MDictPC/doc/Collins COBUILD (CN)/Collins COBUILD (CN).mdx" builder = IndexBuilder(filename) t = threading.Thread(target=loop, args=()) t.start() # if not os.path.exists(args.filename): # print("Please specify a valid MDX/MDD file") # else: # builder = IndexBuilder(args.filename) # t = threading.Thread(target=loop, args=()) # t.start() # import argparse # parser = argparse.ArgumentParser() # parser.add_argument("filename", nargs='?', help="mdx file name") # args = parser.parse_args()
return [b'<h1>WSGIServer ok!</h1>'] # 新线程执行的代码 def loop(): # 创建一个服务器,IP地址为空,端口是8000,处理函数是application: httpd = make_server('', 8000, application) print("Serving HTTP on port 8000...") # 开始监听HTTP请求: httpd.serve_forever() if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument("filename", nargs='?', help="mdx file name") args = parser.parse_args() # use GUI to select file, default to extract if not args.filename: root = tk.Tk() root.withdraw() args.filename = filedialog.askopenfilename(parent=root) if not os.path.exists(args.filename): print("Please specify a valid MDX/MDD file") else: builder = IndexBuilder(args.filename) t = threading.Thread(target=loop, args=()) t.start()
# 创建一个服务器,IP地址为空,端口是8000,处理函数是application: httpd = make_server('', 8000, application) print("Serving HTTP on port 8000...") # 开始监听HTTP请求: httpd.serve_forever() if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument("filenames", nargs='*', help="mdx file name") args = parser.parse_args() # use GUI to select file, default to extract if not args.filenames: if sys.version_info < (3, 0, 0): import Tkinter as tk import tkFileDialog as filedialog else: import tkinter as tk import tkinter.filedialog as filedialog root = tk.Tk() root.withdraw() args.filenames = filedialog.askopenfilename(parent=root) if not all((os.path.exists(filename) for filename in args.filenames)): print("Please specify a valid MDX/MDD file") else: builders = list((IndexBuilder(file) for file in args.filenames)) t = threading.Thread(target=loop, args=()) t.start()
def _add_builder(self): for dict in self._config['dicts']: dict['builder'] = IndexBuilder(dict['mdx_name'])
def main(): fileIn = 'input.txt' fileOut = ('output', '.txt') keepOriginal = False breakFileFlag = False # parse command line args try: opts, args = getopt.getopt(sys.argv[1:], 'hkbi:o:', ['help', 'break-file']) except getopt.GetoptError as err: print('Error: %s!' % err) sys.exit(2) for o, a in opts: if o == '-i': fileIn = a elif o == '-o': fileOut = os.path.splitext(a) if len(fileOut[1]) == 0: print('Error: Regular name of output file required! For example: output.txt') sys.exit() elif o == '-k': keepOriginal = True elif o in ('-b', '--break-file'): breakFileFlag = True elif o in ('-h', '--help'): usage() sys.exit() global idxBuilder idxBuilder = IndexBuilder('C:/Users/DELL/Desktop/hello/Oxford+Advanced+Learner+English-Chinese+Dictionary+8th+Edition.mdx') # read/write files with open(fileIn, 'r') as fdIn: # words = fdIn.readlines() words = [w for w in [line.replace('\t', ' ').strip() for line in fdIn] if len(w) > 0] if not keepOriginal: words = list(set(words)) words.sort(key=str.lower) failWords = [] print("Total words: %d." % len(words)) # output: fdOutTxt = None fdOutHtml = None rememberedLeadingLetter = None fileOpenedFlag = False for word in words: # open right file if breakFileFlag: if rememberedLeadingLetter != word[0].lower(): if fdOutTxt is not None: if not fdOutTxt.closed: fdOutTxt.close() if fdOutHtml is not None: if not fdOutHtml.closed: fdOutHtml.close() rememberedLeadingLetter = word[0].lower() fileOutTxt = fileOut[0] + '_' + rememberedLeadingLetter + fileOut[1] fileOutHtml = fileOut[0] + '_' + rememberedLeadingLetter + '.html' fdOutTxt = open(fileOutTxt, mode='a+', encoding='utf-8') fdOutHtml = open(fileOutHtml, mode='a+', encoding='utf-8') elif not fileOpenedFlag: fileOutTxt = fileOut[0] + fileOut[1] fileOutHtml = fileOut[0] + '.html' fdOutTxt = open(fileOutTxt, mode='a+', encoding='utf-8') fdOutHtml = open(fileOutHtml, mode='a+', encoding='utf-8') fileOpenedFlag = True # lookup word result = extractSentence(word) if result is not None: markupResult = formatContent(result) if markupResult[0] is not None: fdOutTxt.write(markupResult[0] + '\n') if markupResult[1] is not None: fdOutHtml.write(markupResult[1] + '\n') else: failWords.append(word) print('[FAIL] ' + word) else: if fdOutTxt is not None: if not fdOutTxt.closed: fdOutTxt.close() if fdOutHtml is not None: if not fdOutHtml.closed: fdOutHtml.close() if len(failWords) > 0: with open(fileOut[0] + fileOut[1] + '.fail', 'a+', encoding='utf-8') as fdFailTxt: for word in failWords: fdFailTxt.write(word + '\n') # done print("result: " + fileOut[0] + fileOut[1] + ' / ' + fileOut[0] + '.html')
# /bin/python3 import requests as rq import sys import re from mdict_query import IndexBuilder builder = IndexBuilder('mdx/bing.mdx') dic = {} sen = '' def get_exp(word): result_text = builder.mdx_lookup(word) return result_text def split_words(passage): l = re.split(r'[^a-zA-Z]', passage) n = set() for e in l: if e: n.add(e.lower()) new_l = list(n) new_l.sort() return new_l def split_sentences(passage): l = re.split(r'\. |? ', passage)
from mdict_query import IndexBuilder from bs4 import BeautifulSoup import re from collections import OrderedDict builder = IndexBuilder('oxford.mdx') def getSentList(word): dict_str = builder.mdx_lookup(word) bs = BeautifulSoup(dict_str[0]) l = bs.find_all('span', class_="x") u = [] for span in l: if "." not in span.text and "?" not in span.text and "!" not in span.text: continue contents = span.contents if len(contents) > 3: continue chn = span.find_next_sibling().text if len(contents) == 1: eng = contents[0] elif len(contents) == 3: temp = ['', '', ''] for i in range(3): if hasattr(contents[i], 'text'): if contents[i].findChild(): continue temp[i] = contents[i].text else:
import sys import json from mdict_query import IndexBuilder #if sys.argv.__len__() < 3: # sys.exit(1) #print sys.argv dict = {} query_type = sys.argv[2] query_word = sys.argv[3].strip() builder = IndexBuilder('/Users/david/Desktop/G/ciku/Longman Dictionary of Contemporary English.mdx') if query_type == "key": dict[query_word] = builder.mdx_lookup(query_word, True) elif query_type == "wildcard": keys = builder.get_mdx_keys(query_word) count = 0 for key in keys: count += 1 dict[key] = builder.mdx_lookup(key) if count > 10: break elif query_type == "wildcardcount": keys = builder.get_mdx_keys(query_word) print keys.__len__() sys.exit(0)
"tags": [ ] } """ % (FrontStr, BackStr) return newnote if __name__ == '__main__': config = configparser.ConfigParser() # get absolute path fp_dir = os.path.dirname(os.path.realpath(sys.argv[0])) iniFile = os.path.join(fp_dir, "Config.ini") # print(iniFile), encoding='utf-8') mdict = config['Default']['mdxfile'] builder = IndexBuilder(mdict) # Word="abandon" Word = sys.argv[1] Meanings = builder.mdx_lookup(Word, ignorecase=True) record = Meanings[0] CardNote = NoteContent(Word, record) # print(CardNote) # t3=time.time() newnote = json.loads(CardNote, strict=False) # print(newnote) # t4=time.time() try: result = invoke('addNote', note=newnote) print(result) winsound.Beep(440, 250) # frequency, duration