Пример #1
0
def search_word_in_dict(word: str, dict: str, morphology: bool = True):
    global logger

    word = word.strip(' \n')
    words = [word]
    if morphology:
        hobj = hunspell.HunSpell('/usr/share/hunspell/en_US.dic',
                                 '/usr/share/hunspell/en_US.aff')
        if hobj.spell(word) and hobj.stem(word):
            words = [b.decode() for b in hobj.stem(word)]
            logger.debug('Get stems: {}.'.format(', '.join(words)))

    builder = IndexBuilder(dict)
    builder.check_build()
    for w in words:
        meanings = builder.mdx_lookup(w, ignorecase=True)
        if not meanings:
            continue
        logger.debug('Find {} meanings of word {} from dictionary {}.'.format(
            len(meanings), w, dict))
        if w != word:
            word = w
        return word, meanings[0]
    logger.debug('Cannot find word {} from dictionary {}.'.format(word, dict))
    return word, None
Пример #2
0
def generate_soup(word):
    """产出所查询单词对应的 BeautifulSoup 实例"""

    builder = IndexBuilder('柯林斯高阶双解.mdx')
    the_word = builder.mdx_lookup(str(word))[0]

    return BeautifulSoup(the_word, features='lxml')
Пример #3
0
def indexing(builder: IndexBuilder) -> int:
    """indexing all examples in lsc4 dict
    TODO: 性能很差,indexing动作应该放在解析mdx文件的时候
    :param builder dict builder
    """
    if not USE_ES or not CONNECTED_ES:
        return 0

    # create index
    if not create_index():
        return 0
    conn = sqlite3.connect(builder.get_mdx_db())
    cursor = conn.execute('SELECT key_text FROM MDX_INDEX')
    keys = [item[0] for item in cursor]
    conn.close()

    examples = []

    for key in keys:
        content = builder.mdx_lookup(key)
        str_content = ""
        if len(content) > 0:
            for c in content:
                str_content += c.replace("\r\n", "").replace("entry:/", "")
        exs = example_parse_lsc4("lsc4", key, str_content)
        if exs:
            examples.extend(exs)
            if len(examples) > 100000:
                ingest("lsc4", examples)
                examples = []
    ingest("lsc4", examples)
    print("indexing done", len(keys))
Пример #4
0
def build_dict() -> Dict[str, IndexBuilder]:
    """将所有词典构建好builders,根据前端选择查询对应的词典"""
    global BUILDERS
    for d, f in DICTS_MAP.items():
        if not os.path.exists(f):
            log.warning(f"the dict({d}) file:{f} doesn't exist,removed")
    for d, f in DICTS_MAP.items():
        _builder = IndexBuilder(f)
        BUILDERS[d] = _builder
    log.info(f"all dictionaries= {DICTS_MAP}")
Пример #5
0
    def test_builder_noindex(self):
        '''test basic function'''
        for f in glob.glob("mdx/Vocabulary*.db"):
            os.remove(f)
        print("***without sql index***\n")
        start = time.time()
        bd = IndexBuilder(self._mdx_file, sql_index=False, check=True)
        print("takes {0} seconds to build without sql index\n".format(
            time.time() - start))

        start = time.time()
        word = 'dedicate'
        for i in range(self._repeat):
            self.assertTrue(bd.mdx_lookup(word))
        print("takes {0} second to lookup {1} {2} times\n".format(
            time.time() - start, word, self._repeat))
        for i in range(self._repeat):
            bd.get_mdx_keys("dedi*")
        print("takes {0} second to lookup {1} {2} times\n".format(
            time.time() - start, "dedi*", self._repeat))
Пример #6
0
 def process_data(self, config, data):
     """"获取柯思林,高阶英汉双解词典信息"""
     # 获得所有数据
     # Cobuild词典信息
     print("Cobuild词典服务开始处理数据!")
     file_name = self.name
     dict_builder = IndexBuilder(config.cobuild)
     # 1:查询到所有单词的字典html信息
     (doc_items, invalid_items) = self._query_word_items(file_name, dict_builder, data)
     # 2:记录扫描无效信息
     print("\t检索字典:%s检索完毕!\n\t开始写入无效数据" % file_name)
     self.write_invalid_items(config, file_name + "_invalid", invalid_items)
     print("\t开始写入正常数据")
     # 3:扫描文本内资源信息
     print("\t开始写入资源数据")
     self.save_resource(config, dict_builder, doc_items, file_name)
     # 4:重置资源引用信息
     self.reset_html_res(doc_items)
     # 5:记录扫描结果
     self.write_items(config, file_name, doc_items)
     # 返回doc信息
     self.result = doc_items
Пример #7
0
 def _build_index(self):
     
     dict_list = []
     files_in_dir = os.listdir(self._mdict_dir)
     for item in files_in_dir:
         full_name = os.path.join(self._mdict_dir, item)
         if os.path.isfile(full_name):
             _filename, _file_extension = os.path.splitext(full_name)
             if _file_extension == '.mdx':
                 _config_single_dic = {
                     'title': '',
                     'description':'',
                     'mdx_name': full_name,
                     'has_mdd': os.path.isfile(_filename + '.mdd')
                     }
                 try:
                     ib = IndexBuilder(full_name)
                 except Exception:
                     continue
                 _config_single_dic['title'] = ib._title
                 _config_single_dic['description'] = ib._description
                 dict_list.append(_config_single_dic)
     self._config['dicts'] = dict_list
Пример #8
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import pandas as pd
import numpy
import ydcv
import sys
sys.path.append('./mdictlib')
from mdict_query import IndexBuilder
builder = IndexBuilder('./Collins.mdx')


def has_explains(result):
    return ('basic' in result) and ('explains' in result['basic'])


def get_explian_from_result(result):
    if not has_explains(result):
        return ''
    return ';'.join(result['basic']['explains'])


def translate_from_yd(word):
    result = ydcv.lookup_word_inner(word)
    return result


def if_in_collins(word):
    return len(builder.mdx_lookup(word)) > 0

Пример #9
0
from mdict_query import IndexBuilder

bd = IndexBuilder("mdx\\oed.mdx")
keys = bd.get_mdx_keys("ded*")
result = bd.mdx_lookup('a')
pass
Пример #10
0
import re
from collections import namedtuple

import bs4
from bs4 import BeautifulSoup

from mdict_query import IndexBuilder

# 测试
builder = IndexBuilder('柯林斯高阶双解.mdx')
with open('./dicta.html', 'w+') as wp:
    wp.write(builder.mdx_lookup('great')[0])

#soup = BeautifulSoup(builder.mdx_lookup('f**k')[0], features="lxml")
# TODO: 将结果写入数据库并进行初步测试

Word_Title = namedtuple('Word_Title', 'name star level')
Word_Collins_Content = namedtuple(
    'Word_Collins_Content', 'interpretation usage usage_note word_format')
Word_Interpretation = namedtuple('Word_Interpretation', 'en cn')
Word_Usage = namedtuple('Word_Usage', 'description examples')
Word_Usage_Note = namedtuple('Word_Usage_Note', 'en cn')
Word_Format = namedtuple('Word_Format', 'format examples')


def generate_soup(word):
    """产出所查询单词对应的 BeautifulSoup 实例"""

    builder = IndexBuilder('柯林斯高阶双解.mdx')
    the_word = builder.mdx_lookup(str(word))[0]
Пример #11
0
    return [b'<h1>WSGIServer ok!</h1>']


# 新线程执行的代码
def loop():
    # 创建一个服务器,IP地址为空,端口是8000,处理函数是application:
    httpd = make_server('', 8000, application)
    print("Serving HTTP on port 8000...")
    # 开始监听HTTP请求:
    httpd.serve_forever()


if __name__ == '__main__':
    filename = "d:/Program Files (x86)/MDictPC/doc/Longman 5/Longman Dictionary of Contemporary English.mdx"
    # filename = "d:/Program Files (x86)/MDictPC/doc/Collins COBUILD (CN)/Collins COBUILD (CN).mdx"
    builder = IndexBuilder(filename)
    t = threading.Thread(target=loop, args=())
    t.start()

    # if not os.path.exists(args.filename):
    #     print("Please specify a valid MDX/MDD file")
    # else:
    #     builder = IndexBuilder(args.filename)
    #     t = threading.Thread(target=loop, args=())
    #     t.start()

    # import argparse
    # parser = argparse.ArgumentParser()
    # parser.add_argument("filename", nargs='?', help="mdx file name")
    # args = parser.parse_args()
Пример #12
0
    return [b'<h1>WSGIServer ok!</h1>']


# 新线程执行的代码
def loop():
    # 创建一个服务器,IP地址为空,端口是8000,处理函数是application:
    httpd = make_server('', 8000, application)
    print("Serving HTTP on port 8000...")
    # 开始监听HTTP请求:
    httpd.serve_forever()


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("filename", nargs='?', help="mdx file name")
    args = parser.parse_args()

    # use GUI to select file, default to extract
    if not args.filename:
        root = tk.Tk()
        root.withdraw()
        args.filename = filedialog.askopenfilename(parent=root)

    if not os.path.exists(args.filename):
        print("Please specify a valid MDX/MDD file")
    else:
        builder = IndexBuilder(args.filename)
        t = threading.Thread(target=loop, args=())
        t.start()
Пример #13
0
    # 创建一个服务器,IP地址为空,端口是8000,处理函数是application:
    httpd = make_server('', 8000, application)
    print("Serving HTTP on port 8000...")
    # 开始监听HTTP请求:
    httpd.serve_forever()

if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("filenames", nargs='*', help="mdx file name")
    args = parser.parse_args()

    # use GUI to select file, default to extract
    if not args.filenames:
        if sys.version_info < (3, 0, 0):
            import Tkinter as tk
            import tkFileDialog as filedialog
        else:
            import tkinter as tk
            import tkinter.filedialog as filedialog
        root = tk.Tk()
        root.withdraw()
        args.filenames = filedialog.askopenfilename(parent=root)

    if not all((os.path.exists(filename) for filename in args.filenames)):
        print("Please specify a valid MDX/MDD file")
    else:
        builders = list((IndexBuilder(file) for file in  args.filenames))
        t = threading.Thread(target=loop, args=())
        t.start()
Пример #14
0
    def _add_builder(self):

        for dict in self._config['dicts']:
            dict['builder'] = IndexBuilder(dict['mdx_name'])
Пример #15
0
def main():
    fileIn = 'input.txt'
    fileOut = ('output', '.txt')
    keepOriginal = False
    breakFileFlag = False
    # parse command line args
    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hkbi:o:', ['help', 'break-file'])
    except getopt.GetoptError as err:
        print('Error: %s!' % err)
        sys.exit(2)
    for o, a in opts:
        if o == '-i':
            fileIn = a
        elif o == '-o':
            fileOut = os.path.splitext(a)
            if len(fileOut[1]) == 0:
                print('Error: Regular name of output file required! For example: output.txt')
                sys.exit()
        elif o == '-k':
            keepOriginal = True
        elif o in ('-b', '--break-file'):
            breakFileFlag = True
        elif o in ('-h', '--help'):
            usage()
            sys.exit()

    global idxBuilder
    idxBuilder = IndexBuilder('C:/Users/DELL/Desktop/hello/Oxford+Advanced+Learner+English-Chinese+Dictionary+8th+Edition.mdx')

    # read/write files
    with open(fileIn, 'r') as fdIn:
        # words = fdIn.readlines()
        words = [w for w in [line.replace('\t', ' ').strip() for line in fdIn] if len(w) > 0]
        if not keepOriginal:
            words = list(set(words))
            words.sort(key=str.lower)
        failWords = []
        print("Total words: %d." % len(words))

        # output:
        fdOutTxt = None
        fdOutHtml = None
        rememberedLeadingLetter = None
        fileOpenedFlag = False

        for word in words:
            # open right file
            if breakFileFlag:
                if rememberedLeadingLetter != word[0].lower():
                    if fdOutTxt is not None:
                        if not fdOutTxt.closed:
                            fdOutTxt.close()
                    if fdOutHtml is not None:
                        if not fdOutHtml.closed:
                            fdOutHtml.close()
                    rememberedLeadingLetter = word[0].lower()
                    fileOutTxt = fileOut[0] + '_' + rememberedLeadingLetter + fileOut[1]
                    fileOutHtml = fileOut[0] + '_' + rememberedLeadingLetter + '.html'
                    fdOutTxt = open(fileOutTxt, mode='a+', encoding='utf-8')
                    fdOutHtml = open(fileOutHtml, mode='a+', encoding='utf-8')
            elif not fileOpenedFlag:
                fileOutTxt = fileOut[0] + fileOut[1]
                fileOutHtml = fileOut[0] + '.html'
                fdOutTxt = open(fileOutTxt, mode='a+', encoding='utf-8')
                fdOutHtml = open(fileOutHtml, mode='a+', encoding='utf-8')
                fileOpenedFlag = True

            # lookup word
            result = extractSentence(word)
            if result is not None:
                markupResult = formatContent(result)
                if markupResult[0] is not None:
                    fdOutTxt.write(markupResult[0] + '\n')
                if markupResult[1] is not None:
                    fdOutHtml.write(markupResult[1] + '\n')
            else:
                failWords.append(word)
                print('[FAIL] ' + word)

        else:
            if fdOutTxt is not None:
                if not fdOutTxt.closed:
                    fdOutTxt.close()
            if fdOutHtml is not None:
                if not fdOutHtml.closed:
                    fdOutHtml.close()

        if len(failWords) > 0:
            with open(fileOut[0] + fileOut[1] + '.fail', 'a+', encoding='utf-8') as fdFailTxt:
                for word in failWords:
                    fdFailTxt.write(word + '\n')
    # done
    print("result: " + fileOut[0] + fileOut[1] + ' / ' + fileOut[0] + '.html')
Пример #16
0
# /bin/python3
import requests as rq
import sys
import re


from mdict_query import IndexBuilder
builder = IndexBuilder('mdx/bing.mdx')

dic = {}
sen = ''


def get_exp(word):
    result_text = builder.mdx_lookup(word)
    return result_text


def split_words(passage):
    l = re.split(r'[^a-zA-Z]', passage)
    n = set()
    for e in l:
        if e:
            n.add(e.lower())
    new_l = list(n)
    new_l.sort()
    return new_l


def split_sentences(passage):
    l = re.split(r'\. |? ', passage)
Пример #17
0
from mdict_query import IndexBuilder
from bs4 import BeautifulSoup
import re
from collections import OrderedDict

builder = IndexBuilder('oxford.mdx')


def getSentList(word):
    dict_str = builder.mdx_lookup(word)
    bs = BeautifulSoup(dict_str[0])
    l = bs.find_all('span', class_="x")
    u = []
    for span in l:
        if "." not in span.text and "?" not in span.text and "!" not in span.text:
            continue
        contents = span.contents
        if len(contents) > 3:
            continue
        chn = span.find_next_sibling().text

        if len(contents) == 1:
            eng = contents[0]
        elif len(contents) == 3:
            temp = ['', '', '']
            for i in range(3):
                if hasattr(contents[i], 'text'):
                    if contents[i].findChild():
                        continue
                    temp[i] = contents[i].text
                else:
Пример #18
0
import sys
import json
from mdict_query import IndexBuilder


#if sys.argv.__len__() < 3:
#    sys.exit(1)

#print sys.argv

dict = {}

query_type = sys.argv[2]
query_word = sys.argv[3].strip()

builder = IndexBuilder('/Users/david/Desktop/G/ciku/Longman Dictionary of Contemporary English.mdx')

if query_type == "key":
    dict[query_word] = builder.mdx_lookup(query_word, True)
elif query_type == "wildcard":
    keys = builder.get_mdx_keys(query_word)
    count = 0
    for key in keys:
        count += 1
        dict[key] = builder.mdx_lookup(key)
        if count > 10:
            break
elif query_type == "wildcardcount":
    keys = builder.get_mdx_keys(query_word)
    print keys.__len__()
    sys.exit(0)
Пример #19
0
        "tags": [ ]
    }
    """ % (FrontStr, BackStr)

    return newnote


if __name__ == '__main__':
    config = configparser.ConfigParser()
    # get absolute path
    fp_dir = os.path.dirname(os.path.realpath(sys.argv[0]))
    iniFile = os.path.join(fp_dir, "Config.ini")
    #    print(iniFile)
    config.read(iniFile, encoding='utf-8')
    mdict = config['Default']['mdxfile']
    builder = IndexBuilder(mdict)
    #    Word="abandon"
    Word = sys.argv[1]
    Meanings = builder.mdx_lookup(Word, ignorecase=True)
    record = Meanings[0]

    CardNote = NoteContent(Word, record)
    #    print(CardNote)
    #    t3=time.time()
    newnote = json.loads(CardNote, strict=False)
    #    print(newnote)
    #    t4=time.time()
    try:
        result = invoke('addNote', note=newnote)
        print(result)
        winsound.Beep(440, 250)  # frequency, duration