示例#1
0
    def make_sqlite(self):
        sqlite_file = self._mdx_file + '.sqlite.db'
        if os.path.exists(sqlite_file):
            os.remove(sqlite_file)
        mdx = MDX(self._mdx_file)
        conn = sqlite3.connect(sqlite_file)
        cursor = conn.cursor()
        cursor.execute(''' CREATE TABLE MDX_DICT
                (key text not null,
                value text
                )''')

        # remove '(pīnyīn)', remove `1`:
        aeiou = 'āáǎàĀÁǍÀēéěèêềếĒÉĚÈÊỀẾīíǐìÍǏÌōóǒòŌÓǑÒūúǔùŪÚǓÙǖǘǚǜǕǗǙǛḾǹňŃŇ'
        pattern = r"`\d+`|[(\(]?['a-z%s]*[%s]['a-z%s]*[\))]?" % (aeiou, aeiou,
                                                                 aeiou)
        tuple_list = [(key.decode(), re.sub(pattern, '', value.decode()))
                      for key, value in mdx.items()]

        cursor.executemany('INSERT INTO MDX_DICT VALUES (?,?)', tuple_list)

        returned_index = mdx.get_index(check_block=self._check)
        meta = returned_index['meta']
        cursor.execute('''CREATE TABLE META (key text, value text)''')

        cursor.executemany('INSERT INTO META VALUES (?,?)',
                           [('encoding', meta['encoding']),
                            ('stylesheet', meta['stylesheet']),
                            ('title', meta['title']),
                            ('description', meta['description']),
                            ('version', version)])

        if self._sql_index:
            cursor.execute('''
                CREATE INDEX key_index ON MDX_DICT (key)
                ''')
        conn.commit()
        conn.close()
示例#2
0
def collins(filename):
    mdx = MDX(filename)
    items = mdx.items()
    for i in range(12):
        item = next(items)
        print(item[0])
        dom = etree.HTML(item[1])
        explains = dom.xpath(r'//*[@class="C1_explanation_item"]')
        for e in explains:
            seealso = e.xpath(r'div/*/a[@class="C1_explain"]/text()')
            if len(seealso) != 0:
                print('see also', seealso)
            else:
                cn = e.xpath(r'div/span[@class="C1_explanation_label"]/text()') + e.xpath(r'div/span[@class="C1_text_blue"]/text()')
                print(''.join(cn))
                entemp = e.xpath(r'div/text()')
                enword = e.xpath(r'div/span[@class="C1_inline_word"]/text()')
                en = []
                for i in range(len(enword)):
                    en.append(entemp[i])
                    en.append(enword[i])
                en.append(entemp.pop())
                en = e.xpath(r'div/span[@class="C1_word_gram"]/text()') + en
                print(''.join(en))
                sentence = e.xpath(r'ul/li')
                for sen in sentence:
                    entemp = sen.xpath(r'p[1]/text()')
                    enword = sen.xpath(r'p[1]/span[@class="C1_text_blue"]/text()')
                    sentence_en = []
                    for i in range(len(enword)):
                        sentence_en.append(entemp[i])
                        sentence_en.append(enword[i])
                    print(entemp)
                    sentence_en.append(entemp.pop())
                    print(''.join(sentence_en))
                    sentence_cn = sen.xpath(r'p[2]/text()')
                    print(''.join(sentence_cn))
示例#3
0
文件: test.py 项目: Duum/bkd
from readmdict import MDX, MDD

dirstring = "资料来源整理/资料来源整理/"
mdx = MDX("21世纪大英汉词典.mdx")
r = r"`1`.*?`2`(\[.*?\])?\s*(\<br\>)?"
r1 = r"`\d`"
#logging.basicConfig()
logger = logging.getLogger('root')
fh = logging.FileHandler('test.log')
fh.setLevel(logging.INFO)
logger.addHandler(fh)
pattern = re.compile(r)
pattern1 = re.compile(r1)
dic = {}
valuelist = []
for key, value in mdx.items():
    try:
        soup = BeautifulSoup(value)

        #print soup.prettify()
        phone = soup.find("span", class_="phone")
        tree = soup.find_all("span", class_="trs")
        # print key
        synonym = soup.find("span", class_="syno")
        #同义词
        antonym = soup.find("span", class_="anto")
        #反义词

        if key == "abandon":
            #print soup.prettify()
            #print tree.text
示例#4
0
文件: test.py 项目: Duum/bkd
from bs4 import BeautifulSoup
from readmdict import MDX,MDD
dirstring="资料来源整理/资料来源整理/"
mdx=MDX("21世纪大英汉词典.mdx")
r=r"`1`.*?`2`(\[.*?\])?\s*(\<br\>)?"
r1=r"`\d`"
#logging.basicConfig()
logger = logging.getLogger('root')
fh = logging.FileHandler('test.log')
fh.setLevel(logging.INFO)
logger.addHandler(fh)
pattern=re.compile(r)
pattern1=re.compile(r1)
dic={}
valuelist=[]
for key,value in mdx.items():
    try:
       soup=BeautifulSoup(value)

          #print soup.prettify()
       phone= soup.find("span",class_="phone")
       tree=soup.find_all("span",class_="trs")
      # print key
       synonym=soup.find("span",class_="syno")
       #同义词
       antonym=soup.find("span",class_="anto")
       #反义词

       if key=="abandon":
           #print soup.prettify()
           #print tree.text
#!/bin/python
# -*- coding: utf-8 -*-

from readmdict import MDX#or MDD  #You need to install lzo Library for gcc and python-lzo,readmdict,ripemd128 and pureSalsa20 Library for python
import csv
import re

csvaddress='XXXX.csv' #Your CSV file name to export

def remove_html_tags(text):
    """Remove html tags from a string"""
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

mdx= MDX('XXXX.mdx') #Open the original MDX Dictionary file
items = mdx.items()
tx=mdx.items().next() #To fetch first line in your mdx file - If delete this line you miss first line
lst=[]

for counter in items:
    try:
        templst=list(tx)
        templst[1] = remove_html_tags(templst[1])
        lst.append(templst)
        tx=items.next()
    except:
        with open(csvaddress,'w') as output:
            writer = csv.writer(output, lineterminator='\n')
            writer.writerows(lst)
示例#6
0
             print(f"[ ] {path}", end='\r')
             path_tuple = path.split('\\')
             path_str = str(os.path.join(
                 "pages", *path_tuple[:-1])) if len(path_tuple) > 1 else ''
             if path_str != '' and not os.path.exists(path_str):
                 os.mkdir(path_str)
             src_file = open(os.path.join("pages", path), 'wb')
             src_file.write(data)
             src_file.flush()
             src_file.close()
             print("[v]")
 elif sys.argv[1].split('.')[-1] == "mdx":
     mdx_file = MDX(sys.argv[1], encoding='utf-8')
     index_info = {}
     print("正在提取词典页面文件...")
     for mdx_item in mdx_file.items():
         item_name = base64.urlsafe_b64encode(
             mdx_item[0]).decode() + '.html'
         item_content = mdx_item[1].decode()
         page_path = os.path.join("pages", item_name)
         if os.path.exists(page_path):
             print(f"[x] {page_path} 已存在!")
         else:
             print(f"[ ] {page_path}", end='\r')
             if "@@@LINK=" in item_content:
                 link = item_content[8:]
                 item_content = "<a href=\"{}\">{}</a>".format(
                     base64.urlsafe_b64encode(link.encode()).decode(), link)
             else:
                 index_info.update({mdx_item[0].decode(): item_name})
                 page_file = open(page_path, 'w', encoding='utf-8')