Пример #1
0
    def _make_mdx_index(self, db_name):
        if os.path.exists(db_name):
            os.remove(db_name)
        mdx = MDX(self._mdx_file)
        self._mdx_db = db_name
        returned_index = mdx.get_index(check_block=self._check)
        index_list = returned_index['index_dict_list']
        conn = sqlite3.connect(db_name)
        c = conn.cursor()
        c.execute(''' CREATE TABLE MDX_INDEX
               (key_text text not null,
                file_pos integer,
                compressed_size integer,
                decompressed_size integer,
                record_block_type integer,
                record_start integer,
                record_end integer,
                offset integer
                )''')

        tuple_list = [
            (item['key_text'], item['file_pos'], item['compressed_size'],
             item['decompressed_size'], item['record_block_type'],
             item['record_start'], item['record_end'], item['offset'])
            for item in index_list
        ]
        c.executemany('INSERT INTO MDX_INDEX VALUES (?,?,?,?,?,?,?,?)',
                      tuple_list)
        # build the metadata table
        meta = returned_index['meta']
        c.execute('''CREATE TABLE META
               (key text,
                value text
                )''')

        #for k,v in meta:
        #    c.execute(
        #    'INSERT INTO META VALUES (?,?)',
        #    (k, v)
        #    )

        c.executemany('INSERT INTO META VALUES (?,?)',
                      [('encoding', meta['encoding']),
                       ('stylesheet', meta['stylesheet']),
                       ('title', meta['title']),
                       ('description', meta['description']),
                       ('version', version)])

        if self._sql_index:
            c.execute('''
                CREATE INDEX key_index ON MDX_INDEX (key_text)
                ''')

        conn.commit()
        conn.close()
        #set class member
        self._encoding = meta['encoding']
        self._stylesheet = json.loads(meta['stylesheet'])
        self._title = meta['title']
        self._description = meta['description']
Пример #2
0
    def _make_mdx_index(self, db_name):
        mdx = MDX(self._mdx_file)
        self._mdx_db = db_name
        index_list = (mdx.get_index())['index_dict_list']
        conn = sqlite3.connect(db_name)
        c = conn.cursor()
        c.execute(''' CREATE TABLE MDX_INDEX
               (key_text text,
                file_pos integer,
                compressed_size integer,
                record_block_type integer,
                record_start integer,
                record_end integer,
                offset integer
                )''')

        tuple_list = []
        for item in index_list:
            tuple = (item['key_text'], item['file_pos'],
                     item['compressed_size'], item['record_block_type'],
                     item['record_start'], item['record_end'], item['offset'])
            tuple_list.append(tuple)
        c.executemany('INSERT INTO MDX_INDEX VALUES (?,?,?,?,?,?,?)',
                      tuple_list)
        # build the metadata table
        meta = (mdx.get_index())['meta']
        c.execute('''CREATE TABLE META
               (key text,
                value text
                )''')

        #for k,v in meta:
        #    c.execute(
        #    'INSERT INTO META VALUES (?,?)',
        #    (k, v)
        #    )

        c.executemany('INSERT INTO META VALUES (?,?)',
                      [('encoding', meta['encoding']),
                       ('stylesheet', meta['stylesheet']),
                       ('title', meta['title']),
                       ('description', meta['description'])])

        conn.commit()
        conn.close()
        #set class member
        self._encoding = meta['encoding']
        self._stylesheet = json.loads(meta['stylesheet'])
        self._title = meta['title']
        self._description = meta['description']
Пример #3
0
 def _():
     self.header_build_flag = True
     mdx = MDX(self._mdx_file, only_header=True)
     self._encoding = mdx.meta['encoding']
     self._stylesheet = json.loads(mdx.meta['stylesheet'])
     self._title = mdx.meta['title']
     self._description = mdx.meta['description']
Пример #4
0
    def _make_mdx_index(self):
        if os.path.exists(self._mdx_db):
            os.remove(self._mdx_db)
        mdx = MDX(self._mdx_file, only_header=False)
        index_list = mdx.get_index(check_block=self._check)
        conn = sqlite3.connect(self._mdx_db)
        c = conn.cursor()
        c.execute(''' CREATE TABLE MDX_INDEX
               (key_text text not null,
                file_pos integer,
                compressed_size integer,
                decompressed_size integer,
                record_block_type integer,
                record_start integer,
                record_end integer,
                offset integer
                )''')

        tuple_list = [
            (item['key_text'], item['file_pos'], item['compressed_size'],
             item['decompressed_size'], item['record_block_type'],
             item['record_start'], item['record_end'], item['offset'])
            for item in index_list
        ]
        c.executemany('INSERT INTO MDX_INDEX VALUES (?,?,?,?,?,?,?,?)',
                      tuple_list)
        # build the metadata table
        c.execute('''CREATE TABLE META
               (key text,
                value text
                )''')
        c.executemany('INSERT INTO META VALUES (?,?)',
                      [('encoding', self.meta['encoding']),
                       ('stylesheet', json.dumps(self.meta['stylesheet'])),
                       ('title', self.meta['title']),
                       ('description', self.meta['description']),
                       ('version', version)])

        if self._sql_index:
            c.execute('''
                CREATE INDEX key_index ON MDX_INDEX (key_text)
                ''')

        conn.commit()
        conn.close()
Пример #5
0
    def make_sqlite(self):
        sqlite_file = self._mdx_file + '.sqlite.db'
        if os.path.exists(sqlite_file):
            os.remove(sqlite_file)
        mdx = MDX(self._mdx_file)
        conn = sqlite3.connect(sqlite_file)
        cursor = conn.cursor()
        cursor.execute(''' CREATE TABLE MDX_DICT
                (key text not null,
                value text
                )''')

        # remove '(pīnyīn)', remove `1`:
        aeiou = 'āáǎàĀÁǍÀēéěèêềếĒÉĚÈÊỀẾīíǐìÍǏÌōóǒòŌÓǑÒūúǔùŪÚǓÙǖǘǚǜǕǗǙǛḾǹňŃŇ'
        pattern = r"`\d+`|[(\(]?['a-z%s]*[%s]['a-z%s]*[\))]?" % (aeiou, aeiou,
                                                                 aeiou)
        tuple_list = [(key.decode(), re.sub(pattern, '', value.decode()))
                      for key, value in mdx.items()]

        cursor.executemany('INSERT INTO MDX_DICT VALUES (?,?)', tuple_list)

        returned_index = mdx.get_index(check_block=self._check)
        meta = returned_index['meta']
        cursor.execute('''CREATE TABLE META (key text, value text)''')

        cursor.executemany('INSERT INTO META VALUES (?,?)',
                           [('encoding', meta['encoding']),
                            ('stylesheet', meta['stylesheet']),
                            ('title', meta['title']),
                            ('description', meta['description']),
                            ('version', version)])

        if self._sql_index:
            cursor.execute('''
                CREATE INDEX key_index ON MDX_DICT (key)
                ''')
        conn.commit()
        conn.close()
Пример #6
0
def collins(filename):
    mdx = MDX(filename)
    items = mdx.items()
    for i in range(12):
        item = next(items)
        print(item[0])
        dom = etree.HTML(item[1])
        explains = dom.xpath(r'//*[@class="C1_explanation_item"]')
        for e in explains:
            seealso = e.xpath(r'div/*/a[@class="C1_explain"]/text()')
            if len(seealso) != 0:
                print('see also', seealso)
            else:
                cn = e.xpath(r'div/span[@class="C1_explanation_label"]/text()') + e.xpath(r'div/span[@class="C1_text_blue"]/text()')
                print(''.join(cn))
                entemp = e.xpath(r'div/text()')
                enword = e.xpath(r'div/span[@class="C1_inline_word"]/text()')
                en = []
                for i in range(len(enword)):
                    en.append(entemp[i])
                    en.append(enword[i])
                en.append(entemp.pop())
                en = e.xpath(r'div/span[@class="C1_word_gram"]/text()') + en
                print(''.join(en))
                sentence = e.xpath(r'ul/li')
                for sen in sentence:
                    entemp = sen.xpath(r'p[1]/text()')
                    enword = sen.xpath(r'p[1]/span[@class="C1_text_blue"]/text()')
                    sentence_en = []
                    for i in range(len(enword)):
                        sentence_en.append(entemp[i])
                        sentence_en.append(enword[i])
                    print(entemp)
                    sentence_en.append(entemp.pop())
                    print(''.join(sentence_en))
                    sentence_cn = sen.xpath(r'p[2]/text()')
                    print(''.join(sentence_cn))
Пример #7
0
Файл: test.py Проект: Duum/bkd
# -*- coding: utf-8 -*-
from collections import Counter
import re
import logging
from bs4 import BeautifulSoup
from readmdict import MDX, MDD

dirstring = "资料来源整理/资料来源整理/"
mdx = MDX("21世纪大英汉词典.mdx")
r = r"`1`.*?`2`(\[.*?\])?\s*(\<br\>)?"
r1 = r"`\d`"
#logging.basicConfig()
logger = logging.getLogger('root')
fh = logging.FileHandler('test.log')
fh.setLevel(logging.INFO)
logger.addHandler(fh)
pattern = re.compile(r)
pattern1 = re.compile(r1)
dic = {}
valuelist = []
for key, value in mdx.items():
    try:
        soup = BeautifulSoup(value)

        #print soup.prettify()
        phone = soup.find("span", class_="phone")
        tree = soup.find_all("span", class_="trs")
        # print key
        synonym = soup.find("span", class_="syno")
        #同义词
        antonym = soup.find("span", class_="anto")
Пример #8
0
Файл: test.py Проект: Duum/bkd
# -*- coding: utf-8 -*-
from collections import Counter
import re
import logging
from bs4 import BeautifulSoup
from readmdict import MDX,MDD
dirstring="资料来源整理/资料来源整理/"
mdx=MDX("21世纪大英汉词典.mdx")
r=r"`1`.*?`2`(\[.*?\])?\s*(\<br\>)?"
r1=r"`\d`"
#logging.basicConfig()
logger = logging.getLogger('root')
fh = logging.FileHandler('test.log')
fh.setLevel(logging.INFO)
logger.addHandler(fh)
pattern=re.compile(r)
pattern1=re.compile(r1)
dic={}
valuelist=[]
for key,value in mdx.items():
    try:
       soup=BeautifulSoup(value)

          #print soup.prettify()
       phone= soup.find("span",class_="phone")
       tree=soup.find_all("span",class_="trs")
      # print key
       synonym=soup.find("span",class_="syno")
       #同义词
       antonym=soup.find("span",class_="anto")
       #反义词
Пример #9
0
import logging
from collections import Counter
import re
import json
import psycopg2
conn = psycopg2.connect(database="word",
                        user="******",
                        password="******",
                        host="rm-2zeg1e0w5v7w5v7y8o.pg.rds.aliyuncs.com",
                        port="3432")
cur = conn.cursor()
logging.basicConfig()
from bs4 import BeautifulSoup
from readmdict import MDX, MDD
dirstring = "资料来源整理/资料来源整理/"
mdx = MDX("牛津英汉简明词典.mdx")
r = r"`1`.*?`2`(\[.*?\])?\s*(\<br\>)?"
r1 = r"`\d`"
pattern = re.compile(r)
pattern1 = re.compile(r1)
dic = {}
for key, value in mdx.items():
    valuelist = []
    match = pattern.sub("", value)
    #print match
    items = match.split("<br>")
    items = [item.strip() for item in items]
    for item in items:
        item = pattern1.sub("", item)
        valuelist.append(item)
        print item
Пример #10
0
#!/bin/python
# -*- coding: utf-8 -*-

from readmdict import MDX#or MDD  #You need to install lzo Library for gcc and python-lzo,readmdict,ripemd128 and pureSalsa20 Library for python
import csv
import re

csvaddress='XXXX.csv' #Your CSV file name to export

def remove_html_tags(text):
    """Remove html tags from a string"""
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

mdx= MDX('XXXX.mdx') #Open the original MDX Dictionary file
items = mdx.items()
tx=mdx.items().next() #To fetch first line in your mdx file - If delete this line you miss first line
lst=[]

for counter in items:
    try:
        templst=list(tx)
        templst[1] = remove_html_tags(templst[1])
        lst.append(templst)
        tx=items.next()
    except:
        with open(csvaddress,'w') as output:
            writer = csv.writer(output, lineterminator='\n')
            writer.writerows(lst)
Пример #11
0
             print(f"[x] {path} 已存在!")
             continue
         else:
             print(f"[ ] {path}", end='\r')
             path_tuple = path.split('\\')
             path_str = str(os.path.join(
                 "pages", *path_tuple[:-1])) if len(path_tuple) > 1 else ''
             if path_str != '' and not os.path.exists(path_str):
                 os.mkdir(path_str)
             src_file = open(os.path.join("pages", path), 'wb')
             src_file.write(data)
             src_file.flush()
             src_file.close()
             print("[v]")
 elif sys.argv[1].split('.')[-1] == "mdx":
     mdx_file = MDX(sys.argv[1], encoding='utf-8')
     index_info = {}
     print("正在提取词典页面文件...")
     for mdx_item in mdx_file.items():
         item_name = base64.urlsafe_b64encode(
             mdx_item[0]).decode() + '.html'
         item_content = mdx_item[1].decode()
         page_path = os.path.join("pages", item_name)
         if os.path.exists(page_path):
             print(f"[x] {page_path} 已存在!")
         else:
             print(f"[ ] {page_path}", end='\r')
             if "@@@LINK=" in item_content:
                 link = item_content[8:]
                 item_content = "<a href=\"{}\">{}</a>".format(
                     base64.urlsafe_b64encode(link.encode()).decode(), link)