예제 #1
0
    def write(self, file_path_absolute, compress=True):
        # get cover path
        cover_path_absolute = self.args.cover_path
        if cover_path_absolute is not None:
            cover_path_absolute = os.path.abspath(cover_path_absolute)

        # get custom css path
        custom_css_path_absolute = self.args.apply_css
        if custom_css_path_absolute is not None:
            custom_css_path_absolute = os.path.abspath(
                custom_css_path_absolute)

        # create new tmp directory and cd there
        self.root_directory_path = create_temp_directory()
        cwd = os.getcwd()
        os.chdir(self.root_directory_path)
        os.makedirs(u"META-INF")
        os.makedirs(u"OEBPS")

        # add mimetype and container.xml
        if self.ebook_format in [self.EPUB2]:  # add EPUB3 here
            self.add_file(u"mimetype",
                          self.MIMETYPE_CONTENTS,
                          mode=zipfile.ZIP_STORED)
            self.add_file(u"META-INF/container.xml",
                          self.CONTAINER_XML_CONTENTS)

        # add cover
        self.write_cover(cover_path_absolute)

        # write CSS
        self.write_css(custom_css_path_absolute)

        # write index
        if self.args.include_index_page:
            self.write_index()

        # write groups
        self.write_groups()

        # write ncx
        if self.ebook_format in [self.EPUB2]:  # add EPUB3 here
            self.write_ncx()

        # write opf
        self.write_opf()

        # compress
        if compress:
            output_file_obj = zipfile.ZipFile(file_path_absolute,
                                              "w",
                                              compression=zipfile.ZIP_DEFLATED)
            for file_to_compress in self.files:
                output_file_obj.write(file_to_compress["path"],
                                      compress_type=file_to_compress["mode"])
            output_file_obj.close()

        # return to previous cwd
        os.chdir(cwd)
예제 #2
0
    def write(self, file_path_absolute, compress=True):
        # get cover path
        cover_path_absolute = self.args.cover_path
        if cover_path_absolute is not None:
            cover_path_absolute = os.path.abspath(cover_path_absolute)

        # get custom css path
        custom_css_path_absolute = self.args.apply_css
        if custom_css_path_absolute is not None:
            custom_css_path_absolute = os.path.abspath(custom_css_path_absolute)

        # create new tmp directory and cd there
        self.root_directory_path = create_temp_directory()
        cwd = os.getcwd()
        os.chdir(self.root_directory_path)
        os.makedirs(u"META-INF")
        os.makedirs(u"OEBPS")

        # add mimetype and container.xml
        if self.ebook_format in [self.EPUB2]:  # add EPUB3 here
            self.add_file(u"mimetype", self.MIMETYPE_CONTENTS, mode=zipfile.ZIP_STORED)
            self.add_file(u"META-INF/container.xml", self.CONTAINER_XML_CONTENTS)

        # add cover
        self.write_cover(cover_path_absolute)

        # write CSS
        self.write_css(custom_css_path_absolute)

        # write index
        if self.args.include_index_page:
            self.write_index()

        # write groups
        self.write_groups()

        # write ncx
        if self.ebook_format in [self.EPUB2]:  # add EPUB3 here
            self.write_ncx()

        # write opf
        self.write_opf()

        # compress
        if compress:
            output_file_obj = zipfile.ZipFile(file_path_absolute, "w", compression=zipfile.ZIP_DEFLATED)
            for file_to_compress in self.files:
                output_file_obj.write(file_to_compress["path"], compress_type=file_to_compress["mode"])
            output_file_obj.close()

        # return to previous cwd
        os.chdir(cwd)
예제 #3
0
    def read_single_dict(dictionary, args, single_dict):
        # create tmp directory
        tmp_path = create_temp_directory()
        print_debug("Working in temp dir '%s'" % (tmp_path), args.debug)

        if len(single_dict) == 1:
            print_debug("Unzipping .install file...", args.debug)
            zip_file_path = single_dict[0]
            idx_file_path = os.path.join(tmp_path, "d.dict.idx")
            dict_file_path = os.path.join(tmp_path, "d.dict")
            zip_file_obj = zipfile.ZipFile(zip_file_path, "r")
            for entry in zip_file_obj.namelist():
                if entry.endswith(".dict.idx"):
                    zip_entry = zip_file_obj.open(entry)
                    idx_file_obj = io.open(idx_file_path, "wb")
                    idx_file_obj.write(zip_entry.read())
                    idx_file_obj.close()
                    zip_entry.close()
                elif entry.endswith(".dict"):
                    zip_entry = zip_file_obj.open(entry)
                    dict_file_obj = io.open(dict_file_path, "wb")
                    dict_file_obj.write(zip_entry.read())
                    dict_file_obj.close()
                    zip_entry.close()
            zip_file_obj.close()
            print_debug("Unzipping .install file... done", args.debug)
        else:
            print_debug("Files .dict.idx and .dict already uncompressed...", args.debug)
            idx_file_path = single_dict[0]
            dict_file_path = single_dict[1]
            for file_path in [idx_file_path, dict_file_path]:
                if not os.path.exists(file_path):
                    print_error("File '%s' does not exist" % file_path)
                    return False
            print_debug("Files .dict.idx and .dict already uncompressed... done", args.debug)

        # unzip .dict file into tmp_path
        print_debug("Unzipping .dict file...", args.debug)
        zip_file_obj = zipfile.ZipFile(dict_file_path, "r")
        for entry in zip_file_obj.namelist():
            if not entry.endswith("/"):
                zip_entry = zip_file_obj.open(entry)
                entry_file_path = os.path.join(tmp_path, os.path.basename(entry))
                entry_file_obj = io.open(entry_file_path, "wb")
                entry_file_obj.write(zip_entry.read())
                entry_file_obj.close()
                zip_entry.close()
        zip_file_obj.close()
        print_debug("Unzipping .dict file... done", args.debug)

        # read .dict.idx
        print_debug("Reading .dict.idx file...", args.debug)
        sql_connection = sqlite3.connect(idx_file_path)
        sql_cursor = sql_connection.cursor()
        sql_cursor.execute("select * from T_DictIndex")
        index_data = sql_cursor.fetchall()
        chunk_index_to_entries = {}
        max_chunk_index = 1
        for index_entry in index_data:
            headword = index_entry[1]
            if args.ignore_case:
                headword = headword.lower()
            offset = index_entry[2]
            size = index_entry[3]
            chunk_index = index_entry[4]
            if chunk_index not in chunk_index_to_entries:
                chunk_index_to_entries[chunk_index] = []
            if chunk_index > max_chunk_index:
                max_chunk_index = chunk_index
            chunk_index_to_entries[chunk_index].append([headword, offset, size])
        sql_cursor.close()
        sql_connection.close()
        print_debug("Reading .dict.idx file... done", args.debug)

        # read c_* files
        print_debug("Reading c_* files...", args.debug)
        for chunk_index in range(1, max_chunk_index + 1):
            print_debug("  Reading c_%d file..." % (chunk_index), args.debug)
            chunk_file_path = os.path.join(tmp_path, "%s%d" % (CHUNK_FILE_PREFIX, chunk_index))
            chunk_file_obj = io.open(chunk_file_path, "rb")
            for entry in chunk_index_to_entries[chunk_index]:
                headword = entry[0]
                offset = entry[1]
                size = entry[2]
                chunk_file_obj.seek(offset)
                definition_bytes = chunk_file_obj.read(size)
                definition_unicode = definition_bytes.decode(args.input_file_encoding)
                dictionary.add_entry(headword=headword, definition=definition_unicode)
            chunk_file_obj.close()
            print_debug("  Reading c_%d file... done" % (chunk_index), args.debug)
        print_debug("Reading c_* files... done", args.debug)

        # delete tmp directory
        if args.keep:
            print_info("Not deleting temp dir '%s'" % (tmp_path))
        else:
            delete_directory(tmp_path)
            print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug)
        return True
예제 #4
0
def write(dictionary, args, output_file_path):
    # result to be returned
    result = None

    # get absolute path
    output_file_path_absolute = os.path.abspath(output_file_path)

    # get absolute path for collation function file
    bookeen_collation_function_path = None
    if args.bookeen_collation_function is not None:
        bookeen_collation_function_path = os.path.abspath(args.bookeen_collation_function)

    # create tmp directory
    cwd = os.getcwd()
    tmp_path = create_temp_directory()
    print_debug("Working in temp dir '%s'" % (tmp_path), args.debug)
    os.chdir(tmp_path)

    # get the basename
    base = os.path.basename(output_file_path)
    if base.endswith(".zip"):
        base = base[:-4]

    # copy empty.idx into tmp_path
    idx_file_path = base + u".dict.idx"
    dict_file_path = base + u".dict"
    copy_file(EMPTY_FILE_PATH, idx_file_path)

    # open index
    sql_connection = sqlite3.connect(idx_file_path)

    # install collation in the index
    collation_function = collate_function_default
    if bookeen_collation_function_path is not None:
        try:
            collation_function = imp.load_source("", bookeen_collation_function_path).collate_function
            print_debug("Using collation function from '%s'" % (bookeen_collation_function_path), args.debug)
        except:
            print_error("Unable to load collation function from '%s'. Using the default collation function instead." % (bookeen_collation_function_path))
    sql_connection.create_collation("IcuNoCase", collation_function)
    sql_connection.text_factory = str

    # get a cursor and delete any data from the index file
    sql_cursor = sql_connection.cursor()
    sql_cursor.execute("delete from T_DictIndex")

    # write c_* files
    # each c_* file has MAX_CHUNK_SIZE < size <= (MAX_CHUNK_SIZE * 2) bytes (tentatively)
    print_debug("Writing c_* files...", args.debug)
    files_to_compress = []
    current_offset = 0
    chunk_index = 1
    chunk_file_path = "%s%d" % (CHUNK_FILE_PREFIX, chunk_index)
    files_to_compress.append(chunk_file_path)
    chunk_file_obj = io.open(chunk_file_path, "wb")
    for entry_index in dictionary.entries_index_sorted:
        entry = dictionary.entries[entry_index]
        definition_bytes = entry.definition.encode("utf-8")
        definition_size = len(definition_bytes)
        chunk_file_obj.write(definition_bytes)
        # insert headword into index file
        sql_tuple = (0, entry.headword, current_offset, definition_size, chunk_index)
        sql_cursor.execute("insert into T_DictIndex values (?,?,?,?,?)", sql_tuple)
        # insert synonyms into index file
        if not args.ignore_synonyms:
            for synonym in entry.get_synonyms():
                sql_tuple = (0, synonym[0], current_offset, definition_size, chunk_index)
                sql_cursor.execute("insert into T_DictIndex values (?,?,?,?,?)", sql_tuple)
        # update offset
        current_offset += definition_size
        # if we reached CHUNK_SIZE, open the next c_* file
        if current_offset > CHUNK_SIZE:
            chunk_file_obj.close()
            chunk_index += 1
            chunk_file_path = "%s%d" % (CHUNK_FILE_PREFIX, chunk_index)
            files_to_compress.append(chunk_file_path)
            chunk_file_obj = io.open(chunk_file_path, "wb")
            current_offset = 0
    chunk_file_obj.close()
    print_debug("Writing c_* files... done", args.debug)

    # compress
    print_debug("Compressing c_* files...", args.debug)
    file_zip_obj = zipfile.ZipFile(dict_file_path, "w", zipfile.ZIP_DEFLATED)
    for file_to_compress in files_to_compress:
        file_to_compress = os.path.basename(file_to_compress)
        file_zip_obj.write(file_to_compress)
    file_zip_obj.close()
    print_debug("Compressing c_* files... done", args.debug)

    # update index metadata
    print_debug("Updating index metadata...", args.debug)
    header = HEADER % (args.language_from)
    sql_cursor.execute("update T_DictInfo set F_xhtmlHeader=?", (header,))
    sql_cursor.execute("update T_DictInfo set F_LangFrom=?", (args.language_from,))
    sql_cursor.execute("update T_DictInfo set F_LangTo=?", (args.language_to,))
    sql_cursor.execute("update T_DictInfo set F_Licence=?", (args.license,))
    sql_cursor.execute("update T_DictInfo set F_Copyright=?", (args.copyright,))
    sql_cursor.execute("update T_DictInfo set F_Title=?", (args.title,))
    sql_cursor.execute("update T_DictInfo set F_Description=?", (args.description,))
    sql_cursor.execute("update T_DictInfo set F_Year=?", (args.year,))
    # the meaning of the following is unknown
    sql_cursor.execute("update T_DictInfo set F_Alphabet=?", ("Z",))
    sql_cursor.execute("update T_DictInfo set F_CollationLevel=?", ("1",))
    sql_cursor.execute("update T_DictVersion set F_DictType=?", ("stardict",))
    sql_cursor.execute("update T_DictVersion set F_Version=?", ("11",))
    print_debug("Updating index metadata... done", args.debug)

    # compact and close
    sql_cursor.execute("vacuum")
    sql_cursor.close()
    sql_connection.close()

    # create .install file or copy .dict.idx and .dict into requested output directory
    parent_output_directory = os.path.split(output_file_path_absolute)[0]
    if args.bookeen_install_file:
        print_debug("Creating .install file...", args.debug)
        file_zip_path = os.path.join(parent_output_directory, base + u".install")
        file_zip_obj = zipfile.ZipFile(file_zip_path, "w", zipfile.ZIP_DEFLATED)
        for file_to_compress in [dict_file_path, idx_file_path]:
            file_to_compress = os.path.basename(file_to_compress)
            file_zip_obj.write(file_to_compress)
        file_zip_obj.close()
        result = [file_zip_path]
        print_debug("Creating .install file... done", args.debug)
    else:
        print_debug("Copying .dict.idx and .dict files...", args.debug)
        dict_file_path_final = os.path.join(parent_output_directory, os.path.basename(dict_file_path))
        idx_file_path_final = os.path.join(parent_output_directory, os.path.basename(idx_file_path))
        copy_file(dict_file_path, dict_file_path_final)
        copy_file(idx_file_path, idx_file_path_final)
        result = [idx_file_path_final, dict_file_path_final]
        print_debug("Copying .dict.idx and .dict files... done", args.debug)

    # delete tmp directory
    os.chdir(cwd)
    if args.keep:
        print_info("Not deleting temp dir '%s'" % (tmp_path))
    else:
        delete_directory(tmp_path)
        print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug)

    return result
예제 #5
0
def write(dictionary, args, output_file_path):
    # result to be returned
    result = None

    # get absolute path
    output_file_path_absolute = os.path.abspath(output_file_path)

    # create tmp directory
    cwd = os.getcwd()
    tmp_path = create_temp_directory()
    print_debug("Working in temp dir '%s'" % (tmp_path), args.debug)
    os.chdir(tmp_path)

    # sort by headword
    dictionary.sort(by_headword=True)

    # group by prefix
    files_to_compress = []
    prefix_length = int(args.group_by_prefix_length)
    special_group, group_keys, group_dict = dictionary.group(
        prefix_function=get_prefix_kobo,
        prefix_length=prefix_length,
        merge_min_size=int(args.group_by_prefix_merge_min_size),
        merge_across_first=args.group_by_prefix_merge_across_first
    )
    if special_group is not None:
        special_group_key = u"1" * prefix_length
        group_dict[special_group_key] = special_group
        group_keys = [special_group_key] + group_keys

    # write files
    for key in group_keys:
        # write html file
        file_html_path = key + u".html"
        file_html_obj = io.open(file_html_path, "wb")
        file_html_obj.write(u"<?xml version=\"1.0\" encoding=\"utf-8\"?><html>".encode("utf-8"))
        for entry in group_dict[key]:
            headword = entry.headword
            definition = entry.definition
            file_html_obj.write((u"<w><a name=\"%s\"/><div><b>%s</b><br/>%s</div></w>" % (headword, headword, definition)).encode("utf-8"))
        file_html_obj.write((u"</html>").encode("utf-8"))
        file_html_obj.close()

        # compress in gz format
        file_html_obj = io.open(file_html_path, "rb")
        file_gz_path = file_html_path + u".gz"
        file_gz_obj = gzip.open(file_gz_path, "wb")
        file_gz_obj.writelines(file_html_obj)
        file_gz_obj.close()
        file_html_obj.close()

        # delete .html file
        delete_file(None, file_html_path)
        # rename .html.gz file into .html
        rename_file(file_gz_path, file_html_path)
        files_to_compress.append(file_html_path)

    # write words
    file_words_path = WORDS_FILE_NAME
    keys = sorted(dictionary.entries_index.keys())
    try:
        import marisa_trie
        trie = marisa_trie.Trie(keys)
        trie.save(file_words_path)
        result = [file_words_path]
    except ImportError as exc:
        # call MARISA with subprocess
        print_info("  MARISA cannot be imported as Python module. You might want to install it with:")
        print_info("  $ [sudo] pip install marisa_trie")
        marisa_build_path = MARISA_BUILD
        if args.marisa_bin_path is None:
            print_info("  Running '%s' from $PATH" % MARISA_BUILD)
        else:
            marisa_build_path = os.path.join(args.marisa_bin_path, MARISA_BUILD)
            print_info("  Running '%s' from '%s'" % (MARISA_BUILD, args.marisa_bin_path))
        # TODO this is ugly, but it works
        query = (u"\n".join([x for x in keys]) + u"\n").encode("utf-8")

        try:
            proc = subprocess.Popen(
                [marisa_build_path, "-l", "-o", file_words_path],
                stdout=subprocess.PIPE,
                stdin=subprocess.PIPE,
                stderr=subprocess.PIPE
            )
            proc.communicate(input=query)[0].decode("utf-8")
            result = [file_words_path]
        except OSError as exc:
            print_error("  Unable to run '%s' as '%s'" % (MARISA_BUILD, marisa_build_path))
            print_error("  Please make sure '%s':" % MARISA_BUILD)
            print_error("    1. is available on your $PATH or")
            print_error("    2. specify its path with --marisa-bin-path or")
            print_error("    3. install the marisa_trie Python module")
            result = None

    if result is not None:
        # add file_words_path to files to compress
        files_to_compress.append(file_words_path)
        # create output zip file
        try:
            print_debug("Writing to file '%s'..." % (output_file_path_absolute), args.debug)
            file_zip_obj = zipfile.ZipFile(output_file_path_absolute, "w", zipfile.ZIP_DEFLATED)
            for file_to_compress in files_to_compress:
                file_to_compress = os.path.basename(file_to_compress)
                file_zip_obj.write(file_to_compress)
            file_zip_obj.close()
            result = [output_file_path]
            print_debug("Writing to file '%s'... success" % (output_file_path_absolute), args.debug)
        except:
            print_error("Writing to file '%s'... failure" % (output_file_path_absolute))

    # delete tmp directory
    os.chdir(cwd)
    if args.keep:
        print_info("Not deleting temp dir '%s'" % (tmp_path))
    else:
        delete_directory(tmp_path)
        print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug)

    return result
예제 #6
0
    def read_single_file(dictionary, args, input_file_path):
        # result flag
        result = False

        # create a tmp directory
        tmp_path = create_temp_directory()
        print_debug("Working in temp dir '%s'" % (tmp_path), args.debug)

        # find .ifo, .idx, .dict[.dz] and .syn files inside the zip
        # and extract them to tmp_path
        input_file_obj = zipfile.ZipFile(input_file_path)
        found_files = find_files(input_file_obj.namelist())
        extracted_files = {}
        if len(found_files) > 0:
            for key in found_files:
                entry = found_files[key]
                ext_file_path = os.path.join(tmp_path, key)
                ext_file_obj = open(ext_file_path, "wb")
                zip_entry = input_file_obj.open(entry)
                ext_file_obj.write(zip_entry.read())
                zip_entry.close()
                ext_file_obj.close()
                print_debug("Extracted %s" % (ext_file_path), args.debug)
                extracted_files[key] = ext_file_path
                # extract from compressed file, but only if ".idx" is not present as well
                if (key == "d.idx.gz") and ("d.idx" not in found_files):
                    extracted_files["d.idx"] = uncompress_file(ext_file_path, tmp_path, "d.idx")
                # extract from compressed file, but only if ".dict" is not present as well
                if ((key == "d.dict.dz") or (key == "d.dz")) and ("d.dict" not in found_files):
                    extracted_files["d.dict"] = uncompress_file(ext_file_path, tmp_path, "d.dict")
        input_file_obj.close()

        # here we have d.ifo, d.idx and d.dict (all uncompressed) and possibly d.syn

        has_syn = "d.syn" in extracted_files
        if (has_syn) and (args.ignore_synonyms):
            has_syn = False
            print_debug("Dictionary has synonyms, but ignoring them (--ignore-synonym)", args.debug)
        ifo_dict = read_ifo(extracted_files["d.ifo"], has_syn, args)
        print_debug("Read .ifo file with values:\n%s" % (str(ifo_dict)), args.debug)

        # read dict file
        dict_file_obj = open(extracted_files["d.dict"], "rb")
        dict_file_bytes = dict_file_obj.read()
        dict_file_obj.close()

        # read idx file
        idx_file_obj = open(extracted_files["d.idx"], "rb")
        byte_read = idx_file_obj.read(1)
        headword = b""
        while byte_read:
            if byte_read == b"\0":
                # end of current word: read offset and size
                offset_bytes = idx_file_obj.read(4)
                offset_int = int((struct.unpack('>i', offset_bytes))[0])
                size_bytes = idx_file_obj.read(4)
                size_int = int((struct.unpack('>i', size_bytes))[0])
                definition = dict_file_bytes[offset_int:offset_int+size_int].decode(args.input_file_encoding)
                headword = headword.decode("utf-8")
                if args.ignore_case:
                    headword = headword.lower()
                dictionary.add_entry(headword=headword, definition=definition)
                headword = b""
            else:
                # read next byte
                headword += byte_read
            byte_read = idx_file_obj.read(1)
        idx_file_obj.close()
        result = True

        # read syn file, if present
        if has_syn:
            print_debug("The input StarDict file contains a .syn file, parsing it...", args.debug)
            result = False
            syn_file_obj = open(extracted_files["d.syn"], "rb")
            byte_read = syn_file_obj.read(1)
            synonym = b""
            while byte_read:
                if byte_read == b"\0":
                    # end of current synonym: read index of original word
                    index_bytes = syn_file_obj.read(4)
                    index_int = int((struct.unpack('>i', index_bytes))[0])
                    synonym = synonym.decode("utf-8")
                    if index_int < len(dictionary):
                        dictionary.add_synonym(synonym=synonym, headword_index=index_int)
                    else:
                        # emit a warning?
                        print_debug("Synonym '%s' points to index %d >= len(dictionary), skipping it" % (index_int, synonym), args.debug)
                    synonym = b""
                else:
                    # read next byte
                    synonym += byte_read
                byte_read = syn_file_obj.read(1)
            syn_file_obj.close()
            result = True
            print_debug("The input StarDict file contains a .syn file, parsing it... done", args.debug)
        else:
            print_debug("The input StarDict file does not contain a .syn file", args.debug)

        # delete tmp directory
        if args.keep:
            print_info("Not deleting temp dir '%s'" % (tmp_path))
        else:
            delete_directory(tmp_path)
            print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug)

        return result
예제 #7
0
def write(dictionary, args, output_file_path):
    # result to be returned
    result = None

    # get absolute path
    output_file_path_absolute = os.path.abspath(output_file_path)

    # create tmp directory
    cwd = os.getcwd()
    tmp_path = create_temp_directory()
    print_debug("Working in temp dir '%s'" % (tmp_path), args.debug)
    os.chdir(tmp_path)

    # get the basename and compute output file paths
    base = os.path.basename(output_file_path)
    if base.endswith(".zip"):
        base = base[:-4]
    ifo_file_path = base + ".ifo"
    idx_file_path = base + ".idx"
    dict_file_path = base + ".dict"
    dict_dz_file_path = base + ".dict.dz"
    syn_file_path = base + ".syn"

    # TODO by spec, the index should be sorted
    # TODO using the comparator stardict_strcmp() defined in the spec
    # TODO (it calls g_ascii_strcasecmp() and/or strcmp() ),
    # TODO or with a user-defined collation function
    #
    # From https://developer.gnome.org/glib/2.28/glib-String-Utility-Functions.html#g-ascii-strcasecmp
    # gint g_ascii_strcasecmp (const gchar *s1, const gchar *s2);
    # Compare two strings, ignoring the case of ASCII characters.
    # Unlike the BSD strcasecmp() function, this only recognizes standard ASCII letters and ignores the locale, treating all non-ASCII bytes as if they are not letters.
    # This function should be used only on strings that are known to be in encodings where the bytes corresponding to ASCII letters always represent themselves. This includes UTF-8 and the ISO-8859-* charsets, but not for instance double-byte encodings like the Windows Codepage 932, where the trailing bytes of double-byte characters include all ASCII letters. If you compare two CP932 strings using this function, you will get false matches. 
    #
    # using Python's builtin lower() and sort() by headword
    # should be equivalent for UTF-8 encoded dictionaries (and it is fast)
    #
    dictionary.sort(by_headword=True, ignore_case=True)

    # write .idx and .dict files
    print_debug("Writing .idx and .dict files...", args.debug)
    idx_file_obj = open(idx_file_path, "wb")
    dict_file_obj = open(dict_file_path, "wb")
    current_offset = 0
    current_idx_size = 0
    for entry_index in dictionary.entries_index_sorted:
        entry = dictionary.entries[entry_index]
        headword_bytes = entry.headword.encode("utf-8")
        definition_bytes = entry.definition.encode("utf-8")
        definition_size = len(definition_bytes)
        # write .idx
        idx_file_obj.write(headword_bytes)
        idx_file_obj.write(b"\0")
        idx_file_obj.write(struct.pack('>i', current_offset))
        idx_file_obj.write(struct.pack('>i', definition_size))
        current_idx_size += (len(headword_bytes) + 1 + 4 + 4)
        # write .dict
        dict_file_obj.write(definition_bytes)
        current_offset += definition_size
    idx_file_obj.close()
    dict_file_obj.close()
    print_debug("Writing .idx and .dict files... done", args.debug)

    # list files to compress
    files_to_compress = []
    files_to_compress.append(ifo_file_path)
    files_to_compress.append(idx_file_path)

    # write .syn file
    dict_syns_len = 0
    if dictionary.has_synonyms:
        if args.ignore_synonyms:
            print_debug("Dictionary has synonyms, but ignoring them", args.debug)
        else:
            print_debug("Dictionary has synonyms, writing .syn file...", args.debug)
            syn_file_obj = open(syn_file_path, "wb")
            dict_syns = dictionary.get_synonyms()
            dict_syns_len = len(dict_syns)
            for pair in dict_syns:
                synonym_bytes = pair[0].encode("utf-8")
                index = pair[1]
                syn_file_obj.write(synonym_bytes)
                syn_file_obj.write(b"\0")
                syn_file_obj.write(struct.pack('>i', index))
            syn_file_obj.close()
            files_to_compress.append(syn_file_path)
            print_debug("Dictionary has synonyms, writing .syn file... done", args.debug)

    # compress .dict file
    if args.sd_no_dictzip:
        print_debug("Not compressing .dict file with dictzip", args.debug)
        files_to_compress.append(dict_file_path)
        result = [dict_file_path] 
    else:
        try:
            print_debug("Compressing .dict file with dictzip...", args.debug)
            dictzip_path = DICTZIP
            if args.dictzip_path is None:
                print_info("  Running '%s' from $PATH" % DICTZIP)
            else:
                dictzip_path = args.dictzip_path
                print_info("  Running '%s' from '%s'" % (DICTZIP, dictzip_path))
            proc = subprocess.Popen(
                [dictzip_path, "-k", dict_file_path],
                stdout=subprocess.PIPE,
                stdin=subprocess.PIPE,
                stderr=subprocess.PIPE
            )
            proc.communicate()
            result = [dict_dz_file_path] 
            files_to_compress.append(dict_dz_file_path)
            print_debug("Compressing .dict file with dictzip... done", args.debug)
        except OSError as exc:
            print_error("  Unable to run '%s' as '%s'" % (DICTZIP, dictzip_path))
            print_error("  Please make sure '%s':" % DICTZIP)
            print_error("    1. is available on your $PATH or")
            print_error("    2. specify its path with --dictzip-path or")
            print_error("    3. specify --no-dictzip to avoid compressing the .dict file")
            result = None 

    if result is not None:
        # create ifo file
        ifo_file_obj = open(ifo_file_path, "wb")
        ifo_file_obj.write((u"StarDict's dict ifo file\n").encode("utf-8"))
        ifo_file_obj.write((u"version=2.4.2\n").encode("utf-8"))
        ifo_file_obj.write((u"wordcount=%d\n" % (len(dictionary))).encode("utf-8"))
        ifo_file_obj.write((u"idxfilesize=%d\n" % (current_idx_size)).encode("utf-8"))
        ifo_file_obj.write((u"bookname=%s\n" % (args.title)).encode("utf-8"))
        ifo_file_obj.write((u"date=%s\n" % (args.year)).encode("utf-8"))
        ifo_file_obj.write((u"sametypesequence=m\n").encode("utf-8"))
        ifo_file_obj.write((u"description=%s\n" % (args.description)).encode("utf-8"))
        ifo_file_obj.write((u"author=%s\n" % (args.author)).encode("utf-8"))
        ifo_file_obj.write((u"email=%s\n" % (args.email)).encode("utf-8"))
        ifo_file_obj.write((u"website=%s\n" % (args.website)).encode("utf-8"))
        if dict_syns_len > 0:
            ifo_file_obj.write((u"synwordcount=%d\n" % (dict_syns_len)).encode("utf-8"))
        ifo_file_obj.close()

        # create output zip file
        try:
            print_debug("Writing to file '%s'..." % (output_file_path_absolute), args.debug)
            file_zip_obj = zipfile.ZipFile(output_file_path_absolute, "w", zipfile.ZIP_DEFLATED)
            for file_to_compress in files_to_compress:
                file_to_compress = os.path.basename(file_to_compress)
                file_zip_obj.write(file_to_compress)
                print_debug("Written %s" % (file_to_compress), args.debug)
            file_zip_obj.close()
            result = [output_file_path]
            print_debug("Writing to file '%s'... success" % (output_file_path_absolute), args.debug)
        except:
            print_error("Writing to file '%s'... failure" % (output_file_path_absolute))

    # delete tmp directory
    os.chdir(cwd)
    if args.keep:
        print_info("Not deleting temp dir '%s'" % (tmp_path))
    else:
        delete_directory(tmp_path)
        print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug)

    return result
예제 #8
0
def write(dictionary, args, output_file_path):
    # result to be returned
    result = None

    # get absolute path
    output_file_path_absolute = os.path.abspath(output_file_path)

    # create tmp directory
    cwd = os.getcwd()
    tmp_path = create_temp_directory()
    print_debug("Working in temp dir '%s'" % (tmp_path), args.debug)
    os.chdir(tmp_path)

    # sort by headword
    dictionary.sort(by_headword=True)

    # group by prefix
    files_to_compress = []
    prefix_length = int(args.group_by_prefix_length)
    special_group, group_keys, group_dict = dictionary.group(
        prefix_function=get_prefix_kobo,
        prefix_length=prefix_length,
        merge_min_size=int(args.group_by_prefix_merge_min_size),
        merge_across_first=args.group_by_prefix_merge_across_first)
    if special_group is not None:
        special_group_key = u"1" * prefix_length
        group_dict[special_group_key] = special_group
        group_keys = [special_group_key] + group_keys

    # write files
    for key in group_keys:
        # write html file
        file_html_path = key + u".html"
        file_html_obj = io.open(file_html_path, "wb")
        file_html_obj.write(
            u"<?xml version=\"1.0\" encoding=\"utf-8\"?><html>".encode(
                "utf-8"))
        for entry in group_dict[key]:
            headword = entry.headword
            definition = entry.definition
            file_html_obj.write(
                (u"<w><a name=\"%s\"/><div><b>%s</b><br/>%s</div></w>" %
                 (headword, headword, definition)).encode("utf-8"))
        file_html_obj.write((u"</html>").encode("utf-8"))
        file_html_obj.close()

        # compress in gz format
        file_html_obj = io.open(file_html_path, "rb")
        file_gz_path = file_html_path + u".gz"
        file_gz_obj = gzip.open(file_gz_path, "wb")
        file_gz_obj.writelines(file_html_obj)
        file_gz_obj.close()
        file_html_obj.close()

        # delete .html file
        delete_file(None, file_html_path)
        # rename .html.gz file into .html
        rename_file(file_gz_path, file_html_path)
        files_to_compress.append(file_html_path)

    # write words
    file_words_path = WORDS_FILE_NAME
    keys = sorted(dictionary.entries_index.keys())
    try:
        import marisa_trie
        trie = marisa_trie.Trie(keys)
        trie.save(file_words_path)
        result = [file_words_path]
    except ImportError as exc:
        # call MARISA with subprocess
        print_info(
            "  MARISA cannot be imported as Python module. You might want to install it with:"
        )
        print_info("  $ [sudo] pip install marisa_trie")
        marisa_build_path = MARISA_BUILD
        if args.marisa_bin_path is None:
            print_info("  Running '%s' from $PATH" % MARISA_BUILD)
        else:
            marisa_build_path = os.path.join(args.marisa_bin_path,
                                             MARISA_BUILD)
            print_info("  Running '%s' from '%s'" %
                       (MARISA_BUILD, args.marisa_bin_path))
        # TODO this is ugly, but it works
        query = (u"\n".join([x for x in keys]) + u"\n").encode("utf-8")

        try:
            proc = subprocess.Popen(
                [marisa_build_path, "-l", "-o", file_words_path],
                stdout=subprocess.PIPE,
                stdin=subprocess.PIPE,
                stderr=subprocess.PIPE)
            proc.communicate(input=query)[0].decode("utf-8")
            result = [file_words_path]
        except OSError as exc:
            print_error("  Unable to run '%s' as '%s'" %
                        (MARISA_BUILD, marisa_build_path))
            print_error("  Please make sure '%s':" % MARISA_BUILD)
            print_error("    1. is available on your $PATH or")
            print_error("    2. specify its path with --marisa-bin-path or")
            print_error("    3. install the marisa_trie Python module")
            result = None

    if result is not None:
        # add file_words_path to files to compress
        files_to_compress.append(file_words_path)
        # create output zip file
        try:
            print_debug(
                "Writing to file '%s'..." % (output_file_path_absolute),
                args.debug)
            file_zip_obj = zipfile.ZipFile(output_file_path_absolute, "w",
                                           zipfile.ZIP_DEFLATED)
            for file_to_compress in files_to_compress:
                file_to_compress = os.path.basename(file_to_compress)
                file_zip_obj.write(file_to_compress)
            file_zip_obj.close()
            result = [output_file_path]
            print_debug(
                "Writing to file '%s'... success" %
                (output_file_path_absolute), args.debug)
        except:
            print_error("Writing to file '%s'... failure" %
                        (output_file_path_absolute))

    # delete tmp directory
    os.chdir(cwd)
    if args.keep:
        print_info("Not deleting temp dir '%s'" % (tmp_path))
    else:
        delete_directory(tmp_path)
        print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug)

    return result
예제 #9
0
    def read_single_dict(dictionary, args, single_dict):
        # create tmp directory
        tmp_path = create_temp_directory()
        print_debug("Working in temp dir '%s'" % (tmp_path), args.debug)

        if len(single_dict) == 1:
            print_debug("Unzipping .install file...", args.debug)
            zip_file_path = single_dict[0]
            idx_file_path = os.path.join(tmp_path, "d.dict.idx")
            dict_file_path = os.path.join(tmp_path, "d.dict")
            zip_file_obj = zipfile.ZipFile(zip_file_path, "r")
            for entry in zip_file_obj.namelist():
                if entry.endswith(".dict.idx"):
                    zip_entry = zip_file_obj.open(entry)
                    idx_file_obj = open(idx_file_path, "wb")
                    idx_file_obj.write(zip_entry.read())
                    idx_file_obj.close()
                    zip_entry.close()
                elif entry.endswith(".dict"):
                    zip_entry = zip_file_obj.open(entry)
                    dict_file_obj = open(dict_file_path, "wb")
                    dict_file_obj.write(zip_entry.read())
                    dict_file_obj.close()
                    zip_entry.close()
            zip_file_obj.close()
            print_debug("Unzipping .install file... done", args.debug)
        else:
            print_debug("Files .dict.idx and .dict already uncompressed...", args.debug)
            idx_file_path = single_dict[0]
            dict_file_path = single_dict[1]
            for file_path in [idx_file_path, dict_file_path]:
                if not os.path.exists(file_path):
                    print_error("File '%s' does not exist" % file_path)
                    return False
            print_debug("Files .dict.idx and .dict already uncompressed... done", args.debug)

        # unzip .dict file into tmp_path
        print_debug("Unzipping .dict file...", args.debug)
        zip_file_obj = zipfile.ZipFile(dict_file_path, "r")
        for entry in zip_file_obj.namelist():
            if not entry.endswith("/"):
                zip_entry = zip_file_obj.open(entry)
                entry_file_path = os.path.join(tmp_path, os.path.basename(entry))
                entry_file_obj = open(entry_file_path, "wb")
                entry_file_obj.write(zip_entry.read())
                entry_file_obj.close()
                zip_entry.close()
        zip_file_obj.close()
        print_debug("Unzipping .dict file... done", args.debug)

        # read .dict.idx
        print_debug("Reading .dict.idx file...", args.debug)
        sql_connection = sqlite3.connect(idx_file_path)
        sql_cursor = sql_connection.cursor()
        sql_cursor.execute("select * from T_DictIndex")
        index_data = sql_cursor.fetchall()
        chunk_index_to_entries = {}
        max_chunk_index = 1
        for index_entry in index_data:
            headword = index_entry[1]
            if args.ignore_case:
                headword = headword.lower()
            offset = index_entry[2]
            size = index_entry[3]
            chunk_index = index_entry[4]
            if not chunk_index in chunk_index_to_entries:
                chunk_index_to_entries[chunk_index] = []
            if chunk_index > max_chunk_index:
                max_chunk_index = chunk_index
            chunk_index_to_entries[chunk_index].append([headword, offset, size])
        sql_cursor.close()
        sql_connection.close()
        print_debug("Reading .dict.idx file... done", args.debug)

        # read c_* files
        print_debug("Reading c_* files...", args.debug)
        for chunk_index in range(1, max_chunk_index + 1):
            print_debug("  Reading c_%d file..." % (chunk_index), args.debug)
            chunk_file_path = os.path.join(tmp_path, "%s%d" % (CHUNK_FILE_PREFIX, chunk_index))
            chunk_file_obj = open(chunk_file_path, "rb")
            for entry in chunk_index_to_entries[chunk_index]:
                headword = entry[0]
                offset = entry[1]
                size = entry[2]
                chunk_file_obj.seek(offset)
                definition_bytes = chunk_file_obj.read(size)
                definition_unicode = definition_bytes.decode(args.input_file_encoding)
                dictionary.add_entry(headword=headword, definition=definition_unicode)
            chunk_file_obj.close()
            print_debug("  Reading c_%d file... done" % (chunk_index), args.debug)
        print_debug("Reading c_* files... done", args.debug)

        # delete tmp directory
        if args.keep:
            print_info("Not deleting temp dir '%s'" % (tmp_path))
        else:
            delete_directory(tmp_path)
            print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug)
        return True 
예제 #10
0
def write(dictionary, args, output_file_path):
    # result to be returned
    result = None

    # get absolute path
    output_file_path_absolute = os.path.abspath(output_file_path)

    # get absolute path for collation function file 
    bookeen_collation_function_path = None
    if args.bookeen_collation_function is not None:
        bookeen_collation_function_path = os.path.abspath(args.bookeen_collation_function)

    # create tmp directory
    cwd = os.getcwd()
    tmp_path = create_temp_directory()
    print_debug("Working in temp dir '%s'" % (tmp_path), args.debug)
    os.chdir(tmp_path)

    # get the basename
    base = os.path.basename(output_file_path)
    if base.endswith(".zip"):
        base = base[:-4]

    # copy empty.idx into tmp_path
    idx_file_path = base + u".dict.idx"
    dict_file_path = base + u".dict"
    copy_file(EMPTY_FILE_PATH, idx_file_path)

    # open index
    sql_connection = sqlite3.connect(idx_file_path)

    # install collation in the index
    collation_function = collate_function_default
    if bookeen_collation_function_path is not None:
        try:
            collation_function = imp.load_source("", bookeen_collation_function_path).collate_function
            print_debug("Using collation function from '%s'" % (bookeen_collation_function_path), args.debug)
        except:
            print_error("Unable to load collation function from '%s'. Using the default collation function instead." % (bookeen_collation_function_path))
    sql_connection.create_collation("IcuNoCase", collation_function)
    sql_connection.text_factory = str

    # get a cursor and delete any data from the index file
    sql_cursor = sql_connection.cursor()
    sql_cursor.execute("delete from T_DictIndex")

    # write c_* files
    # each c_* file has MAX_CHUNK_SIZE < size <= (MAX_CHUNK_SIZE * 2) bytes (tentatively)
    print_debug("Writing c_* files...", args.debug)
    files_to_compress = []
    current_offset = 0
    chunk_index = 1
    chunk_file_path = "%s%d" % (CHUNK_FILE_PREFIX, chunk_index)
    files_to_compress.append(chunk_file_path)
    chunk_file_obj = open(chunk_file_path, "wb")
    for entry_index in dictionary.entries_index_sorted:
        entry = dictionary.entries[entry_index]
        definition_bytes = entry.definition.encode("utf-8")
        definition_size = len(definition_bytes)
        chunk_file_obj.write(definition_bytes)
        # insert headword into index file
        sql_tuple = (0, entry.headword, current_offset, definition_size, chunk_index)
        sql_cursor.execute("insert into T_DictIndex values (?,?,?,?,?)", sql_tuple)
        # insert synonyms into index file
        if not args.ignore_synonyms:
            for synonym in entry.get_synonyms():
                sql_tuple = (0, synonym[0], current_offset, definition_size, chunk_index)
                sql_cursor.execute("insert into T_DictIndex values (?,?,?,?,?)", sql_tuple)
        # update offset
        current_offset += definition_size
        # if we reached CHUNK_SIZE, open the next c_* file
        if current_offset > CHUNK_SIZE:
            chunk_file_obj.close()
            chunk_index += 1
            chunk_file_path = "%s%d" % (CHUNK_FILE_PREFIX, chunk_index)
            files_to_compress.append(chunk_file_path)
            chunk_file_obj = open(chunk_file_path, "wb")
            current_offset = 0
    chunk_file_obj.close()
    print_debug("Writing c_* files... done", args.debug)

    # compress
    print_debug("Compressing c_* files...", args.debug)
    file_zip_obj = zipfile.ZipFile(dict_file_path, "w", zipfile.ZIP_DEFLATED)
    for file_to_compress in files_to_compress:
        file_to_compress = os.path.basename(file_to_compress)
        file_zip_obj.write(file_to_compress)
    file_zip_obj.close()
    print_debug("Compressing c_* files... done", args.debug)

    # update index metadata
    print_debug("Updating index metadata...", args.debug)
    header = HEADER % (args.language_from)
    sql_cursor.execute("update T_DictInfo set F_xhtmlHeader=?", (header,))
    sql_cursor.execute("update T_DictInfo set F_LangFrom=?", (args.language_from,))
    sql_cursor.execute("update T_DictInfo set F_LangTo=?", (args.language_to,))
    sql_cursor.execute("update T_DictInfo set F_Licence=?", (args.license,))
    sql_cursor.execute("update T_DictInfo set F_Copyright=?", (args.copyright,))
    sql_cursor.execute("update T_DictInfo set F_Title=?", (args.title,))
    sql_cursor.execute("update T_DictInfo set F_Description=?", (args.description,))
    sql_cursor.execute("update T_DictInfo set F_Year=?", (args.year,))
    # the meaning of the following is unknown 
    sql_cursor.execute("update T_DictInfo set F_Alphabet=?", ("Z",))
    sql_cursor.execute("update T_DictInfo set F_CollationLevel=?", ("1",))
    sql_cursor.execute("update T_DictVersion set F_DictType=?", ("stardict",))
    sql_cursor.execute("update T_DictVersion set F_Version=?", ("11",))
    print_debug("Updating index metadata... done", args.debug)

    # compact and close
    sql_cursor.execute("vacuum")
    sql_cursor.close()
    sql_connection.close()

    # create .install file or copy .dict.idx and .dict into requested output directory
    parent_output_directory = os.path.split(output_file_path_absolute)[0]
    if args.bookeen_install_file:
        print_debug("Creating .install file...", args.debug)
        file_zip_path = os.path.join(parent_output_directory, base + u".install")
        file_zip_obj = zipfile.ZipFile(file_zip_path, "w", zipfile.ZIP_DEFLATED)
        for file_to_compress in [dict_file_path, idx_file_path]:
            file_to_compress = os.path.basename(file_to_compress)
            file_zip_obj.write(file_to_compress)
        file_zip_obj.close()
        result = [file_zip_path]
        print_debug("Creating .install file... done", args.debug)
    else:
        print_debug("Copying .dict.idx and .dict files...", args.debug)
        dict_file_path_final = os.path.join(parent_output_directory, os.path.basename(dict_file_path))
        idx_file_path_final = os.path.join(parent_output_directory, os.path.basename(idx_file_path))
        copy_file(dict_file_path, dict_file_path_final)
        copy_file(idx_file_path, idx_file_path_final)
        result = [idx_file_path_final, dict_file_path_final]
        print_debug("Copying .dict.idx and .dict files... done", args.debug)

    # delete tmp directory
    os.chdir(cwd)
    if args.keep:
        print_info("Not deleting temp dir '%s'" % (tmp_path))
    else:
        delete_directory(tmp_path)
        print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug)

    return result
예제 #11
0
def write(dictionary, args, output_file_path):
    # result to be returned
    result = None

    # get absolute path
    output_file_path_absolute = os.path.abspath(output_file_path)

    # create tmp directory
    cwd = os.getcwd()
    tmp_path = create_temp_directory()
    print_debug("Working in temp dir '%s'" % (tmp_path), args.debug)
    os.chdir(tmp_path)

    # get the basename and compute output file paths
    base = os.path.basename(output_file_path)
    if base.endswith(".zip"):
        base = base[:-4]
    ifo_file_path = base + ".ifo"
    idx_file_path = base + ".idx"
    dict_file_path = base + ".dict"
    dict_dz_file_path = base + ".dict.dz"
    syn_file_path = base + ".syn"

    # TODO by spec, the index should be sorted
    # TODO using the comparator stardict_strcmp() defined in the spec
    # TODO (it calls g_ascii_strcasecmp() and/or strcmp() ),
    # TODO or with a user-defined collation function
    #
    # From https://developer.gnome.org/glib/2.28/glib-String-Utility-Functions.html#g-ascii-strcasecmp
    # gint g_ascii_strcasecmp (const gchar *s1, const gchar *s2);
    # Compare two strings, ignoring the case of ASCII characters.
    # Unlike the BSD strcasecmp() function, this only recognizes standard ASCII letters and ignores the locale, treating all non-ASCII bytes as if they are not letters.
    # This function should be used only on strings that are known to be in encodings where the bytes corresponding to ASCII letters always represent themselves. This includes UTF-8 and the ISO-8859-* charsets, but not for instance double-byte encodings like the Windows Codepage 932, where the trailing bytes of double-byte characters include all ASCII letters. If you compare two CP932 strings using this function, you will get false matches.
    #
    # using Python's builtin lower() and sort() by headword
    # should be equivalent for UTF-8 encoded dictionaries (and it is fast)
    #
    dictionary.sort(by_headword=True, ignore_case=True)

    # write .idx and .dict files
    print_debug("Writing .idx and .dict files...", args.debug)
    idx_file_obj = io.open(idx_file_path, "wb")
    dict_file_obj = io.open(dict_file_path, "wb")
    current_offset = 0
    current_idx_size = 0
    for entry_index in dictionary.entries_index_sorted:
        entry = dictionary.entries[entry_index]
        headword_bytes = entry.headword.encode("utf-8")
        definition_bytes = entry.definition.encode("utf-8")
        definition_size = len(definition_bytes)
        # write .idx
        idx_file_obj.write(headword_bytes)
        idx_file_obj.write(b"\0")
        idx_file_obj.write(struct.pack('>i', current_offset))
        idx_file_obj.write(struct.pack('>i', definition_size))
        current_idx_size += (len(headword_bytes) + 1 + 4 + 4)
        # write .dict
        dict_file_obj.write(definition_bytes)
        current_offset += definition_size
    idx_file_obj.close()
    dict_file_obj.close()
    print_debug("Writing .idx and .dict files... done", args.debug)

    # list files to compress
    files_to_compress = []
    files_to_compress.append(ifo_file_path)
    files_to_compress.append(idx_file_path)

    # write .syn file
    dict_syns_len = 0
    if dictionary.has_synonyms:
        if args.ignore_synonyms:
            print_debug("Dictionary has synonyms, but ignoring them",
                        args.debug)
        else:
            print_debug("Dictionary has synonyms, writing .syn file...",
                        args.debug)
            syn_file_obj = io.open(syn_file_path, "wb")
            dict_syns = dictionary.get_synonyms()
            dict_syns_len = len(dict_syns)
            for pair in dict_syns:
                synonym_bytes = pair[0].encode("utf-8")
                index = pair[1]
                syn_file_obj.write(synonym_bytes)
                syn_file_obj.write(b"\0")
                syn_file_obj.write(struct.pack('>i', index))
            syn_file_obj.close()
            files_to_compress.append(syn_file_path)
            print_debug("Dictionary has synonyms, writing .syn file... done",
                        args.debug)

    # compress .dict file
    if args.sd_no_dictzip:
        print_debug("Not compressing .dict file with dictzip", args.debug)
        files_to_compress.append(dict_file_path)
        result = [dict_file_path]
    else:
        try:
            print_debug("Compressing .dict file with dictzip...", args.debug)
            dictzip_path = DICTZIP
            if args.dictzip_path is None:
                print_info("  Running '%s' from $PATH" % DICTZIP)
            else:
                dictzip_path = args.dictzip_path
                print_info("  Running '%s' from '%s'" %
                           (DICTZIP, dictzip_path))
            proc = subprocess.Popen([dictzip_path, "-k", dict_file_path],
                                    stdout=subprocess.PIPE,
                                    stdin=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
            proc.communicate()
            result = [dict_dz_file_path]
            files_to_compress.append(dict_dz_file_path)
            print_debug("Compressing .dict file with dictzip... done",
                        args.debug)
        except OSError as exc:
            print_error("  Unable to run '%s' as '%s'" %
                        (DICTZIP, dictzip_path))
            print_error("  Please make sure '%s':" % DICTZIP)
            print_error("    1. is available on your $PATH or")
            print_error("    2. specify its path with --dictzip-path or")
            print_error(
                "    3. specify --no-dictzip to avoid compressing the .dict file"
            )
            result = None

    if result is not None:
        # create ifo file
        ifo_file_obj = io.open(ifo_file_path, "wb")
        ifo_file_obj.write((u"StarDict's dict ifo file\n").encode("utf-8"))
        ifo_file_obj.write((u"version=2.4.2\n").encode("utf-8"))
        ifo_file_obj.write(
            (u"wordcount=%d\n" % (len(dictionary))).encode("utf-8"))
        ifo_file_obj.write(
            (u"idxfilesize=%d\n" % (current_idx_size)).encode("utf-8"))
        ifo_file_obj.write((u"bookname=%s\n" % (args.title)).encode("utf-8"))
        ifo_file_obj.write((u"date=%s\n" % (args.year)).encode("utf-8"))
        ifo_file_obj.write((u"sametypesequence=m\n").encode("utf-8"))
        ifo_file_obj.write(
            (u"description=%s\n" % (args.description)).encode("utf-8"))
        ifo_file_obj.write((u"author=%s\n" % (args.author)).encode("utf-8"))
        ifo_file_obj.write((u"email=%s\n" % (args.email)).encode("utf-8"))
        ifo_file_obj.write((u"website=%s\n" % (args.website)).encode("utf-8"))
        if dict_syns_len > 0:
            ifo_file_obj.write(
                (u"synwordcount=%d\n" % (dict_syns_len)).encode("utf-8"))
        ifo_file_obj.close()

        # create output zip file
        try:
            print_debug(
                "Writing to file '%s'..." % (output_file_path_absolute),
                args.debug)
            file_zip_obj = zipfile.ZipFile(output_file_path_absolute, "w",
                                           zipfile.ZIP_DEFLATED)
            for file_to_compress in files_to_compress:
                file_to_compress = os.path.basename(file_to_compress)
                file_zip_obj.write(file_to_compress)
                print_debug("Written %s" % (file_to_compress), args.debug)
            file_zip_obj.close()
            result = [output_file_path]
            print_debug(
                "Writing to file '%s'... success" %
                (output_file_path_absolute), args.debug)
        except:
            print_error("Writing to file '%s'... failure" %
                        (output_file_path_absolute))

    # delete tmp directory
    os.chdir(cwd)
    if args.keep:
        print_info("Not deleting temp dir '%s'" % (tmp_path))
    else:
        delete_directory(tmp_path)
        print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug)

    return result
예제 #12
0
    def read_single_file(dictionary, args, input_file_path):
        # result flag
        result = False

        # create a tmp directory
        tmp_path = create_temp_directory()
        print_debug("Working in temp dir '%s'" % (tmp_path), args.debug)

        # find .ifo, .idx, .dict[.dz] and .syn files inside the zip
        # and extract them to tmp_path
        input_file_obj = zipfile.ZipFile(input_file_path)
        found_files = find_files(input_file_obj.namelist())
        extracted_files = {}
        if len(found_files) > 0:
            for key in found_files:
                entry = found_files[key]
                ext_file_path = os.path.join(tmp_path, key)
                ext_file_obj = io.open(ext_file_path, "wb")
                zip_entry = input_file_obj.open(entry)
                ext_file_obj.write(zip_entry.read())
                zip_entry.close()
                ext_file_obj.close()
                print_debug("Extracted %s" % (ext_file_path), args.debug)
                extracted_files[key] = ext_file_path
                # extract from compressed file, but only if ".idx" is not present as well
                if (key == "d.idx.gz") and ("d.idx" not in found_files):
                    extracted_files["d.idx"] = uncompress_file(
                        ext_file_path, tmp_path, "d.idx")
                # extract from compressed file, but only if ".dict" is not present as well
                if ((key == "d.dict.dz") or
                    (key == "d.dz")) and ("d.dict" not in found_files):
                    extracted_files["d.dict"] = uncompress_file(
                        ext_file_path, tmp_path, "d.dict")
        input_file_obj.close()

        # here we have d.ifo, d.idx and d.dict (all uncompressed) and possibly d.syn

        has_syn = "d.syn" in extracted_files
        if (has_syn) and (args.ignore_synonyms):
            has_syn = False
            print_debug(
                "Dictionary has synonyms, but ignoring them (--ignore-synonym)",
                args.debug)
        ifo_dict = read_ifo(extracted_files["d.ifo"], has_syn, args)
        print_debug("Read .ifo file with values:\n%s" % (str(ifo_dict)),
                    args.debug)

        # read dict file
        dict_file_obj = io.open(extracted_files["d.dict"], "rb")
        dict_file_bytes = dict_file_obj.read()
        dict_file_obj.close()

        # read idx file
        idx_file_obj = io.open(extracted_files["d.idx"], "rb")
        byte_read = idx_file_obj.read(1)
        headword = b""
        while byte_read:
            if byte_read == b"\0":
                # end of current word: read offset and size
                offset_bytes = idx_file_obj.read(4)
                offset_int = int((struct.unpack('>i', offset_bytes))[0])
                size_bytes = idx_file_obj.read(4)
                size_int = int((struct.unpack('>i', size_bytes))[0])
                definition = dict_file_bytes[offset_int:(
                    offset_int + size_int)].decode(args.input_file_encoding)
                headword = headword.decode("utf-8")
                if args.ignore_case:
                    headword = headword.lower()
                dictionary.add_entry(headword=headword, definition=definition)
                headword = b""
            else:
                # read next byte
                headword += byte_read
            byte_read = idx_file_obj.read(1)
        idx_file_obj.close()
        result = True

        # read syn file, if present
        if has_syn:
            print_debug(
                "The input StarDict file contains a .syn file, parsing it...",
                args.debug)
            result = False
            syn_file_obj = io.open(extracted_files["d.syn"], "rb")
            byte_read = syn_file_obj.read(1)
            synonym = b""
            while byte_read:
                if byte_read == b"\0":
                    # end of current synonym: read index of original word
                    index_bytes = syn_file_obj.read(4)
                    index_int = int((struct.unpack('>i', index_bytes))[0])
                    synonym = synonym.decode("utf-8")
                    if index_int < len(dictionary):
                        dictionary.add_synonym(synonym=synonym,
                                               headword_index=index_int)
                    else:
                        # emit a warning?
                        print_debug(
                            "Synonym '%s' points to index %d >= len(dictionary), skipping it"
                            % (index_int, synonym), args.debug)
                    synonym = b""
                else:
                    # read next byte
                    synonym += byte_read
                byte_read = syn_file_obj.read(1)
            syn_file_obj.close()
            result = True
            print_debug(
                "The input StarDict file contains a .syn file, parsing it... done",
                args.debug)
        else:
            print_debug("The input StarDict file does not contain a .syn file",
                        args.debug)

        # delete tmp directory
        if args.keep:
            print_info("Not deleting temp dir '%s'" % (tmp_path))
        else:
            delete_directory(tmp_path)
            print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug)

        return result