Пример #1
0
def create_index(root):
	index_root = os.path.join(root, config.IndexPath)
	if not os.path.exists(index_root):
		os.makedirs(index_root, config.DirectoryDefaultMode)
	
	docid_file = open(os.path.join(index_root, 'docid'), 'wb')
	docid_idx = open(os.path.join(index_root, 'docid.idx'), 'wb')
	
	dictionary = {}
	index = []
	lexid = 0
	docid = 0
	root = os.path.abspath(root)
	for parent, dirs, files in os.walk(root):
		for name in files:
			if name.lower().endswith('.epub'):
				full_name = os.path.join(parent, name)
				try:
					info = epub.Info(full_name)
				except Exception, ex:
					print('error while parsing file:', full_name, ex)
					continue
					
				docid_idx.write(struct.pack('i', docid_file.tell()))
				docid_file.write(full_name.encode(config.FileCodePage))
				lex = lexems.get(' '.join(info.authors()))
				lex += lexems.get(' '.join(info.titles()))
				for w in lex:
					if not dictionary.has_key(w):
						dictionary[w] = lexid
						lexid += 1
						index.append([])
					if len(index[dictionary[w]]) == 0 or index[dictionary[w]][-1] != docid:
						index[dictionary[w]].append(docid)
				docid += 1
Пример #2
0
def create_index(root):
    index_root = os.path.join(root, config.IndexPath)
    if not os.path.exists(index_root):
        os.makedirs(index_root, config.DirectoryDefaultMode)

    docid_file = open(os.path.join(index_root, 'docid'), 'wb')
    docid_idx = open(os.path.join(index_root, 'docid.idx'), 'wb')

    dictionary = {}
    index = []
    lexid = 0
    docid = 0
    root = os.path.abspath(root)
    for parent, dirs, files in os.walk(root):
        for name in files:
            if name.lower().endswith('.epub'):
                full_name = os.path.join(parent, name)
                try:
                    info = epub.Info(full_name)
                except Exception, ex:
                    print('error while parsing file:', full_name, ex)
                    continue

                docid_idx.write(struct.pack('i', docid_file.tell()))
                docid_file.write(full_name.encode(config.FileCodePage))
                lex = lexems.get(' '.join(info.authors()))
                lex += lexems.get(' '.join(info.titles()))
                for w in lex:
                    if not dictionary.has_key(w):
                        dictionary[w] = lexid
                        lexid += 1
                        index.append([])
                    if len(index[dictionary[w]]
                           ) == 0 or index[dictionary[w]][-1] != docid:
                        index[dictionary[w]].append(docid)
                docid += 1
Пример #3
0
def search(keywords, idx):
    def size_cmp(lhs, rhs):
        return len(lhs) - len(rhs)

    keywords = lexems.get(keywords)
    docs = []
    for keyword in keywords:
        docs.append(idx.docids(keyword.lower()))

    if len(docs) == 0:
        return docs

    docs.sort(size_cmp)

    docids = docs[0]
    for i in xrange(1, len(docs)):
        docids = intersect(docids, docs[i])

    return docids
Пример #4
0
def search(keywords, idx):
	
	def size_cmp(lhs, rhs):
		return len(lhs) - len(rhs)
	
	keywords = lexems.get(keywords)
	docs = []
	for keyword in keywords:
		docs.append(idx.docids(keyword.lower()))
	
	if len(docs) == 0:
		return docs
	
	docs.sort(size_cmp)
	
	docids = docs[0]
	for i in xrange(1, len(docs)):
		docids = intersect(docids, docs[i])
		
	return docids