Exemplo n.º 1
0
def buildIndex(databaseName, linksSourcePath, currSettings, lang):
	settings = Settings(DATA_FOLDER + SETTINGS_FILE)
	for key, value in currSettings.items():
		settings.set(key, value)

	database = DATABASES_FOLDER + databaseName + '/'
	links = readfile(linksSourcePath).splitlines()
	indexManager = IndexManager(settings)
	indexManager.shutUp = False
	indexManager.build(links, database, getStopWords(lang), lang)
Exemplo n.º 2
0
	def __init__(self):
		self.keylen = 1
		self.manager = IndexManager(Settings())
		self.dynamicKeywords = True
		self.maxKeywords = 2
		self.minKeywords = 1
Exemplo n.º 3
0
class TempSearch:
	def __init__(self):
		self.keylen = 1
		self.manager = IndexManager(Settings())
		self.dynamicKeywords = True
		self.maxKeywords = 2
		self.minKeywords = 1

	def build(self, documents, stopwords):
		infoDtb = {}
		options = documents.get('options', {})
		lang = options.get('lang', 'cs')
		documentsData = documents['data']

		if not documentsData:
			raise Exception('No documents downloaded')

		indexInfo = toIndex(documentsData, stopwords, self.keylen, lang)
		self.indexProxy = IndexProxy()
		self.indexProxy.setIndexInfo(indexInfo)
		metadata, scoresTable = self.getDocsInfo(indexInfo, lang)
	
		stemsDict = self.manager._getStemDict(self.totalKeywords)
		keywordsInDocuments = self.manager._getKeywordsInfo(self.totalKeywords, [x['content'] for x in indexInfo['documents']])
		
		self.manager.findDescription(metadata, indexInfo['documents'], lang)

		infoDtb[SCORES_TABLE] = scoresTable
		infoDtb[STEMSDICT_NAME] = stemsDict
		infoDtb[KEYWORDSINDOCUMENTS_NAME] = keywordsInDocuments
		infoDtb['allwords'] = indexInfo['allRealWords']
		infoDtb[DOCUMENT_INFO_NAME] = metadata
		self.indexProxy.setInfoDtb(infoDtb)
		return self.indexProxy
		

	

	def getDocsInfo(self, indexInfo, lang):
		documentsInfo = indexInfo['documents']
		keywordsScore = getKeywords(documentsInfo, self.indexProxy, nothing, lang)
		totalKeywords = set()
		realLimit = self.manager.countKeywordsLimit(keywordsScore)
			
		for docInfo, allDocKeywords in zip(documentsInfo, keywordsScore):
			if self.dynamicKeywords:
				topKeywords = [x for x in allDocKeywords if x[1] > realLimit][:self.maxKeywords]
				if len(topKeywords) < self.minKeywords:
					topKeywords = allDocKeywords[:self.minKeywords]
			else:
				topKeywords = allDocKeywords[:self.keywordsCount]
				
			docInfo['keywords'] = topKeywords
			totalKeywords = totalKeywords.union(topKeywords)
		
		self.totalKeywords = totalKeywords
		totalKeywordsName = [x[0] for x in totalKeywords]
		
		scoresTable = self.manager._getKeywordsScoreTable(keywordsScore, totalKeywordsName)
		
		newDocInfo = []
		
		for docInfo in documentsInfo:
			d = {}
			for key in ['title', 'url', 'keywords', 'words', 'id', 'description']:
				d[key] = docInfo[key]
			newDocInfo.append(d)
		
		return newDocInfo, scoresTable
Exemplo n.º 4
0
 def __init__(self):
     self.keylen = 1
     self.manager = IndexManager(Settings())
     self.dynamicKeywords = True
     self.maxKeywords = 2
     self.minKeywords = 1
Exemplo n.º 5
0
class TempSearch:
    def __init__(self):
        self.keylen = 1
        self.manager = IndexManager(Settings())
        self.dynamicKeywords = True
        self.maxKeywords = 2
        self.minKeywords = 1

    def build(self, documents, stopwords):
        infoDtb = {}
        options = documents.get('options', {})
        lang = options.get('lang', 'cs')
        documentsData = documents['data']

        if not documentsData:
            raise Exception('No documents downloaded')

        indexInfo = toIndex(documentsData, stopwords, self.keylen, lang)
        self.indexProxy = IndexProxy()
        self.indexProxy.setIndexInfo(indexInfo)
        metadata, scoresTable = self.getDocsInfo(indexInfo, lang)

        stemsDict = self.manager._getStemDict(self.totalKeywords)
        keywordsInDocuments = self.manager._getKeywordsInfo(
            self.totalKeywords, [x['content'] for x in indexInfo['documents']])

        self.manager.findDescription(metadata, indexInfo['documents'], lang)

        infoDtb[SCORES_TABLE] = scoresTable
        infoDtb[STEMSDICT_NAME] = stemsDict
        infoDtb[KEYWORDSINDOCUMENTS_NAME] = keywordsInDocuments
        infoDtb['allwords'] = indexInfo['allRealWords']
        infoDtb[DOCUMENT_INFO_NAME] = metadata
        self.indexProxy.setInfoDtb(infoDtb)
        return self.indexProxy

    def getDocsInfo(self, indexInfo, lang):
        documentsInfo = indexInfo['documents']
        keywordsScore = getKeywords(documentsInfo, self.indexProxy, nothing,
                                    lang)
        totalKeywords = set()
        realLimit = self.manager.countKeywordsLimit(keywordsScore)

        for docInfo, allDocKeywords in zip(documentsInfo, keywordsScore):
            if self.dynamicKeywords:
                topKeywords = [x for x in allDocKeywords
                               if x[1] > realLimit][:self.maxKeywords]
                if len(topKeywords) < self.minKeywords:
                    topKeywords = allDocKeywords[:self.minKeywords]
            else:
                topKeywords = allDocKeywords[:self.keywordsCount]

            docInfo['keywords'] = topKeywords
            totalKeywords = totalKeywords.union(topKeywords)

        self.totalKeywords = totalKeywords
        totalKeywordsName = [x[0] for x in totalKeywords]

        scoresTable = self.manager._getKeywordsScoreTable(
            keywordsScore, totalKeywordsName)

        newDocInfo = []

        for docInfo in documentsInfo:
            d = {}
            for key in [
                    'title', 'url', 'keywords', 'words', 'id', 'description'
            ]:
                d[key] = docInfo[key]
            newDocInfo.append(d)

        return newDocInfo, scoresTable
Exemplo n.º 6
0
	def setUpClass(cls):
		settings = Settings(DATA_FOLDER + 'settings.json')
		urls = cls.getURLs()
		manager = IndexManager(settings)
		manager.build(urls, cls.databaseFolder, [])
		cls.index = Index(cls.databaseFolder, settings)