def PDF(self, url, enc='UTF-8'): tempfile = TEMP_FOLDER + "temp." pdfdest = tempfile + "pdf" txtdest = tempfile + "txt" downloadFile(url, pdfdest) os.system(PDFTOTEXT + "-enc " + enc + " " + pdfdest + " " + txtdest) txt = readfile(txtdest) txt = normalizePDF(txt) return txt
def tempSearch(path): content = readfile(path) data = loads(content); options = data.get('options', {}) lang = options.get('lang', 'en') tempIndex = TempSearch() index = tempIndex.build(data, getStopWords(lang)) res=tempSearchQuery(index, options.get('query', 'lion'), {}, lang) return getJson(res)
def PDF(self, url, enc = 'UTF-8'): tempfile = TEMP_FOLDER + "temp." pdfdest = tempfile + "pdf" txtdest = tempfile + "txt" downloadFile(url, pdfdest) os.system(PDFTOTEXT + "-enc " + enc + " " + pdfdest + " " + txtdest) txt = readfile(txtdest) txt = normalizePDF(txt) return txt
def buildIndex(databaseName, linksSourcePath, currSettings, lang): settings = Settings(DATA_FOLDER + SETTINGS_FILE) for key, value in currSettings.items(): settings.set(key, value) database = DATABASES_FOLDER + databaseName + '/' links = readfile(linksSourcePath).splitlines() indexManager = IndexManager(settings) indexManager.shutUp = False indexManager.build(links, database, getStopWords(lang), lang)
def test_toIndex(self): urls = self.getURLs() sites = downloads(urls) sites = [{'type': 'html', 'content': x, 'url': 'url'} for x in sites] # savefile(repr(toIndex(sites, [], 1)), TEST_FOLDER + 'index1.txt') # savefile(repr(toIndex(sites, getStopWords(), 1)), TEST_FOLDER + 'index2.txt') # savefile(repr(toIndex(sites, getStopWords(), 2)), TEST_FOLDER + 'index3.txt') result = toIndex(sites, [], 1) desired = readfile(TEST_FOLDER + 'index1.txt') self.assertEqual(repr(result), desired) result = toIndex(sites, getStopWords(), 1) desired = readfile(TEST_FOLDER + 'index2.txt') self.assertEqual(repr(result), desired) result = toIndex(sites, getStopWords(), 2) desired = readfile(TEST_FOLDER + 'index3.txt') self.assertEqual(repr(result), desired)
def testSearchCommand(self): path = TEST_FOLDER fun = self.runShell dfun = lambda dtb, q: fun(dtb, q).decode("utf-8") save = lambda cont, name: savefile(cont, path + name + '.txt', False) read = lambda name: readfile(path + name + '.txt') ass = self.assertEqual # search matweb queries = ['derivace', 'nesmysl', '(spocetne OR nespocetne) mnoziny', 'rovnice', 'rovnice NOT (linearni OR pravdepodobnost)'] ass(dfun('matweb-test', 'derivace'), read('matweb0')) ass(dfun('matweb-test', 'nesmysl'), read('matweb1')) ass(dfun('matweb-test', '(spocetne OR nespocetne) mnoziny'), read('matweb2')) # ass(dfun('matweb-test', 'rovnice'), read('matweb3')) ass(dfun('matweb-test', 'rovnice NOT (linearni OR pravdepodobnost)'), read('matweb4'))
def testSearchCommand(self): path = TEST_FOLDER fun = self.runShell dfun = lambda dtb, q: fun(dtb, q).decode("utf-8") save = lambda cont, name: savefile(cont, path + name + '.txt', False) read = lambda name: readfile(path + name + '.txt') ass = self.assertEqual # search matweb queries = [ 'derivace', 'nesmysl', '(spocetne OR nespocetne) mnoziny', 'rovnice', 'rovnice NOT (linearni OR pravdepodobnost)' ] ass(dfun('matweb-test', 'derivace'), read('matweb0')) ass(dfun('matweb-test', 'nesmysl'), read('matweb1')) ass(dfun('matweb-test', '(spocetne OR nespocetne) mnoziny'), read('matweb2')) # ass(dfun('matweb-test', 'rovnice'), read('matweb3')) ass(dfun('matweb-test', 'rovnice NOT (linearni OR pravdepodobnost)'), read('matweb4'))
def getStopWords(lang): return _cache_result('stopwords', lambda: set(readfile(DATA_FOLDER + STOPWORDS_NAME + "." + lang + ".txt").split()))
def loadData(self): content = readfile(DATA_FOLDER + 'matwebsearches.txt') self.pureData = content.splitlines()[:self.maxQueries]
def loadSettings(self, path): self.text = readfile(path) self.settings = json.loads(self.text)
def getURLs(cls): path = DATA_FOLDER + 'test.txt' content = readfile(path).splitlines() return content