def main(args): indexer = Indexer() with io.open(args.texts, encoding="utf-8") as f: for line in f: line = line.strip() bag = BagOfWords(line, filter_stopwords=False) indexer.index(bag) open_func = gzip.open if args.zip else io.open index_ext = ".json.gz" if args.zip else ".json" with open_func(args.index + index_ext, mode="wb") as f: indexer.dump(f) return 0
def __init__(self, cluster_size=2000, dhash_size=4): self.cluster_size = cluster_size self.dhash_size = dhash_size self.indexer = Indexer() self.db_path = '../index/deeprelevance.db' self.temp_table = 'CREATE TEMPORARY TABLE dhash_filtered (dbid TEXT PRIMARY KEY)' self.temp_insert = 'INSERT INTO dhash_filtered VALUES(?)' self.dhash_stmt = 'SELECT dbid, dhash FROM features' self.dhash_bit_stmt = 'SELECT dbid, (dhash|?)&~(dhash&?) FROM features' self.feats_stmt = '''
def indexCorpus(): indexer = Indexer(database) # index normal articles indexer.corpus_cursor = database.fetch_data("SELECT * FROM articles") indexer.compute_tf() indexer.compute_tf_idf() indexer.purge() # index lemmatized articles indexer.corpus_cursor = database.fetch_data("SELECT * FROM articles_lemma") indexer.output_catalog = "./indexes_lemmatized/" indexer.compute_tf() indexer.compute_tf_idf() indexer.purge()
def main(search_terms): dbname = 'ftp_files.db' db = Database(dbname) xname = 'xapian.db' corpus = Indexer(xname) result = corpus.search(str(search_terms)) print_results(result[0], result[1], result[2], db) # clean up corpus.close() db.close()
def __init__(self, readers=10, start='Main/HomePage', outfile=None): self.url_queue = SetQueue() self.url_queue.put(start) self.link_queue = Queue() self.index = Indexer() self.readers = [ TVTropes_Reader(self.url_queue, self.link_queue, daemon=True) for _ in range(readers) ] self.counter = TVTropes_Counter(self.link_queue, self.index, outfile=outfile, daemon=True)
def parse_html(url, bs): print 'Start parse html from url: ' + str(url) body = bs.find('body') if body is None: return raw_text = body.get_text() words = get_words_from_raw_text(raw_text) dict_words = get_dict_words(words[:100]) # print dict_words print 'Start Indexing url: ' + str(url) indexer = Indexer(url=url, words=dict_words) indexer.save()
def form_example(): from videosplitter import VideoSplitter from indexer import Indexer from search import Search if request.method == 'POST': # this block is only entered when the form is submitted dataset_path = 'video_frames' index_path = 'index.csv' query_path = 'static/defaultvalues' result_path = 'static/result' video = request.files['video'] query_image = request.files['image'] copyfile('static/defaultfiles/Video4_amn_cs445.mp4', 'static/defaultvalues/Video4_amn_cs445.mp4') copyfile('static/defaultfiles/query4_amn_cs445.png', 'static/defaultvalues/query4_amn_cs445.png') if (video.filename == ''): videofilename = 'static/defaultvalues/Video4_amn_cs445.mp4' else: # save video if os.path.exists(video.filename): os.remove(video.filename) video.save(os.path.join("", video.filename)) videofilename = video.filename if (query_image.filename == ''): query_path = 'static/defaultvalues/query4_amn_cs445.png' else: # save query image if os.path.exists("static/defaultvalues/" + query_image.filename): os.remove("static/defaultvalues/" + query_image.filename) query_image.save(os.path.join(query_path, query_image.filename)) query_path = os.path.join(query_path, query_image.filename) videoSplitter = VideoSplitter('video_frames') videoSplitter.splitVideo(videofilename) indexer = Indexer('index.csv', 'video_frames') indexer.indexImages() search = Search(dataset_path, index_path, query_path, result_path) results = search.performSearch() output = [] for i in range(len(results)): image = results[i] output.append(image) return render_template("result.html", images=output) return render_template("index_ssd.html")
def indexit(tokenizer, filenames, store_positions=False, calculate_tfidf=False, memory_usage=20): indexer = Indexer(tokenizer, 'indexer', store_positions=store_positions, max_memory_usage=memory_usage) for filename in filenames: corpus_reader = CorpusReader(filename) indexer.index(corpus_reader) indexer.merge(calculate_tfidf) return indexer
def __init__(self, config=None, run_config=None): if not config: config = ConfigClass() if not run_config: run_config = RunConfigClass() self._run_config = run_config self._config = config self._parser = Parse(run_config) self._indexer = Indexer(run_config) self._model = None self.searcher = Searcher(self._parser, self._indexer, run_config, model=self._model)
def main(): idx = Indexer() scan_queue = [ # 'http://s28.bitdl.ir/Video/', # 'http://128.199.129.79:666/', # 'https://korea-dpr.com/mp3/', # 'http://46.4.132.219:999/', 'https://mirror.futureweb.be/manjaro/arm-stable/', ] urls = idx.scan(scan_queue) idx.save('urls.txt', urls)
def test_highlight_window_one(self): self.indexator = Indexer('database') test_file_one = open('test_highlight_window.txt', 'w') test_file_one.write('Alina Zakharova is a student') test_file_one.close() self.indexator.get_index_with_line('test_highlight_window.txt') del self.indexator self.search = SearchEngine('database') window = windows.Context_Window.get_window('test_highlight_window.txt', Position_Plus(0, 6, 15), 1) result = window.highlight_window() output_string = 'Alina <b>Zakharova</b> is' self.assertEqual(result, output_string) os.remove('test_highlight_window.txt')
def __init__(self, config=None): self._config = config if self._config: if not hasattr(self._config, 'toStem'): self._config.toStem = False if not hasattr(self._config, 'toLemm'): self._config.toLemm = False self._parser = Parse() self._indexer = Indexer(config) self._model = None self.corpus_size = 0 self.load_precomputed_model()
def test_index_creation(self): self.maxDiff = None indexer = Indexer() for text in self.texts: text = text.strip() bag = BagOfWords( text, enable_stemming=False, filter_stopwords=False) indexer.index(bag) got = indexer.to_dict() self.assertSequenceEqual(self.expected["docs_index"], got["docs_index"]) self.assertDictEqual(self.expected["terms_index"], got["terms_index"])
def test_extend_window_rus_one(self): self.indexator = Indexer('database') test_file_one = open('test_extend_window_rus.txt', 'w') test_file_one.write('Пьер с грустью слышал над собою насмешки.') test_file_one.close() self.indexator.get_index_with_line('test_extend_window_rus.txt') del self.indexator self.search = SearchEngine('database') window = windows.Context_Window.get_window( 'test_extend_window_rus.txt', Position_Plus(0, 0, 4), 1) window.extend_window() extended_window = Context_Window( 'Пьер с грустью слышал над собою насмешки.', [Position_Plus(0, 0, 4)], 0, 41) self.assertEqual(window, extended_window)
def test_extend_window(self): self.indexator = Indexer('database') test_file_one = open('test_extend_window.txt', 'w') test_file_one.write('Alina Zakharova is a student!!') test_file_one.close() self.indexator.get_index_with_line('test_extend_window.txt') del self.indexator self.search = SearchEngine('database') window = windows.Context_Window.get_window('test_extend_window.txt', Position_Plus(0, 6, 15), 1) window.extend_window() extended_window = Context_Window('Alina Zakharova is a student!!', [Position_Plus(0, 6, 15)], 0, 30) self.assertEqual(window, extended_window) os.remove('test_extend_window.txt')
def test_not_crossed(self): self.indexator = Indexer('database') test_file_one = open('test_not_crossed_window.txt', 'w') test_file_one.write('The girl named Alina Zakharova is a student') test_file_one.close() self.indexator.get_index_with_line('test_not_crossed_window.txt') del self.indexator self.search = SearchEngine('database') window_A = windows.Context_Window.get_window( 'test_not_crossed_window.txt', Position_Plus(0, 31, 33), 1) window_B = windows.Context_Window.get_window( 'test_not_crossed_window.txt', Position_Plus(0, 8, 14), 1) crossed_AB = window_A.is_crossed(window_B) self.assertEqual(False, crossed_AB) os.remove('test_not_crossed_window.txt')
def test_dump(self): indexer = Indexer() for text in self.texts: text = text.strip() bag = BagOfWords( text, enable_stemming=False, filter_stopwords=False) indexer.index(bag) fd = StringIO() indexer.dump(fd) fd.seek(0) got = json.load(fd) self.assertSequenceEqual(self.expected["docs_index"], got["docs_index"]) self.assertDictEqual(self.expected["terms_index"], got["terms_index"])
def __init__(self, dictionary_file, postings_file, rate = 0.01, alpha = 0.1, expand = True, feedback = True, pagerank = True, pivoted = False, score = False): self.dictionary_file = dictionary_file self.postings_file = postings_file self.rate = rate self.alpha = alpha self.pagerank = pagerank self.pivoted = pivoted self.score = score self.stemmer = PorterStemmer() self.indexer = Indexer(dictionary_file, postings_file) self.refiner = Refiner(indexer=self.indexer, expand=expand, feedback=feedback) self.indexer.LoadDict()
def create_new_indexer(self): candidate_link = self.links_queue.pop( timeout=Crawler.POP_TIMEOUT_IN_SECONDS) candidate_indexedPage, was_create = IndexedPage.objects.get_or_create( pk=candidate_link) if was_create: Indexer(indexed_page=candidate_indexedPage, on_finished_indexing=self.on_indexer_finished, main_thread_cmd_queue=self.main_thread_cmd_queue, links_queue=self.links_queue).start() return True else: logger.info("Skipping {url}. Index already exists".format( url=candidate_link)) return False
def __init__(self, config=None): """ init engine with the relevant model - Thesaurus_Searcher :param config: """ self._config = config try: self._reader = ReadFile(corpus_path=config.get__corpusPath()) except: self._reader = ReadFile("") self._parser = Parse() self._parser.STEMMER = config.toStem self._indexer = Indexer(config) self._model = Thesaurus_Searcher(self._indexer) self.last_parquet = False
def main(): parser = ArgumentParser() parser.add_argument("-p", "--path", dest="path", default=None, help="Document path") parser.add_argument("-t", "--threads", dest="threads", help="Number of threads to launch") args = parser.parse_args() path = os.path.abspath(args.path) if args.path else "docs" threads = int(args.threads) if args.threads else 5 indexer = Indexer(path, threads) indexer.create_index()
def start_app(self): if self.box is None: self.box = Box(self.config) saveConfig(self.configPath, self.config) indexer = Indexer(self.config['path'], self.config['encryption_key'], self.box) indexer.synchronize() self.eventList = EventList() self.remoteObserver = startRemote(self.config['path'], self.config['encryption_key'], self.box, self.eventList) self.localObserver = startLocal(self.config['path'], self.config['encryption_key'], self.box, self.eventList) self.set_icon('img/icon_active.png') self.isAppRunning = True
def picture(structure='reporting', datadir=os.getcwd(), engine='dot', teammembers=True, openimage=True): """ Render an org chart PNG image and open it. STRUCTURE: reporting|teams """ indexer = Indexer(datadir) indexer.load() indexer.index() orggraph = OrgGraph(indexer, engine, teammembers) orggraph.buildgraph(structure) imagepath = orggraph.render() if openimage: showpicture(imagepath)
def run_engine(corpus_path, output_path, stemming, queries, num_docs_to_retrieve, word2vec): """ :return: """ # print("start: ", time.asctime(time.localtime(time.time()))) number_of_documents = 0 num_of_writes = 1 config = ConfigClass(corpus_path) r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(stemming) indexer = Indexer(config, word2vec) # documents_list = r.read_file(file_name='covid19_07-30.snappy.parquet') # TODO - handel all files ~50 (can do with from multiprocessing.pool import ThreadPool) # Iterate over every document in the file counter = 0 names = r.get_files_names_in_dir() for name in names: documents_list = r.read_file_by_name(file_name=str(name)) for idx, document in enumerate(documents_list): parsed_document = p.parse_doc(document) # parse the document if parsed_document == {}: # RT continue number_of_documents += 1 indexer.add_new_doc(parsed_document, num_of_writes) # index the document data counter += 1 if counter >= 500000: write_and_clean_buffer(indexer, num_of_writes, stemming, config, output_path) counter = 0 # print("finish parser & index number: ", num_of_writes, " At: ", time.asctime(time.localtime(time.time()))) num_of_writes += 1 # print('Finished parsing and indexing. Starting to export files') write_and_clean_buffer(indexer, num_of_writes, stemming, config, output_path) # print("finish parser & index: ", time.asctime(time.localtime(time.time()))) indexer.inverted_idx = { key: val for key, val in indexer.inverted_idx.items() if val != 1 } utils.save_obj(indexer.inverted_idx, "inverted_idx") # print("finish save index: ", time.asctime(time.localtime(time.time()))) return num_of_writes
def main(flist, plist='prefix.conf', dbname='ftp_files.db', xname='xapian.db', verbose=False): ''' Main method: dispatches tasks to catalogue and index remote FTP servers. ''' db = Database(dbname) indexer = Indexer(xname, writeable=True) # Read list of prefixes prefixes = [] with open(plist) as f: prefixes = f.read().splitlines() # Read list of remote FTP servers servers = [] with open(flist) as f: servers = f.read().splitlines() # Compile list of all servers for server in servers[:]: idx = servers.index(server) for prefix in prefixes: servers.insert(idx, prefix + '.' + server) for server in servers: if verbose: print "Scanning: %s" % server # Determine if server is a valid FTP site if not is_open_ftp_server(server): continue if verbose: print "\tServer is valid, connecting..." # Record all files on a remote server if not enumerate_files(server, db, verbose=verbose): print "\tCould not enumerate files on %s" % server continue # Download text and add to corpus if not index_content(server, indexer, db, verbose=verbose): print "\tCould not index %s" % server if verbose: print "\nCataloguing and indexing complete." # cleanup indexer.close() db.close()
def main(): # File_path is a class that stores the file paths for required documents. f = File_path() f.declare_paths() corpus = Create_corpus(f.raw_files_folder, True, True) corpus.parse_files(f.raw_files_folder, f.parsed_file_folder, True , True) a = Indexer() a.create_unigram_index(f.parsed_file_folder,f.index_file_path) c = Context() index = c.read_inverted_index(f.index_file_path) DL = c.calculate_document_length(f.parsed_file_folder) AvDL = c.calculate_avg_doc_length(f.parsed_file_folder) q = Query_Parser() q.parse_queries(f.query_file_path,f.parsed_query_file_path) f1 = open(f.parsed_query_file_path,"r") query = dict() for lines in f1: lines = lines.split(":") query[lines[0]] = lines[1].strip() bm = BM25WithRelevance("BM25WithRelevance") bm.retrieve_bm25_scores(query,f.parsed_file_folder,AvDL,DL,index, f.relevance_file_path, f.output_folder_path) tf = Tf_idf("TfIdfRanking") tf.retrieve_tfidf_scores(DL,query,index,f.output_folder_path) q = QueryLikelihood("QLModel") q.retrieve_QL_scores(DL, query,index,f.output_folder_path) # task 2 - pseudo relevance feedback pr = PseudoRelFeedback() pr.PRmain(f.parsed_file_folder,f.index_file_path,f.parsed_query_file_path,f.relevance_file_path,f.stop_file_path,f.output_folder_path) # task 3 - stemmed queries t = Task3() t.driver_stemmed(f) t.ranking_with_stopwords(f) # phase 2 - Snippet generation sg = SnippetGeneration(f.raw_files_folder) sg.get_queries(f.parsed_query_file_path) output_file_path = f.output_folder_path+"/"+"BM25WithRelevance"+".txt" sg.get_ranklist(output_file_path) sg.generate_snippet(f.snippet_file)
def test_two_files(self): test = open("text.txt", 'w' ) test.write("test") test.close() test = open("text1.txt", 'w' ) test.write("my my") test.close() self.indexer.indexing("text.txt") self.indexer = Indexer('database') self.indexer.indexing("text1.txt") words1 = dict(shelve.open("database")) words2 = { "my":{"text.txt": [Position(0, 2), Position(3, 5)]}, "test":{"text.txt": [Position(0, 4)] }} self.assertEqual(words1, words2)
def test_extend_window_rus(self): self.indexator = Indexer('database') test_file_one = open('test_extend_window_rus.txt', 'w') test_file_one.write( 'Прогать очень сложно! Алина Захарова студент лингвист!! Аня любит немецкий. В Петербурге идет дождь.' ) test_file_one.close() self.indexator.get_index_with_line('test_extend_window_rus.txt') del self.indexator self.search = SearchEngine('database') window = windows.Context_Window.get_window( 'test_extend_window_rus.txt', Position_Plus(0, 28, 36), 1) window.extend_window() extended_window = Context_Window( 'Прогать очень сложно! Алина Захарова студент лингвист!! Аня любит немецкий. В Петербурге идет дождь.', [Position_Plus(0, 28, 36)], 22, 55) self.assertEqual(window, extended_window)
def test_extend_window_rus_two(self): self.indexator = Indexer('database') test_file_one = open('test_extend_window_rus.txt', 'w') test_file_one.write( 'С разных сторон виднелись пожары. Пьер тогда еще не понимал значения сожженной Москвы и с ужасом смотрел на эти пожары.' ) test_file_one.close() self.indexator.get_index_with_line('test_extend_window_rus.txt') del self.indexator self.search = SearchEngine('database') window = windows.Context_Window.get_window( 'test_extend_window_rus.txt', Position_Plus(0, 34, 38), 1) window.extend_window() extended_window = Context_Window( 'С разных сторон виднелись пожары. Пьер тогда еще не понимал значения сожженной Москвы и с ужасом смотрел на эти пожары.', [Position_Plus(0, 34, 38)], 0, 119) self.assertEqual(window, extended_window)
def run_engine(corpus_path_, output_path_, stemming_): """ :return: """ number_of_documents = 0 config = ConfigClass(corpuspath=corpus_path_,outputpath=output_path_,stemming=stemming_) config.corpusPath = corpus_path_ config.savedFileMainFolder=output_path_ r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse() indexer = Indexer(config) pathes = r.get_all_path_of_parquet() length_of_array = len(pathes) iteration = 0 is_stemmer = config.toStem parsed_doc_list = list() for i in range(0, length_of_array): documents_list = r.get_documents(pathes[i][0], pathes[i][0]) for doc, j in zip(documents_list, range(len(documents_list))): parsed_document = p.parse_doc(doc, stemmer=is_stemmer) if parsed_document == None: continue parsed_doc_list.append(parsed_document) number_of_documents += 1 if number_of_documents % 200000 == 0: for doc in parsed_doc_list: indexer.add_new_doc(doc) indexer.write_posting_to_txt_file_lower_upper(iteration) iteration += 1 parsed_doc_list.clear() parsed_doc_list = list() elif j == len(documents_list) - 1 and i == length_of_array - 1: for doc in parsed_doc_list: indexer.add_new_doc(doc) indexer.write_posting_to_txt_file_lower_upper(iteration) parsed_doc_list.clear() parsed_doc_list = list() indexer.merge_posting_file() indexer.merge_two_last_posting_file() indexer.split_posting_file_and_create_inverted_index() indexer.write_inverted_index_to_txt_file() number_of_documents = 0