def __init__(self, index=Index()): self.index = index
def __init__(self, name, num_columns, key, rid_space, buffer_pool): self.name = name """ key is the index of the primary key column """ self.key = key """ The rid_block will specify a range/interval of possible values that can be used as rids for the table's records. When all the values in this range have been assigned, the table requets a new rid_block. """ self.rid_block = rid_space.assign_space() """ global_rid_space is the table's reference to the global rid space allocater. The table will make requests to this global rid space allocater for another rid_block when the current rid_block is 'depleted' """ self.global_rid_space = rid_space """ The rid_block_offset keeps track of our current position in the current rid space that has been allocated for the table. """ self.rid_block_offset = 0 """ num_columns simply represents the number of columns in the table. """ self.num_columns = num_columns """ Keep track of the number of records assoicated with this table. for book-keeping purposes """ self.num_records = 0 """ Keep track of number updates made to records in table. Merge is initiated upon every 512 updates. """ self.num_updates = 0 """ Record offset is used to determine the position of the record data within a given page. For example, if the column values for a record are located in the 8th - 15th bytes of the base pages associated with that record, then the record offset would be 1. If the column values for a record are located in the 16th - 23rd bytes of the base pages associated with that record, the record offset is 2, and so on. """ self.record_offset = 0 """ The directory_lock is used to regulate concurrent access to the page directory. Currently, there are only two threads that will have to compete for the lock/acquire and release the lock: The background merge thread and the main forefround thread. """ self.directory_lock = threading.Lock() """ Simple flag that indicates if a merge is currently occuring. """ self.merging = False """ page_ranges contains a list of page ranges. A page range is purely logical. It only contains the list of page ids for each physical page in a page range. Each page range conists of 512 records. This way, every time a set of base pages gets filled up, we allocate a new set of base pages and write the data to those base pages and place the id's for these base pages into a new page range. We only create a new page range when the base pages in the most recently created page range are full. When this happens, we allocate a new set of base pages, place their id's in a page range and insert this into page_ranges. For example, we create a new table of four columns and insert 512 records into this table. The data for these records will span the first set of 4 base pages and 4 metadata base pages allocated for these records (say these pages have ids 1,2,3, and 4). Then page_ranges would look like: [ [1,2,3,4,5,6,7,8] ]. Now suppose we immediately insert another 100 records. The 'current' page range is full. So, we allocate a new set of base pages (and 4 pages to hold the metadata), write the data to these pages and page_ranges will now look like: [ [1,2,3,4,5,6,7,8] [9,10,11,12,13,14,15,16] ] The purpose of the page range is to establish an associative relationship between base pages (which contain base records) and tail pages (which contain tail records). Suppose we make an unpdate (see update_record) to a record whose base record spans pages 1 - 8 (i.e. a record contained in the first page range). We allocate a set of tail pages, place the id's for these tail pages into the appropriate page range, and then write the tail record data to the tail pages. Now page_ranges would look like: [ [1,2,3,4,5,6,7,8,19,20,21,22,23,24,25,26] [9,10,11,12,13,14,15,16] ] """ self.page_ranges = [] """ The page directory maps each rid for each record to a python tuple. That python tuple contains the range of page id's for all base pages that contain the record data. It also contains the record offset. For example, suppose we have a table of 4 columns and there's a record within this table that has an rid of 1. Also suppose that this record spans base pages with id's 0,1,2,3. Moreover, suppose that each of the column values for the record are located in the first 8 bytes of each of the physical base pages. Then the entry for this rid would look like: 1 -> (0,7,0,0) since base pages would have ids 0-3, metadata pages would have ids 4-7 and these pages would be located in page range 0. """ self.page_directory = {} """ Keeps track of the page ranges that are ready for merging. Only page ranges with full base pages are merged. """ self.merge_queue = [] self.page_ids = 0 self.bp = buffer_pool self.index = Index(self) pass
def index(self): return Index(self)
parser.add_option('-p', '--profile', action='store_true', default=False, dest='profile', help='perform profiling of the indexing process') parser.add_option('-t', '--thesaurus', action='store', default=None, dest='thesaurus', help='ID of thesaurus to be used') options, files = parser.parse_args() I = Index(fields=('SearchableText', ), autoexpand_limit=4) ts = time.time() count = 0 bytes = 0 ID2FILES = {} def do_index(options, files): global count, bytes if not files: print >> sys.stderr, 'Reading files from %s' % options.directory files = [] for dirname, dirs, filenames in os.walk(options.directory):
def test_should_add_documents_with_name_and_content(self): index = Index() index.add_document('test', 'this is my first document') index.add_document('test2', 'this is my second document') self.assertEquals(len(index), 2) self.assertEquals(index._documents, set(['test', 'test2']))
dir_path = input( "Enter the path to Directory for the documents ex. cranfieldDocs:") queries_path = input("Enter the path for queries to be evaluated:") relevant_doc_path = input("Enter the path for relevant documents file:") stopwords_file = input("Enter path to the stopwords file:") if dir_path and queries_path and relevant_doc_path and stopwords_file: print("Evaluating your queries...") files = os.listdir(dir_path) no_doc = len(files) # total number of documents in collection tp = TextProcessor() index = Index() # inverted index inverted_index = {} document_frequency = {} # tf-idf for each term tf_idf = {} # cosine similarity cos_similarities = {} for file in files: parser = SGMLParser(dir_path + "/" + file)
def __init__(self, table): self.table = table self.idx = Index(table) pass
ctr += 1 if args.show: logging.info(" Found in - %s", filename) except: pass logging.info("full regular expression search took %s", str(time() - st)) return ctr, time() - st if args.index: if args.directory == 'empty': logging.info('Please read usage --help') quit() else: if os.path.exists(args.directory): index = Index(args.directory) else: logging.info('No such file exists') quit() if args.test: try: os.system("rm my.pkl") except: pass regexs = ["a(b+|c+)d", "(abc|cba)def", "abc+de", "ab(cd)*ef", "def|lambda", "a*(bcd|efg)", "(a|b|c)+@(a|b|c)+(\.(a|b|c))+"] for i in range(len(regexs)): os.system("python3 csearch.py -a demo " + '"' + regexs[i] + '"')
def test_unique_entry(): idx = Index() idx.add("COLON", ":") assert idx["COLON"] == {":"}
def __init__(self, country='us', unknown_value='unknown'): self.index = Index(country) self.country = Country(country) self.unknown_value = unknown_value
def test_three_occurrences(): sample = [("7", "DIGIT"), ("8", "DIGIT"), ("9", "DIGIT")] idx = Index() for char, word in sample: idx.add(word, char) assert idx["DIGIT"] == {"7", "8", "9"}
def backward(kbase, query, *pmode): known = Index().init([(['true'], ['true'])]) try: bchain(kbase, ({}, query, 0, None, None), None, known, pmode) except stop_proof: pass
import sys, os, lucene, json from org.apache.lucene.document import Document, Field, FieldType from org.apache.lucene.index import FieldInfo, IndexWriter, IndexWriterConfig, DirectoryReader, IndexOptions, IndexReader from org.apache.lucene.search import IndexSearcher from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.analysis import LowerCaseFilter, StopFilter from org.apache.lucene.analysis.en import PorterStemFilter, EnglishAnalyzer from org.apache.pylucene.analysis import PythonAnalyzer from org.apache.lucene.store import SimpleFSDirectory from java.nio.file import Paths from index import Index if __name__ == "__main__": if (len(sys.argv) <= 1): print( 'To run index directory is required as an argument. e.g.: python index.py \"/index\"' ) sys.exit() # required to run java functions for lucene lucene.initVM(classpath=lucene.CLASSPATH) analyzer = StandardAnalyzer() index = Index(sys.argv[1], analyzer) store = SimpleFSDirectory(Paths.get(sys.argv[1])) searcher = IndexSearcher(DirectoryReader.open(store)) # open file for searching index.SearchIndex(searcher, analyzer, 20)
def set_index(self, value): self._index = Index(value) self._series.setIndex(self._index.data)
def test_passing_a_stemmer_should_stem_search_term_before_matching(self): porter_stemmer = PorterStemmer() index = Index(stemmer=porter_stemmer) index.add_document('coffee', 'I liked it') self.assertEquals(index.find_by_term('liked'), set(['coffee']))
from index import Index import pretreatment from search import Search import nltk import re import conf import linecache import json if __name__ == '__main__': # pretreatment.pre_process() my_index = Index() # my_index.gen_index() # my_index.write_index_file() my_index.load_index_file() print("Get index successfully.") search_word = ["food"] print("search: ", search_word) stemmer = nltk.stem.PorterStemmer() search_word = [ stemmer.stem(re.sub(conf.clean_rule, "", w)) for w in search_word ] result = [] index_arr = [] for w in search_word: if w not in my_index.word2id_map or my_index.word2id_map[ w] not in my_index.index: print("There is no word:", w)
from evaluation import EvalMeasure, IRList, P, AP, EvalIRModel from index import Index from modeles import Weighter, Vectoriel, Okapi from ParserCACM import ParserCACM, QueryParser from TextRepresenter import PorterStemmer if __name__=='__main__': rel_filename = 'cacm/cacm.rel' query_filename = 'cacm/cacm.qry' index = Index(name='test', docFrom=None, parser=ParserCACM, textRepresenter=PorterStemmer, create_index=False) weighter = Weighter(index) parser = ParserCACM() parser.initFile('cacm/cacm.txt') doc = parser.nextDocument() print(doc.others['links']) # for d in range(20,22): # docId = str(d) # print(ParserCACM().getDocument(docId)) # print(weighter.getDocWeightsForDoc(docId), index.getDocsLength(docId)) # q = QueryParser(query_filename, rel_filename) # train_queries, test_queries = q.split_query_dataset() # print(len(train_queries), len(test_queries))