예제 #1
0
 def __init__(self, index=Index()):
     self.index = index
예제 #2
0
    def __init__(self, name, num_columns, key, rid_space, buffer_pool):
        self.name = name
        """
        key is the index of the primary key column
        """
        self.key = key
        """
        The rid_block will specify a range/interval of possible values that can
        be used as rids for the table's records. When all the values in this
        range have been assigned, the table requets a new rid_block.
        """
        self.rid_block = rid_space.assign_space()
        """
        global_rid_space is the table's reference to the global rid space
        allocater. The table will make requests to this global rid space
        allocater for another rid_block when the current rid_block is
        'depleted'
        """
        self.global_rid_space = rid_space
        """
        The rid_block_offset keeps track of our current position in the current
        rid space that has been allocated for the table.
        """
        self.rid_block_offset = 0
        """
        num_columns simply represents the number of columns in the table.
        """
        self.num_columns = num_columns
        """
        Keep track of the number of records assoicated with this table. for
        book-keeping purposes
        """
        self.num_records = 0
        """
        Keep track of number updates made to records in table. Merge is
        initiated upon every 512 updates.
        """
        self.num_updates = 0
        """
        Record offset is used to determine the position of the record data
        within a given page. For example, if the column values for a record are
        located in the 8th - 15th bytes of the base pages associated with that
        record, then the record offset would be 1. If the column values for
        a record are located in the 16th - 23rd bytes of the base pages
        associated with that record, the record offset is 2, and so on.   
        """
        self.record_offset = 0
        """
        The directory_lock is used to regulate concurrent access to the page
        directory. Currently, there are only two threads that will have to
        compete for the lock/acquire and release the lock: The background merge
        thread and the main forefround thread.
        """
        self.directory_lock = threading.Lock()
        """
        Simple flag that indicates if a merge is currently occuring.
        """
        self.merging = False
        """
        page_ranges contains a list of page ranges. A page range is purely
        logical. It only contains the list of page ids for each physical page
        in a page range.
        Each page range conists of 512 records. This way, every time a set of
        base pages gets filled up, we allocate a new set of base pages and
        write the data to those base pages and place the id's for these base
        pages into a new page range. We only create a new page range when the
        base pages in the most recently created page range are full. When this
        happens, we allocate a new set of base pages, place their id's in a page
        range and insert this into page_ranges.

        For example, we create a new table of four columns and insert 512
        records into this table. The data for these records will span the first
        set of 4 base pages and 4 metadata base pages allocated for these
        records (say these pages have ids 1,2,3, and 4). Then page_ranges would
        look like: 
        [ [1,2,3,4,5,6,7,8] ].
        
        Now suppose we immediately insert another 100 records. The 'current'
        page range is  full. So, we allocate a new set of base pages (and
        4 pages to hold the metadata), write the data to these pages and
        page_ranges will now look like:
        [ [1,2,3,4,5,6,7,8] [9,10,11,12,13,14,15,16] ]

        The purpose of the page range is to establish an associative
        relationship between base pages (which contain base records) and tail
        pages (which contain tail records).
        Suppose we make an unpdate (see update_record) to a record whose base
        record spans pages 1 - 8 (i.e. a record contained in the first page
        range). We allocate a set of tail pages, place the id's for these tail
        pages into the appropriate page range, and then write the tail record
        data to the tail pages. Now page_ranges would look like:
        [ [1,2,3,4,5,6,7,8,19,20,21,22,23,24,25,26] [9,10,11,12,13,14,15,16] ]
        """
        self.page_ranges = []
        """
        The page directory maps each rid for each record to a python tuple.
        That python tuple contains the range of page id's for all base pages that
        contain the record data. It also contains the record offset.
        
        For example, suppose we have a table of 4 columns and there's a record
        within this table that has an rid of 1. Also suppose that this record
        spans base pages with id's 0,1,2,3. Moreover, suppose that each of
        the column values for the record are located in the first 8 bytes of
        each of the physical base pages. Then the entry for this rid would look
        like: 1 -> (0,7,0,0) since base pages would have ids 0-3, metadata
        pages would have ids 4-7 and these pages would be located in page range
        0.
        """
        self.page_directory = {}
        """
        Keeps track of the page ranges that are ready for merging. Only page
        ranges with full base pages are merged.
        """
        self.merge_queue = []

        self.page_ids = 0

        self.bp = buffer_pool

        self.index = Index(self)

        pass
예제 #3
0
 def index(self):
     return Index(self)
예제 #4
0
parser.add_option('-p',
                  '--profile',
                  action='store_true',
                  default=False,
                  dest='profile',
                  help='perform profiling of the indexing process')
parser.add_option('-t',
                  '--thesaurus',
                  action='store',
                  default=None,
                  dest='thesaurus',
                  help='ID of thesaurus to be used')

options, files = parser.parse_args()

I = Index(fields=('SearchableText', ), autoexpand_limit=4)

ts = time.time()
count = 0
bytes = 0

ID2FILES = {}


def do_index(options, files):
    global count, bytes

    if not files:
        print >> sys.stderr, 'Reading files from %s' % options.directory
        files = []
        for dirname, dirs, filenames in os.walk(options.directory):
예제 #5
0
 def test_should_add_documents_with_name_and_content(self):
     index = Index()
     index.add_document('test', 'this is my first document')
     index.add_document('test2', 'this is my second document')
     self.assertEquals(len(index), 2)
     self.assertEquals(index._documents, set(['test', 'test2']))
예제 #6
0
    dir_path = input(
        "Enter the path to Directory for the documents ex. cranfieldDocs:")
    queries_path = input("Enter the path for queries to be evaluated:")
    relevant_doc_path = input("Enter the path for relevant documents file:")
    stopwords_file = input("Enter path to the stopwords file:")

    if dir_path and queries_path and relevant_doc_path and stopwords_file:

        print("Evaluating your queries...")

        files = os.listdir(dir_path)

        no_doc = len(files)  # total number of documents in collection

        tp = TextProcessor()
        index = Index()

        # inverted index
        inverted_index = {}

        document_frequency = {}

        # tf-idf for each term
        tf_idf = {}

        # cosine similarity
        cos_similarities = {}

        for file in files:
            parser = SGMLParser(dir_path + "/" + file)
예제 #7
0
 def __init__(self, table):
     self.table = table
     self.idx = Index(table)
     pass
예제 #8
0
                    ctr += 1
                    if args.show:
                        logging.info(" Found in -  %s", filename)
        except:
            pass
    logging.info("full regular expression search took %s", str(time() - st))
    return ctr, time() - st


if args.index:
    if args.directory == 'empty':
        logging.info('Please read usage --help')
        quit()
    else:
        if os.path.exists(args.directory):
            index = Index(args.directory)
        else:
            logging.info('No such file exists')
            quit()

if args.test:
    try:
        os.system("rm my.pkl")
    except:
        pass

    regexs = ["a(b+|c+)d", "(abc|cba)def", "abc+de", "ab(cd)*ef", "def|lambda", "a*(bcd|efg)",
              "(a|b|c)+@(a|b|c)+(\.(a|b|c))+"]

    for i in range(len(regexs)):
        os.system("python3 csearch.py -a demo " + '"' + regexs[i] + '"')
예제 #9
0
def test_unique_entry():
    idx = Index()
    idx.add("COLON", ":")
    assert idx["COLON"] == {":"}
 def __init__(self, country='us', unknown_value='unknown'):
     self.index = Index(country)
     self.country = Country(country)
     self.unknown_value = unknown_value
예제 #11
0
def test_three_occurrences():
    sample = [("7", "DIGIT"), ("8", "DIGIT"), ("9", "DIGIT")]
    idx = Index()
    for char, word in sample:
        idx.add(word, char)
    assert idx["DIGIT"] == {"7", "8", "9"}
예제 #12
0
def backward(kbase, query, *pmode):
    known = Index().init([(['true'], ['true'])])
    try:
        bchain(kbase, ({}, query, 0, None, None), None, known, pmode)
    except stop_proof:
        pass
예제 #13
0
import sys, os, lucene, json
from org.apache.lucene.document import Document, Field, FieldType
from org.apache.lucene.index import FieldInfo, IndexWriter, IndexWriterConfig, DirectoryReader, IndexOptions, IndexReader
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.analysis import LowerCaseFilter, StopFilter
from org.apache.lucene.analysis.en import PorterStemFilter, EnglishAnalyzer
from org.apache.pylucene.analysis import PythonAnalyzer
from org.apache.lucene.store import SimpleFSDirectory
from java.nio.file import Paths
from index import Index

if __name__ == "__main__":
    if (len(sys.argv) <= 1):
        print(
            'To run index directory is required as an argument. e.g.: python index.py \"/index\"'
        )
        sys.exit()

    # required to run java functions for lucene
    lucene.initVM(classpath=lucene.CLASSPATH)

    analyzer = StandardAnalyzer()
    index = Index(sys.argv[1], analyzer)

    store = SimpleFSDirectory(Paths.get(sys.argv[1]))
    searcher = IndexSearcher(DirectoryReader.open(store))
    # open file for searching
    index.SearchIndex(searcher, analyzer, 20)
예제 #14
0
 def set_index(self, value):
     self._index = Index(value)
     self._series.setIndex(self._index.data)
예제 #15
0
 def test_passing_a_stemmer_should_stem_search_term_before_matching(self):
     porter_stemmer = PorterStemmer()
     index = Index(stemmer=porter_stemmer)
     index.add_document('coffee', 'I liked it')
     self.assertEquals(index.find_by_term('liked'), set(['coffee']))
예제 #16
0
from index import Index
import pretreatment
from search import Search
import nltk
import re
import conf
import linecache
import json

if __name__ == '__main__':

    # pretreatment.pre_process()

    my_index = Index()
    # my_index.gen_index()
    # my_index.write_index_file()
    my_index.load_index_file()
    print("Get index successfully.")

    search_word = ["food"]
    print("search: ", search_word)
    stemmer = nltk.stem.PorterStemmer()
    search_word = [
        stemmer.stem(re.sub(conf.clean_rule, "", w)) for w in search_word
    ]
    result = []
    index_arr = []
    for w in search_word:
        if w not in my_index.word2id_map or my_index.word2id_map[
                w] not in my_index.index:
            print("There is no word:", w)
예제 #17
0
from evaluation import EvalMeasure, IRList, P, AP, EvalIRModel
from index import Index
from modeles import Weighter, Vectoriel, Okapi
from ParserCACM import ParserCACM, QueryParser
from TextRepresenter import PorterStemmer


if __name__=='__main__':

    rel_filename = 'cacm/cacm.rel'
    query_filename = 'cacm/cacm.qry'

    index = Index(name='test', docFrom=None,
                  parser=ParserCACM, textRepresenter=PorterStemmer,
                  create_index=False)


    weighter = Weighter(index)
    parser = ParserCACM()
    parser.initFile('cacm/cacm.txt')
    doc = parser.nextDocument()
    print(doc.others['links'])
    # for d in range(20,22):
    #     docId = str(d)
    #     print(ParserCACM().getDocument(docId))
        # print(weighter.getDocWeightsForDoc(docId), index.getDocsLength(docId))

    # q = QueryParser(query_filename, rel_filename)
    # train_queries, test_queries = q.split_query_dataset()
    # print(len(train_queries), len(test_queries))