예제 #1
0
import tabulate as tab
import numpy as np
import settings as ENV

sys.path.insert(0, 'src')
import utils as util
from indexing import docProcessor as dp
from indexing import indexing as idx
from indexing import tripleBuilder as tb
from object_definitions import document as d

termList = []
dfList = []
tripleList = []
documentList = []
stopTerms = util.extractStopTerms()
if ENV.BUILD_ALL_INDEXES == True:
    indexTypes = ["INVERTED", "POSITIONAL", "STEM", "PHRASE"]
else:
    indexTypes = [ENV.INDEX_TYPE]

runStats = {}
timeStats = {}


# Empty our index folder
indexFiles = os.listdir(ENV.INDEX_LOCATION)
for f in indexFiles:
    os.remove(ENV.INDEX_LOCATION + f)

# For every index type we wish to create...
import utils as util
from query import queryProcessor as qp
from query import index as i
from query import vectorSpace as vsm
from query import bm25
from query import languageModel as lang
from query import queryReducer as q_red
from query import queryExpander as q_exp
from indexing import indexing as idx
from object_definitions import document as d
from object_definitions.query import Query


start_time = datetime.datetime.now()

ENV.STOP_TERMS = util.extractStopTerms()

# we always want to extract phrases for the lexicon
ENV.EXTRACT_PHRASES = True

''' LOAD NECESSARY INDEXES '''
if ENV.QUERY_PROCESSING_METHOD == "STANDARD":
    lexicon_path = ENV.INDEX_LOCATION + ENV.QUERY_PROCESSING_INDEX.lower() + "Lexicon.txt"
    doc_list_path = ENV.INDEX_LOCATION + ENV.QUERY_PROCESSING_INDEX.lower() + ENV.DOC_FILE_NAME + ".txt"
    posting_list_path = ENV.INDEX_LOCATION + ENV.QUERY_PROCESSING_INDEX.lower() + ENV.POSTING_LIST_NAME + ".txt"
    ENV.primary_index = i.Index(lexicon_path, posting_list_path, doc_list_path)
# if the index we use is query dependent
elif ENV.QUERY_PROCESSING_METHOD == "CONDITIONAL":
    ENV.QUERY_PROCESSING_INDEX = "PHRASE"
    lexicon_path = ENV.INDEX_LOCATION + ENV.QUERY_PROCESSING_INDEX.lower() + "Lexicon.txt"
    doc_list_path = ENV.INDEX_LOCATION + ENV.QUERY_PROCESSING_INDEX.lower() + ENV.DOC_FILE_NAME + ".txt"