Exemplo n.º 1
0
def getDocuments(dimensions, date_dimenstions):
    index_table = indexer.index()
    dates_index = loader.loadJsonFile('./storage/dates_index.json')
    if not dates_index: dates_index = {}
    dates_index_keys = dates_index.keys()
    documents = {}
    try:
        for dim in dimensions:
            docs = [doc[0] for doc in index_table[dim]]
            for doc in docs:
                if (doc not in documents):
                    v = makeVector(
                        doc, dimensions, date_dimenstions)
                    documents[doc] = v / np.linalg.norm(v)
        for dim in date_dimenstions:
            if not dim in dates_index_keys:
                continue
            for doc in dates_index[dim]:
                if (doc not in documents):
                    v = makeVector(
                        doc, dimensions, date_dimenstions)
                    documents[doc] = v / np.linalg.norm(v)
    except Exception as e:
        print("Exception in get Docs\n"+str(e))
        traceback.print_exc()

    return documents
Exemplo n.º 2
0
 def __init__(self):
     self.abbrev = loader.loadJsonFile("./storage/abbreviations.json")
     
     if(self.abbrev == False):
         print("no abbrevs get, reset dict")
         self.abbrev = {}
     
     self.pattern = "((\w\.)+\w)"
     self.abbrev_keys = self.abbrev.keys()
Exemplo n.º 3
0
def makeVector(document, dimensions, date_dimenstions):
    index_table = indexer.index()
    dates_index = loader.loadJsonFile('./storage/dates_index.json')
    if not dates_index: dates_index = {}
    dates_index_keys = dates_index.keys()
    vector = []
    for dim in dimensions:
        docs = [doc[0] for doc in index_table[dim]]
        if (document in docs):
            vector.append(
                [doc[1] for doc in index_table[dim] if doc[0] == document][0])
        else:
            vector.append(0)
    for dim in date_dimenstions:
        if(dim in dates_index_keys and document in dates_index[dim]):
            vector.append(1)
        else:
            vector.append(0)
    return vector
Exemplo n.º 4
0
def test(fresh=False):
    queries = loadQueries()
    relevances = loadRelevance()

    if (not fresh):
        cached_results = loader.loadJsonFile('./storage/test_cases.json')
        if (cached_results):
            return cached_results

    test_cases = {}
    for query, relevance in zip(queries, relevances):
        results = list(matcher.match(query.lower()).keys())
        results = [r.replace(".txt", "") for r in results]
        shared_res = set(relevance) & set(results)
        test_cases[query] = [(doc,
                              results.index(doc) + 1 if doc in results else -1)
                             for doc in shared_res]

    loader.saveJsonFile('./storage/test_cases.json', test_cases)
    return test_cases
Exemplo n.º 5
0
def index(fresh=False, dir='./docs/', selected_files = None):
    global index_table_cached

    if not fresh:
        if (index_table_cached):
            return index_table_cached
        index_table_cached = loader.loadJsonFile('./storage/index_table.json')
        if(index_table_cached):
            return index_table_cached

    # index for spelling correction
    # bigram index for isolated tem correction and context sensitive correction
    # soundec index for phonetic correction
    soundex_index = {}
    bigram_index = {}
    index_table = {}
    terms_frequency = {}
    dates_index = {}

    if selected_files:
        terms_frequency = loader.loadJsonFile('./storage/terms_frequency.json')
        if not terms_frequency:
            terms_frequency = {}

    files = loader.getFilesInDir(dir)
    for file in files:
        if selected_files and file not in selected_files:
            continue
        print("Indexing File: " + (dir + file))
        date_tokens = getDocDates(dir + file)

        for date in date_tokens:
            if date in dates_index.keys() and file not in dates_index[date]:
                dates_index[date].append(file)
                continue
            dates_index[date] = [file]
        
        tokens = getDocTokens(dir + file)
        for token in tokens:

            if token[0] in terms_frequency.keys():
                terms_frequency[token[0]].append((file, token[1]))
                continue

            terms_frequency[token[0]] = [(file, token[1])]
            soundex_index[token[0]] = getPhoneticHash(token[0])
            bigram = getBigramForWord(token[0])
            for c in bigram:
                if c not in bigram_index.keys():
                    bigram_index[c] = [token[0]]
                else:
                    bigram_index[c].append(token[0])

    index_table = relevance.getIndexTableWithIDF(terms_frequency, len(files))
    # sort the dictionary for binary search
    index_table = OrderedDict(sorted(index_table.items(), key=lambda t: t[0]))
    bigram_index = OrderedDict(
        sorted(bigram_index.items(), key=lambda t: t[0]))
    soundex_index = OrderedDict(
        sorted(soundex_index.items(), key=lambda t: t[0]))

    index_table_cached = index_table
    loader.saveJsonFile('./storage/index_table.json', index_table)
    loader.saveJsonFile('./storage/terms_frequency.json', terms_frequency)
    loader.saveJsonFile('./storage/bigram_index.json', bigram_index)
    loader.saveJsonFile('./storage/soundex_index.json', soundex_index)
    loader.saveJsonFile('./storage/dates_index.json', dates_index)

    return index_table
Exemplo n.º 6
0
def loadIndex(path):
    return loader.loadJsonFile(path)
Exemplo n.º 7
0
def soundexIndex():
    return loader.loadJsonFile('./storage/soundex_index.json')
Exemplo n.º 8
0
def bigramIndex():
    return loader.loadJsonFile('./storage/bigram_index.json')
Exemplo n.º 9
0
def getCorrectDate(date_tokens):
    dates_index = loader.loadJsonFile('./storage/dates_index.json')
    # for d in date_tokens:
    #  if(d not in dates_index.keys()):
    #     date_tokens.remove(d)
    return date_tokens