Exemplo n.º 1
0
    def findSimilar(self,link,limit):

        # we call the read text function from the Crawler to read the new link
        # we use the construcotr with empty variables
        crawler = Crawler.Crawler('',0,0,0)


        self.limit = limit
        file = open("Data/page%d.txt" % self.limit, 'w')

        try:
            self.title , text = crawler.getText(link)
            # we combine the lists of string to a single string
            text = ''.join(text)
            for t in text:
                file.write(t)
            file.close()
        except:
            "Link is not accesible"
            file.close()
            sys.exit(0)

        indexer = Indexer.Indexer()
        indexer.start()

        cosineSimilarity = indexer.getCosineSimilarity()



        linksId = [ i for i in range(self.limit)]

        linksIdSorted = [x for _,x in sorted(zip(cosineSimilarity,linksId),reverse=True)]

        return cosineSimilarity , linksIdSorted
Exemplo n.º 2
0
def run_index():
    # run an entire index build
    global docs_path
    global postings_path
    global is_stemming
    global indexer
    global dict_cache_path
    try:
        # check validation conditions
        if (not check_corpus_directory(docs_path.get())) or (not check_postings_directory(postings_path.get())):
            return
        result = tkMessageBox.askquestion("Run Index",
                                          "Are you sure?\n dont worry if the GUI"
                                          " is stuck or not responding - it is working", icon='warning')
        if result != 'yes':
            return
        print ('START TIME - ' + time.strftime("%H:%M:%S"))
        start_time = datetime.now()
        # reset the current memory of the project
        if (globs.main_dictionary is not None) and (bool(globs.main_dictionary)):
            globs.main_dictionary.clear()
        if (globs.cache is not None) and (bool(globs.cache)):
            globs.cache.clear()
        if (globs.documents_dict is not None) and (bool(globs.documents_dict)):
            globs.documents_dict.clear()
        # start indexing
        globs.stop_words = load_stop_words(docs_path.get())
        indexer = Indexer.Indexer(postings_path.get(), is_stemming.get())
        read_file = ReadFile.ReadFile(get_corpus_dir(docs_path.get()),
                                      indexer, globs.constants, globs.stop_words, is_stemming.get())
        read_file.index_folder()
        globs.num_of_documents = len(read_file.documents_dict)

        globs.documents_dict = read_file.documents_dict
        del read_file
        indexer.unite_temp_postings()
        globs.main_dictionary = indexer.main_dict
        indexer.build_document_weight(globs.documents_dict)
        # in case want to print stats, uncomment this
        # with open('{}{}'.format('stats', 'stem' if is_stemming.get() else ''),'w') as my_stats_file:
        #    my_stats_file.write('term,tf,df\n'.format())
        #    for key,val in main_dictionary.iteritems():
        #        my_stats_file.write('{},{},{}\n'.format(key,val.tf,val.df))
        globs.cache = indexer.cache_dict
        globs.average_doc_size = globs.average_doc_size/globs.num_of_documents
        dict_cache_path = postings_path
        print ('END TIME - ' + time.strftime("%H:%M:%S"))
        end_time = datetime.now()
        print_stats_at_end_of_indexing(end_time - start_time)
    except Exception as err:
        tkMessageBox.showinfo('ERROR', err)
        traceback.print_exc(file=stdout)
Exemplo n.º 3
0
import Tokenizer
import Indexer

# (modifiedTokens, DocumentId)
pairs = []
docFiles = ["Almeida Garrett - Viagens na Minha Terra.txt", "Eça de Queirós - A Cidade e as Serras.txt"]

for docId in range(len(docFiles)):
    tokenizer = Tokenizer.Tokenizer(docFiles[docId])
    tokenizer.createTokens()

    pairs += [(token, docId) for token in tokenizer.getTokens()]

indexer = Indexer.Indexer(pairs, len(docFiles))
indexer.indexTerms(None)
print("Most frequent terms:")
for docId in range(len(docFiles)):
    print(docFiles[docId] + "\n\t" + str(indexer.getFreqTerms()[docId]))
print("\nMost frequent terms with 4 or more letters:")
indexer.indexTerms(4)
for docId in range(len(docFiles)):
    print(docFiles[docId] + "\n\t" + str(indexer.getFreqTerms()[docId]))
Exemplo n.º 4
0
from Indexer import *
from QueryProcessor import QueryProcessor
import numpy as np
import math
import time
from nltk.stem import PorterStemmer
from string import ascii_lowercase

if __name__ == "__main__":

    index = Indexer()
    #index.start_index()

    query = input("Enter query: ")
    start_time = time.time()  #Return the time to start the search
    qp = QueryProcessor()
    urlid = qp.search(query.lower())
    temp = []
    if not urlid:
        print('no url find with given query')
    else:
        with open('doc_id.json', 'r') as url_id:
            url_dict = json.load(url_id, strict=False)
        index = 1
        for i in urlid:
            try:
                if index > 20:
                    break
                result_str = "#%3d: %s" % (index, url_dict[str(i)])
                print(result_str)
                index += 1
Exemplo n.º 5
0
def main(argv):
    collectionFile = ''
    tokenizerType = ''
    queriesFile = ''
    rankType = ''
    start = []
    end = []
    try:
        opts, args = getopt.getopt(argv, "hf:t:q:r:", ["collectionFile=", "tokenizerType=", "queriesFilePath=",
                                                     "rankType="])
    except getopt.GetoptError:
        print('main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better> -q <queriesFilePath> '
              '-r <rankType: 0 - TF-IDF, 1 - BM25>')
        sys.exit()

    if len(opts) != 4:
        print('main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better> -q <queriesFilePath> '
              '-r <rankType: 0 - TF-IDF, 1 - BM25>')
        sys.exit()

    for opt, arg in opts:
        if opt == '-h':
            print('main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better> -q <queriesFilePath> '
                  '-r <rankType: 0 - TF-IDF, 1 - BM25>')
            sys.exit()
        elif opt in ("-f", "--collectionFile"):
            if not path.exists(arg):
                print('Incorrect path to collection file.')
                sys.exit()
            collectionFile = arg
        elif opt in ("-t", "--tokenizerType"):
            if arg != '0' and arg != '1':
                print('Incorrect tokenizer type. Simple tokenizer: 0, Better tokenizer: 1.')
                sys.exit()
            tokenizerType = arg
        elif opt in ("-q", "--queriesFilePath"):
            if not path.exists(arg):
                print('Incorrect path to queries file.')
                sys.exit()
            queriesFile = arg
        elif opt in ("-r", "--rankType"):
            if arg != '0' and arg != '1':
                print('Incorrect rank type. TF-IDF: 0, BM25: 1.')
                sys.exit()
            rankType = arg

    # Indexer
    (Indexer(collectionFile, tokenizerType)).writeIndexToFile('index')

    f = open(queriesFile, 'r')
    queries = f.read().splitlines()
    f.close()

    scores = []

    if tokenizerType == '0':  # simple
        tokenizer = Tokenizer.SimpleTokenizer('')
    else:  # better
        tokenizer = Tokenizer.BetterTokenizer('')

    for query in queries:

        # Query Operations
        tokenizer.changeText(query)
        queryTerms = tokenizer.getTerms()

        
        # Searcher
        documentsInfo, avgDocLen = Searcher.searchDocuments(queryTerms, 'index')

        # Ranker
        ranker = Ranker(documentsInfo, avgDocLen)
        
        # Start time (latency purpose)
        start.append(timer())
        # If rankType = 0 (tf-idf)
        if rankType == '0':
            scores += [ranker.lnc_ltc()]
        # If rankType = 1 (BM25)
        else:
            scores += [ranker.bm25(1.2, 0.75)]

        # End time (latency purpose)
        end.append(timer())

    # Evaluation
    Evaluation.getResults('./data/queries.relevance.txt', queries, scores, start, end)
Exemplo n.º 6
0
def main(argv):

    # ----------------------------------------- HANDLING PROGRAM INPUT -------------------------------------------------
    collectionFile = ''
    tokenizerType = ''
    queriesFile = ''
    rankType = ''
    storePos = ''
    proximity = ''
    try:
        opts, args = getopt.getopt(argv, "hf:t:q:r:p:b:", ["collectionFile=", "tokenizerType=", "queriesFilePath=",
                                                     "rankType=", "storePositions=", "proximityBoost="])
    except getopt.GetoptError:
        print('main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better> -q <queriesFilePath> '
              '-r <rankType: 0 - TF-IDF, 1 - BM25> -p <storePositions: 0 - No, 1 - Yes> '
              '-b <proximityBoost: 0 - No, 1 - Yes>')
        sys.exit()

    if len(opts) != 6:
        print('main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better> -q <queriesFilePath> '
              '-r <rankType: 0 - TF-IDF, 1 - BM25> -p <storePositions: 0 - No, 1 - Yes> '
              '-b <proximityBoost: 0 - No, 1 - Yes>')
        sys.exit()

    for opt, arg in opts:
        if opt == '-h':
            print('main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better> -q <queriesFilePath> '
                  '-r <rankType: 0 - TF-IDF, 1 - BM25> -p <storePositions: 0 - No, 1 - Yes> '
              '-b <proximityBoost: 0 - No, 1 - Yes>')
            sys.exit()
        elif opt in ("-f", "--collectionFile"):
            if not path.exists(arg):
                print('Incorrect path to collection file.')
                sys.exit()
            collectionFile = arg
        elif opt in ("-t", "--tokenizerType"):
            if arg != '0' and arg != '1':
                print('Incorrect tokenizer type. Simple tokenizer: 0, Better tokenizer: 1.')
                sys.exit()
            tokenizerType = arg
        elif opt in ("-q", "--queriesFilePath"):
            if not path.exists(arg):
                print('Incorrect path to queries file.')
                sys.exit()
            queriesFile = arg
        elif opt in ("-r", "--rankType"):
            if arg != '0' and arg != '1':
                print('Incorrect rank type. TF-IDF: 0, BM25: 1.')
                sys.exit()
            rankType = arg
        elif opt in ("-p", "--storePositions"):
            if arg != '0' and arg != '1':
                print('\nIncorrect store positions choice. No: 0, Yes: 1.')
                sys.exit()
            storePos = arg
        elif opt in ("-b", "--proximityBoost"):
            if arg != '0' and arg != '1':
                print('\nIncorrect proximity boost choice. No: 0, Yes: 1.')
                sys.exit()
            proximity = arg

    # ----------------------------------------------- INDEXER ----------------------------------------------------------
    indexer = Indexer(collectionFile, tokenizerType, True if storePos=='1' else False)

    start = timeit.default_timer()
    indexer.index()
    stop = timeit.default_timer()

    print('Indexing total time - {} tokenizer: {} min and {} seconds'.format("simple" if tokenizerType == "0" else "better", (stop - start)//60, (stop - start) % 60))

    f = open(queriesFile, 'r')
    queries = f.read().splitlines()
    f.close()

    scores = []

    if tokenizerType == '0':  # simple
        tokenizer = Tokenizer.SimpleTokenizer('')
    else:  # better
        tokenizer = Tokenizer.BetterTokenizer('')

    start_queries = []
    end_queries = []
    time_searcher = 0
    time_ranker = 0
    for query in queries:

        # --------------------------------------- QUERY OPERATIONS -----------------------------------------------------
        tokenizer.changeText(query)

        #queryTerms, queryTermsPositions = tokenizer.getTerms(withPositions=True if storePos == '1' else False)
        queryTerms = tokenizer.getTerms(withPositions=False)

        # ------------------------------------------- SEARCHER ---------------------------------------------------------
        start = timeit.default_timer()
        documentsInfo, avgDocLen = Searcher.searchDocuments(queryTerms, 'index', True if storePos == '1' else False)
        stop = timeit.default_timer()
        time_searcher = time_searcher + stop - start

        # -------------------------------------------- RANKER ----------------------------------------------------------'
        start = timeit.default_timer()
        ranker = Ranker(documentsInfo, avgDocLen)
        
        # Start time (latency purpose)
        start_queries.append(timer())
        # If rankType = 0 (tf-idf)
        if rankType == '0':
            # If proximity = 1 (Proximity Boost)
            if proximity == '1':
                scores += [ranker.proximity_boost(ranker.lnc_ltc(), queryTerms)]
            else:
                scores += [ranker.lnc_ltc()]
        # If rankType = 1 (BM25)
        else:
            # If proximity = 1 (Proximity Boost)
            if proximity == '1':
                scores += [ranker.proximity_boost(ranker.bm25(1.2, 0.75), queryTerms)]
            else:
                scores += [ranker.bm25(1.2, 0.75)]

        stop = timeit.default_timer()
        time_ranker = time_ranker + stop - start

        # End time (latency purpose)
        end_queries.append(timer())


    print('Searching time for all queries: {} min and {} seconds'.format(time_searcher // 60, time_searcher % 60))
    print('Ranking time for all queries: {} min and {} seconds'.format(time_ranker // 60, time_ranker % 60))

    # Evaluation
    Evaluation.getResults('./data/queries.relevance.txt', queries, scores, start_queries, end_queries)
Exemplo n.º 7
0
    def get_context_data(self, **kwargs):

        context = super(SearchView, self).get_context_data(**kwargs)

        #{'view': <googlesearch.views.SearchView object at 0x1036cd0d0>}

        results = []
        try:
            results = []

            index = Indexer()
            query = self.request.GET.get('q', '')
            start_time = time.time() #Return the time to start the search 
            qp = QueryProcessor() 
            urlid = qp.search(query.lower())
            temp = []
            if not urlid:
                print('no url find with given query')
            else:                
                with open('doc_id.json', 'r') as url_id:
                    url_dict = json.load(url_id, strict=False)
                index = 1
                for i in urlid:
                    try:
                        if index > 20:
                            break
                        result_str = "#%3d: %s" %(index,url_dict[str(i)])
                        results.append( (result_str, url_dict[str(i)]))
                        index += 1
                    except:
                        pass

            total_time = time.time() - start_time #The total time used to complete the search
            #time_str = "The search took time %f seconds" % (total_time)
            #print(time_str)
            #results = SearchResults(results)
            pages = self.calculate_pages()
            
        except:
            print("Error occured")
            page = 1
            pages = [0, 1, 2]


        # Defaults
        context.update({
            'items': [],
            'total_results': 0,
            'current_page': 0,
            'prev_page': 0,
            'next_page': 0,
            'search_terms': self.request.GET.get('q', ''),
            'error': results,
            'total_time': 0,
        })

        context.update({
            'items': results,
            'total_results': 20, 
            'current_page': pages[1],
            'prev_page': pages[0],
            'next_page': pages[2],
            'search_terms': self.request.GET.get('q', ''),
            'total_time': total_time,

        })

        return context
Exemplo n.º 8
0
 def __init__(self, directory, load_file=False, stop_word_path=None):
     self.indexer = Indexer.Indexer(directory, load_file, stop_word_path)
     self.inverted_index = self.indexer.inverted_idx
     self.stop_list = self.indexer.stop_words
    def corpusReader():

        # to choose file to read and open it
        #filename_input = "OneDrive_1_9-26-2019/2004_TREC_ASCII_MEDLINE_1"
        filename_input = Interact.openFile()
        fi = open(filename_input, 'r', encoding="latin-1")

        #filename_input = "OneDrive_1_9-26-2019/2004_TREC_ASCII_MEDLINE_2"
        #fi2 = open(filename_input, 'r', encoding="latin-1")
        
        #open file to write the results; not needed
        filename_output = "output.txt"
        try:
            fo = open(filename_output, 'w')
        except:
            print("File not found!")

        
        #read file and send to Identifier reader in separate documents
        doc = ""
        var = False

        idx = Indexer()
        token = Tokenizer()
        

        start = time.time()
        for line in fi:
            if(line.strip() == ""):
                #here ends a document
                #call IdentifierReader on read lines and find Identitifiers (PMID and TI)
                docdict = IdentifierReader.identReader(doc)
                #basic Tokenizer
                tokenizer_dict = token.tokenizer(docdict)
                #improved Tokenizer with Porter stemmer
                tok_dict = ImprovedTokenizer.improvedTokenizer(tokenizer_dict)
                indexed_dict = idx.indexer(tok_dict)
                var = False
                doc = ""
                continue
            if(line[4] == '-'):
                key = line.split("-", 1)
                #begining of a document
                if(key[0] == "PMID"):
                    var = True
            if(var):
                doc += line

        #read 2nd file
        # for line in fi2:
        #     if(line.strip() == ""):
        #         #here ends a document
        #         #call IdentifierReader on read lines and find Identitifiers (PMID and TI)
        #         docdict = IdentifierReader.identReader(doc)
        #         #basic Tokenizer
        #         tokenizer_dict = token.tokenizer(docdict)
        #         #improved Tokenizer with Porter stemmer
        #         tok_dict = ImprovedTokenizer.improvedTokenizer(tokenizer_dict)
        #         #indexing
        #         indexed_dict = idx.indexer(tok_dict)
        #         var = False
        #         doc = ""
        #         continue
        #     if(line[4] == '-'):
        #         key = line.split("-", 1)
        #         if(key[0] == "PMID"):
        #             var = True
        #             #here starts a document
        #     if(var):
        #         doc += line
        

        #write results to output.txt file
        print("Writing in file\n")
        indexed_dict = idx.sort_indexer(indexed_dict)
        for i in indexed_dict:
            tmp = ""
            fo.write(i)
            for j in indexed_dict[i]:
                tmp = tmp + "," + j + ":" + str(indexed_dict[i][j])
            fo.write(tmp + "\n")
        

        end = time.time()
       

        #to answer question 4
        #ten first terms (in alphabetic order) that appear in only one document
        doc_freq_1 = []
        high_doc_freq = {1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:0, 10:0}
        current_min = 0
        count1 = 0
        count2 = 0
        for i in indexed_dict:
            if((len(indexed_dict[i]) == 1) and count1 <= 9):
                doc_freq_1.append(i)
                count1 += 1
            if(len(indexed_dict[i]) > current_min):
                term_to_replace = [k for k, h in high_doc_freq.items() if h is current_min]
                high_doc_freq.pop(term_to_replace[0])
                high_doc_freq[i] = len(indexed_dict[i])
                current_min = min(list(high_doc_freq.values()))
      
        
        print("RESULTS")
        print("Time to run: ", end - start)
        print("Vocabulary size: ", len(indexed_dict))
        print("Doc frequency 1: ", doc_freq_1)
        print("Highest doc frequency: ", high_doc_freq)
Exemplo n.º 10
0
            count += 1

def answer_phrase_queries(phrase_string):
    query = Query.Query()
    results = query.phrase_query(phrase_string)
    count = 1
    for score, file_name in results:
        print("Choice number: ", count, " --> File: ", file_name, "Score = ", score)
        count += 1

def answer_text_queries(query_string):
    query = Query.Query()
    results = query.text_query(query_string)
    count = 1
    for score, file_name in results:
        print("Choice number: ", count, " --> File: ", file_name, "Score = ", score)
        count += 1

if __name__ == "__main__":
    print("Aloha!")
    path_to_text_corpus = "/home/nikhil/Desktop/Text-Search-Engine/text_corpus"
    indexer = Indexer.Indexer()
    print("Indexer object created!")
    #ndexer.build_index(path_to_text_corpus)
    print("Index building success!!!")
    #listen_for_queries()
    root = Tk()
    root.geometry("400x300")
    app = gui.Window(root)
    root.mainloop()
Exemplo n.º 11
0
    parser = argparse.ArgumentParser(description='This is project4 driver.')

    parser.add_argument('original', help='Original text file name.')
    parser.add_argument('preprocessed',
                        help='preprocessed file name for indexing.')
    parser.add_argument('--map',
                        dest='mapType',
                        help='Map type used for the multimap.')
    parser.add_argument('--index',
                        dest='indexFile',
                        help='File for the indexed output')

    args = parser.parse_args()

    myIndexer = Indexer(args)
    myIndexer.index()

    run = True
    while (run):
        toSearch = input("Enter a word to search for: ")
        toSearch = toSearch.strip('\n')
        myIndexer.search(toSearch)

        quit = input("Quit? (y/n): ")

        if (quit == 'y'):
            run = False
            if (args.indexFile):
                myIndexer.writeIndex()
            else: