示例#1
0
文件: run.py 项目: wakewalker/eecs767
def index():
    if request.method == 'GET':
        return render_template('index.html')#, request=request)
    else:
        abstract = '<strong>Lorem ipsum</strong> dolor sit amet, consectetur adipiscing elit. Suspendisse vitae purus sit amet magna iaculis rhoncus. Aenean ullamcorper nibh vitae lacus commodo condimentum. Aenean ornare pharetra est id porttitor. <strong>Lorem ipsum</strong> dolor sit amet, consectetur adipiscing elit. Morbi eu ante sed arcu maximus imperdiet. Phasellus id nisl quis sem consectetur sagittis. Duis placerat nisi ut nisl condimentum ornare. Sed pulvinar arcu nisl, eu faucibus dui tincidunt aliquam. Aliquam malesuada faucibus nisl, et malesuada turpis sagittis nec. Aliquam id pretium augue.'

        start_time = timeit.default_timer()
        query = request.form['query']

        dproc = DocProcessor()
        dproc.prep_query(query)
        
        iidx = InvertedIndex(
            '/home/ubuntu/eecs767/var/wikipedia-3833/term.dct',
            '/home/ubuntu/eecs767/var/wikipedia-3833/doc.lst'
        )

        dlist = DocList(
            '/home/ubuntu/eecs767/var/wikipedia-3833/doc.lst'
        )

        results = []
        if '_enhanced' in request.form:
            rel_docs = iidx.enhanced_query(dproc.tokens)
            #ranked_docs = sorted(rel_docs, key=itemgetter('fscore'), reverse=True)
#            cos_ranked_docs = sorted(rel_docs, key=itemgetter('cos_sim'), reverse=True)
            ranked_docs = sorted(rel_docs, key=itemgetter('cos_sim', 'term_prox','i_win_loc'), reverse=True)
            for doc in ranked_docs[:10]:
                results.append({
                    'url': dlist[doc['did']]['url'],
                    'title': dlist[doc['did']]['title'],
                    'abstract': abstract,
                    'cos_sim': doc['cos_sim'],
                    'term_prox': doc['term_prox'],
                    'win_loc': int(round(1/doc['i_win_loc'])),
                    'fscore': doc['fscore']
                })
        else:
            rel_docs = iidx.query(dproc.tokens)
            ranked_docs = sorted(rel_docs.items(), key=itemgetter(1), reverse=True)

            for doc in ranked_docs[:10]:
                results.append({
                    'url': dlist[doc[0]]['url'],
                    'title': dlist[doc[0]]['title'],
                    'score': doc[1],
                    'abstract': abstract
                })

        elapsed_time = timeit.default_timer() - start_time

        return render_template('index.html', 
            #request=request,
            #tokens=dproc.tokens,
            #docs=ranked_docs[:10],
            query=query,
            results=results,
            elapsed_time=round(elapsed_time,3),
            total_docs=len(dlist)
        )
示例#2
0
def write(plist, url, title, config):
    '''
    RQ worker function which adds the given document posting list data to the
    inverted index.
    '''
    MAX_DOCS = int(config.get('crawler', 'max_docs'))
    TERM_DICT_FILE = config.get('indexer', 'term_dict_file')
    DOC_LIST_FILE = config.get('indexer', 'doc_list_file')
    
    dl = DocList(DOC_LIST_FILE)
    if len(dl) < MAX_DOCS:
        did = md5(url).hexdigest()

        if did not in dl:
            dl.append(url, title)

            iidx = InvertedIndex(
                TERM_DICT_FILE,
                DOC_LIST_FILE
            )
            iidx.append(plist, did)
            iidx.update()
示例#3
0
def main():
    alphabets = [chr(x) for x in range(ord('a'), ord('z') + 1)]
    alphabets.append("num")
    current_dict = dict()
    N = 55393

    for i in alphabets:
        f = open(i + ".txt", 'rb')
        current_indexer = InvertedIndex()
        current_indexer.merge(pickle.load(f))
        tf_idf_score = dict()
        f.close()
        # Calculating tf-idf score for each separate index
        for token, dictionary in current_indexer.getDict().items():
            df = len(dictionary.keys())
            current_dict = dict()
            for docID, tf in dictionary.items():
                current_dict[docID] = round(tf * math.log(N / df, 10), 4)
            current_indexer.getDict()[token].clear()
            current_indexer.getDict()[token] = current_dict
        # Store the results into tf_score_(insert here).txt
        f = open("tf_score_" + i + ".txt", 'wb')
        pickle.dump(current_indexer.getDict(), f)
        f.close()
示例#4
0
def write(plist, url, title, config):
    '''
    RQ worker function which adds the given document posting list data to the
    inverted index.
    '''
    MAX_DOCS = int(config.get('crawler', 'max_docs'))
    TERM_DICT_FILE = config.get('indexer', 'term_dict_file')
    DOC_LIST_FILE = config.get('indexer', 'doc_list_file')

    dl = DocList(DOC_LIST_FILE)
    if len(dl) < MAX_DOCS:
        did = md5(url).hexdigest()

        if did not in dl:
            dl.append(url, title)

            iidx = InvertedIndex(TERM_DICT_FILE, DOC_LIST_FILE)
            iidx.append(plist, did)
            iidx.update()
示例#5
0
def main():
    i = 0
    final_result = InvertedIndex()

    # Load all indexes into one InvertedIndex() (class object for index)
    for i in range(88):
        f = open("indexer" + str(i) + ".txt", 'rb')
        final_result.merge(pickle.load(f))
        f.close()

    alphanumIndex = [dict() for i in range(27)
                     ]  # length of alphabet + number = 26 + 1 = 27

    for i in final_result.getDict():
        if 97 <= ord(
                i[0]
        ) <= 122:  # if the key of the index is part of english alphabet
            alphanumIndex[ord(i[0]) - 97][i] = final_result.getDict(
            )[i]  # the first index selects which partial dictionary the key belongs to
            # the second index maps the key i to the value of the key i from the merged index
        else:
            alphanumIndex[26][i] = final_result.getDict(
            )[i]  # the key is not part of eng. alphabet -> insert in number index

    ## create partial file for each letter and dump the dict into that file
    ASCII_code = 97  # ascii of 'a'
    for i in range(len(alphanumIndex)):  # == 27
        if i == len(
                alphanumIndex) - 1:  # at the last index, reserved for numbers
            filename = "num"
        else:
            filename = chr(ASCII_code + i)

        fileObject = open(f"{filename}.txt", "wb")
        pickle.dump(alphanumIndex[i], fileObject)
        fileObject.close()
示例#6
0
from operator import itemgetter

from indexer import InvertedIndex, DocList
from tokenizer import DocProcessor

query = 'mary had a little lamb whos fleece was white as snow'
query = 'tom colwell'
query = 'information retrieval'
query = 'asian women alumni'

dproc = DocProcessor()
dproc.prep_query(query)
iidx = InvertedIndex(
        #'/home/ubuntu/eecs767/var/ku/term.dct',
        #'/home/ubuntu/eecs767/var/ku/doc.lst',
    '/home/ubuntu/eecs767/var/wikipedia/term.dct',
    '/home/ubuntu/eecs767/var/wikipedia/doc.lst',
)
rel_docs = iidx.query(dproc.tokens)
#print rel_docs.items()

ranked_docs = sorted(rel_docs.items(), key=itemgetter(1), reverse=True)
#print ranked_docs[:10]

dlist = DocList(
    '/home/ubuntu/eecs767/var/wikipedia/doc.lst'
)
for doc in ranked_docs[:10]:
    print '%s: %s' % (doc[1], dlist[doc[0]])
示例#7
0
def index():
    if request.method == 'GET':
        return render_template('index.html')  #, request=request)
    else:
        abstract = '<strong>Lorem ipsum</strong> dolor sit amet, consectetur adipiscing elit. Suspendisse vitae purus sit amet magna iaculis rhoncus. Aenean ullamcorper nibh vitae lacus commodo condimentum. Aenean ornare pharetra est id porttitor. <strong>Lorem ipsum</strong> dolor sit amet, consectetur adipiscing elit. Morbi eu ante sed arcu maximus imperdiet. Phasellus id nisl quis sem consectetur sagittis. Duis placerat nisi ut nisl condimentum ornare. Sed pulvinar arcu nisl, eu faucibus dui tincidunt aliquam. Aliquam malesuada faucibus nisl, et malesuada turpis sagittis nec. Aliquam id pretium augue.'

        start_time = timeit.default_timer()
        query = request.form['query']

        dproc = DocProcessor()
        dproc.prep_query(query)

        iidx = InvertedIndex(
            '/home/ubuntu/eecs767/var/wikipedia-3833/term.dct',
            '/home/ubuntu/eecs767/var/wikipedia-3833/doc.lst')

        dlist = DocList('/home/ubuntu/eecs767/var/wikipedia-3833/doc.lst')

        results = []
        if '_enhanced' in request.form:
            rel_docs = iidx.enhanced_query(dproc.tokens)
            #ranked_docs = sorted(rel_docs, key=itemgetter('fscore'), reverse=True)
            #            cos_ranked_docs = sorted(rel_docs, key=itemgetter('cos_sim'), reverse=True)
            ranked_docs = sorted(rel_docs,
                                 key=itemgetter('cos_sim', 'term_prox',
                                                'i_win_loc'),
                                 reverse=True)
            for doc in ranked_docs[:10]:
                results.append({
                    'url': dlist[doc['did']]['url'],
                    'title': dlist[doc['did']]['title'],
                    'abstract': abstract,
                    'cos_sim': doc['cos_sim'],
                    'term_prox': doc['term_prox'],
                    'win_loc': int(round(1 / doc['i_win_loc'])),
                    'fscore': doc['fscore']
                })
        else:
            rel_docs = iidx.query(dproc.tokens)
            ranked_docs = sorted(rel_docs.items(),
                                 key=itemgetter(1),
                                 reverse=True)

            for doc in ranked_docs[:10]:
                results.append({
                    'url': dlist[doc[0]]['url'],
                    'title': dlist[doc[0]]['title'],
                    'score': doc[1],
                    'abstract': abstract
                })

        elapsed_time = timeit.default_timer() - start_time

        return render_template(
            'index.html',
            #request=request,
            #tokens=dproc.tokens,
            #docs=ranked_docs[:10],
            query=query,
            results=results,
            elapsed_time=round(elapsed_time, 3),
            total_docs=len(dlist))
示例#8
0
from indexer import InvertedIndex
import csv
from sys import exit

with open('/home/ubuntu/eecs767/var/sample.csv') as f:
    term_list = []
    for row in csv.DictReader(f, skipinitialspace=True):
        term_list.append(row)

tdict = InvertedIndex()
tdict.build(term_list)
tdict.write()

for term in sorted(tdict):
    tnode = tdict[term]['tnode']
    pl_str = ''
    for p in tnode.plist:
        pl_str += ' -> %s x %s (%.3f)' % (p['did'], p['tf'], p['w'])
    print '(%s) %s (tf:%s; df:%s; idf:%.3f):%s' % (
        tdict[term]['loc'],
        tnode.term,
        tnode.tf,
        tnode.df,
        tnode.idf,
        pl_str
    )
示例#9
0
            unprocessed_location, most_similar_documents, document_titles,
            document_snapshots
        ]

    except Exception as e:
        raise ()


if __name__ == "__main__":
    try:
        doc_basename = "docsnew"  # the actual name of the folder containing the processed files
        doc_location = "../file_cache/processed/" + doc_basename

        dp = DPClass()
        #dp.runDocProc("../file_cache/unprocessed/" + doc_basename)
        iic = InvertedIndexClass()
        #iic.createInvertedIndex("../file_cache/processed/docsnew")
        #iic.createInvertedIndex("../file_cache/processed/testdoc")
        iic.loadInvertedIndex(doc_location)
        vsm = VSMClass(iic, doc_basename)
        vsm.createEntireModel()
        stemmer = PorterStemmer()

        continueLoop = True

        print("Welcome to the Search Engine\n")
        while continueLoop:
            fromUser = ""
            user_query = ""
            print(
                "\n\nSelect from the Following Options:\n\t1.) Search\n\t2.) Exit"
示例#10
0
from tokenizer import DocProcessor
from indexer import InvertedIndex

docs = {
    1: '/home/ubuntu/eecs767/var/docs/doc1.html',
    2: '/home/ubuntu/eecs767/var/docs/doc2.html',
    3: '/home/ubuntu/eecs767/var/docs/doc3.html',
    4: '/home/ubuntu/eecs767/var/docs/doc4.html',
    5: '/home/ubuntu/eecs767/var/docs/doc5.html',
}

dproc = DocProcessor()
iidx = InvertedIndex()

for did, doc in docs.iteritems():
    print '-- Processing Doc #%s: %s' % (did, doc)
    dproc.parse(doc)
    plist = dproc.gen_posting_list()

    iidx.append(plist, did)
    iidx.update()
    iidx.clear()
示例#11
0
from operator import itemgetter

from indexer import InvertedIndex, DocList
from tokenizer import DocProcessor

query = 'mary had a little lamb whos fleece was white as snow'
query = 'tom colwell'
query = 'asian women alumni'
query = 'information retrieval'

dproc = DocProcessor()
dproc.prep_query(query)
iidx = InvertedIndex(
        #'/home/ubuntu/eecs767/var/ku/term.dct',
        #'/home/ubuntu/eecs767/var/ku/doc.lst',
    '/home/ubuntu/eecs767/var/wikipedia/term.dct',
    '/home/ubuntu/eecs767/var/wikipedia/doc.lst',
)
rel_docs = iidx.enhanced_query(dproc.tokens)

#for doc in rel_docs:
#    print doc

#print '-----------------'

cos_ranked_docs = sorted(rel_docs, key=itemgetter('cos_sim'), reverse=True)

#for doc in cos_ranked_docs[:10]:
    #print doc

#print '-----------------'
        return [unprocessed_location, most_similar_documents, document_titles, document_snapshots]

    except Exception as e:
        raise()




if __name__ == "__main__":
    output = {}
    try:
        doc_basename = "newly_crawled" # the actual name of the folder containing the processed files
        doc_location = "../file_cache/processed/" + doc_basename

        dp = DPClass()
        iic = InvertedIndexClass()
        iic.loadInvertedIndex("../file_cache/processed/" + doc_basename)

        stemmer = PorterStemmer()

        english_file = open("./nltk-3.3/nltk_data/corpora/stopwords/english", "r", encoding="UTF8")
        english_words = english_file.read().strip().split()
        english_file.close()

        if len(sys.argv) < 2:
            output = {"ERROR MESSAGE": "You Need to Give a Search Term"}

        else:
            arguments = ""
            query = []
            # argument 0 is the file name
示例#13
0
from indexer import InvertedIndex
import csv
from sys import exit

tdict = InvertedIndex()
tdict.init_index()
for term in sorted(tdict):
    print '(%s) %s' % (tdict[term]['loc'], term)

示例#14
0
def main():
    # define command line parameters
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--documents_folder', action='store')
    parser.add_argument('-o', '--index_output_file', action='store')
    parser.add_argument('-c', '--docid_map_output_file', action='store')
    parser.add_argument('-i', '--index_file', action='store')
    parser.add_argument('-b', '--docid_map_file', action='store')
    parser.add_argument('-q', '--evaluate_queries', action='store_true')
    parser.add_argument('-a', '--queries_result_file', action='store')
    parser.add_argument('-p', '--part_three_file', action='store')

    # parse command line parameters
    args = parser.parse_args()
    print('received the following arguments')
    for k, v in vars(args).items():
        print(k, v)

    inverted_index = None

    # build index if requested
    if args.documents_folder is not None:
        inverted_index = build_inverted_index(args.documents_folder)

        # export inverted index to a file if requested
        if args.index_output_file is not None and args.docid_map_output_file is not None:
            inverted_index.index_to_file(args.index_output_file)
            inverted_index.docidmap_to_file(args.docid_map_output_file)

    # load inverted index from file
    if args.index_file is not None and args.docid_map_file:
        inverted_index = InvertedIndex(args.index_file, args.docid_map_file)

    # write queries results to a file
    if args.evaluate_queries and inverted_index and args.queries_result_file:
        queries_results = BooleanRetrieval(inverted_index)
        with open(args.queries_result_file, 'w') as file:
            file.write('\n'.join(queries_results))

    # write hw part 3 answers to a file
    if args.part_three_file and inverted_index:
        answer = ''
        top = 10
        bottom = 10

        # Write the top 10 terms with the highest document frequency
        top_df = inverted_index.get_top_df_ids(top)
        answer += '--------------------------------------\n'
        answer += 'Top {} df terms:\n'.format(top)
        answer += '\n'.join(['{}: {}'.format(term, df) for term, df in top_df])

        # Write the top 10 terms with the lowest document frequency
        bottom_df = inverted_index.get_bottom_df_ids(bottom)
        answer += '\n--------------------------------------\n'
        answer += 'Bottom {} df terms:\n'.format(bottom)
        answer += '\n'.join(['{}: {}'.format(term, df) for term, df in bottom_df])

        # Explain the different characteristics of the above two sets of terms
        answer += '\n--------------------------------------\n'
        answer += 'The different characteristics of the above two sets of terms:\n'
        answer += '   Top dfs terms are the most common terms in the collection, located in large number of documents\n'
        answer += '   Bottom dfs terms are the rarest terms in the collection, located only in few documents\n'
        answer += '   * in our inverted index we don\'t keep track of terms frequency inside a document, ' \
                  'meaning df of a term signifies the number of documents in which this term was present' \
                  ' at least one time\n'

        with open(args.part_three_file, 'w') as file:
            file.write(answer)
示例#15
0
from indexer import InvertedIndex

iidx = InvertedIndex()

iidx.calc_scores()
iidx.write()