def map(): with sqlite3.connect(os.getcwd() + '/../../data/database.sqlite') as database: cursor = database.cursor() cursor.execute('SELECT paper_text FROM papers') for doc in cursor.fetchall(): text = doc[0] lines = sent_tokenize(text) for line in lines: words = word_tokenize(line.lower()) words = [i for i in words if i not in stop and not any((c.isdigit() or c in string.punctuation) for c in i) and not len(i) == 1] for first, second in zip(words, words[1:]): sFirst = stem(first) print(sFirst, second, 1) return True
def stemWord(matchobj): # if necessary ignore keywords in the stemming, but the Porter stemmer doesn't affect them. return stem(matchobj.group(0))
def Search(query, type="BM25"): """ This method searches in the corpus using VSM or BM25 :param query: query to search in the corpus :param type: VSM or BM25 :return: list of doc ids ranked by their similarity to query """ # get tokens from query q_grammar = Word(alphas) q_tf = {} q_length = 0 for token, start, end in q_grammar.scanString(query): token = stem(str(token[0]).lower()) if q_tf.has_key(token): q_tf[token] += 1 else: q_tf[token] = 1 q_length += 1 # index = Index() # vocabulary = index[0] # d_idf = index[1] # d_length = index[2] q_terms = q_tf.keys() vocabulary = GetDbPostings(q_terms) d_idf = GetDbIdf(q_terms) d_length = GetDbDocs(q_terms) q_score = 0 d_score_list = {} first_pos = {} d_avg = 0 for doc in d_length: d_avg += d_length[doc] d_count = len(d_length) d_avg = d_avg / d_count # get the collection's tf idf with the query's tf idf for term in q_terms: if d_idf.has_key(term): q_score = q_tf[term] * d_idf[term] for doc in vocabulary[term]: posting = vocabulary[term][doc] d_tf = len(posting) if type == "BM25": k = 2.0 b = 0.75 d_score = (((k + 1) * d_tf) / ( (k * (1 - b + b * (d_count / d_avg))) + d_tf)) * d_idf[term] * q_score if type == "VSM": d_score = d_tf * d_idf[term] * q_score if d_score_list.has_key(doc): d_score_list[doc] += d_score else: d_score_list[doc] = d_score #print q_score #print sorted(d_score_list.items(), key=operator.itemgetter(0)) # normalize the weights of the documents if type == "VSM": for d in d_score_list: d_score_list[d] = d_score_list[d] / d_length[d] # sort the results descendingly sorted_scores = sorted(d_score_list.items(), key=operator.itemgetter(1), reverse=True) #print sorted(d_score_list.items(), key=operator.itemgetter(0)) #print sorted_scores result = [] for doc, weight in sorted_scores: result.append(str(doc)) #print result return result
def Index(file_dataset="../../data/papers.csv", file_dump="../../data/derived/", id_col=0, text_col=-1, title_col=2): try: collection = pickle.load(open(file_dump + "index.lol", "rb")) idf = pickle.load(open(file_dump + "idf.lol", "rb")) docs = pickle.load(open(file_dump + "doc_length.lol", "rb")) #print "Retrieved index from file " + file_dump except: collection = defaultdict(functools.partial(defaultdict, list)) doc_nr = 0 idf = {} docs = {} print "Building indexer..." #with open(file_dataset, "r") as paperscsv: #papersreader = csv.reader(paperscsv,delimiter=',') with sqlite3.connect(os.getcwd() + '/../../data/database.sqlite') as database: cursor = database.cursor() cursor.execute('SELECT * FROM papers') for doc in cursor.fetchall(): #for doc in papersreader: # skip the header #if doc_nr > 0: id = doc[id_col] title = doc[title_col] text = doc[text_col] # extract tokens from the title and the text token_pos = 0 print "Examining doc: " + str(id) tokenized_text = Word(alphas).searchString(title) docs[id] = len(tokenized_text) for token in tokenized_text: token = stem(str(token[0]).lower()) token_pos += 1 collection[token][id].append(token_pos) tokenized_text = Word(alphas).searchString(text) docs[id] += len(tokenized_text) for token in tokenized_text: token = stem(str(token[0]).lower()) token_pos += 1 collection[token][id].append(token_pos) doc_nr += 1 # if doc_nr > 10: # break print "Calculating idf..." for term in collection.keys(): idf[term] = math.log10(doc_nr / float(len(collection[term]))) print "Dumping index..." pickle.dump(collection, open(file_dump + "index.lol", "wb")) print "Dumping idf..." pickle.dump(idf, open(file_dump + "idf.lol", "wb")) print "Dumping doc_length..." pickle.dump(docs, open(file_dump + "doc_length.lol", "wb"), pickle.HIGHEST_PROTOCOL) return [collection, idf, docs]