def parse_tokens(tokens): """ Implementation of the shunting-yard algorithm, with modifications to handle unary right associative operators """ output = [] op_stack = [] phrasal = [] for tok in tokens: if tok == "\"": if op_stack and op_stack[-1] == "\"": output.append(PhrasePostings(phrasal)) phrasal = [] op_stack.pop() else: op_stack.append(tok) elif tok == ')': while op_stack[-1] != '(':apply_op(op_stack,output) op_stack.pop() elif tok in prec: if tok in right: while op_stack and op_stack[-1] != '(' and prec[op_stack[-1]] > prec[tok]: apply_op(op_stack,output) else: while op_stack and op_stack[-1] != '(' and prec[op_stack[-1]] >= prec[tok]: apply_op(op_stack,output) op_stack.append(tok) else: if op_stack and op_stack[-1] == '\"': phrasal.append(preprocess(tok)) else: output.append(process_token(tok)) while op_stack: apply_op(op_stack,output) return output[0]
def split_query(query): result = {} tokens = query.split() for i in tokens: term = preprocess(i) result[term] = result.get(term,0) + 1 return result
def process_token(tok): if tok.lower() in stop_words: return all_postings tok = preprocess(tok) try: return Postings(tok) except KeyError: return EmpPostings()
def queryToScore(query): postingHandler = PostingHandler(dictionary_file, postings_file) uniDict = makeUniGrams(preprocess(query)) q_len = tf_idf.getLtcLen(postingHandler, uniDict) N = postingHandler.getNumDoc() for word in uniDict: df = postingHandler.getDocFreq(word) uniDict[word] = tf_idf.get_ltc(uniDict[word], N, df, q_len) return uniDict
def query_tf(query): '''This function will filter given query for stopwords and stem, then will return their tf-''' d_qr = {} Q = preprocess(query).split() length = len(query.split()) for keys in Q: if keys in d_qr: d_qr[keys] = d_qr[keys] + 1 / length else: d_qr[keys] = 1 / length return d_qr
def free_text_query(self, query): """ Retrieve documents from query. :param query: A string with something. Please have something, :return: """ words = word_tokenize(query) words = preprocess(words) if len(words) == 0: return [] res = self.postings.get_postings_list(words.pop()) while len(words) > 0: posting = self.postings.get_postings_list(words.pop()) res = union(res, posting) return res
def __identify_query(self, query_string): """ identify the query type and process the queries """ if ("AND" in query_string): # Consider as list of queries # Pre process if need be out = [] split_word = query_string.split(' ') i = 0 while (i < len(split_word)): if (split_word[i][0] == '"'): combined = [] d = i while (d < len(split_word)): combined.append(split_word[d].replace('"', '')) d += 1 if split_word[d][-1] == '"': i = d + 1 combined.append(split_word[d].replace('"', '')) break out.append(preprocess(combined)) else: if (split_word[i] != "AND"): out.append(preprocess([split_word[i]])) i += 1 # Flattens the out list so we can get a tf_q flat_term_list = [item for sublist in out for item in sublist] self.__get_tf(flat_term_list) return True, out else: # pre process as per normal self.__get_tf(self.__get_term_list(query_string)) return False, query_string
def processBoolQuery(query): # Since only boolean operator is AND, each element in this list will be AND-merged query = query.split(" AND ") query2 = [] for i in range(len(query)): query2.extend(preprocess(query[i])) query = query2 convertToScores(query) # Since skip lists aren't implemented, no need to prioritise shortest query while len(query) > 1: query[0] = doAnd(query[0], query[1]) del query[1] query = query[0] if len(query) == 0: return "" else: query.sort(key=lambda x: x[1], reverse=True) # print query result = [str(i) for i in zip(*query)[0]] return ' '.join(result)
def query_type_two(q): #we "clean" the query and we calculate the tf query = voc_lyrics(preprocess(q)) vocabulary = db.INVERTED.find({"vocabulary": { "$exists": True }})[1]['vocabulary'] tot_num_doc = db.ADMHMW3.count() tot_doc = [] #we take the inverted index for each term of the query for q in query: term = q[0] try: #we take the id of the term from the vocabulary term_id = list(vocabulary.keys())[list( vocabulary.values()).index(term)] posting = db.INVERTED.find({term_id: { "$exists": True }})[1][term_id] except: print( "The query is impossible, one or more terms aren't in the db") return res = list(map(tuple, posting)) docs = list(zip(*res))[0] tot_doc.append(set(docs)) # we take the intersection between the posting lists common_doc = list(set.intersection(*tot_doc)) #print the result to receive the k from the users n = len(common_doc) print("The number of the results is", n, "Insert the value of k") k = int(input()) #we clusterize only if the number of documents is a value bigger than k if k < n: X = np.zeros((n, n)) updated_common_docs = {} dict_docs = {} #for each document in the intersection, we take the index list, that has this form: doc_id = [(term_id, tf)], and we update the value with tf*idf #we should have done this operation for all songs and store in another collection. This approach would have produced a code faster in this phase, because #we would have avoid this double loop for idx, doc in enumerate(common_doc): dict_docs[idx] = doc doc_new = [] list_term = db.INDEX.find({doc: {"$exists": True}})[0][doc] for term_tf in list_term: term = term_tf[0] tf = term_tf[1] term_id = list(vocabulary.keys())[list( vocabulary.values()).index(term)] posting_length = len( db.INVERTED.find({term_id: { "$exists": True }})[0][term_id]) idf = 1 + log(tot_num_doc / posting_length) value = tf * idf doc_new.append((term, value)) updated_common_docs[doc] = doc_new #we compute the distance matrix max_dis = 0 for i in range(n - 1): doc_i = updated_common_docs[common_doc[i]] for j in range(i + 1, n): doc_j = updated_common_docs[common_doc[j]] d = distance(doc_i, doc_j) max_dis = max(d, max_dis) X[i][j] = d X[j][i] = d #Cluster of songs clusters = KMeans(n_clusters=k).fit( PCA(n_components=n).fit_transform(X)).labels_ clust_dict = {} for idx, x in enumerate(clusters): try: clust_dict[x].append(dict_docs[idx]) except: clust_dict[x] = [dict_docs[idx]] res = {} text = "" #print of the result for k, v in clust_dict.items(): for idx, doc_id in enumerate(v): d = db.ADMHMW3.find({"_id": bson.ObjectId(doc_id)})[0] try: res[k].append((d['Artist'], d['Title'])) except: res[k] = [(d['Artist'], d['Title'])] text += d["Lyrics"] + " " print(res) generate_word_cloud(text) else: print("k is too big, bye!") return
def __get_term_list(self, query_string): term_list = preprocess(word_tokenize(query_string)) return term_list