def analyze_doc(filename): f = open(input_directory + str(filename)) terms = [] for line in f: terms = terms + get_terms(line) f.close() return get_term_freq_dict(terms)
def __init__(self, query_text): self.text = query_text self.terms = get_terms(self.text) self.tf = {} for term in self.terms: if term not in self.tf: self.tf[term] = 0 self.tf[term] += 1
def build_inverted_index(index): inverted_index = defaultdict(lambda: dict()) for document_id, document in index.iteritems(): tokens = get_terms(document) if not len(tokens): continue terms_frequencies = Counter(tokens) max_frequency = terms_frequencies.most_common(1)[0][1] for term, frequency in terms_frequencies.iteritems(): tf = float(frequency) / max_frequency # 0 <= tf <= 1 inverted_index[term][document_id] = tf return dict(inverted_index)
def build_inverted_index(index, term_to_id): voc_size = len(term_to_id) inverted_index = [dict() for _ in xrange(voc_size)] for document_id, document in index.iteritems(): tokens = get_terms(document) if not len(tokens): continue terms_frequencies = Counter(tokens) for term, frequency in terms_frequencies.iteritems(): tf = float(frequency) inverted_index[term_to_id[term]][document_id] = tf return inverted_index
def build_inverted_index(): inverted_index = [dict() for _ in xrange(voc_size)] for document_id, document in index.iteritems(): tokens = get_terms(document) if not len(tokens): continue terms_frequencies = Counter(tokens) max_frequency = terms_frequencies.most_common(1)[0][1] for term, frequency in terms_frequencies.iteritems(): tf = float(frequency) / max_frequency # 0 <= tf <= 1 inverted_index[term_to_id[term]][document_id] = tf return inverted_index
def get_vocabulary(index): vocabulary = [] term_to_id = {} term_id = 0 for document_id, document in index.iteritems(): tokens = get_terms(document) if not len(tokens): continue for term in tokens: if term not in term_to_id: term_to_id[term] = term_id vocabulary.append(term) term_id += 1 return np.array(vocabulary), term_to_id
def main(): print("Loading resources... ", end=""), sys.stdout.flush() start_time = clock() term_to_id, inverted_index, doc_id_to_path = cPickle.load(open(PICKLE_PATH, "rb")) tfidf_matrix = np.load(TFIDF_PATH) vocabulary = np.load(VOCABULARY_PATH) print("{:.0f} s".format(clock() - start_time)) while True: query = raw_input("\nType your query: ") print() query_terms = get_terms(query) # Get id-s for terms presented in vocabulary query_terms = [term_to_id[t] for t in query_terms if t in term_to_id] if not len(query_terms): continue documents_ids = get_documents_ids(query_terms, inverted_index) tfidf_submatrix = tfidf_matrix[documents_ids, :] npmi_submatrix = load_npmi_submatrix(query_terms) npmi_sum = npmi_submatrix.sum(axis=0) # sum NPMI vectors for query terms (due to q=1) similarities = (tfidf_submatrix * npmi_sum.reshape((1, -1))).sum(axis=1) ranked_order = similarities.argsort()[::-1] top_table = [] for i in ranked_order[:TOP_SIZE]: doc_id = documents_ids[i] file_name = os.path.basename(doc_id_to_path[documents_ids[i]]) tfidf_vector = tfidf_matrix[doc_id] doc_terms = np.where(tfidf_vector)[0] tfidf_subvector = tfidf_vector[doc_terms] npmi_subsubmatrix = npmi_submatrix[:, doc_terms] tuples = get_top_pairs(tfidf_subvector, npmi_subsubmatrix) tuples = map(lambda tup: (vocabulary[query_terms[tup[0]]], vocabulary[doc_terms[tup[1]]]) + tup[2:], tuples) top_table.append([similarities[i], file_name] + list(tuples[0])) for tup in tuples[1:]: top_table.append(["", ""] + list(tup)) top_table.append([""] * 6) print(tabulate(top_table, showindex=False, headers=["sim", "doc", "u", "v", "v_tfidf", "uv_npmi"] + query_terms, numalign="left"))
def computor(): if (len(sys.argv) != 2): print("wrong nb of arguments") print(utils.usage) sys.exit(0) else: equ1 = utils.check_equation(sys.argv[1]) equ = utils.get_terms(equ1) equation = Equation(equ) print("Reduced form : {}".format(equation)) print("Rational reduced form : {}\n".format(equation.rational())) print("Natural reduced form : {}".format(equation.natural())) print("Natural rational reduced form : {}\n".format( equation.natural_rat())) print("Polynomial degree = {}".format(equation.po())) if (equation.po() > 2): print( "The polynomial degree is stricly greater than 2, I can't solve.") sys, exit(0) solve.def_sol(equation)
def main(): inverted_index, id_to_path, term_to_idf = cPickle.load(open(PICKLE_PATH, "rb")) while True: query = raw_input("\nType your query: ") print terms = get_terms(query) if not len(terms): continue documents_ids = get_documents_ids(terms, inverted_index) tfidf_matrix = get_tfidf_matrix(documents_ids, terms, inverted_index, term_to_idf) similarities = tfidf_matrix.sum(axis=1) / len(terms) # Sort arrays ranked_order = similarities.argsort()[::-1] similarities = similarities[ranked_order] documents_ids = documents_ids[ranked_order] tfidf_matrix = tfidf_matrix[ranked_order, :] top_table = [] for i in range(min(TOP_SIZE, len(documents_ids))): file_name = os.path.basename(id_to_path[documents_ids[i]]) top_table.append([similarities[i], file_name] + list(tfidf_matrix[i, :])) print tabulate(top_table, showindex=False, headers=["sim", "doc"] + terms, numalign="left")
def search(self, term, args): term = ' '.join(term.split()) search_pieces = utils.get_terms(term) self._filter(search_pieces, args) self.FINISHED.emit()
def analyze_query(line): terms = get_terms(line) return get_term_freq_dict(terms)
def search(self, term): term = ' '.join(term.split()) search_pieces = utils.get_terms(term) self._filter(search_pieces) self.FINISHED.emit()