Пример #1
0
def analyze_doc(filename):
    f = open(input_directory + str(filename))
    terms = []
    for line in f:
        terms = terms + get_terms(line)
    f.close()
    return get_term_freq_dict(terms)
Пример #2
0
 def __init__(self, query_text):
     self.text = query_text
     self.terms = get_terms(self.text)
     self.tf = {}
     for term in self.terms:
         if term not in self.tf:
             self.tf[term] = 0
         self.tf[term] += 1
Пример #3
0
def build_inverted_index(index):
    inverted_index = defaultdict(lambda: dict())
    for document_id, document in index.iteritems():
        tokens = get_terms(document)
        if not len(tokens):
            continue
        terms_frequencies = Counter(tokens)
        max_frequency = terms_frequencies.most_common(1)[0][1]
        for term, frequency in terms_frequencies.iteritems():
            tf = float(frequency) / max_frequency  # 0 <= tf <= 1
            inverted_index[term][document_id] = tf
    return dict(inverted_index)
def build_inverted_index(index, term_to_id):
    voc_size = len(term_to_id)
    inverted_index = [dict() for _ in xrange(voc_size)]
    for document_id, document in index.iteritems():
        tokens = get_terms(document)
        if not len(tokens):
            continue
        terms_frequencies = Counter(tokens)
        for term, frequency in terms_frequencies.iteritems():
            tf = float(frequency)
            inverted_index[term_to_id[term]][document_id] = tf
    return inverted_index
def build_inverted_index():
    inverted_index = [dict() for _ in xrange(voc_size)]
    for document_id, document in index.iteritems():
        tokens = get_terms(document)
        if not len(tokens):
            continue
        terms_frequencies = Counter(tokens)
        max_frequency =  terms_frequencies.most_common(1)[0][1]
        for term, frequency in terms_frequencies.iteritems():
            tf = float(frequency) / max_frequency  # 0 <= tf <= 1
            inverted_index[term_to_id[term]][document_id] = tf
    return inverted_index
def get_vocabulary(index):
    vocabulary = []
    term_to_id = {}
    term_id = 0
    for document_id, document in index.iteritems():
        tokens = get_terms(document)
        if not len(tokens):
            continue
        for term in tokens:
            if term not in term_to_id:
                term_to_id[term] = term_id
                vocabulary.append(term)
                term_id += 1
    return np.array(vocabulary), term_to_id
Пример #7
0
def main():
    print("Loading resources... ", end=""), sys.stdout.flush()
    start_time = clock()
    term_to_id, inverted_index, doc_id_to_path = cPickle.load(open(PICKLE_PATH, "rb"))
    tfidf_matrix = np.load(TFIDF_PATH)
    vocabulary = np.load(VOCABULARY_PATH)
    print("{:.0f} s".format(clock() - start_time))

    while True:
        query = raw_input("\nType your query: ")
        print()
        query_terms = get_terms(query)
        # Get id-s for terms presented in vocabulary
        query_terms = [term_to_id[t] for t in query_terms if t in term_to_id]
        if not len(query_terms):
            continue
        documents_ids = get_documents_ids(query_terms, inverted_index)

        tfidf_submatrix = tfidf_matrix[documents_ids, :]
        npmi_submatrix = load_npmi_submatrix(query_terms)
        npmi_sum = npmi_submatrix.sum(axis=0)  # sum NPMI vectors for query terms (due to q=1)
        similarities = (tfidf_submatrix * npmi_sum.reshape((1, -1))).sum(axis=1)
        ranked_order = similarities.argsort()[::-1]

        top_table = []
        for i in ranked_order[:TOP_SIZE]:
            doc_id = documents_ids[i]
            file_name = os.path.basename(doc_id_to_path[documents_ids[i]])

            tfidf_vector = tfidf_matrix[doc_id]
            doc_terms = np.where(tfidf_vector)[0]
            tfidf_subvector = tfidf_vector[doc_terms]
            npmi_subsubmatrix = npmi_submatrix[:, doc_terms]

            tuples = get_top_pairs(tfidf_subvector, npmi_subsubmatrix)
            tuples = map(lambda tup: (vocabulary[query_terms[tup[0]]], vocabulary[doc_terms[tup[1]]]) + tup[2:], tuples)

            top_table.append([similarities[i], file_name] + list(tuples[0]))
            for tup in tuples[1:]:
                top_table.append(["", ""] + list(tup))
            top_table.append([""] * 6)

        print(tabulate(top_table, showindex=False, headers=["sim", "doc", "u", "v", "v_tfidf", "uv_npmi"] + query_terms, numalign="left"))
Пример #8
0
def computor():
    if (len(sys.argv) != 2):
        print("wrong nb of arguments")
        print(utils.usage)
        sys.exit(0)
    else:
        equ1 = utils.check_equation(sys.argv[1])
    equ = utils.get_terms(equ1)
    equation = Equation(equ)
    print("Reduced form : {}".format(equation))
    print("Rational reduced form : {}\n".format(equation.rational()))
    print("Natural reduced form : {}".format(equation.natural()))
    print("Natural rational reduced form : {}\n".format(
        equation.natural_rat()))
    print("Polynomial degree = {}".format(equation.po()))
    if (equation.po() > 2):
        print(
            "The polynomial degree is stricly greater than 2, I can't solve.")
        sys, exit(0)
    solve.def_sol(equation)
Пример #9
0
def main():
    inverted_index, id_to_path, term_to_idf = cPickle.load(open(PICKLE_PATH, "rb"))

    while True:
        query = raw_input("\nType your query: ")
        print
        terms = get_terms(query)
        if not len(terms):
            continue
        documents_ids = get_documents_ids(terms, inverted_index)
        tfidf_matrix = get_tfidf_matrix(documents_ids, terms, inverted_index, term_to_idf)
        similarities = tfidf_matrix.sum(axis=1) / len(terms)

        # Sort arrays
        ranked_order = similarities.argsort()[::-1]
        similarities = similarities[ranked_order]
        documents_ids = documents_ids[ranked_order]
        tfidf_matrix = tfidf_matrix[ranked_order, :]

        top_table = []
        for i in range(min(TOP_SIZE, len(documents_ids))):
            file_name = os.path.basename(id_to_path[documents_ids[i]])
            top_table.append([similarities[i], file_name] + list(tfidf_matrix[i, :]))
        print tabulate(top_table, showindex=False, headers=["sim", "doc"] + terms, numalign="left")
Пример #10
0
	def search(self, term, args):
		term = ' '.join(term.split())
		search_pieces = utils.get_terms(term)

		self._filter(search_pieces, args)
		self.FINISHED.emit()
Пример #11
0
def analyze_query(line):
    terms = get_terms(line)
    return get_term_freq_dict(terms)
Пример #12
0
	def search(self, term):
		term = ' '.join(term.split())
		search_pieces = utils.get_terms(term)

		self._filter(search_pieces)
		self.FINISHED.emit()