示例#1
0
from indexing import create_index

create_index()
示例#2
0
                sys.exit(2)
        query = raw_input("Please enter the search string: ")             ## Expect exactly 2 arguments: the search query and number of clusters
        k = raw_input("Please enter the number of clusters: ")
        k=int(k)
##        query = "jaguar"
        start = time.clock()
##        k=7
        interim_path = extract_results.get_search_results(query)        ## extract search results
               
        parsing_search_results.parse_file(interim_path)                 ## parse the serach results
        title_dict = pickle.load(open("title_dict","rb"))               ## unpickle the document dictionaries
        desc_dict = pickle.load(open("desc_dict","rb"))
        url_dict = pickle.load(open("url_dict","rb"))

        
        indexing.create_index(title_dict,desc_dict,url_dict)            ## create an index
        index = pickle.load(open("index","rb"))                         ## unpickle the index dictionaries
        np_ind = pickle.load(open("np_ind","rb"))
        stem_dict = pickle.load(open("stem_dict","rb"))
        
        indexing.calc_tf_idf(title_dict,index,np_ind)                   ## calculate the tf-idf values for the document vector
        doc_word_dict = pickle.load(open("doc_word_dict","rb"))         ## unpickle the tf-idf dictionaries representing document vectors
        doc_np_dict = pickle.load(open("doc_np_dict","rb"))

        norm_doc_word_dict = clustering.normalize_doc_dict(doc_word_dict)               ## normalize the document vector
        dij = clustering.calc_eucl_dist(norm_doc_word_dict)                             ## calculate the eucledian distance
        clustering.get_mediods(k,dij)                                                   ## use k-mediods to get the clusters
        cluster = pickle.load(open("cluster","rb"))                                     ## unpickle the clusters

        
        label_dict = labeling.label(cluster,doc_np_dict, stem_dict,np_ind,query)                        ## label the clusters
示例#3
0
import math
import random
import re
from xml.dom.minidom import parse, parseString
import xml.dom.minidom as minidom
from collections import OrderedDict,defaultdict
import pickle
from sets import Set
from operator import itemgetter
import os
import nltk
from nltk.tokenize.regexp import RegexpTokenizer
import time
import Queue

import parsing
import indexing



if __name__ == "__main__":
        if len(sys.argv)!=2:                # Expect exactly 1 argument
                sys.exit(2)

        path1 = sys.argv[1]
        
        parsing.parse_file(path1)

        indexing.create_index()

示例#4
0
###! /usr/bin/python

import sys
import math
import random
import re
from collections import OrderedDict
import os
import pickle
import time
import nltk
from nltk.stem.porter import PorterStemmer
import itertools
import indexing
import train


if __name__ == "__main__":
    if len(sys.argv) != 2:  # Expect exactly 1 argument: the training data file
        sys.exit(2)
    input1 = file(sys.argv[1], "r")

    indexing.create_index(input1)  ## this function will create dp the basic pre-processing and create the index
    train.train_classifier()  ## this function will calculate the model parameters for the classifier