from indexing import create_index create_index()
sys.exit(2) query = raw_input("Please enter the search string: ") ## Expect exactly 2 arguments: the search query and number of clusters k = raw_input("Please enter the number of clusters: ") k=int(k) ## query = "jaguar" start = time.clock() ## k=7 interim_path = extract_results.get_search_results(query) ## extract search results parsing_search_results.parse_file(interim_path) ## parse the serach results title_dict = pickle.load(open("title_dict","rb")) ## unpickle the document dictionaries desc_dict = pickle.load(open("desc_dict","rb")) url_dict = pickle.load(open("url_dict","rb")) indexing.create_index(title_dict,desc_dict,url_dict) ## create an index index = pickle.load(open("index","rb")) ## unpickle the index dictionaries np_ind = pickle.load(open("np_ind","rb")) stem_dict = pickle.load(open("stem_dict","rb")) indexing.calc_tf_idf(title_dict,index,np_ind) ## calculate the tf-idf values for the document vector doc_word_dict = pickle.load(open("doc_word_dict","rb")) ## unpickle the tf-idf dictionaries representing document vectors doc_np_dict = pickle.load(open("doc_np_dict","rb")) norm_doc_word_dict = clustering.normalize_doc_dict(doc_word_dict) ## normalize the document vector dij = clustering.calc_eucl_dist(norm_doc_word_dict) ## calculate the eucledian distance clustering.get_mediods(k,dij) ## use k-mediods to get the clusters cluster = pickle.load(open("cluster","rb")) ## unpickle the clusters label_dict = labeling.label(cluster,doc_np_dict, stem_dict,np_ind,query) ## label the clusters
import math import random import re from xml.dom.minidom import parse, parseString import xml.dom.minidom as minidom from collections import OrderedDict,defaultdict import pickle from sets import Set from operator import itemgetter import os import nltk from nltk.tokenize.regexp import RegexpTokenizer import time import Queue import parsing import indexing if __name__ == "__main__": if len(sys.argv)!=2: # Expect exactly 1 argument sys.exit(2) path1 = sys.argv[1] parsing.parse_file(path1) indexing.create_index()
###! /usr/bin/python import sys import math import random import re from collections import OrderedDict import os import pickle import time import nltk from nltk.stem.porter import PorterStemmer import itertools import indexing import train if __name__ == "__main__": if len(sys.argv) != 2: # Expect exactly 1 argument: the training data file sys.exit(2) input1 = file(sys.argv[1], "r") indexing.create_index(input1) ## this function will create dp the basic pre-processing and create the index train.train_classifier() ## this function will calculate the model parameters for the classifier