def train(num_lsa_topics, k): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) #TOKENIZE xs = SentenceFragmentData.SentenceFragmentData() tokenizer = WordTokenizer.WordTokenizer(min_word_count = 5) tokenized_docs = tokenizer.tokenize(xs.documents) #MAP TO VECTOR AND SEMANTIC SPACE tfidf = TfIdf.TfIdf(tokenized_docs) lsa = Lsa.Lsa(tfidf, num_topics = num_lsa_topics) full_lsa_matrix = MatrixHelper.gensim_to_python_mdarray(lsa.distance_matrix, num_lsa_topics) #Filter To just sm codes sm_code_lsa_matrix = ListHelper.filter_list_by_index(full_lsa_matrix, xs.sm_code_indices) #CLUSTER clusterer = Clusterer.Clusterer(k) labels = clusterer.Run(sm_code_lsa_matrix) #OUTPUT - Filter by SM Code only this time file_name_code_clusters = "LSA_SMCODES_Fragments_k-means_k_{0}_dims_{1}.csv".format(k, num_lsa_topics) sm_codes_per_doc = ListHelper.filter_list_by_index(xs.codes_per_document, xs.sm_code_indices) ClustersToFile.clusters_to_file(file_name_code_clusters, labels, sm_codes_per_doc, "Chicago") file_name_category_clusters = "LSA_Categories_Fragments_k-means_k_{0}_dims_{1}.csv".format(k, num_lsa_topics) categories_per_doc = ListHelper.filter_list_by_index(xs.categories_per_document, xs.sm_code_indices) ClustersToFile.clusters_to_file(file_name_category_clusters, labels, categories_per_doc, "Chicago") print "Finished processing lsa clustering for dims: {0} and k: {1}".format(num_lsa_topics, k)
def train(num_lda_topics): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) #TOKENIZE xs = SentenceData.SentenceData() tokenizer = dbnetwork.WordTokenizer(min_word_count=5) tokenized_docs = tokenizer.tokenize(xs.documents) #MAP TO VECTOR AND SEMANTIC SPACE tfidf = TfIdf.TfIdf(tokenized_docs) lda = Lda.Lda(tfidf, num_topics=num_lda_topics) # Pull out topic topic_labels topic_labels = extract_topic_labels(lda.distance_matrix) #OUTPUT file_name_code_clusters = "LDA_SMCODES_topics_{0}.csv".format( num_lda_topics) ClustersToFile.clusters_to_file(file_name_code_clusters, topic_labels, xs.codes_per_document, "Chicago") file_name_category_clusters = "LDA_categories_topics_{0}.csv".format( num_lda_topics) ClustersToFile.clusters_to_file(file_name_category_clusters, topic_labels, xs.categories_per_document, "Chicago") print "Finished processing lda clustering for dims: {0}".format( num_lda_topics)
def main(): my_tfidf = TfIdf.TfIdf("tfidf_corpus.txt", DEFAULT_IDF=DEFAULT_IDF_UNITTEST) files = os.listdir("txt") n_files = len(files) - 150 print("initializing information retrieval!\n") for i in range(n_files): print("Proccesing[" + str(int (i* 100/n_files) ) + "%]: (" + str(i) + ") " + files[i]) file_act = open(".\\txt\\" + files[i], "r") string_act = "" for line in file_act.readlines(): string_act += line my_tfidf.add_input_document(string_act) print("Process Finish!") my_tfidf.save_corpus_to_file("out_tfidf.txt", "out_stopword.txt") print("Starting Query Input!") while True: ''' n = int(input("Choose nº document [Number of document]:")) if n >= 0 and n<n_files: print("File choosed: " + str(files[n])) file_act = open(".\\txt\\" + files[n], "r") string_act = "" for line in file_act.readlines(): string_act += line q = input("Choose you query:") print(str(my_tfidf.get_tfipc(string_act,q))) ''' q = input("Choose you query (#q to exit):") if q == "#q": break dic_tfipf = {} for i in range(n_files): print("Proccesing query[" + str(int(i * 100 / n_files)) + "%]: (" + str(i) + ") " + files[i]) file_act = open(".\\txt\\" + files[i], "r") string_act = "" for line in file_act.readlines(): string_act += line dic_tfipf[files[i]] = my_tfidf.get_tfipc(string_act,q) print(dic_tfipf[files[i]]) sorted_dic_tfipf = OrderedDict(sorted(dic_tfipf.items(), key=itemgetter(1), reverse=True)) print("\nSorted list of files with TF-IDF:\n") for file, tfipf_value in sorted_dic_tfipf.items(): print("#" + str(tfipf_value) + " :->: " + file)
def test_on_data(): import GwData import WordTokenizer import TfIdf import Converter import MatrixHelper data = GwData.GwData() tokenized = WordTokenizer.tokenize(data.documents) tfidf = TfIdf.TfIdf(data.documents)
def set_tfidf(): articles = mod.articles.find() res = [] l = [] for i in articles: l.append((i["title"],i["keywords"])) articles = mod.articles.find() for article in articles: #print article["title"] x = (article["title"],article["keywords"]) tf_idf = {word: tf.tfidf(word, x[1], l) for word in x[1].keys()} mod.articles.update({"title":x[0]},{"$set" : {"tfidf":tf_idf}})
def __init__(self, num_topics, directory=None, min_sentence_length=3): if directory == None: directory = Settings.Settings().data_directory + "\GlobalWarming" if not directory.endswith("\\"): directory += "\\" self.directory = directory logging.log(logging.INFO, "GwLsaClass: Processing Data from directory \n\t'%s'", directory) lsa_file = "{0}lsa_{1}.lsi".format(directory, num_topics) id2Word_file = "{0}id2Word.txt".format(directory, num_topics) if os.path.isfile(lsa_file): pass #TODO #self.__lsa__ = LsiModel.load(lsa_file) #self.id2Word = corpora.Dictionary.load(id2Word_file) #return lines = self.__loadLines__("globalwarming_specific_space11.txt") lines.append("") sentences = [] current = "" for line in lines: current += line if (len(line.strip()) == 0 and len(current.strip()) > 0): sent = nltk.sent_tokenize(current.strip().lower()) sentences.extend(sent) current = "" #if len(sentences) > 100: #print " >> STOPPING EARLY TO SPEED DEBUGGING, PLEASE REMOVE" #break documents = [] wt = WordTokenizer.WordTokenizer(min_word_count=3) tokenized = wt.tokenize(sentences) for tokenized_docs in tokenized: if len(tokenized_docs) >= min_sentence_length: documents.append(tokenized_docs) tfidf = TfIdf.TfIdf(documents) self.__lsa__ = Lsa.Lsa(tfidf, num_topics=num_topics) self.id2Word = tfidf.id2Word self.num_topics = num_topics
def search(): phasil = {} a = 0 for x in range(len(content)): data[x] = preprocessing.preprocess(content[x], queryinp) hasil = TfIdf.__init__(data, queryinp) for x in range(len(hasil)): phasil[x] = printhasil(content[x], hasil[x]) for key, value in sorted(phasil.items(), key=lambda e: e[1][2], reverse=True): if value[2] > 0.002: a += 1 print(value[0] + '\n' + value[1][:100] + '\n' + value[1][100:200] + '\n') print('got ' + str(a) + ' document')
def generate (n): if n>0: title = wikipedia.random(pages=1) try: page = wikipedia.WikipediaPage(title) summary = page.summary content = tb(page.content) tff = {word: tf.tf(word,content) for word in content.words} res = {} for i in tff: if tff[i] == 0.0: continue else: res[i] = tff[i] article = art.Article(title, summary, kw=res, tfidf={}) mod.insert(article) generate(n-1) except wikipedia.exceptions.DisambiguationError : generate(n) except bson.errors.InvalidDocument : generate(n)
def train(num_lsa_topics, k, window_size): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) #TOKENIZE xs = SentenceData.SentenceData() tokenizer = dbnetwork.WordTokenizer(min_word_count=5) tokenized_docs = tokenizer.tokenize(xs.documents) windowed_docs, window_indices = split_documents_into_windows( tokenized_docs, window_size) #MAP TO VECTOR AND SEMANTIC SPACE tfidf = TfIdf.TfIdf(windowed_docs) lsa = Lsa.Lsa(tfidf, num_topics=num_lsa_topics) full_lsa_matrix = MatrixHelper.gensim_to_python_mdarray( lsa.distance_matrix, num_lsa_topics) #CLUSTER clusterer = Clusterer.Clusterer(k) window_labels = clusterer.Run(full_lsa_matrix) #Extract the labeld for the original sentences using the indices build earlier labels = pivot_window_labels(window_labels, window_indices) #OUTPUT file_name_code_clusters = "Windowed_LSA_SMCODES_win_size_{0}_k-means_k_{1}_dims_{2}.csv".format( window_size, k, num_lsa_topics) ClustersToFile.clusters_to_file(file_name_code_clusters, labels, xs.codes_per_document, "Chicago") file_name_category_clusters = "Windowed_LSA_Categories_win_size_{0}_k-means_k_{1}_dims_{2}.csv".format( window_size, k, num_lsa_topics) ClustersToFile.clusters_to_file(file_name_category_clusters, labels, xs.categories_per_document, "Chicago") logging.info( "Finished processing lsa clustering for dims: {0} and k: {1}".format( num_lsa_topics, k))
def main(): #SETTINGS logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) settings = Settings.Settings() results_dir = settings.results_directory + GwData.FOLDER #TOKENIZE data = GwData.GwData() tokenized_docs = WordTokenizer.tokenize(data.documents, min_word_count=5) tfidf = TfIdf.TfIdf(tokenized_docs) #NLTK Decision Tree np_matrix = MatrixHelper.gensim_to_numpy_array(tfidf.matrix, initial_value=0) labels = data.causal_per_document def get_svm_val(x): if x <= 0: return -1 return 1 labels = map(get_svm_val, labels) td_size = int(0.75 * len(np_matrix)) td_x = np_matrix[:td_size] td_y = labels[:td_size] vd_x = np_matrix[td_size:] vd_y = labels[td_size:] rng = array(range(1, 21, 1)) c_vals = rng / 10.0 all_results = "" for c in c_vals: classifier = svm.LinearSVC(C=c) classifier.fit(td_x, td_y) #RESULTS classifications = classifier.predict(vd_x) results = "\nC VALUE: " + str(c) + "\n" results += ResultsHelper.rfp(vd_y, classifications) print results all_results += results #print "EXPLAIN:\n" #me.explain(condensed_data[0], 100) #DUMP TO FILE fName = results_dir + "Causal_Relation_SVM.txt" handle = open(fName, mode="w+") handle.write(all_results) handle.close() #binary_matrix = term_freq.binary_matrix() #decision_tree = tree.DecisionTreeClassifier(criterion = 'entropy') #decision_tree.fit(binary_matrix, labels) # Test with CL1 labels raw_input("Press Enter to quit")
return self.distance_matrix[self.words[wd]].flatten().tolist()[0] def project(self, item): if type(item) == type(""): return self.project(item) l = [] for w in item: if w in self.words: l.append(self.project(w)) return l if __name__ == "__main__": import GwData import TfIdf import WordTokenizer e = Embeddings() d = GwData.GwData() tokenized_docs = WordTokenizer.tokenize(d.documents, min_word_count=1, stem=False, remove_stop_words=False) tf = TfIdf.TfIdf(tokenized_docs) ewds = set(e.words) dwds = set([w for w in tf.id2Word.values()]) pass
def tfidf_vspace(self, tokenized_docs): tfidf = TfIdf.TfIdf(tokenized_docs) return (tfidf.distance_matrix, tfidf.id2Word)
#get count of number of documents #we know that this number gets updated only once every 6 hours. This would need to be modified if we are going to work with system in which articles are being added to the database #constantly lengthOfCorpus = tableWithDocs.count() #tableWithDocs.query_count(last_name__eq='Doe') #rows = tableWithDocs.scan(body__contains= 'obama') #index = 0 #for row in rows: # index +=1 #print index #exit() tdIdfCalculator = TfIdf.TfIdf(lengthOfCorpus, numDocsWithKeyword, keyword) columnWithBody = sys.argv[4] columnWithUniqueId = sys.argv[5] rows = tableWithDocs.scan() #this is the table to store Tf-Idf value tdidfIndexTable = Table('TfIdfNew', connection=db) #tdidfTl = tdidfIndexTable.query_2(word__eq = 'obama') #for row in tdidfTl: # print row['articleId'] # print row['tdIdfRoundTo7']
def train(num_lsa_topics, k): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) #TOKENIZE xs = SentenceData.SentenceData() tokenizer = WordTokenizer.WordTokenizer(min_word_count=5) tokenized_docs = tokenizer.tokenize(xs.documents) #MAP TO VECTOR AND SEMANTIC SPACE tfidf = TfIdf.TfIdf(tokenized_docs) lsa = Lsa.Lsa(tfidf, num_topics=num_lsa_topics) full_lsa_matrix = MatrixHelper.gensim_to_python_mdarray( lsa.distance_matrix, num_lsa_topics) #TODO Partition into Docs by LSA sim txt_codes = xs.text_codes clusters_per_text_code = int(round(k / float((len(txt_codes))))) #Extract the sm code rows from LSA smCodeRows = ListHelper.filter_list_by_index(full_lsa_matrix, xs.sm_code_indices) smCodeClassifications = ListHelper.filter_list_by_index( xs.codes_per_document, xs.sm_code_indices) smCodeCategoryClassifications = ListHelper.filter_list_by_index( xs.categories_per_document, xs.sm_code_indices) # Dict of <code, list[list]]> - LSA row vectors logging.info("Partitioning LSA distance_matrix by Source Document") txtMatrixByCode = PartitionByCode.partition(full_lsa_matrix, xs, xs.text_codes) closest_docs = [ find_closest_document(txtMatrixByCode, row) for row in smCodeRows ] matrix_by_doc = collections.defaultdict(list) for i, doc in enumerate(closest_docs): matrix_by_doc[doc].append(smCodeRows[i]) #Stores all cluster labels logging.info("Clustering within a document") all_smcode_labels = [] label_offset = 0 for doc in xs.text_codes: distance_matrix = matrix_by_doc[doc] #CLUSTER clusterer = Clusterer.Clusterer(clusters_per_text_code) labels = clusterer.Run(distance_matrix) all_smcode_labels = all_smcode_labels + [ int(l + label_offset) for l in labels ] label_offset += clusters_per_text_code #OUTPUT file_name_code_clusters = "Partition_By_Doc_LSA_SMCODES_k-means_k_{0}_dims_{1}.csv".format( k, num_lsa_topics) ClustersToFile.clusters_to_file(file_name_code_clusters, all_smcode_labels, smCodeClassifications, "Chicago") file_name_category_clusters = "Partition_By_Doc_LSA_categories_k-means_k_{0}_dims_{1}.csv".format( k, num_lsa_topics) ClustersToFile.clusters_to_file(file_name_category_clusters, all_smcode_labels, smCodeCategoryClassifications, "Chicago") #TODO - filter the category and the docs per docs to the sm codes and output #file_name_category_clusters = "Partition_By_Doc_LSA_categories_k-means_k_{0}_dims_{1}.txt".format(k, num_lsa_topics) #ClustersToFile.clusters_to_file(file_name_category_clusters, all_smcode_labels, smCodeClassifications, "Chicago") print "Finished processing lsa clustering for dims: {0} and k: {1}".format( num_lsa_topics, k)
def lsa_vspace(self, tokenized_docs): tfidf = TfIdf.TfIdf(tokenized_docs) lsa = Lsa.Lsa(tfidf, self.num_topics) return (lsa.distance_matrix, lsa.id2Word)
def train(): #SETTINGS cv_folds = 10 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) settings = Settings.Settings() results_dir = settings.results_directory + +GwData.FOLDER num_lsa_topics = 100 #TOKENIZE xs = GwData.GwData() tokenized_docs = WordTokenizer.tokenize(xs.documents, min_word_count=5) tfidf = TfIdf.TfIdf(tokenized_docs) lsa = Lsa.Lsa(tfidf, num_topics=num_lsa_topics) #NLTK SVM linear kernel xs = MatrixHelper.gensim_to_numpy_array(lsa.distance_matrix, initial_value=0) total_recall, total_precision, total_f1 = 0.0, 0.0, 0.0 all_results = "LSA Dimensions: " + str(num_lsa_topics) print all_results processed_code_count = 0 #MIN_CODE_COUNT = 5 MIN_CODE_COUNT = 1 codes = [ c for c in xs.sm_codes # Exclude pure vague codes if c != "v" and # Exclude doc codes. Need whole doc to classify them not c.startswith("s") ] for code in codes: code_count = xs.sm_code_count[code] if code_count <= MIN_CODE_COUNT: continue processed_code_count += 1 labels = map(Converter.get_svm_val, xs.labels_for(code)) classifier = svm.LinearSVC(C=1) recall, precision, f1_score = cross_validation_score(xs, labels, classifier, cv_folds, class_value=1.0) results = "Code: {0} Count: {1}, Recall: {2}, Precision: {3}, F1: {4}\n".format( code.ljust(10), code_count, recall, precision, f1_score) all_results += results total_recall += recall total_precision += precision total_f1 += f1_score print results, #num_codes = len(xs.sm_codes) num_codes = processed_code_count result = "AGGREGATE\n\t Recall: {0}, Precision: {1}, F1: {2}\n".format( total_recall / num_codes, total_precision / num_codes, total_f1 / num_codes) all_results += result print result #DUMP TO FILE fName = results_dir + "Codes_ClassifyUsing_SVM_with_EssayBasedLSA_Dims_" + str( num_lsa_topics) + ".txt" handle = open(fName, mode="w+") handle.write(all_results) handle.close()
def __init__(self, tokenized_docs, latentSpaceFactory, aggregation_method="doc", normalize=False, unit_vectors=False, term_frequency_only=False): """ Projects words to a vector space """ tokenized_docs = [t for t in tokenized_docs if len(t) > 0] def pivot_by_words(dct, doc): for word1 in doc: for word2 in doc: if word1 != word2: dct[word1].append(word2) """ Pivot Docs Around Words """ d = defaultdict(list) if aggregation_method == "doc": """ term - doc space """ for i, doc in enumerate(tokenized_docs): for word in doc: d[word].append(str(i)) elif aggregation_method == "sentence": """ word space - words to words """ for i, doc in enumerate(tokenized_docs): self.pivot_by_words(d, doc) elif aggregation_method.startswith("window:"): _, str_size = aggregation_method.split(":") win_size = int(str_size) print "Window Size:", win_size win_id = 0 for doc in tokenized_docs: windows = split_into_windows(doc, win_size) for win in windows: for word in win: d[word].append(str(win_id)) win_id += 1 #pivot_by_words(d, win) print "Size of windowed method:", len(d) pass else: raise Exception( "Unexpected aggregation_method value: %s. Accepted Values are <'doc','sentence','window:n> " % aggregation_method) tokenized_docs = d.values() self.word_to_index = dict() for i, wd in enumerate(d.keys()): self.word_to_index[wd] = i if term_frequency_only: tf = TermFrequency.TermFrequency(tokenized_docs) latent_space = latentSpaceFactory(tf, tokenized_docs) else: tfidf = TfIdf.TfIdf(tokenized_docs) latent_space = latentSpaceFactory(tfidf, tokenized_docs) """ Construct Vector Space """ self.latent_vector = [] for i, v in enumerate(latent_space): vec = [val for idx, val in v] """ Example Normalization """ if unit_vectors: vec = unit_vector(vec) self.latent_vector.append(vec) """ Normalize """ if normalize: tmp_arr = np.array(self.latent_vector) means = np.mean(tmp_arr, axis=0) sds = np.std(tmp_arr, axis=0) norm = (tmp_arr - means) / sds self.latent_vector = norm pass