def processPOS(s1,s2,stem): processed_s1=preprocess_text.preprocess(s1, stem) processed_s2=preprocess_text.preprocess(s2, stem) s1_text_obj = TextObj(processed_s1.decode('utf-8', 'replace')) s1_tokens= s1_text_obj.tokens s2_text_obj = TextObj(processed_s2.decode('utf-8', 'replace')) s2_tokens=s2_text_obj.tokens vecs=buildVector(s1_tokens,s2_tokens) return (cosim(vecs[0], vecs[1]))
def Create_Agg_cluster(self,stem,stop,processing,remS): Allrow_dicts=data_pkg.FileHandling.read_csv(self.StringsFile) Allstrings=[rowdict_str[self.clusterdfield] for rowdict_str in Allrow_dicts] if self.POS=="ALL": Allstrings_process=[preprocess_text.preprocess(string_text, stem,stop) for string_text in Allstrings] else: POS_Strings=list() if self.POS=="Noun_Verb_AdJ" : POS_List=["Noun","Adj","Verb"] else: if self.POS=="Noun_AdJ" : POS_List=["Noun","Adj"] else: print "Error in Part of speech in function Create_Agg_cluster" sys.exit(0) for string in Allstrings: POS_String=Add_POS.ADDPOS_string(string,POS_List)["AllPOSstring"] POS_Strings.append(POS_String) Allstrings_process=[preprocess_text.preprocess(string_text, stem,stop) for string_text in POS_Strings] if remS: Allstrings_process=[preprocess_text.removeS(text) for text in Allstrings_process] if self.vec=="CountVectorizer": vectorizer = CountVectorizer() else: if self.vec=="TFIdfCountVectorizer": vectorizer= TfidfVectorizer() term_doc=vectorizer.fit_transform(Allstrings_process) #======================================================================= # svd = TruncatedSVD(n_components=5, random_state=42) # lsa = make_pipeline(svd, Normalizer(copy=False)) # term_doc = lsa.fit_transform(term_doc) # term_doc = svd.fit_transform(term_doc) #======================================================================= #-------------------------- feature_names=vectorizer.get_feature_names() #------------------------------------------------ Array=term_doc.toarray if self.affinity=='euclidean': Agg_cluster=AgglomerativeClustering(n_clusters=self.num_cluster,affinity='euclidean') if self.affinity=='cosine': Agg_cluster=AgglomerativeClustering(n_clusters=self.num_cluster,linkage='average',affinity=self.affinity) if self.affinity=='l1': Agg_cluster=AgglomerativeClustering(n_clusters=self.num_cluster,linkage='average',affinity=self.affinity) Res_Labels=Agg_cluster.fit_predict(term_doc.toarray()) self.cluster_tup_list=self.tuple_cluster_doc(Res_Labels,Allstrings,Allrow_dicts) #print type (term_doc) self.metric=metrics.silhouette_score(term_doc.toarray(), Res_Labels, metric=self.affinity)
def Create_Ext_Agg_cluster(self,stem,stop,processing,remS): Allrow_dicts=data_pkg.FileHandling.read_csv(self.ExtStringCSv) Allstrings=list() #Allstrings=[rowdict_str["Text_original"] for rowdict_str in Allrow_dicts] for row_dict in Allrow_dicts: if self.POS =="ALL_EXT": Stringrow=row_dict["Text_original"]+row_dict["Adj_Extended"]+row_dict["Noun_Extended"] +row_dict["Verb_Extended"] Allstrings.append(Stringrow) else: Stringrow=row_dict["Adj"]+row_dict["Adj_Extended"]+row_dict["Noun"]+row_dict["Noun_Extended"]#+row_dict["Verb"]#+row_dict["Verb_Extended"] Allstrings.append(Stringrow) Allstrings_process=[preprocess_text.preprocess(string_text, stem,stop) for string_text in Allstrings] if remS: Allstrings_process=[preprocess_text.removeS(text) for text in Allstrings_process] vectorizer = CountVectorizer() term_doc=vectorizer.fit_transform(Allstrings_process) #-------------------------- feature_names=vectorizer.get_feature_names() #--z---------------------------------------------- Array=term_doc.toarray if self.affinity=='euclidean': Agg_cluster=AgglomerativeClustering(n_clusters=self.num_cluster,affinity='euclidean') if self.affinity=='cosine': Agg_cluster=AgglomerativeClustering(n_clusters=self.num_cluster,linkage='average',affinity='cosine') Res_Labels=Agg_cluster.fit_predict(term_doc.toarray()) self.cluster_tup_list=self.tuple_Ext_cluster_doc(Res_Labels,Allstrings,Allrow_dicts) #term_doc_lsa = lsa.fit_transform(term_doc) print type (term_doc) self.metric=metrics.silhouette_score(term_doc.toarray(), Res_Labels, metric=self.affinity) print Res_Labels print("n_samples: %d, n_features: %d" % term_doc.shape)
sys.exit('ERR: LDA model file ./ldamodel' + str(NUM_TOPICS) + '.lda not found!') print 'Loading LDA model from file ./ldamodel' + str(NUM_TOPICS) + '.lda ...', sys.stdout.flush() ldamodel = models.LdaModel.load('ldamodel' + str(NUM_TOPICS) + '.lda') print ' Done!' # transform ALL documents into LDA space target_labels = {} for img_path in train_dict.keys(): with open(db_dir + train_dict[img_path]) as fp: raw = fp.read() tokens = preprocess(raw) bow_vector = dictionary.doc2bow(tokens) #lda_vector = ldamodel[bow_vector] lda_vector = ldamodel.get_document_topics(bow_vector, minimum_probability=None) lda_vector = sorted(lda_vector, key=lambda x: x[1], reverse=True) topic_prob = {} for instance in lda_vector: topic_prob[instance[0]] = instance[1] labels = [] for topic_num in range(0, NUM_TOPICS): if topic_num in topic_prob.keys(): labels.append(topic_prob[topic_num]) else: labels.append(0) target_labels[img_path] = labels
sys.exit('ERR: Dataset metadata folder ' + xml_dir + ' not found!') if not os.path.isfile(train_dict_path): sys.exit('ERR: Train dictionary file ' + train_dict_path + ' not found!') with open(train_dict_path) as f: train_dict = json.load(f) if not os.path.isfile('./dictionary.dict') or not os.path.isfile('./bow.mm'): # list for tokenized documents in loop texts = [] for text_path in train_dict.values(): with open(db_dir + text_path) as f: raw = f.read() # add tokens to corpus list texts.append(preprocess(raw)) sys.stdout.write( '\rCreating a list of tokenized documents: %d/%d documents processed...' % (len(texts), len(train_dict.values()))) sys.stdout.flush() sys.stdout.write(' Done!\n') # turn our tokenized documents into a id <-> term dictionary if not os.path.isfile('./dictionary.dict'): print 'Turn our tokenized documents into a id <-> term dictionary ...', sys.stdout.flush() dictionary = corpora.Dictionary(texts) dictionary.save('./dictionary.dict') else: print 'Loading id <-> term dictionary from ./dictionary.dict ...', sys.stdout.flush()
import sys sys.path.insert(1, '../LDA/') from preprocess_text import preprocess NUM_TOPICS = 2 print('Learning LDA topic model with ' + str(NUM_TOPICS) + ' topics') if not os.path.isfile('./dictionary.dict') or not os.path.isfile('./bow.mm'): # list for tokenized documents in loop texts = [] with open('data_pairs.json') as f: data_pairs = json.load(f) for data_pair in data_pairs: texts.append(preprocess(str(data_pair['text']))) sys.stdout.write("\rNum texts processed: " + str(len(texts))) sys.stdout.flush() del data_pairs print("") # turn our tokenized documents into a id <-> term dictionary if not os.path.isfile('./dictionary.dict'): print 'Turn our tokenized documents into a id <-> term dictionary ...', sys.stdout.flush() dictionary = corpora.Dictionary(texts) dictionary.save('./dictionary.dict') else: print 'Loading id <-> term dictionary from ./dictionary.dict ...', sys.stdout.flush() dictionary = corpora.Dictionary.load('./dictionary.dict')
if not os.path.isdir(xml_dir): sys.exit('ERR: Dataset metadata folder '+xml_dir+' not found!') if not os.path.isfile(train_dict_path): sys.exit('ERR: Train dictionary file '+train_dict_path+' not found!') with open(train_dict_path) as f: train_dict = json.load(f) if not os.path.isfile('./dictionary.dict') or not os.path.isfile('./bow.mm'): # list for tokenized documents in loop texts = [] for text_path in train_dict.values(): with open(db_dir+text_path) as f: raw = f.read() # add tokens to corpus list texts.append(preprocess(raw)) sys.stdout.write('\rCreating a list of tokenized documents: %d/%d documents processed...' % (len(texts),len(train_dict.values()))) sys.stdout.flush() sys.stdout.write(' Done!\n') # turn our tokenized documents into a id <-> term dictionary if not os.path.isfile('./dictionary.dict'): print 'Turn our tokenized documents into a id <-> term dictionary ...', sys.stdout.flush() dictionary = corpora.Dictionary(texts) dictionary.save('./dictionary.dict') else: print 'Loading id <-> term dictionary from ./dictionary.dict ...', sys.stdout.flush() dictionary = corpora.Dictionary.load('./dictionary.dict') print ' Done!'
# load LDA model if not os.path.isfile('ldamodel'+str(NUM_TOPICS)+'.lda'): sys.exit('ERR: LDA model file ./ldamodel'+str(NUM_TOPICS)+'.lda not found!') print 'Loading LDA model from file ./ldamodel'+str(NUM_TOPICS)+'.lda ...', sys.stdout.flush() ldamodel = models.LdaModel.load('ldamodel'+str(NUM_TOPICS)+'.lda') print ' Done!' # transform ALL documents into LDA space target_labels = {} for img_path in train_dict.keys(): with open(db_dir+train_dict[img_path]) as fp: raw = fp.read() tokens = preprocess(raw) bow_vector = dictionary.doc2bow(tokens) #lda_vector = ldamodel[bow_vector] lda_vector = ldamodel.get_document_topics(bow_vector, minimum_probability=None) lda_vector = sorted(lda_vector,key=lambda x:x[1],reverse=True) topic_prob = {} for instance in lda_vector: topic_prob[instance[0]] = instance[1] labels = [] for topic_num in range(0,NUM_TOPICS): if topic_num in topic_prob.keys(): labels.append(topic_prob[topic_num]) else: labels.append(0) target_labels[img_path] = labels sys.stdout.write('\r%d/%d text documents processed...' % (len(target_labels),len(train_dict.keys())))
from pomegranate import * from preprocess_text import preprocess from perturb_text import perturb #input_text = "La-vita-sul-pianeta-Marte.txt" input_text = "books.txt" # call function that preprocess the input text preprocess(input_text) # call function that introduce noise in the text perturb('preprocessed_text.txt') # define states list -> states = ['a', 'b', ..., 'z'] states = [chr(code) for code in range(ord('a'), ord('z') + 1)] # initialize prior probability distribution dictionary and end probability # distribution dictionary. That is: # -> start_probs = {'a': 0, 'b': 0, ..., 'z': 0} # -> end_probs = {'a': 0, 'b': 0, ..., 'z': 0} # start_probs[x] is the probability that a word start with char x # end_probs[x] is the probability that a word end with char x start_probs = {} end_probs = {} for state in states: start_probs[state] = 0 end_probs[state] = 0 # initialize transition probability distribution dictionary and observation # probability distribution dictionary. That is: