예제 #1
0
def processPOS(s1,s2,stem):
    
    processed_s1=preprocess_text.preprocess(s1, stem)
    processed_s2=preprocess_text.preprocess(s2, stem)
    s1_text_obj = TextObj(processed_s1.decode('utf-8', 'replace'))
    s1_tokens= s1_text_obj.tokens
    
    s2_text_obj = TextObj(processed_s2.decode('utf-8', 'replace'))
    s2_tokens=s2_text_obj.tokens
    vecs=buildVector(s1_tokens,s2_tokens)
    return (cosim(vecs[0], vecs[1]))
예제 #2
0
 def Create_Agg_cluster(self,stem,stop,processing,remS): 
     
     Allrow_dicts=data_pkg.FileHandling.read_csv(self.StringsFile)
     Allstrings=[rowdict_str[self.clusterdfield] for rowdict_str in Allrow_dicts]
     if self.POS=="ALL":
         Allstrings_process=[preprocess_text.preprocess(string_text, stem,stop) for string_text in Allstrings] 
     else:
         POS_Strings=list()
         if self.POS=="Noun_Verb_AdJ" :
             POS_List=["Noun","Adj","Verb"]
         else:    
             if  self.POS=="Noun_AdJ" :
                 POS_List=["Noun","Adj"] 
             else:
                 print "Error in Part of speech in function Create_Agg_cluster"
                 sys.exit(0)
                 
         
         for string in Allstrings:
             POS_String=Add_POS.ADDPOS_string(string,POS_List)["AllPOSstring"] 
             POS_Strings.append(POS_String)                  
         Allstrings_process=[preprocess_text.preprocess(string_text, stem,stop) for string_text in POS_Strings]  
     
     if remS:
         Allstrings_process=[preprocess_text.removeS(text) for text in Allstrings_process]            
     if self.vec=="CountVectorizer":
         vectorizer = CountVectorizer()
     else:
         if self.vec=="TFIdfCountVectorizer":
             vectorizer= TfidfVectorizer()      
     term_doc=vectorizer.fit_transform(Allstrings_process)
     #=======================================================================
     # svd = TruncatedSVD(n_components=5, random_state=42)
     # lsa = make_pipeline(svd, Normalizer(copy=False))
     # term_doc = lsa.fit_transform(term_doc)
     # term_doc = svd.fit_transform(term_doc)
     #=======================================================================
     
     #-------------------------- feature_names=vectorizer.get_feature_names()
     #------------------------------------------------ Array=term_doc.toarray
     if self.affinity=='euclidean':
         Agg_cluster=AgglomerativeClustering(n_clusters=self.num_cluster,affinity='euclidean')
     if self.affinity=='cosine':
         Agg_cluster=AgglomerativeClustering(n_clusters=self.num_cluster,linkage='average',affinity=self.affinity)
     if self.affinity=='l1':
         Agg_cluster=AgglomerativeClustering(n_clusters=self.num_cluster,linkage='average',affinity=self.affinity)    
     Res_Labels=Agg_cluster.fit_predict(term_doc.toarray())
     self.cluster_tup_list=self.tuple_cluster_doc(Res_Labels,Allstrings,Allrow_dicts)
     #print type (term_doc)
     self.metric=metrics.silhouette_score(term_doc.toarray(), Res_Labels, metric=self.affinity)
예제 #3
0
 def Create_Ext_Agg_cluster(self,stem,stop,processing,remS): 
      
     Allrow_dicts=data_pkg.FileHandling.read_csv(self.ExtStringCSv)
     Allstrings=list()
     #Allstrings=[rowdict_str["Text_original"] for rowdict_str in Allrow_dicts]
     for row_dict in Allrow_dicts:
         if self.POS =="ALL_EXT":
             Stringrow=row_dict["Text_original"]+row_dict["Adj_Extended"]+row_dict["Noun_Extended"] +row_dict["Verb_Extended"]
             Allstrings.append(Stringrow)
         else:
             Stringrow=row_dict["Adj"]+row_dict["Adj_Extended"]+row_dict["Noun"]+row_dict["Noun_Extended"]#+row_dict["Verb"]#+row_dict["Verb_Extended"]
             Allstrings.append(Stringrow)
              
     Allstrings_process=[preprocess_text.preprocess(string_text, stem,stop) for string_text in Allstrings]  
      
     if remS:
         Allstrings_process=[preprocess_text.removeS(text) for text in Allstrings_process]            
     vectorizer = CountVectorizer()    
     term_doc=vectorizer.fit_transform(Allstrings_process)
     #-------------------------- feature_names=vectorizer.get_feature_names()
     #--z---------------------------------------------- Array=term_doc.toarray
      
     if self.affinity=='euclidean':
         Agg_cluster=AgglomerativeClustering(n_clusters=self.num_cluster,affinity='euclidean')
     if self.affinity=='cosine':
         Agg_cluster=AgglomerativeClustering(n_clusters=self.num_cluster,linkage='average',affinity='cosine')
     Res_Labels=Agg_cluster.fit_predict(term_doc.toarray())
     self.cluster_tup_list=self.tuple_Ext_cluster_doc(Res_Labels,Allstrings,Allrow_dicts)
     #term_doc_lsa = lsa.fit_transform(term_doc)
     print type (term_doc)
     self.metric=metrics.silhouette_score(term_doc.toarray(), Res_Labels, metric=self.affinity)
     print Res_Labels
     print("n_samples: %d, n_features: %d" % term_doc.shape) 
예제 #4
0
    sys.exit('ERR: LDA model file ./ldamodel' + str(NUM_TOPICS) +
             '.lda not found!')

print 'Loading LDA model from file ./ldamodel' + str(NUM_TOPICS) + '.lda ...',
sys.stdout.flush()
ldamodel = models.LdaModel.load('ldamodel' + str(NUM_TOPICS) + '.lda')
print ' Done!'

# transform ALL documents into LDA space
target_labels = {}
for img_path in train_dict.keys():

    with open(db_dir + train_dict[img_path]) as fp:
        raw = fp.read()

    tokens = preprocess(raw)
    bow_vector = dictionary.doc2bow(tokens)
    #lda_vector = ldamodel[bow_vector]
    lda_vector = ldamodel.get_document_topics(bow_vector,
                                              minimum_probability=None)
    lda_vector = sorted(lda_vector, key=lambda x: x[1], reverse=True)
    topic_prob = {}
    for instance in lda_vector:
        topic_prob[instance[0]] = instance[1]
    labels = []
    for topic_num in range(0, NUM_TOPICS):
        if topic_num in topic_prob.keys():
            labels.append(topic_prob[topic_num])
        else:
            labels.append(0)
    target_labels[img_path] = labels
    sys.exit('ERR: Dataset metadata folder ' + xml_dir + ' not found!')

if not os.path.isfile(train_dict_path):
    sys.exit('ERR: Train dictionary file ' + train_dict_path + ' not found!')

with open(train_dict_path) as f:
    train_dict = json.load(f)

if not os.path.isfile('./dictionary.dict') or not os.path.isfile('./bow.mm'):
    # list for tokenized documents in loop
    texts = []
    for text_path in train_dict.values():
        with open(db_dir + text_path) as f:
            raw = f.read()
        # add tokens to corpus list
        texts.append(preprocess(raw))
        sys.stdout.write(
            '\rCreating a list of tokenized documents: %d/%d documents processed...'
            % (len(texts), len(train_dict.values())))
        sys.stdout.flush()
    sys.stdout.write(' Done!\n')

# turn our tokenized documents into a id <-> term dictionary
if not os.path.isfile('./dictionary.dict'):
    print 'Turn our tokenized documents into a id <-> term dictionary ...',
    sys.stdout.flush()
    dictionary = corpora.Dictionary(texts)
    dictionary.save('./dictionary.dict')
else:
    print 'Loading id <-> term dictionary from ./dictionary.dict ...',
    sys.stdout.flush()
예제 #6
0
import sys

sys.path.insert(1, '../LDA/')
from preprocess_text import preprocess

NUM_TOPICS = 2

print('Learning LDA topic model with ' + str(NUM_TOPICS) + ' topics')

if not os.path.isfile('./dictionary.dict') or not os.path.isfile('./bow.mm'):
    # list for tokenized documents in loop
    texts = []
    with open('data_pairs.json') as f:
        data_pairs = json.load(f)
    for data_pair in data_pairs:
        texts.append(preprocess(str(data_pair['text'])))
        sys.stdout.write("\rNum texts processed: " + str(len(texts)))
        sys.stdout.flush()
    del data_pairs
    print("")

# turn our tokenized documents into a id <-> term dictionary
if not os.path.isfile('./dictionary.dict'):
    print 'Turn our tokenized documents into a id <-> term dictionary ...',
    sys.stdout.flush()
    dictionary = corpora.Dictionary(texts)
    dictionary.save('./dictionary.dict')
else:
    print 'Loading id <-> term dictionary from ./dictionary.dict ...',
    sys.stdout.flush()
    dictionary = corpora.Dictionary.load('./dictionary.dict')
if not os.path.isdir(xml_dir):
    sys.exit('ERR: Dataset metadata folder '+xml_dir+' not found!')

if not os.path.isfile(train_dict_path):
    sys.exit('ERR: Train dictionary file '+train_dict_path+' not found!')

with open(train_dict_path) as f:
    train_dict = json.load(f)

if not os.path.isfile('./dictionary.dict') or not os.path.isfile('./bow.mm'):
    # list for tokenized documents in loop
    texts = []
    for text_path in train_dict.values():
        with open(db_dir+text_path) as f: raw = f.read()
        # add tokens to corpus list
        texts.append(preprocess(raw))
        sys.stdout.write('\rCreating a list of tokenized documents: %d/%d documents processed...' % (len(texts),len(train_dict.values())))
        sys.stdout.flush()
    sys.stdout.write(' Done!\n')

# turn our tokenized documents into a id <-> term dictionary
if not os.path.isfile('./dictionary.dict'):
    print 'Turn our tokenized documents into a id <-> term dictionary ...',
    sys.stdout.flush()
    dictionary = corpora.Dictionary(texts)
    dictionary.save('./dictionary.dict')
else:
    print 'Loading id <-> term dictionary from ./dictionary.dict ...',
    sys.stdout.flush()
    dictionary = corpora.Dictionary.load('./dictionary.dict')
print ' Done!'
# load LDA model
if not os.path.isfile('ldamodel'+str(NUM_TOPICS)+'.lda'):
    sys.exit('ERR: LDA model file ./ldamodel'+str(NUM_TOPICS)+'.lda not found!')

print 'Loading LDA model from file ./ldamodel'+str(NUM_TOPICS)+'.lda ...',
sys.stdout.flush()
ldamodel = models.LdaModel.load('ldamodel'+str(NUM_TOPICS)+'.lda')
print ' Done!'

# transform ALL documents into LDA space
target_labels = {}
for img_path in train_dict.keys():

    with open(db_dir+train_dict[img_path]) as fp: raw = fp.read()

    tokens = preprocess(raw)
    bow_vector = dictionary.doc2bow(tokens)
    #lda_vector = ldamodel[bow_vector]
    lda_vector = ldamodel.get_document_topics(bow_vector, minimum_probability=None)
    lda_vector = sorted(lda_vector,key=lambda x:x[1],reverse=True)
    topic_prob = {}
    for instance in lda_vector:
        topic_prob[instance[0]] = instance[1]
    labels = []
    for topic_num in range(0,NUM_TOPICS):
        if topic_num in topic_prob.keys():
          labels.append(topic_prob[topic_num])
        else:
          labels.append(0)
    target_labels[img_path] = labels
    sys.stdout.write('\r%d/%d text documents processed...' % (len(target_labels),len(train_dict.keys())))
예제 #9
0
from pomegranate import *

from preprocess_text import preprocess
from perturb_text import perturb

#input_text = "La-vita-sul-pianeta-Marte.txt"
input_text = "books.txt"

# call function that preprocess the input text
preprocess(input_text)

# call function that introduce noise in the text
perturb('preprocessed_text.txt')

# define states list -> states = ['a', 'b', ..., 'z']
states = [chr(code) for code in range(ord('a'), ord('z') + 1)]

# initialize prior probability distribution dictionary and end probability
# distribution dictionary. That is:
# -> start_probs = {'a': 0, 'b': 0, ..., 'z': 0}
# -> end_probs = {'a': 0, 'b': 0, ..., 'z': 0}
# start_probs[x] is the probability that a word start with char x
# end_probs[x] is the probability that a word end with char x
start_probs = {}
end_probs = {}
for state in states:
    start_probs[state] = 0
    end_probs[state] = 0

# initialize transition probability distribution dictionary and observation
# probability distribution dictionary. That is: