Пример #1
0
class StemmerTokenizer():
    def __init__(self):
        self.lem = PorterStemmer()

    def __call__(self, string):
        tokens = word_tokenize( string.lower() )
        return [ self.lem.stem_word(t) for t in tokens ]
Пример #2
0
def preprocess(text):
  stemmer = PorterStemmer()
  stop = stopwords.words("english")
  result = word_tokenize(text)
  result = [stemmer.stem_word(word.lower()) for word in result if \
            word not in stop and \
            word not in string.punctuation and \
            word not in string.digits]
  return result
Пример #3
0
def dialogue_act_features(post):
    words = nltk.word_tokenize(post)
    sentences = nltk.sent_tokenize(post)
    features = {
        'word_diversity': len(words)/len(set(words)),
    }

    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem_word(w) for w in words]
        
    # words
    for word in set(stemmed_words):
         features['contains(%s)' % word.lower()] = True

    # check for presence/absence of specific words
    check_words = [
        'who', 'what', 'where', 'why', 'how',    # question words
        'love', 'hate', 'despis',		 # emotional words (?)
        ] 

    for word in check_words:
        features['contains(%s)' % word] = word in stemmed_words
         
    # punctuation
    for punctuation in ['?', '!', '!!', '?!', '"', '...', '.']:
        features['punctuation_count(%s)' % punctuation] = post.count(punctuation)

    # skip parts of speech for now - slow, not helping much
    return features

    # get counts for parts of speech
    pos_count = defaultdict(int)
    for sentence in sentences:
        # tokenize the sentence into words and tag parts of speech
        sentence_words = nltk.word_tokenize(sentence)
        # - using the nltk parts-of-speech tagger for now
        #  (other options may be faster/more accurate)
        pos_sentence = nltk.pos_tag(sentence_words)
        for word, pos in pos_sentence:
            pos_count['pos_%s' % pos] += 1

    # include final counts by part of speech in the features
    features.update(pos_count)

    return features
Пример #4
0
class PreProcess:
    def __init__(self):
        self.tokenizer = word_tokenize
        self.stemmer = PorterStemmer()
        self.punct = string.punctuation
        self.digits = string.digits
        self.stop = stopwords.words("english")

    def process_sent(self, snt):
        snt = self.tokenizer(snt)
        snt = [self.stemmer.stem_word(wrd.lower()) for wrd in snt if \
                    wrd not in self.stop and \
                    wrd not in self.digits and \
                    wrd not in self.punct ]
        return snt

    def process(self, snts):
        return [self.process_sent(snt) for snt in snts]
Пример #5
0
class PreProcess:
    def __init__(self):
        self.tokenizer = word_tokenize
        self.stemmer = PorterStemmer()
        self.punct = string.punctuation
        self.digits = string.digits
        self.stop = stopwords.words("english")

    def process_sent(self, snt):
        snt = self.tokenizer(snt)
        snt = [self.stemmer.stem_word(wrd.lower()) for wrd in snt if \
                    wrd not in self.stop and \
                    wrd not in self.digits and \
                    wrd not in self.punct ]
        return snt

    def process(self, snts):
        return [self.process_sent(snt) for snt in snts]
Пример #6
0
    def stem(self,input_text):
       tokenizer = RegexpTokenizer('\s+', gaps=True)
       stemmed_text=[]
       lemmatizer = WordNetLemmatizer()
       stemmer = PorterStemmer() 
       text = tokenizer.tokenize(str(input_text))
       filtered_text = self.stopword(text)            
       for word in filtered_text:
           if word.isalpha():
		if len(word)>4:
               		stemmed_text.append(stemmer.stem_word(word).lower())
		else:
			stemmed_text.append(word.lower())
       for word in stemmed_text:
          if len(word) < 3 :
               stemmed_text.remove(word)      
       ' '.join(stemmed_text)
      
       return stemmed_text   
Пример #7
0
""" General approach: serach for control-type structures which may be ambigious (with raising)
then search for those verbs to see if they exist in "There[ex] VERB" contexts
e.g we find "John seems to be beside himself today"
    so we search for "/[tT]here/ . (/VB/ < /^(seem)/)"
    if this returns any results, "seem" must be a raising verb
"""

from pdb import set_trace
import runTregex as tx
from nltk.stem import PorterStemmer
ps = PorterStemmer()
treebank_dir = "/home/chase/CompLing/stanford-tregex-2012-03-09/treebank_3/parsed/mrg/wsj/"

unfiltered = set()
for t in trees:
    unfiltered.add(ps.stem_word(t.matchTree.leaves()[0]).lower())

# this takes forever and  isn't really too effective...
for word in unfiltered:
    pat = "(/[Tt]here/ > EX) . /^%s/"%word
    reload(tx) 
    trees = tx.Treebank(treebank_dir, pat)
    trees.run()
    if len(trees) > 0:
        print word

Пример #8
0
        True_iD = True
        continue
    if inPage and line.find( "<text" ) != -1:
        inText = True
        continue
    if inPage and True_iD and line.find("<id>") != -1:
        iD.append(line[len("<id>") : -len("</id>")])
        True_iD = False

    if inPage and line.find( "/text" ) != -1:
        inText = False
        text = ' '.join(list)
        #Tokenizing Text For Each XML
        temp = text.decode("utf-8","ignore")
        temp = temp.replace(u'\ufeff',' ')
        temp_1 = re.sub(pattern," ",temp)
        temp_1 = temp_1.lower()
        res=[]
        for x in temp_1.split():
        	if x not in stopwords.words('english'):
        		res.append(x)
        clean_text = " ".join(stem.stem_word(word) for word in res)
        tokens = nltk.word_tokenize(clean_text)
        cnt = Counter(tokens)
        print("[[%s]]\t[[%.0f]]") % (dict(cnt),int(iD[0]))
        list = []
        continue
    


Пример #9
0
stop_words = set(stopwords.words("english"))

words = word_tokenize(example_sentence)

filter_sentence = [w for w in words if w not in stop_words]

print(filter_sentence)

##### STEAMMER EXAMPLE #####

ps = PorterStemmer()

example_words = ["pythone", "pythoner", "pythoning", "pythoned", "pythonly"]

for w in example_words:
    print(ps.stem_word(w))

##### SENTENCE TOKENIZER EXAMPLE #####

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_some_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_some_tokenizer(sample_text)


def proce_content():
    try:
        for w in tokenized:
            words = nltk.word_tokenize(w)
Пример #10
0
 def __stem_tokens(tokens):
     stemmer = PorterStemmer()
     return [stemmer.stem_word(token) for token in tokens]
def blah(str_inp):
    print str_inp
    reload(sys)
    sys.setdefaultencoding('utf-8')
    porter = PorterStemmer()
    stop_words = set(stopwords.words("english"))  # load stopwords
    tag = open("hashtags.txt", "r")
    text = open("tweet_text_lower.txt", "r")
    from collections import defaultdict
    distict_terms = defaultdict(int)
    hash_tags = defaultdict(int)

    count = -1
    ht_index_dict = {}
    ht_index_list = []
    x_size = 0
    count = 0
    with open("hashtags.txt", "r") as f:

        x = []
        for line in f:
            x.append(ast.literal_eval(line.strip()))

        for ht in x:
            x_size += 1
            ht_str = ""
            for ele in ht:
                ht_str += str(ele)
            if ht_str in ht_index_dict:
                ht_index_list.append(ht_index_dict[ht_str])
            else:
                ht_index_dict[ht_str] = count
                count += 1
                ht_index_list.append(ht_index_dict[ht_str])

            if (x_size == 1197):
                break

    # print ("x_size"+str(x_size))

    #print x

    #count =-1
    #for a in x:
    #    for t in a:
    #        if t.lower() not in hash_tags:
    #            count+=1
    #            hash_tags[t.lower()]=count'''

    tagsize = count

    #dictionary of terms
    count = -1
    with open("tweet_text_lower.txt", "r") as f:
        for rec in f:
            for word in rec.split():
                # print word
                if word[0] != '#':
                    if word.strip() not in stop_words:
                        # print word.strip()
                        if porter.stem_word(word.strip()) not in distict_terms:
                            count += 1
                            distict_terms[porter.stem_word(
                                word.strip())] = count

    # print f
    dictsize = count
    # print len(distict_terms), count
    #dictionary of hashtags
    # print len(hash_tags), count

    count = 0
    count2 = 0
    line_num = 0
    with open("tweet_text_lower.txt", "r") as f:
        for i, l in enumerate(f):
            pass
        line_num = i + 1

    mat = [[0] * (dictsize + 1) for i in range(1197)]  #replace by line_num+1
    # mat = [[0]]
    # print mat
    # print  text
    for (line, i) in zip(text, range(1197)):
        # print line
        for word in line.split():
            #print word
            if (word[0] != '#'):
                if word.strip() not in stop_words:
                    #for tags in ast.literal_eval(hash_.strip()):
                    #print distict_terms[word] + hash_tags[tags]
                    mat[i][distict_terms[porter.stem_word(word.lower())]] += 1

    #print mat
    document_freq = [0] * (dictsize + 1)
    #print document_freq

    for i in distict_terms:
        for j in range(1197):
            if mat[j][distict_terms[i]] >= 1:
                document_freq[distict_terms[i]] += 1

    #print line_num
    #print document_freq
    #for t in distict_terms:
    #    if distict_terms[t]==1:
    #        print t
    #print tagsize
    #print len(distict_terms)
    import math
    for i in distict_terms:
        for j in range(1197):
            mat[j][distict_terms[i]] = math.log(
                1 + mat[j][distict_terms[i]]) * (math.log(
                    (1197) / (document_freq[distict_terms[i]] + 1)))
            #print mat[distict_terms[i]][hash_tags[j]]

    import numpy as np
    import matplotlib.pyplot as plt

    from sklearn.datasets import make_multilabel_classification
    from sklearn.multiclass import OneVsRestClassifier
    from sklearn.svm import SVC
    from sklearn.preprocessing import LabelBinarizer
    from sklearn.decomposition import PCA
    from sklearn.cross_decomposition import CCA
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.preprocessing import MultiLabelBinarizer
    from scipy import spatial
    import numpy as np

    from sklearn.metrics.pairwise import cosine_similarity

    # print line_num

    clf = KNeighborsClassifier(n_neighbors=4)
    #mat = CCA(n_components=2).fit(mat, x).transform(mat)
    #print feature
    #clf = OneVsRestClassifier(SVC(kernel='poly'))
    #mat=np.array(mat)
    #print  len(x)
    #print len(mat)
    #print x[0]

    #test =["not"]*1197
    #print test
    clf.fit(mat, ht_index_list)

    #print type(mat), type(Y)

    query = [0] * (dictsize + 1)
    for word in str_inp.split():
        if word[0] != '#':
            if word not in stop_words:
                if porter.stem_word(word) in distict_terms:
                    query[distict_terms[porter.stem_word(word)]] += 1

    for word in distict_terms:
        query[distict_terms[word]] = math.log(
            1 + query[distict_terms[word]]) * (math.log(
                (1197) / (document_freq[distict_terms[word]] + 1)))

    res_index = clf.predict(query)
    # print res_index

    for key in ht_index_dict:
        if (ht_index_dict[key] == res_index[0]):
            print key
            break

    return key
Пример #12
0
class TitleSim:
    def __init__(self, features_conf, features_deleted):
        print 'Start initialization'

        # initial model training
        features = features_deleted + features_conf
        target = [0 for x in range(len(features_deleted))] +\
                [1 for x in range(len(features_conf))]
        self.classifier = RandomForestClassifier(n_estimators=50,
                                            verbose=2,
                                            n_jobs=1,
                                            min_samples_split=10,
                                            random_state=1)
        self.classifier.fit(features, target)

        # loading relational data which will be used
        paths = json.loads(open("SETTINGS.json").read())
        paper_doc = paths["paper_doc"]
        self.paper = dict([(entry[0], entry[1]) for entry in csv.reader(open(paper_doc))])

        # loading setting file
        self.paths = json.loads(open("SETTINGS.json").read())

        # loading word map of titles
        self.wordmap = self.load_titlemap()

        # do other initializations
        self.stemmer = PorterStemmer()
        print 'End initialization'


    def label_predict(self, fea_dict):
        # fea_dict is a dictionary whose key is 'user id'
        prob_dict = {}
        for key in fea_dict:
            features = [feature[1:] for feature in fea_dict[key]]
            predictions = self.classifier.predict_proba(features)[:,1]
            prob_dict[key]=[(item[0],prob) for item,prob in zip(fea_dict[key],predictions)]
        return prob_dict


    def load_titlemap(self):
        return dict([(entry[0],entry[1]) for entry in \
                csv.reader(open(self.paths["title_wordmap"]))])

    def calsim(self, author_doc, pairs):
        # calculate the similarity between titles
        title_features = []
        for pair in pairs:
            if pair[0] not in author_doc:
                print 'Key error.'
                sys.exit(1)
            title_features.append(self.calpairsim(author_doc[pair[0]], pair[1]))

        return title_features

    def calpairsim(self, doclist, target_doc):
        author_words = {}
        for doc in doclist:
            words = self.paper[doc].lower().split(' ')
            for word in words:
                stemmed_word = self.stemmer.stem_word(word)
                if stemmed_word in self.wordmap:
                    if stemmed_word in author_words:
                        author_words[stemmed_word] += 1
                    else:
                        author_words[stemmed_word] = 1

        doc_words = {}
        words = self.paper[target_doc].lower().split(' ')
        for word in words:
            stemmed_word = self.stemmer.stem_word(word)
            if stemmed_word in self.wordmap:
                if stemmed_word in doc_words:
                    doc_words[stemmed_word] += 1
                else:
                    doc_words[stemmed_word] = 1

        # number of common words
        comm_num = len(set(author_words.keys()) & set(doc_words.keys()))

        # pearson coefficient
        if (len(set(author_words.keys())) + len(set(doc_words.keys()))) != 0:
            pearson = comm_num*1.0/ (len(set(author_words.keys())) + len(set(doc_words.keys())))
        else:
            pearson = 0.0

        return [comm_num, pearson]