-
Notifications
You must be signed in to change notification settings - Fork 1
/
Find Attributes.py
175 lines (135 loc) · 4.93 KB
/
Find Attributes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 14 19:34:40 2016
@author: Nick Cohron
The script includes the following pre-processing steps for text:
- Sentence Splitting
- Term Tokenization
- Ngrams
- POS tagging
The run function includes of grams various sizes that include <NOUN> and <ADJECTIVE>
POS tags list: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
"""
import csv
import re
from operator import itemgetter
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import load
#==============================================================================
# # read file with review subset and return one corpus of reviews
# def read_file(in_path):
# reviews = []
# with open(in_path, 'rb') as f:
# reader = csv.reader(f)
# for row in reader:
# reviews.append(row[2])
#
# return reviews
#==============================================================================
# return all the 'adv adj' twograms
<<<<<<< HEAD
def getNounAdjNgrams(terms, nouns, adjectives, n):
=======
<<<<<<< HEAD
<<<<<<< HEAD
def getNounAdjNgrams(terms, nouns, adjectives, n):
=======
def getNounAdjNgrams(terms, adjectives, nouns, n):
>>>>>>> refs/remotes/origin/master
>>>>>>> 0d07c6885e4d7a6d9135051b2db2d065e279fc3b
=======
def getNounAdjNgrams(terms, adjectives, nouns, n):
>>>>>>> refs/remotes/origin/master
>>>>>>> 0d07c6885e4d7a6d9135051b2db2d065e279fc3b
result=[]
# creates a sliding window of two words each
grams = ngrams(terms, n) # compute grams
# for each gram
for gram in grams:
# if the 2gram is an adjective followed by a noun
if gram[0] in adjectives and gram[1] in nouns:
result.append(gram)
return result
# return all the terms that belong to a specific POS type
def getPOSterms(terms,POStags,tagger):
tagged_terms=tagger.tag(terms)#do POS tagging on the tokenized sentence
# dictionary for return
POSterms={}
# initialize dictionary of key, value where value is a set
for tag in POStags:
POSterms[tag]=set()
# for each tagged term
for pair in tagged_terms:
for tag in POStags: # for each POS tag
if pair[1].startswith(tag):
POSterms[tag].add(pair[0])
return POSterms
# main body of program
def run(path):
# initialize list
adjWithNoun = []
# make a tagger
_POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
tagger = load(_POS_TAGGER)
# load sexicon of stop words
stopLex = set(stopwords.words('english'))
# get raw review text from file
with open(in_path, 'rb') as f:
review = []
reader = csv.reader(f)
for row in reader:
review = row[2]
print(review)
try:
# split sentences
sentences = sent_tokenize(review)
#print (sentences)
print 'NUMBER OF SENTENCES: ', len(sentences)
continue
except:
print "Oops! That was not tokenizable. Try again..."
# for each sentence
for sentence in sentences:
print (sentence)
# replace chars that are not letters or numbers with a space
sentence = re.sub('[^a-zA-Z\d]',' ',sentence)
# remove duplicate spaces
sentence = re.sub(' +',' ', sentence).strip()
# tokenize the lowercase sentence
terms = nltk.word_tokenize(sentence.lower())
print (terms)
# POS tags of interest
POStags = ['JJ','NN']
POSterms = getPOSterms(terms,POStags,tagger)
# get the set of adjectives and nouns
adjectives = POSterms['JJ']
nouns = POSterms['NN']
# get the results for this sentence
# call function to get ngrams
n = 2
adjWithNoun += getNounAdjNgrams(terms, nouns, adjectives, n)
return adjWithNoun
#tag_list = nltk.pos_tag(sentence)
if __name__=='__main__':
# file with raw text reviews
<<<<<<< HEAD
<<<<<<< HEAD
in_path = '/Users/Nick/Stevens Institute of Technology/Web Analytics/Final Project/data_repo/chinese_reviews.csv'
=======
<<<<<<< HEAD
=======
>>>>>>> 0d07c6885e4d7a6d9135051b2db2d065e279fc3b
in_path = r'C:\Users\Gautam\Documents\GitHub\Yelp-dataset\csv\auto_reviews.csv'
=======
in_path = '/Users/Nick/Stevens Institute of Technology/Web Analytics/Final Project/data_repo/test.csv'
>>>>>>> refs/remotes/origin/master
<<<<<<< HEAD
>>>>>>> 0d07c6885e4d7a6d9135051b2db2d065e279fc3b
=======
>>>>>>> 0d07c6885e4d7a6d9135051b2db2d065e279fc3b
# send raw text for processing of attributes
print run(in_path)