import json from FileOperations import FileOperations import nltk from nltk.tag.stanford import StanfordPOSTagger import os # set the java environment variables: # CLASSPATH is the path to the stanford-postagger.jar in your local disk # STANFORD_MODELS is the path to the tagger file in your local disk os.environ[ 'CLASSPATH'] = '/home/sol315/Downloads/stanford-postagger-2015-12-09/stanford-postagger.jar' os.environ['STANFORD_MODELS'] = './models/english-left3words-distsim.tagger' fo = FileOperations("../input.json") fo.get_json() st = StanfordPOSTagger('english-bidirectional-distsim.tagger') f = open('taged.data', 'a') cur = 0 for line in fo.reviews: cur += 1 print cur, cur * 100 / fo.num_lines, '%' res = st.tag(line.split()) json_tag = json.dumps(res) f.write(json_tag) f.write('\n')
import operator import os import re # set the java environment variables: # CLASSPATH is the path to the stanford-postagger.jar in your local disk # STANFORD_MODELS is the path to the tagger file in your local disk os.environ[ 'CLASSPATH'] = '/home/sol315/Downloads/stanford-postagger-2015-12-09/stanford-postagger.jar' os.environ['STANFORD_MODELS'] = './models/english-left3words-distsim.tagger' fo = FileOperations("taged.data") tages = fo.get_taged_data() origin = FileOperations("../input.json") origin.get_json() stop = set(stopwords.words('english')) pairs = dict() attributes = dict() regex = re.compile('[^a-zA-Z]') #this for loop is only used for get the attributes of task 2 for line in tages: for tag in line: if tag[1] == 'NN' or tag[1] == 'NNS': tag[0] = regex.sub('', tag[0]).lower() if tag[0] in stop or len(tag[0]) <= 1: tag[1] = 'STOP' elif tag[0] in attributes: