示例#1
0
import json
from FileOperations import FileOperations
import nltk
from nltk.tag.stanford import StanfordPOSTagger
import os

# set the java environment variables:
# CLASSPATH is the path to the stanford-postagger.jar in your local disk
# STANFORD_MODELS is the path to the tagger file in your local disk
os.environ[
    'CLASSPATH'] = '/home/sol315/Downloads/stanford-postagger-2015-12-09/stanford-postagger.jar'
os.environ['STANFORD_MODELS'] = './models/english-left3words-distsim.tagger'

fo = FileOperations("../input.json")
fo.get_json()
st = StanfordPOSTagger('english-bidirectional-distsim.tagger')
f = open('taged.data', 'a')
cur = 0
for line in fo.reviews:
    cur += 1
    print cur, cur * 100 / fo.num_lines, '%'
    res = st.tag(line.split())
    json_tag = json.dumps(res)
    f.write(json_tag)
    f.write('\n')
示例#2
0
import operator
import os
import re

# set the java environment variables:
# CLASSPATH is the path to the stanford-postagger.jar in your local disk
# STANFORD_MODELS is the path to the tagger file in your local disk
os.environ[
    'CLASSPATH'] = '/home/sol315/Downloads/stanford-postagger-2015-12-09/stanford-postagger.jar'
os.environ['STANFORD_MODELS'] = './models/english-left3words-distsim.tagger'

fo = FileOperations("taged.data")
tages = fo.get_taged_data()

origin = FileOperations("../input.json")
origin.get_json()

stop = set(stopwords.words('english'))

pairs = dict()
attributes = dict()
regex = re.compile('[^a-zA-Z]')

#this for loop is only used for get the attributes of task 2
for line in tages:
    for tag in line:
        if tag[1] == 'NN' or tag[1] == 'NNS':
            tag[0] = regex.sub('', tag[0]).lower()
            if tag[0] in stop or len(tag[0]) <= 1:
                tag[1] = 'STOP'
            elif tag[0] in attributes: