def stanford_batch_tag(sentences): '''use stanford tagger to batch tag a list of tokenized sentences ''' import src.experiment.path as path # need to replace the model path and tagger path of standford parser # in your computer (I use two functions here, you can hard code the paths if # you like) tagger = POSTagger(path.stanford_tagger_model_path(), path.stanford_tagger_path()) return tagger.batch_tag(sentences)
__author__ = 'Luke' import cPickle as pickle def tokenise_tweet(): pass objective_tweets = pickle.load(open('../../Data/Training/objective-tweets.obj')) subjective_tweets = pickle.load(open('../../Data/Training/subjective-tweets.obj')) objective_tweets = [(tweet, u'obj') for tweet in objective_tweets] subjective_tweets = [(tweet, u'sub') for tweet, sent in subjective_tweets] total_set = objective_tweets + subjective_tweets random.shuffle(total_set) cut_off = int(0.85*len(total_set)) tagger = POSTagger('stanford-model.tagger', 'stanford-postagger.jar', encoding='utf8') tagged_sentences = tagger.batch_tag([sent.split() for sent, label in total_set]) target_values = [label for sent, label in total_set] to_disk = zip(tagged_sentences, target_values) pickle.dump(to_disk, open('../../Data/Training/sentiment_detector_training.obj', 'wb'))
class POSExtractor(object): def __init__(self, gold, toClassify, base="/resources/processors/tagger/stanford-postagger-3.0/"): self.posTagger = POSTagger(base + "/models/german.tagger", base + "/stanford-postagger.jar") self.posCache = {} self.pos_dv = self._trainPOSDictVectorizer(gold, toClassify) def _trainPOSDictVectorizer(self, goldTree, to_classify=None): sentences = list(goldTree) if to_classify: sentences.extend(to_classify) pos_tagged = self.get_pos_tags_for_sentences(sentences) items = [] assert len(pos_tagged) == len(sentences) for sentence, pos in itertools.izip(sentences, pos_tagged): # feels silly, but there is the occasional encoding error # when using str(sentence) self.posCache[sentence.pprint().encode('utf-8')] = pos items.extend(self.extract_POS(sentence, pos)) dv = DictVectorizer(sparse=False) dv.fit(items) #logger.debug("DictVectorizer vocab: %s", dv.vocabulary_) return dv def get_pos_tags_for_sentences(self, sentences): tokenizedSentences = [] for parseTree in sentences: tokens = parseTree.leaves() # (PROAV Deshalb) # (@S-:-PROAV-.. # (@S-:-PROAV-...-$. # (VVFIN 3 1/2) # (NP-SB (NN Sterne) (PP (APPR von) (PPER mir)))) # ($. .))) # [('Deshalb', 'PROAV'), ('3', 'CARD'), ('1/2', 'CARD') # # encode as utf-8 # the POSTagger object hands this over to a separate object, # i.e. at some point str() is called on the tokens tokens = map(lambda x: x.encode('utf-8'), tokens) # 3 1/2 is separated by a non-breaking space which prevented # correct tokenization in the parse tree # the pos tagger however breaks it up correctly # so replace 3 1/2 with 3-1/2 tokens = map(lambda x: x.replace('3\xc2\xa01/2', '3-1/2'), tokens) tokenizedSentences.append(tokens) pos_tagged = self.posTagger.batch_tag(tokenizedSentences) assert len(pos_tagged) == len(tokenizedSentences) return pos_tagged def transform(self, posTag): return self.pos_dv.transform(posTag) def extract_POS(self, goldSentence, tagged=None): if tagged is None: tagged = self.posCache[goldSentence.pprint().encode('utf-8')] if tagged is None: #tagged = self.get_pos_tags_for_sentences([goldSentence])[0] raise ValueError("Should have seen sentence in cache: %s" % goldSentence) leaves = goldSentence.leaves() if not len(leaves) == len(tagged): logger.error("leaves do not correspond to tagged!") logger.error("leaves: %s, tagged: %s", leaves, tagged) # TODO: there's a chance that similar leaves will have their POS tags # overriden # but yeah, good enough for now. leafDict = {} for (leaf, pos) in itertools.izip(leaves, tagged): pos = pos[1] leafDict[leaf] = pos items = [] all_pos_tags = set() for goldNode in ma_util.walkTree(goldSentence): res = {} for subTreeLeaf in goldNode.leaves(): key = leafDict[subTreeLeaf] # [0] if not key in res: res[key] = 0 res[key] += 1 # += 1 all_pos_tags.add(key) items.append(res) return items