Exemplo n.º 1
0
def stanford_batch_tag(sentences):
    '''use stanford tagger to batch tag a list of tokenized
    sentences
    '''
    import src.experiment.path as path
    # need to replace the model path and tagger path of standford parser 
    # in your computer (I use two functions here, you can hard code the paths if 
    # you like)
    tagger = POSTagger(path.stanford_tagger_model_path(),
                       path.stanford_tagger_path())
    return tagger.batch_tag(sentences)
Exemplo n.º 2
0

__author__ = 'Luke'
import cPickle as pickle


def tokenise_tweet():
    pass



objective_tweets = pickle.load(open('../../Data/Training/objective-tweets.obj'))
subjective_tweets = pickle.load(open('../../Data/Training/subjective-tweets.obj'))

objective_tweets = [(tweet, u'obj') for tweet in objective_tweets]
subjective_tweets = [(tweet, u'sub') for tweet, sent in subjective_tweets]
total_set = objective_tweets + subjective_tweets
random.shuffle(total_set)
cut_off = int(0.85*len(total_set))

tagger = POSTagger('stanford-model.tagger', 'stanford-postagger.jar', encoding='utf8')
tagged_sentences = tagger.batch_tag([sent.split() for sent, label in total_set])

target_values = [label for sent, label in total_set]

to_disk = zip(tagged_sentences, target_values)
pickle.dump(to_disk, open('../../Data/Training/sentiment_detector_training.obj', 'wb'))



Exemplo n.º 3
0
class POSExtractor(object):

    def __init__(self, gold, toClassify,
                 base="/resources/processors/tagger/stanford-postagger-3.0/"):
        self.posTagger = POSTagger(base + "/models/german.tagger",
                                   base + "/stanford-postagger.jar")
        self.posCache = {}
        self.pos_dv = self._trainPOSDictVectorizer(gold, toClassify)

    def _trainPOSDictVectorizer(self, goldTree, to_classify=None):
        sentences = list(goldTree)
        if to_classify:
            sentences.extend(to_classify)
        pos_tagged = self.get_pos_tags_for_sentences(sentences)
        items = []
        assert len(pos_tagged) == len(sentences)
        for sentence, pos in itertools.izip(sentences, pos_tagged):
            # feels silly, but there is the occasional encoding error
            # when using str(sentence)
            self.posCache[sentence.pprint().encode('utf-8')] = pos
            items.extend(self.extract_POS(sentence, pos))
        dv = DictVectorizer(sparse=False)
        dv.fit(items)
        #logger.debug("DictVectorizer vocab: %s", dv.vocabulary_)
        return dv

    def get_pos_tags_for_sentences(self, sentences):
        tokenizedSentences = []
        for parseTree in sentences:
            tokens = parseTree.leaves()
            #  (PROAV Deshalb)
            #  (@S-:-PROAV-..
            #      (@S-:-PROAV-...-$.
            #             (VVFIN 3 1/2)
            #             (NP-SB (NN Sterne) (PP (APPR von) (PPER mir))))
            #      ($. .)))
            # [('Deshalb', 'PROAV'), ('3', 'CARD'), ('1/2', 'CARD')
            #
            # encode as utf-8
            # the POSTagger object hands this over to a separate object,
            # i.e. at some point str() is called on the tokens
            tokens = map(lambda x: x.encode('utf-8'), tokens)
            # 3 1/2 is separated by a non-breaking space which prevented
            # correct tokenization in the parse tree
            # the pos tagger however breaks it up correctly
            # so replace 3 1/2 with 3-1/2
            tokens = map(lambda x: x.replace('3\xc2\xa01/2', '3-1/2'), tokens)
            tokenizedSentences.append(tokens)
        pos_tagged = self.posTagger.batch_tag(tokenizedSentences)
        assert len(pos_tagged) == len(tokenizedSentences)
        return pos_tagged

    def transform(self, posTag):
        return self.pos_dv.transform(posTag)

    def extract_POS(self, goldSentence, tagged=None):
        if tagged is None:
            tagged = self.posCache[goldSentence.pprint().encode('utf-8')]
        if tagged is None:
            #tagged = self.get_pos_tags_for_sentences([goldSentence])[0]
            raise ValueError("Should have seen sentence in cache: %s" %
                             goldSentence)
        leaves = goldSentence.leaves()
        if not len(leaves) == len(tagged):
            logger.error("leaves do not correspond to tagged!")
            logger.error("leaves: %s, tagged: %s", leaves, tagged)
        # TODO: there's a chance that similar leaves will have their POS tags
        # overriden
        # but yeah, good enough for now.
        leafDict = {}
        for (leaf, pos) in itertools.izip(leaves, tagged):
            pos = pos[1]
            leafDict[leaf] = pos
        items = []
        all_pos_tags = set()
        for goldNode in ma_util.walkTree(goldSentence):
            res = {}
            for subTreeLeaf in goldNode.leaves():
                key = leafDict[subTreeLeaf]  # [0]
                if not key in res:
                    res[key] = 0
                res[key] += 1  # += 1
                all_pos_tags.add(key)
            items.append(res)
        return items