def read_sstb_data(fpath='sstb/sstb_condensed_{}.csv'):
    revs = []
    vocab = {}
    pos_vocab = {}
    max_len = 0
    pos_tagger = StanfordPOSTagger('pos-tag/english-left3words-distsim.tagger',
                                   'pos-tag/stanford-postagger.jar', 'utf8',
                                   False, '-mx2000m')

    dataset_split = ['train', 'test', 'dev']

    for split in dataset_split:
        with open(fpath.format(split), "rb") as f:
            rdr = csv.reader(f)
            tokens_list = []
            labels = []

            # read all the lines
            for row in rdr:
                tokens = clean_str(row[0]).split()
                tokens_list.append(tokens)
                labels.append(row[1])

            # pos tagging
            tokens_list_tagged = pos_tagger.tag_sents(tokens_list)

            for i in range(len(tokens_list_tagged)):
                tokens_tagged = tokens_list_tagged[i]
                label = labels[i]
                text_tokens = list(zip(*tokens_tagged)[0])
                tag_tokens = list(zip(*tokens_tagged)[1])

                # add each token to vocab
                for token in text_tokens:
                    if token not in vocab:
                        vocab[token] = len(vocab)
                for tag in tag_tokens:
                    if tag not in pos_vocab:
                        pos_vocab[tag] = len(pos_vocab)

                # get max len
                max_len = max(max_len, len(text_tokens))

                # create an entry for the current rev and add to the list
                curr_rev = {
                    'text_tokens': text_tokens,
                    'tag_tokens': tag_tokens,
                    'label': conv_label_to_label_vec(label),
                    'fold_num': get_fold_num(split)
                }
                revs.append(curr_rev)

    # add padding word
    vocab[PAD_WORD] = len(vocab)
    pos_vocab[PAD_WORD] = len(pos_vocab)

    return revs, vocab, pos_vocab, max_len
示例#2
0
def postag_sents(sents):
    if not os.environ.get('STANFORD_MODELS'):
        os.environ["STANFORD_MODELS"] = STANFORD_MODELS
        
    st = StanfordPOSTagger('arabic.tagger', STANFORD_POSTAGGER + '/combined.jar')
    tagged_sents = st.tag_sents(sents)
    tagged_sents = [[tuple(t[1].split('/'))  for t in sent] for sent in tagged_sents]
    
    return tagged_sents
def read_sstb_data(fpath='sstb/sstb_condensed_{}.csv'):
    revs = []
    vocab = {}
    pos_vocab = {}
    max_len = 0
    pos_tagger = StanfordPOSTagger(
        'pos-tag/english-left3words-distsim.tagger',
        'pos-tag/stanford-postagger.jar',
        'utf8', False, '-mx2000m')

    dataset_split = ['train', 'test', 'dev']

    for split in dataset_split:
        with open(fpath.format(split), "rb") as f:
            rdr = csv.reader(f)
            tokens_list = []
            labels = []

            # read all the lines
            for row in rdr:
                tokens = clean_str(row[0]).split()
                tokens_list.append(tokens)
                labels.append(row[1])

            # pos tagging
            tokens_list_tagged = pos_tagger.tag_sents(tokens_list)

            for i in range(len(tokens_list_tagged)):
                tokens_tagged = tokens_list_tagged[i]
                label = labels[i]
                text_tokens = list(zip(*tokens_tagged)[0])
                tag_tokens = list(zip(*tokens_tagged)[1])

                # add each token to vocab
                for token in text_tokens:
                    if token not in vocab:
                        vocab[token] = len(vocab)
                for tag in tag_tokens:
                    if tag not in pos_vocab:
                        pos_vocab[tag] = len(pos_vocab)

                # get max len
                max_len = max(max_len, len(text_tokens))

                # create an entry for the current rev and add to the list
                curr_rev = {'text_tokens': text_tokens,
                            'tag_tokens': tag_tokens,
                            'label': conv_label_to_label_vec(label),
                            'fold_num': get_fold_num(split)}
                revs.append(curr_rev)

    # add padding word
    vocab[PAD_WORD] = len(vocab)
    pos_vocab[PAD_WORD] = len(pos_vocab)

    return revs, vocab, pos_vocab, max_len
示例#4
0
 def get_tagged_sents(self, sents):
     if self.tagger == 'stanford':
         # java_options -mx3000m sets memory use in 3GB
         tagger = StanfordPOSTagger(self.__tag_path_to_model,
                                    self.__tag_path_to_jar,
                                    java_options='-mx3000m')
         tagged = tagger.tag_sents(sents)
         # TODO: desdoblar el vector vimp ... así queda 'verb' = True, 'verb & singular'=True
     else:
         tagged = pos_tag(sents, lang='es')
     return tagged
示例#5
0
class TMStanfordPOSTagger:
    # Available Stanford POS models. TODO: fill entries for other languages
    models = {
        'EN': 'english-bidirectional-distsim.tagger',
        'ES': 'spanish.tagger',
        'FR': 'french.tagger',
        'DE': 'german-fast.tagger',
        'ZH': 'chinese-distsim.tagger',
        'AR': 'arabic.tagger'
    }

    def __init__(self, language):

        self.language = language
        model = self.models.get(language)
        if not model:
            raise (Exception(
                "Unsupported language for POS tagging: {}".format(language)))
        # Initialize Stanford POS tagger
        self.st = StanfordPOSTagger(
            os.path.join(stanford_posTagger_home, 'models', model),
            os.path.join(stanford_posTagger_home, 'stanford-postagger.jar'))
        self.preprocessor = TMTokenizer(language)

    def tag_segments(self, texts):

        #Stanford PosTagger receive a list of word.
        tok_sents = [
            self.preprocessor.tokenizer.process(s).split(' ') for s in texts
        ]
        target_sents = [[[tag.split('#')[0],
                          tag.split('#')[1]] for word, tag in sentence]
                        for sentence in self.st.tag_sents(tok_sents)]
        return target_sents

    #Pos tagger without tokenizer
    def only_tag_segments(self, texts):
        return [
            [[word, tag] for word, tag in sentence]
            for sentence in self.st.tag_sents(texts)
        ]  #[[element for element in self.st.tag(text.split(' '))] for text in texts]#self.st.tag(s.split('') for s in texts) #
def read_mr_data(num_folds, fpath='mr/rt-polarity.{}'):
    revs = []
    vocab = {}
    pos_vocab = {}
    max_len = 0
    pos_tagger = StanfordPOSTagger('pos-tag/english-left3words-distsim.tagger',
                                   'pos-tag/stanford-postagger.jar', 'utf8',
                                   False, '-mx2000m')
    sentiments = ['pos', 'neg']

    for sentiment in sentiments:
        with open(fpath.format(sentiment), "rb") as f:
            tokens_list = []
            label_vec = conv_sent_to_vec(sentiment)

            # read all the lines
            for line in f.read().splitlines():
                tokens = clean_str(line).split()
                tokens_list.append(tokens)

            # pos tagging
            tokens_list_tagged = pos_tagger.tag_sents(tokens_list)

            for tokens_tagged in tokens_list_tagged:
                text_tokens = list(zip(*tokens_tagged)[0])
                tag_tokens = list(zip(*tokens_tagged)[1])

                # add each token to vocab
                for token in text_tokens:
                    if token not in vocab:
                        vocab[token] = len(vocab)
                for tag in tag_tokens:
                    if tag not in pos_vocab:
                        pos_vocab[tag] = len(pos_vocab)

                # get max len
                max_len = max(max_len, len(text_tokens))

                # create an entry for the current rev and add to the list
                curr_rev = {
                    'text_tokens': text_tokens,
                    'tag_tokens': tag_tokens,
                    'label': label_vec,
                    'fold_num': np.random.randint(0, num_folds)
                }
                revs.append(curr_rev)

    # add padding word
    vocab[PAD_WORD] = len(vocab)
    pos_vocab[PAD_WORD] = len(pos_vocab)

    return revs, vocab, pos_vocab, max_len
def read_mr_data(num_folds, fpath='mr/rt-polarity.{}'):
    revs = []
    vocab = {}
    pos_vocab = {}
    max_len = 0
    pos_tagger = StanfordPOSTagger(
        'pos-tag/english-left3words-distsim.tagger',
        'pos-tag/stanford-postagger.jar',
        'utf8', False, '-mx2000m')
    sentiments = ['pos', 'neg']

    for sentiment in sentiments:
        with open(fpath.format(sentiment), "rb") as f:
            tokens_list = []
            label_vec = conv_sent_to_vec(sentiment)

            # read all the lines
            for line in f.read().splitlines():
                tokens = clean_str(line).split()
                tokens_list.append(tokens)

            # pos tagging
            tokens_list_tagged = pos_tagger.tag_sents(tokens_list)

            for tokens_tagged in tokens_list_tagged:
                text_tokens = list(zip(*tokens_tagged)[0])
                tag_tokens = list(zip(*tokens_tagged)[1])

                # add each token to vocab
                for token in text_tokens:
                    if token not in vocab:
                        vocab[token] = len(vocab)
                for tag in tag_tokens:
                    if tag not in pos_vocab:
                        pos_vocab[tag] = len(pos_vocab)

                # get max len
                max_len = max(max_len, len(text_tokens))

                # create an entry for the current rev and add to the list
                curr_rev = {'text_tokens': text_tokens,
                            'tag_tokens': tag_tokens,
                            'label': label_vec,
                            'fold_num': np.random.randint(0, num_folds)}
                revs.append(curr_rev)

    # add padding word
    vocab[PAD_WORD] = len(vocab)
    pos_vocab[PAD_WORD] = len(pos_vocab)

    return revs, vocab, pos_vocab, max_len
示例#8
0
class POSTagger():
    def __init__(self, modelfile, jarfile, max_seq_len):
        self.tagger = StanfordPOSTagger(model_filename=modelfile,
                                        path_to_jar=jarfile)
        self.pos2index = pos2index
        self.num_tags = len(self.pos2index)
        self.index2vec = np.zeros((self.num_tags + 1, self.num_tags))
        self.max_seq_len = max_seq_len
        self.text_pos_seq = None
        self.aspect_pos_seq = None

        for i in range(self.num_tags):
            self.index2vec[i + 1] = np.zeros(self.num_tags)
            self.index2vec[i + 1, i] = 1

    def get_pos_tags(self, ind, flag='text'):
        if flag == 'text':
            return self.text_pos_seq[ind]
        else:
            return self.aspect_pos_seq[ind]

    def get_pos_tags_list(self,
                          text_list,
                          padding='post',
                          truncating='post',
                          flag='text'):
        tagged_text_list = self.tagger.tag_sents(
            word_tokenize_text(sent).strip().split() for sent in text_list)
        res = []
        for text in tagged_text_list:
            ans = [
                self.pos2index[i[1]] if i[1] in self.pos2index else 0
                for i in text
            ]
            ans = pad_and_truncate(ans,
                                   self.max_seq_len,
                                   padding=padding,
                                   truncating=truncating)
            res.append(ans)
        if flag == 'text':
            self.text_pos_seq = res
        else:
            self.aspect_pos_seq = res
        return res
def build_data_cv(data_file):
    revs = []
    vocab = defaultdict(float)
    pos_vocab = defaultdict(float)
    pos_tagger = StanfordPOSTagger(
        'pos-tag/english-left3words-distsim.tagger',
        'pos-tag/stanford-postagger.jar',
        'utf8', False, '-mx2000m')
    split_list = ['train', 'test']
    class_to_label = {}

    for split in split_list:
        with open(data_file.format(split), "rb") as f:
            revs_text = []
            ys = []
            for line in f:
                qclass, rev = line.split(':')[0], line.split(':')[1]
                rev = clean_str(rev)
                if qclass not in class_to_label:
                    class_to_label[qclass] = len(class_to_label)
                    y = class_to_label[qclass]
                else:
                    y = class_to_label[qclass]
                revs_text.append(rev.split())
                ys.append(y)
            revs_tagged = pos_tagger.tag_sents(revs_text)
            for i in range(len(revs_tagged)):
                rev_tagged = revs_tagged[i]
                text = list(zip(*rev_tagged)[0])[1:]
                tag = list(zip(*rev_tagged)[1])
                y = ys[i]
                for word in set(text):
                    vocab[word] += 1
                for postag in set(tag):
                    pos_vocab[postag] += 1
                datum = {"y": y,
                         "text": ' '.join(text),
                         "tag": ' '.join(tag),
                         "num_words": len(text),
                         "split": 0 if split == 'train' else 1}
                revs.append(datum)

    return revs, vocab, pos_vocab, len(class_to_label)
示例#10
0
class POSTagger():
    def __init__(self):
        jar = '/home/joe32140/stanford/stanford-postagger-2018-02-27/stanford-postagger.jar'
        model = '/home/joe32140/stanford/stanford-postagger-2018-02-27/models/english-bidirectional-distsim.tagger'
        self.tagger = StanfordPOSTagger(model, jar, encoding='utf8')

    def getPOS_sents(self, sents):
        tokenized_sents = [word_tokenize(sent) for sent in sents]
        classified_sents = self.tagger.tag_sents(tokenized_sents)
        return classified_sents

    def get_Noun(self, sents):
        classified_sents =self.getPOS_sents(sents)
        new_sentences=[]
        for i, sent in enumerate(classified_sents):
            tmp=[]
            for w in sent:
                if w[1][0] == 'N':
                    tmp.append(w[0])
            new_sentences.append(' '.join(tmp))
        return new_sentences
示例#11
0
def build_data_cv(data_file):
    revs = []
    vocab = defaultdict(float)
    pos_vocab = defaultdict(float)
    pos_tagger = StanfordPOSTagger(
        'pos-tag/english-left3words-distsim.tagger',
        'pos-tag/stanford-postagger.jar',
        'utf8', False, '-mx6000m')
    splits = ['train', 'test', 'dev']

    for split in splits:
        with open(data_file.format(split), "rb") as f:
            lines = f.read().splitlines()
            revs_text = []
            ratings = []
            for line in lines:
                line_split = line.split('\t\t')
                rating = int(line_split[2]) - 1
                rev = line_split[3]
                rev_tokens = rev.split()
                revs_text.append(rev_tokens)
                ratings.append(rating)

            revs_tagged = pos_tagger.tag_sents(revs_text)
            for i in range(len(revs_tagged)):
                rev_tagged = revs_tagged[i]
                text = list(zip(*rev_tagged)[0])
                tag = list(zip(*rev_tagged)[1])
                for word in set(text):
                    vocab[word] += 1
                for postag in set(tag):
                    pos_vocab[postag] += 1
                rev_datum = {"y": ratings[i],
                             "text": ' '.join(text),
                             "tag": ' '.join(tag),
                             "num_words": len(text),
                             "split": get_split_num(split)}
                revs.append(rev_datum)

    return revs, vocab, pos_vocab
示例#12
0
    def __init__(self, file_path, tagged_words_path=None):
        '''Creates a Collocations instance with a text

        file_path - string path to .txt input file; used to generate full
            description of results in output file, whether or not tagged_words
            is given
        tagged_words_path - string path to .txt file containing string
            representation of list of tagged words in input file; saves time and
            resources on computation
        '''

        self.file_path = file_path

        if tagged_words_path == None:
            #open input file, extract text, and close file
            document = open(file_path, 'r', encoding='utf-8')
            raw = document.read().lower()
            document.close()

            #tokenize text into words and tag parts of speech using the
            #Stanford part-of-speech tagger
            sentences = nltk.sent_tokenize(raw)
            tokenized_sentences = [nltk.word_tokenize(w) for w in sentences]

            java_path = 'C:/Program Files/Java/jdk-9.0.1/bin/java.exe'
            os.environ['JAVAHOME'] = java_path
            path_to_model = ('stanford-postagger-2017-06-09/models/'
                'english-left3words-distsim.tagger')
            path_to_jar = ('stanford-postagger-2017-06-09/'
                'stanford-postagger.jar')
            tagger = StanfordPOSTagger(path_to_model, path_to_jar)
            tagger.java_options='-mx4096m'
            tagged_sentences = tagger.tag_sents(tokenized_sentences)
            self.tagged_words = sum(tagged_sentences, [])
        else:
            #load pre-tagged words
            import ast
            document = open(tagged_words_path, 'r', encoding='utf-8')
            self.tagged_words = ast.literal_eval(document.read())
            document.close()
示例#13
0
class StanfordNLTKWrapper:
    def __init__(self, config_file_path='aida_event/config/xmie.json'):
        self._config = read_dict_from_json_file(config_file_path)
        self._domain_name = self._config['common_tools']['stanford_url']
        self._port_number = self._config['common_tools']['stanford_port']
        self._pos_model = self._config['common_tools']['stanford_pos_model']
        self._pos_jar = self._config['common_tools']['stanford_pos_jar']
        self._parser_model = self._config['common_tools'][
            'stanford_parser_model']
        self._parser_jar = self._config['common_tools']['stanford_parser_jar']

        self._core_nlp_parser = CoreNLPParser(
            url='%s:%s' % (self._domain_name, self._port_number))
        self._pos_tagger = StanfordPOSTagger(model_filename=self._pos_model,
                                             path_to_jar=self._pos_jar)
        self._dep_parser = StanfordDependencyParser(
            path_to_jar=self._parser_jar,
            path_to_models_jar=self._parser_model,
            java_options='-Xmx16G')

    def tokenizer(self, input_text):
        return list(self._core_nlp_parser.tokenize(input_text))

    def pos_tag(self, input_tokenized_sentence):
        return self._pos_tagger.tag(input_tokenized_sentence)

    def pos_tag_sentences(self, input_tokenized_sentences):
        return self._pos_tagger.tag_sents(input_tokenized_sentences)

    def dependency_parser(self, input_tokenized_pos_tagged_sentence):
        return self._dep_parser.tagged_parse(
            input_tokenized_pos_tagged_sentence)

    def dependency_parser_sentences(self,
                                    input_tokenized_pos_tagged_sentences):
        return self._dep_parser.tagged_parse_sents(
            input_tokenized_pos_tagged_sentences)
示例#14
0
class NLPCore:
    """
    nlp processing including Stanford Word Segmenter, Stanford POS Tagger, 
    Stanford Named Entity Recognizer and Stanford Parser 
    """
    def __init__(self):
        self.root_path = '../Models/stanfordNLP/'

        # word segmenter
        self.segmenter = StanfordSegmenter(
            path_to_jar=self.root_path + "stanford-segmenter.jar",
            path_to_slf4j=self.root_path + "log4j-over-slf4j.jar",
            path_to_sihan_corpora_dict=self.root_path + "segmenter/",
            path_to_model=self.root_path + "segmenter/pku.gz",
            path_to_dict=self.root_path + "segmenter/dict-chris6.ser.gz")

        # pos tagger
        self.posTagger = StanfordPOSTagger(
            self.root_path + 'pos-tagger/chinese-distsim.tagger',
            path_to_jar=self.root_path + "stanford-postagger.jar")

        # named entity recognizer
        self.nerTagger = StanfordNERTagger(
            self.root_path + 'ner/chinese.misc.distsim.crf.ser.gz',
            path_to_jar=self.root_path + 'stanford-ner.jar')

        self.parser = StanfordDependencyParser(
            model_path=self.root_path + 'lexparser/chinesePCFG.ser.gz',
            path_to_jar=self.root_path + 'stanford-parser.jar',
            path_to_models_jar=self.root_path +
            'stanford-parser-3.7.0-models.jar',
            encoding='gbk')

    def split_sent_stanford(self, textPair):
        """
        Stanford Word Segmenter, input should be raw text
        :return: also TextPair with raw string of results
        """
        t1 = self.segmenter.segment(textPair.t1)
        t2 = self.segmenter.segment(textPair.t1)

        if DEBUG:
            print(t1, t2)

        return text_pair.TextPair(t1, t2, textPair.label)

    def split_sents_stanford(self, textPairs):
        """
        Stanford Word Segmenter, input should be list of sents
        :return: also TextPair with raw string of results
        """
        sents1 = [textPair.t1 for textPair in textPairs]
        sents2 = [textPair.t2 for textPair in textPairs]

        split1 = self.segmenter.segment_sents(sents1).split('\n')
        split2 = self.segmenter.segment_sents(sents2).split('\n')

        rlist = []
        for i in range(len(textPairs)):
            rlist.append(
                text_pair.TextPair(split1[i], split2[i], textPairs[i].label))

            if DEBUG:
                print(split1[i], split2[i])

        return rlist

    def split_sent_jieba(self, textPair):

        jieba.setLogLevel('INFO')
        ger1 = jieba.cut(textPair.t1)
        ger2 = jieba.cut(textPair.t2)

        t1 = ' '.join(ger1)
        t2 = ' '.join(ger2)

        return text_pair.TextPair(t1, t2, textPair.label)

    def pos_tag(self, textPair):
        """
        Stanford POS Tagger, input should be splitted
        :return: also TextPair with raw string of results
        """
        t1_s = textPair.t1.split()
        t2_s = textPair.t2.split()

        t1_tag = ' '.join([ele[1] for ele in self.posTagger.tag(t1_s)])
        t2_tag = ' '.join([ele[1] for ele in self.posTagger.tag(t2_s)])

        if DEBUG:
            print(t1_tag, t2_tag)

        return text_pair.TextPair(t1_tag, t2_tag, textPair.label)

    def pos_tag_pairs(self, textPairs):
        """
        Stanford POS Tagger, input should be list of sents
        :return: also TextPair with raw string of results
        """
        sents1 = [textPair.t1.split() for textPair in textPairs]
        sents2 = [textPair.t2.split() for textPair in textPairs]

        tag1 = self.posTagger.tag_sents(sents1)
        tag2 = self.posTagger.tag_sents(sents2)

        rlist = []
        for i in range(len(tag1)):
            t1_tag = ' '.join([ele[1] for ele in tag1[i]])
            t2_tag = ' '.join([ele[1] for ele in tag2[i]])

            rlist.append(text_pair.TextPair(t1_tag, t2_tag,
                                            textPairs[i].label))

            if DEBUG:
                print(t1_tag, t2_tag)

        return rlist

    def ner_tag(self, textPair):
        """
        Stanford Named Entity Recognizer, input should be splitted
        :return: also TextPair with raw string of results
        """
        t1_s = textPair.t1.split()
        t2_s = textPair.t2.split()

        t1_ner = ' '.join(
            [ele[0] + '#' + ele[1] for ele in self.nerTagger.tag(t1_s)])
        t2_ner = ' '.join(
            [ele[0] + '#' + ele[1] for ele in self.nerTagger.tag(t2_s)])

        if DEBUG:
            print(t1_ner, t2_ner)

        return text_pair.TextPair(t1_ner, t2_ner, textPair.label)

    def ner_tag_pairs(self, textPairs):
        """
        Stanford Named Entity Recognizer, input should be list of sents
        :return: also TextPair with raw string of results
        """
        sents1 = [textPair.t1.split() for textPair in textPairs]
        sents2 = [textPair.t2.split() for textPair in textPairs]

        tag1 = self.nerTagger.tag_sents(sents1)
        tag2 = self.nerTagger.tag_sents(sents2)

        rlist = []
        for i in range(len(tag1)):
            t1_ner = ' '.join([ele[0] + '#' + ele[1] for ele in tag1[i]])
            t2_ner = ' '.join([ele[0] + '#' + ele[1] for ele in tag2[i]])

            rlist.append(text_pair.TextPair(t1_ner, t2_ner,
                                            textPairs[i].label))

            if DEBUG:
                print(t1_ner, t2_ner)

        return rlist

    def depen_parse(self, textPair):
        """
        Stanford Dependency Parser, input should be splitted
        :return: also TextPair with raw string of results
        """
        print([p.tree() for p in self.parser.raw_parse(textPair.t1)])
print(user_mentions_with_words)

import re

punctuation = {
    '/', '"', '(', ')', '%', ';', '?', '¿', '!', '¡', "'", ':', '#', '$', '&',
    '>', '<', '-', '_', '°', '|', '¬', '\\', '*', '+', '[', ']', '{', '}', '=',
    '\n', '&amp', '&gt', '&lt', '@'
}
text = re.sub('(ja){2,}', '', text)
print(text)
tokenized_text = nltk.word_tokenize(text, "spanish")
print(tokenized_text)

start_time = time()
tagged_text = sum(spanish_pos_tagger.tag_sents([tokenized_text]), [])
processed_text = []
for s in tagged_text:
    for tag in eagles_standard:
        if s[1] in eagles_standard[tag] and tag != "puntuacion":
            processed_text.append({s[0]: tag})
print(processed_text)
execution_time = time() - start_time
print(str(timedelta(seconds=execution_time)))
print()

snowball_stemmer = SnowballStemmer("spanish")
porter_stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()
snowball_stemmed_list = list()
porter_stemmed_list = list()
示例#16
0
from nltk.tag import StanfordPOSTagger
import os
java_path = "C:/Program Files/Java/jdk1.8.0_181/bin/java.exe"
os.environ["JAVAHOME"] = java_path
stanford_dir = "C:/NLP_Programs/stanford-postagger-2018-10-16"
modelfile = stanford_dir + "/models/english-bidirectional-distsim.tagger"
jarfile = stanford_dir + "/stanford-postagger.jar"

tagger = StanfordPOSTagger(model_filename=modelfile, path_to_jar=jarfile)

print(
    tagger.tag_sents(
        sent.split() for sent in
        ["Yo im your deep learning mama", "Modi is besht", "Please work ma!"]))
x = [['11', '222'], ['33', '444']]
x = x + [['33', '444'], ['11', '222']]
print(x)
示例#17
0
def build_data_cv(data_file, all_phrases, binary, min_len=4):
    revs = []
    vocab = defaultdict(float)
    pos_vocab = defaultdict(float)
    pos_tagger = StanfordPOSTagger(
        'pos-tag/english-left3words-distsim.tagger',
        'pos-tag/stanford-postagger.jar',
        'utf8', False, '-mx2000m')
    splits = ['train', 'test', 'dev']
    sentence_set = set()

    for split in splits:
        with open(data_file.format(split), "rb") as f:
            reader = csv.reader(f)
            revs_text = []
            sents = []
            for row in reader:
                rev, sent = row[0], int(row[1])
                if binary and sent == 2:  # skip neutral if binary
                    continue
                rev = clean_str_sst(rev)
                if split == 'train':
                    sentence_set.add(rev)
                rev_tokens = rev.split()
                revs_text.append(rev_tokens)
                sent = sentiment_label_for_binary(sent) if binary else sent  # check for binary case
                sents.append(sent)
            revs_tagged = pos_tagger.tag_sents(revs_text)
            for i in range(len(revs_tagged)):
                rev_tagged = revs_tagged[i]
                text = list(zip(*rev_tagged)[0])
                tag = list(zip(*rev_tagged)[1])
                for word in set(text):
                    vocab[word] += 1
                for postag in set(tag):
                    pos_vocab[postag] += 1
                rev_datum = {"y": sents[i],
                             "text": ' '.join(text),
                             "tag": ' '.join(tag),
                             "num_words": len(text),
                             "split": get_split_num(split)}
                revs.append(rev_datum)

    if all_phrases:
        with open(data_file.format("train_phrases"), "rb") as f:
            reader = csv.reader(f)
            revs_text = []
            sents = []
            count = 0
            for row in reader:
                rev, sent = row[0], int(row[1])
                rev = clean_str_sst(rev)
                if rev in sentence_set:
                    count += 1
                    continue
                if binary and sent == 2:  # skip neutral if binary
                    continue
                rev_tokens = rev.split()
                if len(rev_tokens) < min_len:
                    continue
                revs_text.append(rev_tokens)
                sent = sentiment_label_for_binary(sent) if binary else sent  # check for binary case
                sents.append(sent)
            revs_tagged = pos_tagger.tag_sents(revs_text)
            for i in range(len(revs_tagged)):
                rev_tagged = revs_tagged[i]
                text = list(zip(*rev_tagged)[0])
                tag = list(zip(*rev_tagged)[1])
                for word in set(text):
                    vocab[word] += 1
                for postag in set(tag):
                    pos_vocab[postag] += 1
                rev_datum = {"y": sents[i],
                             "text": ' '.join(text),
                             "tag": ' '.join(tag),
                             "num_words": len(text),
                             "split": get_split_num('train')}
                revs.append(rev_datum)

            print "{} sentences in phrases".format(count)

    return revs, vocab, pos_vocab
class CTBCreator(object):
    '''Data path is assumed to be a directory with
       pkl files and a corpora subdirectory.
    '''
    def __init__(self,
                 wordembed_dim=300,
                 embeddingstd=0.1,
                 data_path=None,
                 tagger_path=None):
        assert data_path is not None
        assert tagger_path is not None
        dict_filepath = os.path.join(data_path, 'dict.pkl')
        data_filepath = os.path.join(data_path, 'parsed.pkl')
        train_filepath = os.path.join(data_path, "train.txt")
        valid_filepath = os.path.join(data_path, "dev.txt")
        test_filepath = os.path.join(data_path, "test.txt")

        self.st = StanfordPOSTagger(
            os.path.join(tagger_path, 'models/chinese-distsim.tagger'),
            os.path.join(tagger_path, 'stanford-postagger.jar'))

        print("building dictionary ...")
        f_dict = open(dict_filepath, 'wb')
        self.dictionary = Dictionary()

        print("loading trees from {}".format(train_filepath))
        train_trees = load_trees(train_filepath)
        print("loading trees from {}".format(valid_filepath))
        valid_trees = load_trees(valid_filepath)
        print("loading trees from {}".format(test_filepath))
        test_trees = load_trees(test_filepath)

        self.add_words(train_trees)
        self.dictionary.rebuild_by_freq()
        self.arc_dictionary = Dictionary()
        self.stag_dictionary = Dictionary()
        self.train = self.preprocess(train_trees, is_train=True)
        self.valid = self.preprocess(valid_trees, is_train=False)
        self.test = self.preprocess(test_trees, is_train=False)
        with open(dict_filepath, "wb") as file_dict:
            pickle.dump(self.dictionary, file_dict)
        with open(data_filepath, "wb") as file_data:
            pickle.dump(
                (self.train, self.arc_dictionary, self.stag_dictionary),
                file_data)
            pickle.dump(self.valid, file_data)
            pickle.dump(self.test, file_data)

        print(len(self.arc_dictionary.idx2word))
        print(self.arc_dictionary.idx2word)

    def add_words(self, trees):
        words, tags = [], []
        for tree in trees:
            tree = process_NONE(tree)
            words, tags = zip(*tree.pos())
            words = ['<s>'] + list(words) + ['</s>']
            for w in words:
                self.dictionary.add_word(w)

    def preprocess(self, parse_trees, is_train=False):
        sens_idx = []
        sens_tag = []
        sens_stag = []
        sens_arc = []
        distances = []
        sens = []
        trees = []

        print('\nConverting trees ...')
        for i, tree in enumerate(parse_trees):
            tree = process_NONE(tree)
            if i % 10 == 0:
                print("Done %d/%d\r" % (i, len(parse_trees)), end='')
            word_lexs, _ = zip(*tree.pos())
            idx = []
            for word in (['<s>'] + list(word_lexs) + ['</s>']):
                idx.append(self.dictionary[word])

            listerized_tree, arcs, tags = tree2list(tree)
            tags = ['<unk>'] + tags + ['<unk>']
            arcs = ['<unk>'] + arcs + ['<unk>']

            if type(listerized_tree) is str:
                listerized_tree = [listerized_tree]
            distances_sent, _ = distance(listerized_tree)
            distances_sent = [0] + distances_sent + [0]

            idx_arcs = []
            for arc in arcs:
                arc = precess_arc(arc)
                arc_id = self.arc_dictionary.add_word(
                    arc) if is_train else self.arc_dictionary[arc]
                idx_arcs.append(arc_id)

            # the "tags" are the collapsed unary chains, i.e. FRAG+DT
            # at evaluation, we swap the word tag "DT" with the true tag in "stags" (see after)
            idx_tags = []
            for tag in tags:
                tag = precess_arc(tag)
                tag_id = self.arc_dictionary.add_word(
                    tag) if is_train else self.arc_dictionary[tag]
                idx_tags.append(tag_id)

            assert len(distances_sent) == len(idx) - 1
            assert len(arcs) == len(idx) - 1
            assert len(idx) == len(word_lexs) + 2

            sens.append(word_lexs)
            trees.append(tree)
            sens_idx.append(idx)
            sens_tag.append(idx_tags)
            sens_arc.append(idx_arcs)
            distances.append(distances_sent)

        print('\nLabelling POS tags ...')
        st_outputs = self.st.tag_sents(sens)
        for i, word_tags in enumerate(st_outputs):
            if i % 10 == 0:
                print("Done %d/%d\r" % (i, len(parse_trees)), end='')
            word_tags = [t[1].split('#')[1] for t in word_tags]
            stags = ['<s>'] + list(word_tags) + ['</s>']

            # the "stags" are the original word tags included in the data files
            # we keep track of them so that, during evaluation, we can swap them with the original ones.
            idx_stags = []
            for stag in stags:
                stag_id = self.stag_dictionary.add_word(
                    stag) if is_train else self.stag_dictionary[stag]
                idx_stags.append(stag_id)

            sens_stag.append(idx_stags)

        return sens_idx, sens_tag, sens_stag, \
               sens_arc, distances, sens, trees
        30. VBN Verb, past participle
        31. VBP Verb, non-3rd person singular present
        32. VBZ Verb, 3rd person singular present
        33. WDT Wh-determiner
        34. WP  Wh-pronoun
        35. WP$ Possessive wh-pronoun
        36. WRB Wh-adverb


        '''
        ts = [
            tknzr.tokenize(line_en.strip().strip('"')) for line_en in tqdm(f)
        ]

        # p = nltk.pos_tag(t, tagset='universal')
        ps = tagger.tag_sents(ts)

        for p in tqdm(ps):
            r = []
            t = []
            for pos in p:
                t.append(pos[0].lower().strip('.,!'))
                if pos[1][0] == 'J':
                    r.append(
                        wnl.lemmatize(pos[0].lower().strip('.,!-'), pos='a') +
                        u'/A')
                elif pos[1][0] == 'V':
                    r.append(
                        wnl.lemmatize(pos[0].lower().strip('.,!-'), pos='v') +
                        u'/V')
                elif pos[1][0] == 'N':
示例#20
0
training_sentences = sentences[split_idx:]

# original tags of sentences in the brown corpus
ground_tags = [[tag for word, tag in testing_sentences[sentence_idx]] for
        sentence_idx in range(split_idx)]
testing_tokens = [[word for word, tag in testing_sentences[sentence_idx]] for
        sentence_idx in range(split_idx)]

if (True):
    print ("#######")
    # get trained stanford model
    stanford_model = StanfordPOSTagger(os.environ.get('STANFORD_BROWN_MODEL'))

    # stanford_tokens_tags = [stanford_model.tag(token_list) for token_list in testing_tokens]
    stanford_tokens_tags = []
    stanford_token_tags = stanford_model.tag_sents(testing_tokens)

    stanford_tags = [[tag for word, tag in stanford_token_tags[sentence_idx]] for sentence_idx in range(split_idx)]
    # save computed tags
    pickle.dump(stanford_tags, open("stanford_brown_20_tags_all.pd", "wb"))
    print (len(stanford_tags))
    print (stanford_tags[0])
    print (stanford_tags[1])

if (True):
    print ("#######")
    print ("Training CRF tagger...")
    crf_tagger = CRFTagger()
    crf_tagger.train(training_sentences, '/tmp/crf_tagger_80.model')
    #crf_tagger.set_model_file('./crf_new')
    print ("Done training CRF tagger...")
示例#21
0
def build_data_cv(data_folder, cv=10, clean_string=True):
    """
    Loads data and split into 10 folds.
    """
    revs = []
    pos_file = data_folder[0]
    neg_file = data_folder[1]
    vocab = defaultdict(float)
    pos_vocab = defaultdict(float)
    pos_tagger = StanfordPOSTagger(
        'pos-tag/english-left3words-distsim.tagger',
        'pos-tag/stanford-postagger.jar',
        'utf8', False, '-mx2000m')

    with open(pos_file, "rb") as f:
        revs_text = []
        for line in f:       
            rev = []
            rev.append(line.strip())
            if clean_string:
                orig_rev = clean_str(" ".join(rev))
            else:
                orig_rev = " ".join(rev).lower()
            revs_text.append(orig_rev.split())

        revs_tagged = pos_tagger.tag_sents(revs_text)

        for rev_tagged in revs_tagged:
            text = list(zip(*rev_tagged)[0])
            tag = list(zip(*rev_tagged)[1])
            words = set(text)
            for word in words:
                vocab[word] += 1
            postags = set(tag)
            for postag in postags:
                pos_vocab[postag] += 1
            datum = {"y": 1,
                     "text": ' '.join(text),
                     "tag": ' '.join(tag),
                     "num_words": len(text),
                     "split": np.random.randint(0, cv)}
            revs.append(datum)

    with open(neg_file, "rb") as f:
        revs_text = []
        for line in f:
            rev = []
            rev.append(line.strip())
            if clean_string:
                orig_rev = clean_str(" ".join(rev))
            else:
                orig_rev = " ".join(rev).lower()
            revs_text.append(orig_rev.split())

        revs_tagged = pos_tagger.tag_sents(revs_text)

        for rev_tagged in revs_tagged:
            text = list(zip(*rev_tagged)[0])
            tag = list(zip(*rev_tagged)[1])
            words = set(text)
            for word in words:
                vocab[word] += 1
            postags = set(tag)
            for postag in postags:
                pos_vocab[postag] += 1
            datum = {"y": 0,
                     "text": ' '.join(text),
                     "tag": ' '.join(tag),
                     "num_words": len(text),
                     "split": np.random.randint(0, cv)}
            revs.append(datum)

    return revs, vocab, pos_vocab
示例#22
0
for idx, chunk_id in enumerate(chunks):
	if not os.path.exists('evidence/evidence_pos_{0:04d}.json'.format(chunk_id)):
		data_path = 'pubmed20n{0:04d}.json'.format(chunk_id)
		if not os.path.exists(data_path): continue

		evi_output = []
		ctx_output = []
		data = json.load(open(data_path))

		for item in data:
			results = process(item)
			evi_output += results[0]
			ctx_output.append(results[1])

		pos_list = st.tag_sents(o['pos'] for o in evi_output)

		for _idx in range(len(evi_output)):
			evi_output[_idx]['pos'] = pos_list[_idx]

		with open('evidence/evidence_pos_{0:04d}.json'.format(chunk_id), 'w') as f:
			json.dump(evi_output, f)
		with open('evidence/contexts_{0:04d}.json'.format(chunk_id), 'w') as f:
			json.dump(ctx_output, f)

	else:
		evi_output = json.load(open('evidence/evidence_pos_{0:04d}.json'.format(chunk_id)))
	
	total += len(evi_output)
	
	print('%d/%d; Processing %s; Number of evidence: %d; Total: %d' % (idx+1, len(chunks), chunk_id, len(evi_output), total))
示例#23
0
class FeatureMaker:

    _sentence_data = None
    _split_data = None
    _stf_pos_tagger = None
    _stf_parser = None

    _pos_list = []
    _neg_list = []

    def __init__(self, data):
        self._split_data = data
        self._sentence_data = [" ".join(line) for line in self._split_data]

    def _pos_tag_sent(self, sent):
        # text = word_tokenize("And now for something completely different")
        return nltk.pos_tag(sent)

    def _sf_pos_tag_sent(self, sent):
        return self._stf_pos_tagger.tag(sent)

    def prefix_suffix(self):
        prefix_2 = []
        prefix_3 = []
        suffix_2 = []
        suffix_3 = []
        for line in self._split_data:
            prefix_2.append([w[:2] for w in line])
            prefix_3.append([w[:3] for w in line])
            suffix_2.append([w[-2:] for w in line])
            suffix_3.append([w[-3:] for w in line])

        return [prefix_2, prefix_3, suffix_2, suffix_3]

    def fast_pos_tag(self):
        tag_result = [[token[1] for token in self._pos_tag_sent(line)] for line in self._split_data]
        return tag_result

    def pos_tag(self):
        if self._stf_pos_tagger is None:
            self._stf_pos_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger')
        index = 0
        tag_result = []
        while index < len(self._split_data):
            temp = self._stf_pos_tagger.tag_sents(self._split_data[index:index+1000])
            tag_result.extend(temp)
            index += 1000
            print(("pos:" + str(index)), end=' ')
        # tag_result = self._stf_pos_tagger.tag_sents(self._split_data)
        tag_result = [[unidecode(p[1]) for p in line] for line in tag_result]

        # for line in tag_result:
        #     print str(line)
        return tag_result

    def parser(self):
        if self._stf_parser is None:
            self._stf_parser = StanfordParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
        result = self._stf_parser.parse_sents(self._split_data)
        result = sum([[parse for parse in dep_graphs] for dep_graphs in result], [])
        for i in result:
            print(i)

    def per_word_length(self):
        wl_result = [[len(w) for w in line] for line in self._split_data]
        return wl_result

    def sentence_avg_word_length(self):
        wl_result = self.per_word_length()
        wl_result = [np.mean(line) for line in wl_result]
        return wl_result

    def sentence_length(self):
        sl_result = [len(line) for line in self._split_data]
        return sl_result

    def sentence_length_mean_sd(self):
        return np.mean(self.sentence_length()), np.std(self.sentence_length())

    def load_sentiment_list(self):
        if not self._pos_list:
            with open("./../pos_neg/positive-words.txt", mode='r') as f:
                file_content = f.readlines()
                for line in file_content:
                    line = line.strip()
                    if not line.startswith(";") and line:
                        self._pos_list.append(line)
        if not self._neg_list:
            with open("./../pos_neg/negative-words.txt", mode='r') as f:
                file_content = f.readlines()
                for line in file_content:
                    line = line.strip()
                    if not line.startswith(";") and line:
                        self._neg_list.append(line)
        return [self._pos_list, self._neg_list]

    def sentiment_sequence(self):
        sentiment_data = []
        for line in self._split_data:
            sentiment_line = []
            for word in line:
                if word in self._pos_list:
                    sentiment_line.append("POS")
                elif word in self._neg_list:
                    sentiment_line.append("NEG")
                else:
                    sentiment_line.append("NON")
            sentiment_data.append(sentiment_line)
        return sentiment_data

    def get_read_measure(self):
        value_list = []
        for cat, data in list(readability.getmeasures(self._sentence_data, lang='en').items()):
            print(('%s:' % cat))
            for key, val in list(data.items()):
                print((('    %-20s %12.2f' % (key + ':', val)).rstrip('0 ').rstrip('.')))

            value_list.append(val)
        return val