def read_sstb_data(fpath='sstb/sstb_condensed_{}.csv'): revs = [] vocab = {} pos_vocab = {} max_len = 0 pos_tagger = StanfordPOSTagger('pos-tag/english-left3words-distsim.tagger', 'pos-tag/stanford-postagger.jar', 'utf8', False, '-mx2000m') dataset_split = ['train', 'test', 'dev'] for split in dataset_split: with open(fpath.format(split), "rb") as f: rdr = csv.reader(f) tokens_list = [] labels = [] # read all the lines for row in rdr: tokens = clean_str(row[0]).split() tokens_list.append(tokens) labels.append(row[1]) # pos tagging tokens_list_tagged = pos_tagger.tag_sents(tokens_list) for i in range(len(tokens_list_tagged)): tokens_tagged = tokens_list_tagged[i] label = labels[i] text_tokens = list(zip(*tokens_tagged)[0]) tag_tokens = list(zip(*tokens_tagged)[1]) # add each token to vocab for token in text_tokens: if token not in vocab: vocab[token] = len(vocab) for tag in tag_tokens: if tag not in pos_vocab: pos_vocab[tag] = len(pos_vocab) # get max len max_len = max(max_len, len(text_tokens)) # create an entry for the current rev and add to the list curr_rev = { 'text_tokens': text_tokens, 'tag_tokens': tag_tokens, 'label': conv_label_to_label_vec(label), 'fold_num': get_fold_num(split) } revs.append(curr_rev) # add padding word vocab[PAD_WORD] = len(vocab) pos_vocab[PAD_WORD] = len(pos_vocab) return revs, vocab, pos_vocab, max_len
def postag_sents(sents): if not os.environ.get('STANFORD_MODELS'): os.environ["STANFORD_MODELS"] = STANFORD_MODELS st = StanfordPOSTagger('arabic.tagger', STANFORD_POSTAGGER + '/combined.jar') tagged_sents = st.tag_sents(sents) tagged_sents = [[tuple(t[1].split('/')) for t in sent] for sent in tagged_sents] return tagged_sents
def read_sstb_data(fpath='sstb/sstb_condensed_{}.csv'): revs = [] vocab = {} pos_vocab = {} max_len = 0 pos_tagger = StanfordPOSTagger( 'pos-tag/english-left3words-distsim.tagger', 'pos-tag/stanford-postagger.jar', 'utf8', False, '-mx2000m') dataset_split = ['train', 'test', 'dev'] for split in dataset_split: with open(fpath.format(split), "rb") as f: rdr = csv.reader(f) tokens_list = [] labels = [] # read all the lines for row in rdr: tokens = clean_str(row[0]).split() tokens_list.append(tokens) labels.append(row[1]) # pos tagging tokens_list_tagged = pos_tagger.tag_sents(tokens_list) for i in range(len(tokens_list_tagged)): tokens_tagged = tokens_list_tagged[i] label = labels[i] text_tokens = list(zip(*tokens_tagged)[0]) tag_tokens = list(zip(*tokens_tagged)[1]) # add each token to vocab for token in text_tokens: if token not in vocab: vocab[token] = len(vocab) for tag in tag_tokens: if tag not in pos_vocab: pos_vocab[tag] = len(pos_vocab) # get max len max_len = max(max_len, len(text_tokens)) # create an entry for the current rev and add to the list curr_rev = {'text_tokens': text_tokens, 'tag_tokens': tag_tokens, 'label': conv_label_to_label_vec(label), 'fold_num': get_fold_num(split)} revs.append(curr_rev) # add padding word vocab[PAD_WORD] = len(vocab) pos_vocab[PAD_WORD] = len(pos_vocab) return revs, vocab, pos_vocab, max_len
def get_tagged_sents(self, sents): if self.tagger == 'stanford': # java_options -mx3000m sets memory use in 3GB tagger = StanfordPOSTagger(self.__tag_path_to_model, self.__tag_path_to_jar, java_options='-mx3000m') tagged = tagger.tag_sents(sents) # TODO: desdoblar el vector vimp ... así queda 'verb' = True, 'verb & singular'=True else: tagged = pos_tag(sents, lang='es') return tagged
class TMStanfordPOSTagger: # Available Stanford POS models. TODO: fill entries for other languages models = { 'EN': 'english-bidirectional-distsim.tagger', 'ES': 'spanish.tagger', 'FR': 'french.tagger', 'DE': 'german-fast.tagger', 'ZH': 'chinese-distsim.tagger', 'AR': 'arabic.tagger' } def __init__(self, language): self.language = language model = self.models.get(language) if not model: raise (Exception( "Unsupported language for POS tagging: {}".format(language))) # Initialize Stanford POS tagger self.st = StanfordPOSTagger( os.path.join(stanford_posTagger_home, 'models', model), os.path.join(stanford_posTagger_home, 'stanford-postagger.jar')) self.preprocessor = TMTokenizer(language) def tag_segments(self, texts): #Stanford PosTagger receive a list of word. tok_sents = [ self.preprocessor.tokenizer.process(s).split(' ') for s in texts ] target_sents = [[[tag.split('#')[0], tag.split('#')[1]] for word, tag in sentence] for sentence in self.st.tag_sents(tok_sents)] return target_sents #Pos tagger without tokenizer def only_tag_segments(self, texts): return [ [[word, tag] for word, tag in sentence] for sentence in self.st.tag_sents(texts) ] #[[element for element in self.st.tag(text.split(' '))] for text in texts]#self.st.tag(s.split('') for s in texts) #
def read_mr_data(num_folds, fpath='mr/rt-polarity.{}'): revs = [] vocab = {} pos_vocab = {} max_len = 0 pos_tagger = StanfordPOSTagger('pos-tag/english-left3words-distsim.tagger', 'pos-tag/stanford-postagger.jar', 'utf8', False, '-mx2000m') sentiments = ['pos', 'neg'] for sentiment in sentiments: with open(fpath.format(sentiment), "rb") as f: tokens_list = [] label_vec = conv_sent_to_vec(sentiment) # read all the lines for line in f.read().splitlines(): tokens = clean_str(line).split() tokens_list.append(tokens) # pos tagging tokens_list_tagged = pos_tagger.tag_sents(tokens_list) for tokens_tagged in tokens_list_tagged: text_tokens = list(zip(*tokens_tagged)[0]) tag_tokens = list(zip(*tokens_tagged)[1]) # add each token to vocab for token in text_tokens: if token not in vocab: vocab[token] = len(vocab) for tag in tag_tokens: if tag not in pos_vocab: pos_vocab[tag] = len(pos_vocab) # get max len max_len = max(max_len, len(text_tokens)) # create an entry for the current rev and add to the list curr_rev = { 'text_tokens': text_tokens, 'tag_tokens': tag_tokens, 'label': label_vec, 'fold_num': np.random.randint(0, num_folds) } revs.append(curr_rev) # add padding word vocab[PAD_WORD] = len(vocab) pos_vocab[PAD_WORD] = len(pos_vocab) return revs, vocab, pos_vocab, max_len
def read_mr_data(num_folds, fpath='mr/rt-polarity.{}'): revs = [] vocab = {} pos_vocab = {} max_len = 0 pos_tagger = StanfordPOSTagger( 'pos-tag/english-left3words-distsim.tagger', 'pos-tag/stanford-postagger.jar', 'utf8', False, '-mx2000m') sentiments = ['pos', 'neg'] for sentiment in sentiments: with open(fpath.format(sentiment), "rb") as f: tokens_list = [] label_vec = conv_sent_to_vec(sentiment) # read all the lines for line in f.read().splitlines(): tokens = clean_str(line).split() tokens_list.append(tokens) # pos tagging tokens_list_tagged = pos_tagger.tag_sents(tokens_list) for tokens_tagged in tokens_list_tagged: text_tokens = list(zip(*tokens_tagged)[0]) tag_tokens = list(zip(*tokens_tagged)[1]) # add each token to vocab for token in text_tokens: if token not in vocab: vocab[token] = len(vocab) for tag in tag_tokens: if tag not in pos_vocab: pos_vocab[tag] = len(pos_vocab) # get max len max_len = max(max_len, len(text_tokens)) # create an entry for the current rev and add to the list curr_rev = {'text_tokens': text_tokens, 'tag_tokens': tag_tokens, 'label': label_vec, 'fold_num': np.random.randint(0, num_folds)} revs.append(curr_rev) # add padding word vocab[PAD_WORD] = len(vocab) pos_vocab[PAD_WORD] = len(pos_vocab) return revs, vocab, pos_vocab, max_len
class POSTagger(): def __init__(self, modelfile, jarfile, max_seq_len): self.tagger = StanfordPOSTagger(model_filename=modelfile, path_to_jar=jarfile) self.pos2index = pos2index self.num_tags = len(self.pos2index) self.index2vec = np.zeros((self.num_tags + 1, self.num_tags)) self.max_seq_len = max_seq_len self.text_pos_seq = None self.aspect_pos_seq = None for i in range(self.num_tags): self.index2vec[i + 1] = np.zeros(self.num_tags) self.index2vec[i + 1, i] = 1 def get_pos_tags(self, ind, flag='text'): if flag == 'text': return self.text_pos_seq[ind] else: return self.aspect_pos_seq[ind] def get_pos_tags_list(self, text_list, padding='post', truncating='post', flag='text'): tagged_text_list = self.tagger.tag_sents( word_tokenize_text(sent).strip().split() for sent in text_list) res = [] for text in tagged_text_list: ans = [ self.pos2index[i[1]] if i[1] in self.pos2index else 0 for i in text ] ans = pad_and_truncate(ans, self.max_seq_len, padding=padding, truncating=truncating) res.append(ans) if flag == 'text': self.text_pos_seq = res else: self.aspect_pos_seq = res return res
def build_data_cv(data_file): revs = [] vocab = defaultdict(float) pos_vocab = defaultdict(float) pos_tagger = StanfordPOSTagger( 'pos-tag/english-left3words-distsim.tagger', 'pos-tag/stanford-postagger.jar', 'utf8', False, '-mx2000m') split_list = ['train', 'test'] class_to_label = {} for split in split_list: with open(data_file.format(split), "rb") as f: revs_text = [] ys = [] for line in f: qclass, rev = line.split(':')[0], line.split(':')[1] rev = clean_str(rev) if qclass not in class_to_label: class_to_label[qclass] = len(class_to_label) y = class_to_label[qclass] else: y = class_to_label[qclass] revs_text.append(rev.split()) ys.append(y) revs_tagged = pos_tagger.tag_sents(revs_text) for i in range(len(revs_tagged)): rev_tagged = revs_tagged[i] text = list(zip(*rev_tagged)[0])[1:] tag = list(zip(*rev_tagged)[1]) y = ys[i] for word in set(text): vocab[word] += 1 for postag in set(tag): pos_vocab[postag] += 1 datum = {"y": y, "text": ' '.join(text), "tag": ' '.join(tag), "num_words": len(text), "split": 0 if split == 'train' else 1} revs.append(datum) return revs, vocab, pos_vocab, len(class_to_label)
class POSTagger(): def __init__(self): jar = '/home/joe32140/stanford/stanford-postagger-2018-02-27/stanford-postagger.jar' model = '/home/joe32140/stanford/stanford-postagger-2018-02-27/models/english-bidirectional-distsim.tagger' self.tagger = StanfordPOSTagger(model, jar, encoding='utf8') def getPOS_sents(self, sents): tokenized_sents = [word_tokenize(sent) for sent in sents] classified_sents = self.tagger.tag_sents(tokenized_sents) return classified_sents def get_Noun(self, sents): classified_sents =self.getPOS_sents(sents) new_sentences=[] for i, sent in enumerate(classified_sents): tmp=[] for w in sent: if w[1][0] == 'N': tmp.append(w[0]) new_sentences.append(' '.join(tmp)) return new_sentences
def build_data_cv(data_file): revs = [] vocab = defaultdict(float) pos_vocab = defaultdict(float) pos_tagger = StanfordPOSTagger( 'pos-tag/english-left3words-distsim.tagger', 'pos-tag/stanford-postagger.jar', 'utf8', False, '-mx6000m') splits = ['train', 'test', 'dev'] for split in splits: with open(data_file.format(split), "rb") as f: lines = f.read().splitlines() revs_text = [] ratings = [] for line in lines: line_split = line.split('\t\t') rating = int(line_split[2]) - 1 rev = line_split[3] rev_tokens = rev.split() revs_text.append(rev_tokens) ratings.append(rating) revs_tagged = pos_tagger.tag_sents(revs_text) for i in range(len(revs_tagged)): rev_tagged = revs_tagged[i] text = list(zip(*rev_tagged)[0]) tag = list(zip(*rev_tagged)[1]) for word in set(text): vocab[word] += 1 for postag in set(tag): pos_vocab[postag] += 1 rev_datum = {"y": ratings[i], "text": ' '.join(text), "tag": ' '.join(tag), "num_words": len(text), "split": get_split_num(split)} revs.append(rev_datum) return revs, vocab, pos_vocab
def __init__(self, file_path, tagged_words_path=None): '''Creates a Collocations instance with a text file_path - string path to .txt input file; used to generate full description of results in output file, whether or not tagged_words is given tagged_words_path - string path to .txt file containing string representation of list of tagged words in input file; saves time and resources on computation ''' self.file_path = file_path if tagged_words_path == None: #open input file, extract text, and close file document = open(file_path, 'r', encoding='utf-8') raw = document.read().lower() document.close() #tokenize text into words and tag parts of speech using the #Stanford part-of-speech tagger sentences = nltk.sent_tokenize(raw) tokenized_sentences = [nltk.word_tokenize(w) for w in sentences] java_path = 'C:/Program Files/Java/jdk-9.0.1/bin/java.exe' os.environ['JAVAHOME'] = java_path path_to_model = ('stanford-postagger-2017-06-09/models/' 'english-left3words-distsim.tagger') path_to_jar = ('stanford-postagger-2017-06-09/' 'stanford-postagger.jar') tagger = StanfordPOSTagger(path_to_model, path_to_jar) tagger.java_options='-mx4096m' tagged_sentences = tagger.tag_sents(tokenized_sentences) self.tagged_words = sum(tagged_sentences, []) else: #load pre-tagged words import ast document = open(tagged_words_path, 'r', encoding='utf-8') self.tagged_words = ast.literal_eval(document.read()) document.close()
class StanfordNLTKWrapper: def __init__(self, config_file_path='aida_event/config/xmie.json'): self._config = read_dict_from_json_file(config_file_path) self._domain_name = self._config['common_tools']['stanford_url'] self._port_number = self._config['common_tools']['stanford_port'] self._pos_model = self._config['common_tools']['stanford_pos_model'] self._pos_jar = self._config['common_tools']['stanford_pos_jar'] self._parser_model = self._config['common_tools'][ 'stanford_parser_model'] self._parser_jar = self._config['common_tools']['stanford_parser_jar'] self._core_nlp_parser = CoreNLPParser( url='%s:%s' % (self._domain_name, self._port_number)) self._pos_tagger = StanfordPOSTagger(model_filename=self._pos_model, path_to_jar=self._pos_jar) self._dep_parser = StanfordDependencyParser( path_to_jar=self._parser_jar, path_to_models_jar=self._parser_model, java_options='-Xmx16G') def tokenizer(self, input_text): return list(self._core_nlp_parser.tokenize(input_text)) def pos_tag(self, input_tokenized_sentence): return self._pos_tagger.tag(input_tokenized_sentence) def pos_tag_sentences(self, input_tokenized_sentences): return self._pos_tagger.tag_sents(input_tokenized_sentences) def dependency_parser(self, input_tokenized_pos_tagged_sentence): return self._dep_parser.tagged_parse( input_tokenized_pos_tagged_sentence) def dependency_parser_sentences(self, input_tokenized_pos_tagged_sentences): return self._dep_parser.tagged_parse_sents( input_tokenized_pos_tagged_sentences)
class NLPCore: """ nlp processing including Stanford Word Segmenter, Stanford POS Tagger, Stanford Named Entity Recognizer and Stanford Parser """ def __init__(self): self.root_path = '../Models/stanfordNLP/' # word segmenter self.segmenter = StanfordSegmenter( path_to_jar=self.root_path + "stanford-segmenter.jar", path_to_slf4j=self.root_path + "log4j-over-slf4j.jar", path_to_sihan_corpora_dict=self.root_path + "segmenter/", path_to_model=self.root_path + "segmenter/pku.gz", path_to_dict=self.root_path + "segmenter/dict-chris6.ser.gz") # pos tagger self.posTagger = StanfordPOSTagger( self.root_path + 'pos-tagger/chinese-distsim.tagger', path_to_jar=self.root_path + "stanford-postagger.jar") # named entity recognizer self.nerTagger = StanfordNERTagger( self.root_path + 'ner/chinese.misc.distsim.crf.ser.gz', path_to_jar=self.root_path + 'stanford-ner.jar') self.parser = StanfordDependencyParser( model_path=self.root_path + 'lexparser/chinesePCFG.ser.gz', path_to_jar=self.root_path + 'stanford-parser.jar', path_to_models_jar=self.root_path + 'stanford-parser-3.7.0-models.jar', encoding='gbk') def split_sent_stanford(self, textPair): """ Stanford Word Segmenter, input should be raw text :return: also TextPair with raw string of results """ t1 = self.segmenter.segment(textPair.t1) t2 = self.segmenter.segment(textPair.t1) if DEBUG: print(t1, t2) return text_pair.TextPair(t1, t2, textPair.label) def split_sents_stanford(self, textPairs): """ Stanford Word Segmenter, input should be list of sents :return: also TextPair with raw string of results """ sents1 = [textPair.t1 for textPair in textPairs] sents2 = [textPair.t2 for textPair in textPairs] split1 = self.segmenter.segment_sents(sents1).split('\n') split2 = self.segmenter.segment_sents(sents2).split('\n') rlist = [] for i in range(len(textPairs)): rlist.append( text_pair.TextPair(split1[i], split2[i], textPairs[i].label)) if DEBUG: print(split1[i], split2[i]) return rlist def split_sent_jieba(self, textPair): jieba.setLogLevel('INFO') ger1 = jieba.cut(textPair.t1) ger2 = jieba.cut(textPair.t2) t1 = ' '.join(ger1) t2 = ' '.join(ger2) return text_pair.TextPair(t1, t2, textPair.label) def pos_tag(self, textPair): """ Stanford POS Tagger, input should be splitted :return: also TextPair with raw string of results """ t1_s = textPair.t1.split() t2_s = textPair.t2.split() t1_tag = ' '.join([ele[1] for ele in self.posTagger.tag(t1_s)]) t2_tag = ' '.join([ele[1] for ele in self.posTagger.tag(t2_s)]) if DEBUG: print(t1_tag, t2_tag) return text_pair.TextPair(t1_tag, t2_tag, textPair.label) def pos_tag_pairs(self, textPairs): """ Stanford POS Tagger, input should be list of sents :return: also TextPair with raw string of results """ sents1 = [textPair.t1.split() for textPair in textPairs] sents2 = [textPair.t2.split() for textPair in textPairs] tag1 = self.posTagger.tag_sents(sents1) tag2 = self.posTagger.tag_sents(sents2) rlist = [] for i in range(len(tag1)): t1_tag = ' '.join([ele[1] for ele in tag1[i]]) t2_tag = ' '.join([ele[1] for ele in tag2[i]]) rlist.append(text_pair.TextPair(t1_tag, t2_tag, textPairs[i].label)) if DEBUG: print(t1_tag, t2_tag) return rlist def ner_tag(self, textPair): """ Stanford Named Entity Recognizer, input should be splitted :return: also TextPair with raw string of results """ t1_s = textPair.t1.split() t2_s = textPair.t2.split() t1_ner = ' '.join( [ele[0] + '#' + ele[1] for ele in self.nerTagger.tag(t1_s)]) t2_ner = ' '.join( [ele[0] + '#' + ele[1] for ele in self.nerTagger.tag(t2_s)]) if DEBUG: print(t1_ner, t2_ner) return text_pair.TextPair(t1_ner, t2_ner, textPair.label) def ner_tag_pairs(self, textPairs): """ Stanford Named Entity Recognizer, input should be list of sents :return: also TextPair with raw string of results """ sents1 = [textPair.t1.split() for textPair in textPairs] sents2 = [textPair.t2.split() for textPair in textPairs] tag1 = self.nerTagger.tag_sents(sents1) tag2 = self.nerTagger.tag_sents(sents2) rlist = [] for i in range(len(tag1)): t1_ner = ' '.join([ele[0] + '#' + ele[1] for ele in tag1[i]]) t2_ner = ' '.join([ele[0] + '#' + ele[1] for ele in tag2[i]]) rlist.append(text_pair.TextPair(t1_ner, t2_ner, textPairs[i].label)) if DEBUG: print(t1_ner, t2_ner) return rlist def depen_parse(self, textPair): """ Stanford Dependency Parser, input should be splitted :return: also TextPair with raw string of results """ print([p.tree() for p in self.parser.raw_parse(textPair.t1)])
print(user_mentions_with_words) import re punctuation = { '/', '"', '(', ')', '%', ';', '?', '¿', '!', '¡', "'", ':', '#', '$', '&', '>', '<', '-', '_', '°', '|', '¬', '\\', '*', '+', '[', ']', '{', '}', '=', '\n', '&', '>', '<', '@' } text = re.sub('(ja){2,}', '', text) print(text) tokenized_text = nltk.word_tokenize(text, "spanish") print(tokenized_text) start_time = time() tagged_text = sum(spanish_pos_tagger.tag_sents([tokenized_text]), []) processed_text = [] for s in tagged_text: for tag in eagles_standard: if s[1] in eagles_standard[tag] and tag != "puntuacion": processed_text.append({s[0]: tag}) print(processed_text) execution_time = time() - start_time print(str(timedelta(seconds=execution_time))) print() snowball_stemmer = SnowballStemmer("spanish") porter_stemmer = PorterStemmer() wordnet_lemmatizer = WordNetLemmatizer() snowball_stemmed_list = list() porter_stemmed_list = list()
from nltk.tag import StanfordPOSTagger import os java_path = "C:/Program Files/Java/jdk1.8.0_181/bin/java.exe" os.environ["JAVAHOME"] = java_path stanford_dir = "C:/NLP_Programs/stanford-postagger-2018-10-16" modelfile = stanford_dir + "/models/english-bidirectional-distsim.tagger" jarfile = stanford_dir + "/stanford-postagger.jar" tagger = StanfordPOSTagger(model_filename=modelfile, path_to_jar=jarfile) print( tagger.tag_sents( sent.split() for sent in ["Yo im your deep learning mama", "Modi is besht", "Please work ma!"])) x = [['11', '222'], ['33', '444']] x = x + [['33', '444'], ['11', '222']] print(x)
def build_data_cv(data_file, all_phrases, binary, min_len=4): revs = [] vocab = defaultdict(float) pos_vocab = defaultdict(float) pos_tagger = StanfordPOSTagger( 'pos-tag/english-left3words-distsim.tagger', 'pos-tag/stanford-postagger.jar', 'utf8', False, '-mx2000m') splits = ['train', 'test', 'dev'] sentence_set = set() for split in splits: with open(data_file.format(split), "rb") as f: reader = csv.reader(f) revs_text = [] sents = [] for row in reader: rev, sent = row[0], int(row[1]) if binary and sent == 2: # skip neutral if binary continue rev = clean_str_sst(rev) if split == 'train': sentence_set.add(rev) rev_tokens = rev.split() revs_text.append(rev_tokens) sent = sentiment_label_for_binary(sent) if binary else sent # check for binary case sents.append(sent) revs_tagged = pos_tagger.tag_sents(revs_text) for i in range(len(revs_tagged)): rev_tagged = revs_tagged[i] text = list(zip(*rev_tagged)[0]) tag = list(zip(*rev_tagged)[1]) for word in set(text): vocab[word] += 1 for postag in set(tag): pos_vocab[postag] += 1 rev_datum = {"y": sents[i], "text": ' '.join(text), "tag": ' '.join(tag), "num_words": len(text), "split": get_split_num(split)} revs.append(rev_datum) if all_phrases: with open(data_file.format("train_phrases"), "rb") as f: reader = csv.reader(f) revs_text = [] sents = [] count = 0 for row in reader: rev, sent = row[0], int(row[1]) rev = clean_str_sst(rev) if rev in sentence_set: count += 1 continue if binary and sent == 2: # skip neutral if binary continue rev_tokens = rev.split() if len(rev_tokens) < min_len: continue revs_text.append(rev_tokens) sent = sentiment_label_for_binary(sent) if binary else sent # check for binary case sents.append(sent) revs_tagged = pos_tagger.tag_sents(revs_text) for i in range(len(revs_tagged)): rev_tagged = revs_tagged[i] text = list(zip(*rev_tagged)[0]) tag = list(zip(*rev_tagged)[1]) for word in set(text): vocab[word] += 1 for postag in set(tag): pos_vocab[postag] += 1 rev_datum = {"y": sents[i], "text": ' '.join(text), "tag": ' '.join(tag), "num_words": len(text), "split": get_split_num('train')} revs.append(rev_datum) print "{} sentences in phrases".format(count) return revs, vocab, pos_vocab
class CTBCreator(object): '''Data path is assumed to be a directory with pkl files and a corpora subdirectory. ''' def __init__(self, wordembed_dim=300, embeddingstd=0.1, data_path=None, tagger_path=None): assert data_path is not None assert tagger_path is not None dict_filepath = os.path.join(data_path, 'dict.pkl') data_filepath = os.path.join(data_path, 'parsed.pkl') train_filepath = os.path.join(data_path, "train.txt") valid_filepath = os.path.join(data_path, "dev.txt") test_filepath = os.path.join(data_path, "test.txt") self.st = StanfordPOSTagger( os.path.join(tagger_path, 'models/chinese-distsim.tagger'), os.path.join(tagger_path, 'stanford-postagger.jar')) print("building dictionary ...") f_dict = open(dict_filepath, 'wb') self.dictionary = Dictionary() print("loading trees from {}".format(train_filepath)) train_trees = load_trees(train_filepath) print("loading trees from {}".format(valid_filepath)) valid_trees = load_trees(valid_filepath) print("loading trees from {}".format(test_filepath)) test_trees = load_trees(test_filepath) self.add_words(train_trees) self.dictionary.rebuild_by_freq() self.arc_dictionary = Dictionary() self.stag_dictionary = Dictionary() self.train = self.preprocess(train_trees, is_train=True) self.valid = self.preprocess(valid_trees, is_train=False) self.test = self.preprocess(test_trees, is_train=False) with open(dict_filepath, "wb") as file_dict: pickle.dump(self.dictionary, file_dict) with open(data_filepath, "wb") as file_data: pickle.dump( (self.train, self.arc_dictionary, self.stag_dictionary), file_data) pickle.dump(self.valid, file_data) pickle.dump(self.test, file_data) print(len(self.arc_dictionary.idx2word)) print(self.arc_dictionary.idx2word) def add_words(self, trees): words, tags = [], [] for tree in trees: tree = process_NONE(tree) words, tags = zip(*tree.pos()) words = ['<s>'] + list(words) + ['</s>'] for w in words: self.dictionary.add_word(w) def preprocess(self, parse_trees, is_train=False): sens_idx = [] sens_tag = [] sens_stag = [] sens_arc = [] distances = [] sens = [] trees = [] print('\nConverting trees ...') for i, tree in enumerate(parse_trees): tree = process_NONE(tree) if i % 10 == 0: print("Done %d/%d\r" % (i, len(parse_trees)), end='') word_lexs, _ = zip(*tree.pos()) idx = [] for word in (['<s>'] + list(word_lexs) + ['</s>']): idx.append(self.dictionary[word]) listerized_tree, arcs, tags = tree2list(tree) tags = ['<unk>'] + tags + ['<unk>'] arcs = ['<unk>'] + arcs + ['<unk>'] if type(listerized_tree) is str: listerized_tree = [listerized_tree] distances_sent, _ = distance(listerized_tree) distances_sent = [0] + distances_sent + [0] idx_arcs = [] for arc in arcs: arc = precess_arc(arc) arc_id = self.arc_dictionary.add_word( arc) if is_train else self.arc_dictionary[arc] idx_arcs.append(arc_id) # the "tags" are the collapsed unary chains, i.e. FRAG+DT # at evaluation, we swap the word tag "DT" with the true tag in "stags" (see after) idx_tags = [] for tag in tags: tag = precess_arc(tag) tag_id = self.arc_dictionary.add_word( tag) if is_train else self.arc_dictionary[tag] idx_tags.append(tag_id) assert len(distances_sent) == len(idx) - 1 assert len(arcs) == len(idx) - 1 assert len(idx) == len(word_lexs) + 2 sens.append(word_lexs) trees.append(tree) sens_idx.append(idx) sens_tag.append(idx_tags) sens_arc.append(idx_arcs) distances.append(distances_sent) print('\nLabelling POS tags ...') st_outputs = self.st.tag_sents(sens) for i, word_tags in enumerate(st_outputs): if i % 10 == 0: print("Done %d/%d\r" % (i, len(parse_trees)), end='') word_tags = [t[1].split('#')[1] for t in word_tags] stags = ['<s>'] + list(word_tags) + ['</s>'] # the "stags" are the original word tags included in the data files # we keep track of them so that, during evaluation, we can swap them with the original ones. idx_stags = [] for stag in stags: stag_id = self.stag_dictionary.add_word( stag) if is_train else self.stag_dictionary[stag] idx_stags.append(stag_id) sens_stag.append(idx_stags) return sens_idx, sens_tag, sens_stag, \ sens_arc, distances, sens, trees
30. VBN Verb, past participle 31. VBP Verb, non-3rd person singular present 32. VBZ Verb, 3rd person singular present 33. WDT Wh-determiner 34. WP Wh-pronoun 35. WP$ Possessive wh-pronoun 36. WRB Wh-adverb ''' ts = [ tknzr.tokenize(line_en.strip().strip('"')) for line_en in tqdm(f) ] # p = nltk.pos_tag(t, tagset='universal') ps = tagger.tag_sents(ts) for p in tqdm(ps): r = [] t = [] for pos in p: t.append(pos[0].lower().strip('.,!')) if pos[1][0] == 'J': r.append( wnl.lemmatize(pos[0].lower().strip('.,!-'), pos='a') + u'/A') elif pos[1][0] == 'V': r.append( wnl.lemmatize(pos[0].lower().strip('.,!-'), pos='v') + u'/V') elif pos[1][0] == 'N':
training_sentences = sentences[split_idx:] # original tags of sentences in the brown corpus ground_tags = [[tag for word, tag in testing_sentences[sentence_idx]] for sentence_idx in range(split_idx)] testing_tokens = [[word for word, tag in testing_sentences[sentence_idx]] for sentence_idx in range(split_idx)] if (True): print ("#######") # get trained stanford model stanford_model = StanfordPOSTagger(os.environ.get('STANFORD_BROWN_MODEL')) # stanford_tokens_tags = [stanford_model.tag(token_list) for token_list in testing_tokens] stanford_tokens_tags = [] stanford_token_tags = stanford_model.tag_sents(testing_tokens) stanford_tags = [[tag for word, tag in stanford_token_tags[sentence_idx]] for sentence_idx in range(split_idx)] # save computed tags pickle.dump(stanford_tags, open("stanford_brown_20_tags_all.pd", "wb")) print (len(stanford_tags)) print (stanford_tags[0]) print (stanford_tags[1]) if (True): print ("#######") print ("Training CRF tagger...") crf_tagger = CRFTagger() crf_tagger.train(training_sentences, '/tmp/crf_tagger_80.model') #crf_tagger.set_model_file('./crf_new') print ("Done training CRF tagger...")
def build_data_cv(data_folder, cv=10, clean_string=True): """ Loads data and split into 10 folds. """ revs = [] pos_file = data_folder[0] neg_file = data_folder[1] vocab = defaultdict(float) pos_vocab = defaultdict(float) pos_tagger = StanfordPOSTagger( 'pos-tag/english-left3words-distsim.tagger', 'pos-tag/stanford-postagger.jar', 'utf8', False, '-mx2000m') with open(pos_file, "rb") as f: revs_text = [] for line in f: rev = [] rev.append(line.strip()) if clean_string: orig_rev = clean_str(" ".join(rev)) else: orig_rev = " ".join(rev).lower() revs_text.append(orig_rev.split()) revs_tagged = pos_tagger.tag_sents(revs_text) for rev_tagged in revs_tagged: text = list(zip(*rev_tagged)[0]) tag = list(zip(*rev_tagged)[1]) words = set(text) for word in words: vocab[word] += 1 postags = set(tag) for postag in postags: pos_vocab[postag] += 1 datum = {"y": 1, "text": ' '.join(text), "tag": ' '.join(tag), "num_words": len(text), "split": np.random.randint(0, cv)} revs.append(datum) with open(neg_file, "rb") as f: revs_text = [] for line in f: rev = [] rev.append(line.strip()) if clean_string: orig_rev = clean_str(" ".join(rev)) else: orig_rev = " ".join(rev).lower() revs_text.append(orig_rev.split()) revs_tagged = pos_tagger.tag_sents(revs_text) for rev_tagged in revs_tagged: text = list(zip(*rev_tagged)[0]) tag = list(zip(*rev_tagged)[1]) words = set(text) for word in words: vocab[word] += 1 postags = set(tag) for postag in postags: pos_vocab[postag] += 1 datum = {"y": 0, "text": ' '.join(text), "tag": ' '.join(tag), "num_words": len(text), "split": np.random.randint(0, cv)} revs.append(datum) return revs, vocab, pos_vocab
for idx, chunk_id in enumerate(chunks): if not os.path.exists('evidence/evidence_pos_{0:04d}.json'.format(chunk_id)): data_path = 'pubmed20n{0:04d}.json'.format(chunk_id) if not os.path.exists(data_path): continue evi_output = [] ctx_output = [] data = json.load(open(data_path)) for item in data: results = process(item) evi_output += results[0] ctx_output.append(results[1]) pos_list = st.tag_sents(o['pos'] for o in evi_output) for _idx in range(len(evi_output)): evi_output[_idx]['pos'] = pos_list[_idx] with open('evidence/evidence_pos_{0:04d}.json'.format(chunk_id), 'w') as f: json.dump(evi_output, f) with open('evidence/contexts_{0:04d}.json'.format(chunk_id), 'w') as f: json.dump(ctx_output, f) else: evi_output = json.load(open('evidence/evidence_pos_{0:04d}.json'.format(chunk_id))) total += len(evi_output) print('%d/%d; Processing %s; Number of evidence: %d; Total: %d' % (idx+1, len(chunks), chunk_id, len(evi_output), total))
class FeatureMaker: _sentence_data = None _split_data = None _stf_pos_tagger = None _stf_parser = None _pos_list = [] _neg_list = [] def __init__(self, data): self._split_data = data self._sentence_data = [" ".join(line) for line in self._split_data] def _pos_tag_sent(self, sent): # text = word_tokenize("And now for something completely different") return nltk.pos_tag(sent) def _sf_pos_tag_sent(self, sent): return self._stf_pos_tagger.tag(sent) def prefix_suffix(self): prefix_2 = [] prefix_3 = [] suffix_2 = [] suffix_3 = [] for line in self._split_data: prefix_2.append([w[:2] for w in line]) prefix_3.append([w[:3] for w in line]) suffix_2.append([w[-2:] for w in line]) suffix_3.append([w[-3:] for w in line]) return [prefix_2, prefix_3, suffix_2, suffix_3] def fast_pos_tag(self): tag_result = [[token[1] for token in self._pos_tag_sent(line)] for line in self._split_data] return tag_result def pos_tag(self): if self._stf_pos_tagger is None: self._stf_pos_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger') index = 0 tag_result = [] while index < len(self._split_data): temp = self._stf_pos_tagger.tag_sents(self._split_data[index:index+1000]) tag_result.extend(temp) index += 1000 print(("pos:" + str(index)), end=' ') # tag_result = self._stf_pos_tagger.tag_sents(self._split_data) tag_result = [[unidecode(p[1]) for p in line] for line in tag_result] # for line in tag_result: # print str(line) return tag_result def parser(self): if self._stf_parser is None: self._stf_parser = StanfordParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") result = self._stf_parser.parse_sents(self._split_data) result = sum([[parse for parse in dep_graphs] for dep_graphs in result], []) for i in result: print(i) def per_word_length(self): wl_result = [[len(w) for w in line] for line in self._split_data] return wl_result def sentence_avg_word_length(self): wl_result = self.per_word_length() wl_result = [np.mean(line) for line in wl_result] return wl_result def sentence_length(self): sl_result = [len(line) for line in self._split_data] return sl_result def sentence_length_mean_sd(self): return np.mean(self.sentence_length()), np.std(self.sentence_length()) def load_sentiment_list(self): if not self._pos_list: with open("./../pos_neg/positive-words.txt", mode='r') as f: file_content = f.readlines() for line in file_content: line = line.strip() if not line.startswith(";") and line: self._pos_list.append(line) if not self._neg_list: with open("./../pos_neg/negative-words.txt", mode='r') as f: file_content = f.readlines() for line in file_content: line = line.strip() if not line.startswith(";") and line: self._neg_list.append(line) return [self._pos_list, self._neg_list] def sentiment_sequence(self): sentiment_data = [] for line in self._split_data: sentiment_line = [] for word in line: if word in self._pos_list: sentiment_line.append("POS") elif word in self._neg_list: sentiment_line.append("NEG") else: sentiment_line.append("NON") sentiment_data.append(sentiment_line) return sentiment_data def get_read_measure(self): value_list = [] for cat, data in list(readability.getmeasures(self._sentence_data, lang='en').items()): print(('%s:' % cat)) for key, val in list(data.items()): print(((' %-20s %12.2f' % (key + ':', val)).rstrip('0 ').rstrip('.'))) value_list.append(val) return val