def pos_tag(texts): from nltk.tag.stanford import POSTagger jar = config.mainpath+"analyze/SPOS/stanford-postagger.jar" if language == "german": model = config.mainpath+"analyze/SPOS/models/german-fast.tagger" if language == "english": model = config.mainpath+"analyze/SPOS/models/english-bidirectional-distsim.tagger" tagger = POSTagger(model, path_to_jar = jar, encoding="UTF-8") return tagger.tag_sents(texts)
def add_POS(self, row_file, target): ''' row_str = ''; f = open(row_file,'rb'); for row in f: row_str+=row; soup = BeautifulSoup(row_str); self.soup = soup; sentences = soup.find_all('sentence'); all_token = list(); for block in sentences: text = block.text.strip(); text_token = self.tf.stanford_tokenize(text); all_token.append(text_token); ''' all_token = self.get_token(target) stanford_tagger = \ POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger','../stanford-postagger-full-2015-01-30/stanford-postagger.jar') tag_list = list() for row in all_token: temp_list = list() for word in row: if len(word) > 1 and re.match(r'^[A-Z]+', word): temp_list.append(word.lower()) else: temp_list.append(word) tag_list.append(temp_list) 1 #end for tagged_result = stanford_tagger.tag_sents(tag_list) ''' for row in tagged_result: index_list = list(); for num,item in enumerate(row): if not re.match(r'.*[\w\d]+',item[0]): index_list.append(num); for i in index_list: row[i]=(row[i][0],row[i][0]); #end for ''' w = open('pos_%s' % target, 'wb') for num1, row in enumerate(tagged_result): for num2, item in enumerate(row): w.write(all_token[num1][num2] + ' ' + item[1] + '\n') w.write('\n') #print tagged_result; return
def add_POS(self,row_file,target): ''' row_str = ''; f = open(row_file,'rb'); for row in f: row_str+=row; soup = BeautifulSoup(row_str); self.soup = soup; sentences = soup.find_all('sentence'); all_token = list(); for block in sentences: text = block.text.strip(); text_token = self.tf.stanford_tokenize(text); all_token.append(text_token); ''' all_token = self.get_token(target); stanford_tagger = \ POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger','../stanford-postagger-full-2015-01-30/stanford-postagger.jar'); tag_list = list(); for row in all_token: temp_list = list(); for word in row: if len(word)>1 and re.match(r'^[A-Z]+',word): temp_list.append(word.lower()); else: temp_list.append(word); tag_list.append(temp_list);1 #end for tagged_result = stanford_tagger.tag_sents(tag_list); ''' for row in tagged_result: index_list = list(); for num,item in enumerate(row): if not re.match(r'.*[\w\d]+',item[0]): index_list.append(num); for i in index_list: row[i]=(row[i][0],row[i][0]); #end for ''' w = open('pos_%s'%target,'wb'); for num1,row in enumerate(tagged_result): for num2,item in enumerate(row): w.write(all_token[num1][num2]+' '+item[1]+'\n'); w.write('\n'); #print tagged_result; return;
def generate_pos_set(self): print '正在构建正性集词典....' pos_dict = dict() pos_set = set() sentences = list() for row in self.train_label: for key in row: if ' ' in key: sentences.append(self.tk.word_tokenize(key)) else: pos_dict[key] = pos_dict.setdefault(key, 0) + 1 #pos_set.add(key); #end for st=POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger'\ ,'../stanford-postagger-full-2015-01-30/stanford-postagger.jar') result = st.tag_sents(sentences) for row in result: for item in row: if item[1].startswith('NN'): pos_dict[item[0]] = pos_dict.setdefault(item[0], 0) + 1 #pos_set.add(item[0]); #end for neg_dict = dict() for num, row in enumerate(self.tagged_train_data): for item in row: if item[1].startswith( 'NN') and item[0] not in self.train_word_label[num]: neg_dict[item[0]] = neg_dict.setdefault(item[0], 0) + 1 for key in pos_dict.keys(): if pos_dict[key] > 1: if neg_dict.has_key(key): if neg_dict[key] / pos_dict[key] < 2: pos_set.add(key) else: pos_set.add(key) self.pos_set = pos_set print '完成!' return
def generate_pos_set(self): print '正在构建正性集词典....'; pos_dict = dict(); pos_set=set(); sentences = list(); for row in self.train_label: for key in row: if ' ' in key: sentences.append(self.tk.word_tokenize(key)); else: pos_dict[key] = pos_dict.setdefault(key,0) + 1; #pos_set.add(key); #end for st=POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger'\ ,'../stanford-postagger-full-2015-01-30/stanford-postagger.jar'); result = st.tag_sents(sentences); for row in result: for item in row: if item[1].startswith('NN'): pos_dict[item[0]] = pos_dict.setdefault(item[0],0) + 1; #pos_set.add(item[0]); #end for neg_dict = dict(); for num,row in enumerate(self.tagged_train_data): for item in row : if item[1].startswith('NN') and item[0] not in self.train_word_label[num]: neg_dict[item[0]] = neg_dict.setdefault(item[0],0) + 1; for key in pos_dict.keys(): if pos_dict[key] > 1: if neg_dict.has_key(key): if neg_dict[key]/pos_dict[key] < 2: pos_set.add(key); else: pos_set.add(key); self.pos_set=pos_set; print '完成!'; return;
class POSTagSelector: def __init__(self, pos_model, stanford_tagger, java_path): """ Creates a POSTagSelector instance. @param pos_model: Path to a POS tagging model for the Stanford POS Tagger. The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml @param stanford_tagger: Path to the "stanford-postagger.jar" file. The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml @param java_path: Path to the system's "java" executable. Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. """ os.environ['JAVAHOME'] = java_path self.tagger = POSTagger(pos_model, stanford_tagger) def selectCandidates(self, substitutions, victor_corpus): """ Selects which candidates can replace the target complex words in each instance of a VICTOR corpus. @param substitutions: Candidate substitutions to be filtered. It can be in two formats: A dictionary produced by a Substitution Generator linking complex words to a set of candidate substitutions. Example: substitutions['perched'] = {'sat', 'roosted'} A list of candidate substitutions selected for the "victor_corpus" dataset by a Substitution Selector. Example: [['sat', 'roosted'], ['easy', 'uncomplicated']] @param victor_corpus: Path to a corpus in the VICTOR format. For more information about the file's format, refer to the LEXenstein Manual. @return: Returns a vector of size N, containing a set of selected substitutions for each instance in the VICTOR corpus. """ selected_substitutions = [] substitution_candidates = [] if isinstance(substitutions, list): substitution_candidates = substitutions elif isinstance(substitutions, dict): void = VoidSelector() substitution_candidates = void.selectCandidates(substitutions, victor_corpus) else: print('ERROR: Substitutions are neither a dictionary or a list!') return selected_substitutions #Read VICTOR corpus: lexf = open(victor_corpus) sents = [] targets = [] heads = [] words = set([]) c = -1 for line in lexf: c += 1 data = line.strip().split('\t') sent = data[0].strip().split(' ') target = data[1].strip() head = int(data[2].strip()) sents.append(sent) targets.append(target) heads.append(head) words.update(set(substitution_candidates[c])) lexf.close() #Tag sentences: tagged_sents = self.tagger.tag_sents(sents) #Tag words: words = list(words) words_sents = [[w] for w in words] tagged_words = self.tagger.tag_sents(words_sents) word_to_tag = {} for i in range(0, len(words)): word_to_tag[words[i]] = tagged_words[i][0][1] for i in range(0, len(sents)): target = targets[i] head = heads[i] target_pos = str(tagged_sents[i][head][1]) candidates = [] candidates = set(substitution_candidates[i]) candidates = self.getCandidatesWithSamePOS(candidates, word_to_tag, target_pos) selected_substitutions.append(candidates) lexf.close() return selected_substitutions def getTargetPOS(self, sent, target, head): pos_data = [] try: pos_data = nltk.pos_tag(sent) return pos_data[head][1] except UnicodeDecodeError: try: pos_data = nltk.pos_tag(target) return pos_data[0][1] except UnicodeDecodeError: return 'None' def getCandidatesWithSamePOS(self, candidates, word_to_tag, target_pos): result = set([]) for candidate in candidates: if candidate in word_to_tag.keys(): ctag = word_to_tag[candidate] if ctag==target_pos: result.add(candidate) return result def toVictorFormat(self, victor_corpus, substitutions, output_path, addTargetAsCandidate=False): """ Saves a set of selected substitutions in a file in VICTOR format. @param victor_corpus: Path to the corpus in the VICTOR format to which the substitutions were selected. @param substitutions: The vector of substitutions selected for the VICTOR corpus. @param output_path: The path in which to save the resulting VICTOR corpus. @param addTargetAsCandidate: If True, adds the target complex word of each instance as a candidate substitution. """ o = open(output_path, 'w') f = open(victor_corpus) for subs in substitutions: data = f.readline().strip().split('\t') sentence = data[0].strip() target = data[1].strip() head = data[2].strip() newline = sentence + '\t' + target + '\t' + head + '\t' for sub in subs: newline += '0:'+sub + '\t' o.write(newline.strip() + '\n') f.close() o.close()