def process_single_file(self,file): try: xml_obj = KafNafParser(file) except: print>>sys.stderr,'Error parsing',file,': skipped' return print>>sys.stderr,'Processing file', os.path.basename(file), 'Type:',xml_obj.get_type() self.langs[xml_obj.get_language()] += 1 sentences = [] current_sent = [] this_sent = None pos_for_wid = {} ## For each token id (wid) the pos of it for term in xml_obj.get_terms(): w_ids = term.get_span().get_span_ids() pos = term.get_pos() for wid in term.get_span().get_span_ids(): pos_for_wid[wid] = pos for token in xml_obj.get_tokens(): wid = token.get_id() value = token.get_text() if self.convert_to_lowercase: value = value.lower() if value in self.punctuation: value = 'PUN' if value == '*': value = 'STAR' sentence = token.get_sent() if this_sent is not None and sentence != this_sent: ## There is a new sent sentences.append(current_sent) current_sent = [] current_sent.append((wid,value)) this_sent = sentence ## Add the last sentence as well sentences.append(current_sent) for sentence in sentences: if self.include_sentence_delimiters: sentence.insert(0,('xxx','<S>')) sentence.append(('xxx','</S>')) for idx in range(0,len(sentence)): for ngramlen in range(self.min_ngram_len,self.max_ngram_len+1): file_desc = self.get_file_desc_for_ngram(ngramlen) start = idx end = start + ngramlen if end <= len(sentence): this_ngram = '\t'.join(value for wid, value in sentence[start:end]) this_ngram_pos = '\t'.join(pos_for_wid.get(wid,'X') for wid, value in sentence[start:end]) file_desc.write(this_ngram.encode('utf-8')+'\t'+DELIMITER+'\t'+this_ngram_pos+'\n')
def extract_all_features(): train_files = load_training_files() logging.debug('Loaded '+str(len(train_files))+' files') feat_folder = my_config_manager.get_feature_folder_name() label_feats = separator = None my_stdout, my_stderr = sys.stdout,sys.stderr rel_exp_tar_filename = my_config_manager.get_relation_exp_tar_training_filename() exp_tar_rel_fic = open(rel_exp_tar_filename,'w') rel_exp_hol_filename = my_config_manager.get_relation_exp_hol_training_filename() exp_hol_rel_fic = open(rel_exp_hol_filename,'w') filename_features_polarity_classifier = my_config_manager.get_filename_features_polarity_classifier() fd_filename_features_polarity_classifier = open(filename_features_polarity_classifier,'w') ## Configuration for the relational alcasifier use_these_lexicons = [] use_deps_now = my_config_manager.get_use_dependencies() use_toks_lems_now = my_config_manager.get_use_tokens_lemmas() #accepted_opinions = my_config_manager.get_mapping_valid_opinions(map_all_to_this=OPINION_EXPRESSION) accepted_opinions = my_config_manager.get_mapping_valid_opinions(map_all_to_this=None) mapping_positive_negative = my_config_manager.get_mapping_valid_opinions() use_dependencies_now = my_config_manager.get_use_dependencies() polarities_found_and_skipped = [] for num_file, train_file in enumerate(train_files): logging.debug('Extracting features '+os.path.basename(train_file)) base_name = os.path.basename(train_file) out_file = os.path.join(feat_folder,'file#'+str(num_file)+'#'+base_name+".feat") err_file = out_file+'.log' kaf_naf_obj = KafNafParser(train_file) print>>sys.stderr,'Extracting features from',train_file if num_file == 0: #The first time we load the lexicons lang = kaf_naf_obj.get_language() use_these_lexicons = load_lexicons(my_config_manager,lang) label_feats, separator, pols_skipped_this = extract_features_from_kaf_naf_file(kaf_naf_obj,out_file,err_file, accepted_opinions=accepted_opinions, lexicons = use_these_lexicons) polarities_found_and_skipped.extend(pols_skipped_this) print>>exp_tar_rel_fic,'#'+train_file print>>exp_hol_rel_fic,'#'+train_file # SET valid_opinions to None to use all the possible opinions in the KAF file for extracitng relations # set it valid_opinions = accepted opinions for feiltering ''' create_rel_exp_tar_training(kaf_naf_obj, output=exp_tar_rel_fic, valid_opinions=None, use_dependencies=use_dependencies_now,use_tokens=use_toks_lems_now, use_lemmas=use_toks_lems_now, log=err_file) create_rel_exp_hol_training(kaf_naf_obj ,output=exp_hol_rel_fic, valid_opinions=None, use_dependencies=use_dependencies_now,use_tokens=use_toks_lems_now, use_lemmas=use_toks_lems_now) ''' ##Extract features for the polarity classifier #for mpqa there will be no polarity classifier #extract_features_polarity_classifier_from_kaf(kaf_naf_obj,fd_filename_features_polarity_classifier,mapping_positive_negative) fd_filename_features_polarity_classifier.close() ##Show just for information how many instances have been skipped becase the polarity of opinion expression was not allowed count = defaultdict(int) for exp_label in polarities_found_and_skipped: count[exp_label] += 1 info = '\nOpinions skipped because the polarity label is not included in the configuration\n' info += 'Accepted opinions: '+' '.join(accepted_opinions.keys())+'\n' info += 'Number of complete opinions skipped\n' for label, c in count.items(): info+=' '+label+' :'+str(c)+'\n' info+='\n' logging.debug(info) ################################################### #Re-set the stdout and stderr exp_tar_rel_fic.close() exp_hol_rel_fic.close() sys.stdout,sys.stderr = my_stdout, my_stderr #Sabe labelfeats and separator in a file filename = my_config_manager.get_feature_desc_filename() fic = open(filename,'w') fic.write(' '.join(label_feats)+'\n') fic.close() logging.debug('Description of features --> '+filename)
fic.close() return map if __name__=='__main__': this_folder = os.path.dirname(os.path.realpath(__file__)) if sys.stdin.isatty(): print>>sys.stderr,'Input stream required.' print>>sys.stderr,'Example usage: cat myUTF8file.kaf |',sys.argv[0] sys.exit(-1) input_obj = KafNafParser(sys.stdin) my_lang = input_obj.get_language() complete_path_to_treetagger = find_treetagger() if complete_path_to_treetagger is None: print>>sys.stderr,'Treetagger could not be found. You need to specify there treetagger is installed in 2 ways:' print>>sys.stderr,'\t1)Update the TREE_TAGGER_PATH variable in the file lib/__init__.py' print>>sys.stderr,'\t2_Update your TREE_TAGGER_PATH environment variable' sys.exit(0) # In the last version of treetagger all the names of commands have been change from X-utf to just X # /cmd/tree-tagger-english-utf8 ==> /cmd/tree-tagger-english # This could be a problem in case other version of treetagger is being used. if my_lang == 'en': treetagger_cmd = complete_path_to_treetagger+'/cmd/tree-tagger-english' mapping_file = this_folder +'/mappings/english.map.treetagger.kaf.csv'
def extract_all_features(): train_files = load_training_files() logging.debug('Loaded '+str(len(train_files))+' files') feat_folder = my_config_manager.get_feature_folder_name() label_feats = separator = None my_stdout, my_stderr = sys.stdout,sys.stderr rel_exp_tar_filename = my_config_manager.get_relation_exp_tar_training_filename() exp_tar_rel_fic = open(rel_exp_tar_filename,'w') rel_exp_hol_filename = my_config_manager.get_relation_exp_hol_training_filename() exp_hol_rel_fic = open(rel_exp_hol_filename,'w') ### LEXICON FROM THE DOMAIN expressions_lexicon = None targets_lexicon = None if my_config_manager.get_use_training_lexicons(): # Create the lexicons ##GUESS THE LANG: first_train_file = train_files[0] obj = KafNafParser(first_train_file) lang = obj.get_language() expression_lexicon_filename = my_config_manager.get_expression_lexicon_filename() target_lexicon_filename = my_config_manager.get_target_lexicon_filename() this_exp_lex = my_config_manager.get_use_this_expression_lexicon() this_tar_lex = my_config_manager.get_use_this_target_lexicon() if this_exp_lex is None or this_tar_lex is None: path_to_lex_creator = '/home/izquierdo/opener_repos/opinion-domain-lexicon-acquisition/acquire_from_annotated_data.py' training_filename = my_config_manager.get_file_training_list() lexicons_manager.create_lexicons(path_to_lex_creator,training_filename,expression_lexicon_filename,target_lexicon_filename) ##Once created we have to copy the previous one in case: if this_exp_lex is not None: if "$LANG" in this_exp_lex: this_exp_lex = this_exp_lex.replace('$LANG',lang) shutil.copy(this_exp_lex, expression_lexicon_filename) if this_tar_lex is not None: if "$LANG" in this_tar_lex: this_tar_lex = this_tar_lex.replace('$LANG',lang) shutil.copy(this_tar_lex,target_lexicon_filename) expressions_lexicon = lexicons_manager.load_lexicon(expression_lexicon_filename) targets_lexicon = lexicons_manager.load_lexicon(target_lexicon_filename) this_propagation_lexicon = my_config_manager.get_propagation_lexicon_name() if this_propagation_lexicon is not None: if "$LANG" in this_propagation_lexicon: this_propagation_lexicon = this_propagation_lexicon.replace('$LANG',lang) print>>sys.stderr,'Propagated lexicon',this_propagation_lexicon ## Configuration for the relational alcasifier use_deps_now = my_config_manager.get_use_dependencies() use_toks_lems_now = my_config_manager.get_use_tokens_lemmas() accepted_opinions = my_config_manager.get_mapping_valid_opinions() use_dependencies_now = my_config_manager.get_use_dependencies() polarities_found_and_skipped = [] for num_file, train_file in enumerate(train_files): logging.debug('Extracting features '+os.path.basename(train_file)) base_name = os.path.basename(train_file) out_file = os.path.join(feat_folder,'file#'+str(num_file)+'#'+base_name+".feat") err_file = out_file+'.log' #Creates the output file # Returns the labels for the features and the separator used if True: kaf_naf_obj = KafNafParser(train_file) label_feats, separator, pols_skipped_this = extract_features_from_kaf_naf_file(kaf_naf_obj,out_file,err_file, accepted_opinions=accepted_opinions, exp_lex=expressions_lexicon, tar_lex=targets_lexicon, propagation_lex_filename=this_propagation_lexicon) polarities_found_and_skipped.extend(pols_skipped_this) print>>exp_tar_rel_fic,'#'+train_file print>>exp_hol_rel_fic,'#'+train_file # SET valid_opinions to None to use all the possible opinions in the KAF file for extracitng relations create_rel_exp_tar_training(kaf_naf_obj, output=exp_tar_rel_fic, valid_opinions=accepted_opinions,use_dependencies=use_dependencies_now,use_tokens=use_toks_lems_now,use_lemmas=use_toks_lems_now) create_rel_exp_hol_training(kaf_naf_obj ,output=exp_hol_rel_fic, valid_opinions=accepted_opinions,use_dependencies=use_dependencies_now,use_tokens=use_toks_lems_now,use_lemmas=use_toks_lems_now) if False: #except Exception as e: sys.stdout, sys.stderr = my_stdout, my_stderr print>>sys.stderr,str(e),dir(e) pass ##Show just for information how many instances have been skipped becase the polarity of opinion expression was not allowed count = defaultdict(int) for exp_label in polarities_found_and_skipped: count[exp_label] += 1 info = '\nOpinions skipped because the polarity label is not included in the configuration\n' info += 'Accepted opinions: '+' '.join(accepted_opinions.keys())+'\n' info += 'Number of complete opinions skipped\n' for label, c in count.items(): info+=' '+label+' :'+str(c)+'\n' info+='\n' logging.debug(info) ################################################### #Re-set the stdout and stderr exp_tar_rel_fic.close() exp_hol_rel_fic.close() sys.stdout,sys.stderr = my_stdout, my_stderr #Sabe labelfeats and separator in a file filename = my_config_manager.get_feature_desc_filename() fic = open(filename,'w') fic.write(' '.join(label_feats)+'\n') fic.close() logging.debug('Description of features --> '+filename)
def extract_all_features(): train_files = load_training_files() logging.debug('Loaded ' + str(len(train_files)) + ' files') feat_folder = my_config_manager.get_feature_folder_name() label_feats = separator = None my_stdout, my_stderr = sys.stdout, sys.stderr rel_exp_tar_filename = my_config_manager.get_relation_exp_tar_training_filename( ) exp_tar_rel_fic = open(rel_exp_tar_filename, 'w') rel_exp_hol_filename = my_config_manager.get_relation_exp_hol_training_filename( ) exp_hol_rel_fic = open(rel_exp_hol_filename, 'w') ### LEXICON FROM THE DOMAIN expressions_lexicon = None targets_lexicon = None if my_config_manager.get_use_training_lexicons(): # Create the lexicons ##GUESS THE LANG: first_train_file = train_files[0] obj = KafNafParser(first_train_file) lang = obj.get_language() expression_lexicon_filename = my_config_manager.get_expression_lexicon_filename( ) target_lexicon_filename = my_config_manager.get_target_lexicon_filename( ) this_exp_lex = my_config_manager.get_use_this_expression_lexicon() this_tar_lex = my_config_manager.get_use_this_target_lexicon() if this_exp_lex is None or this_tar_lex is None: path_to_lex_creator = '/home/izquierdo/opener_repos/opinion-domain-lexicon-acquisition/acquire_from_annotated_data.py' training_filename = my_config_manager.get_file_training_list() lexicons_manager.create_lexicons(path_to_lex_creator, training_filename, expression_lexicon_filename, target_lexicon_filename) ##Once created we have to copy the previous one in case: if this_exp_lex is not None: if "$LANG" in this_exp_lex: this_exp_lex = this_exp_lex.replace('$LANG', lang) shutil.copy(this_exp_lex, expression_lexicon_filename) if this_tar_lex is not None: if "$LANG" in this_tar_lex: this_tar_lex = this_tar_lex.replace('$LANG', lang) shutil.copy(this_tar_lex, target_lexicon_filename) expressions_lexicon = lexicons_manager.load_lexicon( expression_lexicon_filename) targets_lexicon = lexicons_manager.load_lexicon( target_lexicon_filename) this_propagation_lexicon = my_config_manager.get_propagation_lexicon_name( ) if this_propagation_lexicon is not None: if "$LANG" in this_propagation_lexicon: this_propagation_lexicon = this_propagation_lexicon.replace( '$LANG', lang) print >> sys.stderr, 'Propagated lexicon', this_propagation_lexicon ## Configuration for the relational alcasifier use_deps_now = my_config_manager.get_use_dependencies() use_toks_lems_now = my_config_manager.get_use_tokens_lemmas() accepted_opinions = my_config_manager.get_mapping_valid_opinions() use_dependencies_now = my_config_manager.get_use_dependencies() polarities_found_and_skipped = [] for num_file, train_file in enumerate(train_files): logging.debug('Extracting features ' + os.path.basename(train_file)) base_name = os.path.basename(train_file) out_file = os.path.join( feat_folder, 'file#' + str(num_file) + '#' + base_name + ".feat") err_file = out_file + '.log' #Creates the output file # Returns the labels for the features and the separator used if True: kaf_naf_obj = KafNafParser(train_file) label_feats, separator, pols_skipped_this = extract_features_from_kaf_naf_file( kaf_naf_obj, out_file, err_file, accepted_opinions=accepted_opinions, exp_lex=expressions_lexicon, tar_lex=targets_lexicon, propagation_lex_filename=this_propagation_lexicon) polarities_found_and_skipped.extend(pols_skipped_this) print >> exp_tar_rel_fic, '#' + train_file print >> exp_hol_rel_fic, '#' + train_file # SET valid_opinions to None to use all the possible opinions in the KAF file for extracitng relations create_rel_exp_tar_training(kaf_naf_obj, output=exp_tar_rel_fic, valid_opinions=accepted_opinions, use_dependencies=use_dependencies_now, use_tokens=use_toks_lems_now, use_lemmas=use_toks_lems_now) create_rel_exp_hol_training(kaf_naf_obj, output=exp_hol_rel_fic, valid_opinions=accepted_opinions, use_dependencies=use_dependencies_now, use_tokens=use_toks_lems_now, use_lemmas=use_toks_lems_now) if False: #except Exception as e: sys.stdout, sys.stderr = my_stdout, my_stderr print >> sys.stderr, str(e), dir(e) pass ##Show just for information how many instances have been skipped becase the polarity of opinion expression was not allowed count = defaultdict(int) for exp_label in polarities_found_and_skipped: count[exp_label] += 1 info = '\nOpinions skipped because the polarity label is not included in the configuration\n' info += 'Accepted opinions: ' + ' '.join(accepted_opinions.keys()) + '\n' info += 'Number of complete opinions skipped\n' for label, c in count.items(): info += ' ' + label + ' :' + str(c) + '\n' info += '\n' logging.debug(info) ################################################### #Re-set the stdout and stderr exp_tar_rel_fic.close() exp_hol_rel_fic.close() sys.stdout, sys.stderr = my_stdout, my_stderr #Sabe labelfeats and separator in a file filename = my_config_manager.get_feature_desc_filename() fic = open(filename, 'w') fic.write(' '.join(label_feats) + '\n') fic.close() logging.debug('Description of features --> ' + filename)
def process_single_file(self, file): try: xml_obj = KafNafParser(file) except: print >> sys.stderr, 'Error parsing', file, ': skipped' return print >> sys.stderr, 'Processing file', os.path.basename( file), 'Type:', xml_obj.get_type() self.langs[xml_obj.get_language()] += 1 sentences = [] current_sent = [] this_sent = None pos_for_wid = {} ## For each token id (wid) the pos of it for term in xml_obj.get_terms(): w_ids = term.get_span().get_span_ids() pos = term.get_pos() for wid in term.get_span().get_span_ids(): pos_for_wid[wid] = pos for token in xml_obj.get_tokens(): wid = token.get_id() value = token.get_text() if self.convert_to_lowercase: value = value.lower() if value in self.punctuation: value = 'PUN' if value == '*': value = 'STAR' sentence = token.get_sent() if this_sent is not None and sentence != this_sent: ## There is a new sent sentences.append(current_sent) current_sent = [] current_sent.append((wid, value)) this_sent = sentence ## Add the last sentence as well sentences.append(current_sent) for sentence in sentences: if self.include_sentence_delimiters: sentence.insert(0, ('xxx', '<S>')) sentence.append(('xxx', '</S>')) for idx in range(0, len(sentence)): for ngramlen in range(self.min_ngram_len, self.max_ngram_len + 1): file_desc = self.get_file_desc_for_ngram(ngramlen) start = idx end = start + ngramlen if end <= len(sentence): this_ngram = '\t'.join( value for wid, value in sentence[start:end]) this_ngram_pos = '\t'.join( pos_for_wid.get(wid, 'X') for wid, value in sentence[start:end]) file_desc.write( this_ngram.encode('utf-8') + '\t' + DELIMITER + '\t' + this_ngram_pos + '\n')