def get_data_from_file(pos_filepath, neg_filepath): """ :param pos_filepath: :param neg_filepath: :return: """ neg = file_utils.read_all_lines(neg_filepath) pos = file_utils.read_all_lines(pos_filepath) data = [] for d in neg: data.append((d, 0)) for d in pos: data.append((d, 1)) return data
def _load_train_dev_test_data_by_filepath(self, train_filepath, test_filepath): data_type_and_filepath = {'train': train_filepath, 'test': test_filepath} data_type_and_data = {} for data_type, filepath in data_type_and_filepath.items(): lines = file_utils.read_all_lines(filepath) sentences = [] polarity_mapping = {'-1': 'negative', '0': 'neutral', '1': 'positive'} for i in range(0, len(lines), 3): text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")] aspect = lines[i + 1].lower().strip() polarity = lines[i + 2].strip() if text_left != '': text = text_left + " " + aspect from_index = len(text_left) + 1 else: text = aspect from_index = 0 if text_right != '': text = text + ' ' + text_right to_index = from_index + len(aspect) if text[from_index: to_index] != aspect: logger.error('error aspect index: %s != %s' (text[from_index: to_index], aspect)) aspect_term = AspectTerm(aspect, polarity_mapping[polarity], from_index, to_index) sentence = AbsaSentence(text, None, None, [aspect_term]) sentences.append(sentence) documents = [AbsaDocument(sentence.text, None, None, None, [sentence]) for sentence in sentences] data_type_and_data[data_type] = documents return data_type_and_data['train'], None, data_type_and_data['test']
def _get_bert_word_segmenter(self): token_dict = {} for line in file_utils.read_all_lines(self.bert_vocab_file_path): token = line.strip() token_dict[token] = len(token_dict) result = bert_tokenizer.EnglishTokenizer(token_dict) return result
def generate_line_dicts(filepath: str): """ :param filepath: :return: """ lines = file_utils.read_all_lines(filepath) for line in lines: line_dict = json.loads(line) yield line_dict
def get_pair_data_from_file(filepath): """ 输入样本形式为text1\ttext2\tlabel :param filepath: :return: """ lines = file_utils.read_all_lines(filepath) samples = [line.split('\t') for line in lines] samples = [(sample[0], sample[1], int(sample[2])) for sample in samples] return samples
def load_samples(self): base_dir = common_path.get_task_data_dir('absa', is_original=True) filepath = os.path.join(base_dir, 'SemEval-2014-Task-4-REST', 'SemEval-2014-Task-4-REST-mil', 'SemEval-2014-Task-4-REST-mil.json') samples = super()._load_samples_by_filepath(filepath) filepath_hard = os.path.join(base_dir, 'SemEval-2014-Task-4-REST', 'SemEval-2014-Task-4-REST-mil', 'SemEval-2014-Task-4-REST-hard-mil.txt') hard_sentences = set(file_utils.read_all_lines(filepath_hard)) result = [ sample for sample in samples if sample.text.lower() in hard_sentences ] return result
def predict(self): USE_GPU = torch.cuda.is_available() if USE_GPU: gpu_id = self.configuration['gpu_id'] else: gpu_id = -1 predictor = pytorch_models.SpanBasedModelPredictor( self.model, self.val_iterator, self.distinct_polarities, configuration=self.configuration, cuda_device=gpu_id) data_type_and_data = { # 'train': self.train_data, # 'dev': self.dev_data, 'test': self.test_data } if self.hard_test_data: data_type_and_data['hard_test'] = self.hard_test_data for data_type, data_temp in data_type_and_data.items(): # for multi correct_sentences = file_utils.read_all_lines( 'd:/correct_sentences.txt') for sentence in correct_sentences: data = [] for instance in data_temp: text = instance.fields['sample'].metadata['text'] # i love the keyboard and the screen. () # The best thing about this laptop is the price along with some of the newer features. if sentence in text: data.append(instance) result = predictor.predict(data) if result[0]['aspect_terms'][0].polarity == 'neutral' or result[ 1]['aspect_terms'][0].polarity == 'neutral': continue for e in result: sentiment_outputs_for_aspect_terms = e[ 'sentiment_outputs_for_aspect_terms'] aspect_terms = e['aspect_terms'] for i in range(len(aspect_terms)): if aspect_terms[i].polarity != 'neutral' and aspect_terms[ i].polarity != sentiment_outputs_for_aspect_terms[ i][1]: print()
def load_csv_data(filepath, skip_first_line=True): """ :param filepath: :param skip_first_line: :return: """ result = [] lines = file_utils.read_all_lines(filepath) for line in lines: rows = csv.reader([line]) for row in rows: result.append(row) if len(row) != len(result[0]): print(row) if skip_first_line: result = result[1:] return result
def _load_train_dev_test_data(self): sentence_map_filepath = os.path.join(base_data_dir, 'ABSA_DevSplits', 'dataset', 'sentence_map.txt') sentence_map = { line.split('\t')[0]: line.split('\t')[1] for line in file_utils.read_all_lines(sentence_map_filepath, strip_type='line_separator') } data_filepath = os.path.join(base_data_dir, 'ABSA_DevSplits', 'dataset', 'Restaurants_category.pkl') polarity_index_and_text = {0: 'negative', 1: 'positive', 2: 'neutral'} datasets = [] with open(data_filepath, mode='rb') as in_file: content = in_file.read() content_correct = b'' for line in content.splitlines(): content_correct += line + str.encode('\n') data = pickle.loads(content_correct, encoding='utf-8') # data = pickle.load(in_file, encoding='utf-8') datasets_indexed = [data['train'], data['dev'], data['test']] index2word = data['index_word'] for dataset_indexed in datasets_indexed: dataset = [] text_and_categories = {} for sample in dataset_indexed: words = [index2word[index] for index in sample[0]] text = ' '.join(words) category = [index2word[index] for index in sample[2]][0] polarity = polarity_index_and_text[sample[4]] aspect_category = AspectCategory(category, polarity) if text not in text_and_categories: text_and_categories[text] = [] text_and_categories[text].append(aspect_category) for text, categories in text_and_categories.items(): text = sentence_map[text] sentence = AbsaSentence(text, None, categories, None) document = AbsaDocument(sentence.text, None, None, None, [sentence]) dataset.append(document) datasets.append(dataset) return datasets
ate_result_filepath = args.ate_result_filepath_template triplet_result_filepath = args.so_result_filepath_template else: ate_result_filepath = args.ate_result_filepath_template % i triplet_result_filepath = args.so_result_filepath_template % i if not os.path.exists(ate_result_filepath): print('not exist: %s' % ate_result_filepath) continue if not os.path.exists(triplet_result_filepath): print('not exist: %s' % triplet_result_filepath) continue ate_pred = read_ate_result(ate_result_filepath) so_pred = file_utils.read_all_lines(triplet_result_filepath) triplets_pred = get_sentence_and_triplets_pred(so_pred, ate_pred) asote_metrics_of_multi_runs.append( evaluate_asote(triplets_true, triplets_pred)) print_precision_recall_f1(asote_metrics_of_multi_runs, 'asote_metrics_of_multi_runs') print('-' * 100) asote_metrics_of_multi_runs = [] for i in range(run_num): if args.debug: ate_result_filepath = args.ate_result_filepath_template triplet_result_filepath = args.so_result_filepath_template
# -*- coding: utf-8 -*- import sys import json from nlp_tasks.utils import file_utils input_filepath = sys.argv[1] accs = [] result_filepath_of_test = [] lines = file_utils.read_all_lines(input_filepath) for line in lines: if 'atsa_train_templates.py-361' in line and 'data_type: test result' in line: start_index = line.index('{') performances_str = line[start_index:].replace('\'', '"') performances = json.loads(performances_str) accs.append(float('%.5f' % performances['sentiment_acc'])) continue if 'result_of_predicting_tes' in line: start_index = line.index(':') + 1 filepath = line[start_index:] result_filepath_of_test.append(filepath) print('accs:') print(','.join([str(e) for e in accs])) print('filepaths: %s' % str(result_filepath_of_test))
if len(opinion_terms) == 0: opinion_terms = ['-'] for opinion_term in opinion_terms: golden_aspect_opinion_pairs.add('%s-%s-%s' % (text, aspect_term, opinion_term)) golden_aspect_opinion_sentiment_triplets.add( '%s-%s-%s-%s' % (text, aspect_term, opinion_term, sentiment)) aspect_sentiment_pair_metrics_all = defaultdict(list) aspect_opinion_pair_metrics_all = defaultdict(list) aste_metrics_all = defaultdict(list) for i in range(5): ate_result_filepath = configuration['ate_result_filepath_template'] % i # print('ate_result_filepath = "%s"' % ate_result_filepath) ate_lines = file_utils.read_all_lines(ate_result_filepath) pred_text_aspects = set() for line in ate_lines: line_dict = json.loads(line) for aspect_term in line_dict['pred']: text_aspect = '%s-%s' % (line_dict['text'], aspect_term) pred_text_aspects.add(text_aspect) atsa_result_filepath = configuration['atsa_result_filepath_template'] % i # print('atsa_result_filepath = "%s"' % atsa_result_filepath) atsa_lines = file_utils.read_all_lines(atsa_result_filepath) pred_text_aspect_sentiment = {} for line in atsa_lines: line_dict = json.loads(line) text = line_dict['text']