def _load_semeval_by_filepath(self, train_filepath, test_filepath, val_filepath=None): """ :return: """ data_type_and_datas = {} data_type_and_filepath = { 'train': train_filepath, 'test': test_filepath, 'dev': val_filepath } for data_type, filepath in data_type_and_filepath.items(): if filepath is None: data_type_and_datas[data_type] = None continue content = file_utils.read_all_content(filepath) soup = BeautifulSoup(content, "lxml") sentence_tags = soup.find_all('sentence') sentences = [] for sentence_tag in sentence_tags: text = sentence_tag.text aspect_term_tags = sentence_tag.find_all('aspectterm') aspect_terms = [] for aspect_term_tag in aspect_term_tags: term = aspect_term_tag['term'] try: polarity = aspect_term_tag['polarity'] except: polarity = 'positive' from_index = aspect_term_tag['from'] to_index = aspect_term_tag['to'] aspect_term = AspectTerm(term, polarity, from_index, to_index) aspect_terms.append(aspect_term) aspect_categories = [] aspect_category_tags = sentence_tag.find_all('aspectcategory') for aspect_category_tag in aspect_category_tags: category = aspect_category_tag['category'] try: polarity = aspect_category_tag['polarity'] except: polarity = 'positive' aspect_category = AspectCategory(category, polarity) aspect_categories.append(aspect_category) sentence = AbsaSentence(text, None, aspect_categories, aspect_terms) sentences.append(sentence) documents = [ AbsaDocument(sentence.text, None, None, None, [sentence]) for sentence in sentences ] data_type_and_datas[data_type] = documents train_data = data_type_and_datas['train'] dev_data = data_type_and_datas['dev'] test_data = data_type_and_datas['test'] return train_data, dev_data, test_data
def _load_train_dev_test_data_by_filepath(self, train_filepath, test_filepath): """ :param train_filepath: :param test_filepath: :return: """ datas = [] for filepath in [train_filepath, test_filepath]: if filepath is None: datas.append(None) continue content = file_utils.read_all_content(filepath) soup = BeautifulSoup(content, "lxml") doc_tags = soup.find_all('review') docs = [] for doc_tag in doc_tags: sentence_tags = doc_tag.find_all('sentence') doc_texts = [] sentences = [] for sentence_tag in sentence_tags: text = sentence_tag.text opinion_tags = sentence_tag.find_all('opinion') aspect_terms = [] aspect_categories = [] for opinion_tag in opinion_tags: category = opinion_tag['category'] polarity = opinion_tag['polarity'] if 'target' in opinion_tag.attrs: term = opinion_tag['target'] from_index = opinion_tag['from'] to_index = opinion_tag['to'] aspect_term = AspectTerm(term, polarity, from_index, to_index, category) aspect_terms.append(aspect_term) else: aspect_category = AspectCategory( category, polarity) aspect_categories.append(aspect_category) sentence = AbsaSentence(text, None, aspect_categories, aspect_terms) sentences.append(sentence) doc_texts.append(sentence.text) doc = AbsaDocument(''.join(doc_texts), None, None, None, sentences) docs.append(doc) datas.append(docs) train_data = datas[0] test_data = datas[1] dev_data = None return train_data, dev_data, test_data