class SemEval2014Task4_Laptops(__SemEval2014Task4): """ SemEval 2014 Task 4 Laptop Dataset for Aspect based Sentiment Analysis. Download: http://alt.qcri.org/semeval2014/task4/index.php?id=data-and-tools """ # paths TRAIN_FILE = FilePath( "SemEval2014-Task4/Laptops_Train.xml", "https://raw.githubusercontent.com/pedrobalage/SemevalAspectBasedSentimentAnalysis/master/semeval_data/Laptop_Train_v2.xml" ) TEST_FILE = FilePath( "SemEval2014-Task4/laptops-trial.xml", "https://alt.qcri.org/semeval2014/task4/data/uploads/laptops-trial.xml" ) n_train_items = lambda self: 3045 n_eval_items = lambda self: 100 def get_aspect_label_pairs(self, sentence): # get aspect categories and terms aspect_terms = sentence.find('aspectTerms') # load aspect label pairs aspect_label_pairs = [] if aspect_terms is not None: aspect_label_pairs += [(aspect.attrib['term'], aspect.attrib['polarity']) for aspect in aspect_terms] # return return aspect_label_pairs
class SemEval2014Task4_Category(__SemEval2014Task4): """ SemEval 2014 Task 4 Aspect-Category Dataset for Aspect based Sentiment Analysis. Only provides examples for aspect-categories (not explicitly mentioned in the text). Download: http://alt.qcri.org/semeval2014/task4/index.php?id=data-and-tools """ # paths TRAIN_FILE = FilePath( "SemEval2014-Task4/Restaurants_Train.xml", "https://raw.githubusercontent.com/pedrobalage/SemevalAspectBasedSentimentAnalysis/master/semeval_data/Restaurants_Train_v2.xml" ) TEST_FILE = FilePath( "SemEval2014-Task4/restaurants-trial.xml", "https://alt.qcri.org/semeval2014/task4/data/uploads/restaurants-trial.xml" ) n_train_items = lambda self: 3041 n_eval_items = lambda self: 100 def get_aspect_label_pairs(self, sentence): # only get categories aspect_categories = sentence.find('aspectCategories') # build aspect label pairs aspect_label_pairs = [] if aspect_categories is not None: aspect_label_pairs += [ (aspect.attrib['category'], aspect.attrib['polarity']) for aspect in sentence.find('aspectCategories') if aspect is not None ] # return return aspect_label_pairs
class SemEval2015Task12(AOEx_Dataset): """ SemEval 2015 Task 12 dataset for Aspect-Opinion Extraction Downlaod: https://github.com/happywwy/Coupled-Multi-layer-Attentions/tree/master/util/data_semEval """ n_train_items = lambda self: 1316 n_eval_items = lambda self: 686 def yield_items(self, aspect_fname:str, sent_opinion_fname:str): # build full paths to files aspect_fpath = self.data_base_dir / aspect_fname sent_opinion_fpath = self.data_base_dir / sent_opinion_fname # load file contents with open(aspect_fpath, 'r', encoding='utf-8') as f: all_aspects = f.read().replace('NULL', '').split('\n') with open(sent_opinion_fpath, 'r', encoding='utf-8') as f: all_sents_opinions = f.read().split('\n') assert len(all_aspects) == len(all_sents_opinions) # preprocess data for sent_opinions, aspects in zip(all_sents_opinions, all_aspects): # separate sentence from opinions sent, opinions = sent_opinions.split('##') if '##' in sent_opinions else (sent_opinions, '') # get aspects and opinions opinions = [o.strip()[:-3] for o in opinions.split(',')] if len(opinions) > 0 else [] aspects = [a.strip() for a in aspects.split(',')] if len(aspects) > 0 else [] # build aspect and opinion spans opinion_pos = [sent.find(o) for o in opinions] opinion_spans = [(i, i + len(o)) for i, o in zip(opinion_pos, opinions)] aspect_pos = [sent.find(a) for a in aspects] aspect_spans = [(i, i + len(a)) for i, a in zip(aspect_pos, aspects)] # yield dataset item yield AOEx_DatasetItem( sentence=sent, aspect_spans=aspect_spans, opinion_spans=opinion_spans ) # yield train and test items yield_train_items = lambda self: self.yield_items( aspect_fname=FilePath( "SemEval2015-Task12/aspectTerm_res15", "https://raw.githubusercontent.com/happywwy/Coupled-Multi-layer-Attentions/master/util/data_semEval/aspectTerm_res15" ), sent_opinion_fname=FilePath( "SemEval2015-Task12/sentence_res15_op", "https://raw.githubusercontent.com/happywwy/Coupled-Multi-layer-Attentions/master/util/data_semEval/sentence_res15_op" ) ) yield_eval_items = lambda self: self.yield_items( aspect_fname=FilePath( "SemEval2015-Task12/aspectTerm_restest15", "https://raw.githubusercontent.com/happywwy/Coupled-Multi-layer-Attentions/master/util/data_semEval/aspectTerm_restest15" ), sent_opinion_fname=FilePath( "SemEval2015-Task12/sentence_restest15_op", "https://raw.githubusercontent.com/happywwy/Coupled-Multi-layer-Attentions/master/util/data_semEval/sentence_restest15_op" ) )
class SemEval2015Task12_AspectPolarity(__SemEval2015Task12): """ Dataset for the SemEval2014 Task4 data for Aspect-based Sentiment Analysis Download: http://alt.qcri.org/semeval2015/task12/index.php?id=data-and-tools """ LABELS = ['positive', 'neutral', 'negative'] TRAIN_FILE = FilePath( "SemEval2015-Task12/ABSA-15_Restaurants_Train_Final.xml", "https://raw.githubusercontent.com/peace195/aspect-based-sentiment-analysis/master/data/ABSA_SemEval2015/Restaurants_Train_Final.xml" ) EVAL_FILE = FilePath( "SemEval2015-Task12/ABSA15_Restaurants_Test.xml", "https://raw.githubusercontent.com/peace195/aspect-based-sentiment-analysis/master/data/ABSA_SemEval2015/Restaurants_Test.xml" ) n_train_items = lambda self: 833 n_eval_items = lambda self: 402 def yield_items(self, fpath: str) -> iter: # parse xml file tree = ET.parse(fpath) root = tree.getroot() # parse all reviews for review in root: for sent in review[0].findall('sentence'): # get sentence text = sent.find('text').text # find opinions opinions = sent.find('Opinions') if opinions is None: continue # get aspects and sentiments aspects = [(int(o.attrib['from']), int(o.attrib['to'])) for o in opinions] sentiments = [o.attrib['polarity'] for o in opinions] # remove unvalids - no aspect target sentiments = [ s for s, (b, e) in zip(sentiments, aspects) if b < e ] aspects = [(b, e) for (b, e) in aspects if b < e] # no aspects found if len(aspects) == 0: continue # build dataset item yield NEC_DatasetItem( sentence=text, entity_spans=aspects, labels=[ SemEval2015Task12_AspectPolarity.LABELS.index(s) for s in sentiments ])
class SemEval2015Task12_OpinionPolarity(__SemEval2015Task12): """ Dataset for the SemEval2014 Task4 data for Opinion-based Sentiment Analysis Downlaod: https://github.com/happywwy/Coupled-Multi-layer-Attentions/tree/master/util/data_semEval """ LABELS = ['positive', 'negative'] TRAIN_FILE = FilePath( "SemEval2015-Task12/sentence_res15_op", "https://raw.githubusercontent.com/happywwy/Coupled-Multi-layer-Attentions/master/util/data_semEval/sentence_res15_op" ) EVAL_FILE = FilePath( "SemEval2015-Task12/sentence_restest15_op", "https://raw.githubusercontent.com/happywwy/Coupled-Multi-layer-Attentions/master/util/data_semEval/sentence_restest15_op" ) n_train_items = lambda self: 760 n_eval_items = lambda self: 333 def yield_items(self, fpath: str) -> iter: # load file content with open(fpath, 'r', encoding='utf-8') as f: all_sents_opinions = f.read().split('\n') # preprocess data for sent_opinions in all_sents_opinions: # no opinions if '##' not in sent_opinions: continue # separate sentence from opinions sent, opinions = sent_opinions.split('##') # get aspects and opinions opinions = [o.strip() for o in opinions.split(',') ] if len(opinions) > 0 else [] opinions, sentiments = zip(*[(o[:-2].strip(), o[-2:]) for o in opinions]) # build opinion spans opinion_pos = [sent.find(o) for o in opinions] opinion_spans = [(i, i + len(o)) for i, o in zip(opinion_pos, opinions)] # get sentiment labels sentiments = [(-int(i) + 1) // 2 for i in sentiments] # build dataset item yield NEC_DatasetItem(sentence=sent, entity_spans=opinion_spans, labels=sentiments)
class SemEval2010Task8(RelExDataset): """ SemEval2010 Task8 Dataset Download: https://github.com/sahitya0000/Relation-Classification/blob/master/corpus/SemEval2010_task8_all_data.zip """ # training and testing files TRAIN_FILE = FilePath( "SemEval2010-Task8/SemEval2010_task8_training/TRAIN_FILE.TXT", "https://raw.githubusercontent.com/sahitya0000/Relation-Classification/master/corpus/SemEval2010_task8_training/TRAIN_FILE.TXT" ) EVAL_FILE = FilePath( "SemEval2010-Task8/SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT", "https://raw.githubusercontent.com/sahitya0000/Relation-Classification/master/corpus/SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT" ) # set of valid labels LABELS = [ "Other", "Component-Whole(e2,e1)", "Component-Whole(e1,e2)", "Instrument-Agency(e2,e1)", "Instrument-Agency(e1,e2)", "Member-Collection(e2,e1)", "Member-Collection(e1,e2)", "Cause-Effect(e2,e1)", "Cause-Effect(e1,e2)", "Entity-Destination(e2,e1)", "Entity-Destination(e1,e2)", "Content-Container(e2,e1)", "Content-Container(e1,e2)", "Message-Topic(e2,e1)", "Message-Topic(e1,e2)", "Product-Producer(e2,e1)", "Product-Producer(e1,e2)", "Entity-Origin(e2,e1)", "Entity-Origin(e1,e2)" ] n_train_items = lambda self: 8000 n_eval_items = lambda self: 2717 # yield training and evaluation items yield_train_items = lambda self: self.yield_item( self.data_base_dir / SemEval2010Task8.TRAIN_FILE) yield_eval_items = lambda self: self.yield_item(self.data_base_dir / SemEval2010Task8.EVAL_FILE) def yield_item(self, fpath: str) -> iter: # load data with open(fpath, 'r', encoding='utf-8') as f: lines = f.read().strip().split('\n') # read examples for sent_line, relation_line in zip(lines[::4], lines[1::4]): # get text sent = sent_line.split('\t')[1].strip() # clean up sentence assert sent[0] == sent[-1] == '"' sent = sent[1:-1] # find entities in sentence entity_A = re.search(r'<e1>(.*)</e1>', sent) entity_B = re.search(r'<e2>(.*)</e2>', sent) # get spans from matches with markers entity_span_A = (entity_A.start(), entity_A.end() - 4 - 5) entity_span_B = (entity_B.start(), entity_B.end() - 4 - 5) if entity_span_A[0] < entity_span_B[0]: entity_span_B = (entity_span_B[0] - 4 - 5, entity_span_B[1] - 4 - 5) else: entity_span_A = (entity_span_A[0] - 4 - 5, entity_span_A[1] - 4 - 5) # remove markers from text sent = re.sub(r'<(/?)e1>', '', sent) sent = re.sub(r'<(/?)e2>', '', sent) # get label label = relation_line.strip() # yield features yield RelExDatasetItem( sentence=sent, source_entity_span=entity_span_A, target_entity_span=entity_span_B, relation_type=SemEval2010Task8.LABELS.index(label))