def clear_data(data): instance_clean = [] for instance in data: id = instance['id'] text = instance['text'] opinion = instance['opinions'] aspect_terms = opinion['aspect_term'] text_clean = clean_str(text) opinion_clean = [] for a in aspect_terms: aspect = a['term'] polarity = a['polarity'] category = a['category'] if polarity == "conflict": continue from_index = int(a['from']) to_index = int(a['to']) aspect_clean = clean_str(aspect) start_clean = clean_str(text[:from_index]) if to_index == 0: opinion_clean.append( {'aspect': aspect_clean, 'category': category, 'polarity': polarity, 'from': 0, 'to': 0}) else: opinion_clean.append( {'aspect': aspect_clean, 'category': category, 'polarity': polarity, 'from': len(start_clean), 'to': len(start_clean) + len(aspect_clean)}) if len(opinion_clean) == 0: continue instance_clean.append({'id': id, 'text': text_clean, 'opinion': opinion_clean}) return instance_clean
def create_rows(xlsxes): train = [] valid = [] test = [] r = [] conv_id = 0 movie_id = 0 previous_movie = -1 previous_situation = -1 for xlsx in tqdm(xlsxes): wb = load_workbook(xlsx) ws = wb.active for i, row in enumerate(tqdm(ws.rows, total=ws.max_row)): if i == 0: continue movie = row[0].value situation = row[3].value sents = (row[1].value, row[2].value) if movie != previous_movie: movie_id += 1 if movie != previous_movie or situation != previous_situation: if r: handle = train if movie_id % 10 == 0: handle = test elif movie_id % 10 == 1: handle = valid handle.append(r) r = [] for sent_idx, sent in enumerate(sents): if sent is not None: try: if type(sent) == int or row[sent_idx + 1].number_format == '@': sent = str(sent) c = Comment(clean_str(sent)) except: import pdb pdb.set_trace() add_tags(c) r.append(c) previous_movie = movie previous_situation = situation handle = train if movie_id % 10 == 0: handle = test elif movie_id % 10 == 1: handle = valid handle.append(r) r = [] return train, valid, test
def create_rows(xlsxes): train = [] valid = [] test = [] r = [] conv_id = 0 for xlsx in xlsxes: firstSSeen = False wb = load_workbook(xlsx) ws = wb.active for i, row in enumerate(tqdm(ws.rows)): if not firstSSeen: if row[0].value == 'S': firstSSeen = True else: continue if row[0].value == "S": if r: handle = train if conv_id % 10 == 0: handle = test elif conv_id % 10 == 1: handle = valid handle.append(r) conv_id += 1 r = [] c = Comment(clean_str(row[1].value)) add_tags(c) r.append(c) handle = train if conv_id % 10 == 0: handle = test elif conv_id % 10 == 1: handle = valid handle.append(r) conv_id += 1 r = [] return train, valid, test