def clear_data(data):
    instance_clean = []
    for instance in data:
        id = instance['id']
        text = instance['text']
        opinion = instance['opinions']
        aspect_terms = opinion['aspect_term']
        text_clean = clean_str(text)
        opinion_clean = []
        for a in aspect_terms:
            aspect = a['term']
            polarity = a['polarity']
            category = a['category']
            if polarity == "conflict":
                continue
            from_index = int(a['from'])
            to_index = int(a['to'])
            aspect_clean = clean_str(aspect)
            start_clean = clean_str(text[:from_index])
            if to_index == 0:
                opinion_clean.append(
                    {'aspect': aspect_clean, 'category': category, 'polarity': polarity, 'from': 0,
                     'to': 0})
            else:
                opinion_clean.append(
                    {'aspect': aspect_clean, 'category': category, 'polarity': polarity, 'from': len(start_clean),
                     'to': len(start_clean) + len(aspect_clean)})
        if len(opinion_clean) == 0:
            continue
        instance_clean.append({'id': id, 'text': text_clean, 'opinion': opinion_clean})
    return instance_clean
示例#2
0
def create_rows(xlsxes):
  train = []
  valid = []
  test = []

  r = []
  conv_id = 0
  movie_id = 0
  previous_movie = -1
  previous_situation = -1
  for xlsx in tqdm(xlsxes):
    wb = load_workbook(xlsx)
    ws = wb.active
    for i, row in enumerate(tqdm(ws.rows, total=ws.max_row)):
      if i == 0:
        continue

      movie = row[0].value
      situation = row[3].value
      sents = (row[1].value, row[2].value)

      if movie != previous_movie:
        movie_id += 1

      if movie != previous_movie or situation != previous_situation:
        if r:
          handle = train
          if movie_id % 10 == 0:
            handle = test
          elif movie_id % 10 == 1:
            handle = valid
          handle.append(r)
          r = []

      for sent_idx, sent in enumerate(sents):
        if sent is not None:
          try:
            if type(sent) == int or row[sent_idx + 1].number_format == '@':
              sent = str(sent)
            c = Comment(clean_str(sent))
          except:
            import pdb
            pdb.set_trace()
          add_tags(c)
          r.append(c)

      previous_movie = movie
      previous_situation = situation

    handle = train
    if movie_id % 10 == 0:
      handle = test
    elif movie_id % 10 == 1:
      handle = valid
    handle.append(r)
    r = []

  return train, valid, test
示例#3
0
def create_rows(xlsxes):
    train = []
    valid = []
    test = []

    r = []
    conv_id = 0
    for xlsx in xlsxes:
        firstSSeen = False
        wb = load_workbook(xlsx)
        ws = wb.active
        for i, row in enumerate(tqdm(ws.rows)):
            if not firstSSeen:
                if row[0].value == 'S':
                    firstSSeen = True
                else:
                    continue

            if row[0].value == "S":
                if r:
                    handle = train
                    if conv_id % 10 == 0:
                        handle = test
                    elif conv_id % 10 == 1:
                        handle = valid
                    handle.append(r)
                    conv_id += 1
                    r = []

            c = Comment(clean_str(row[1].value))
            add_tags(c)
            r.append(c)

        handle = train
        if conv_id % 10 == 0:
            handle = test
        elif conv_id % 10 == 1:
            handle = valid
        handle.append(r)
        conv_id += 1
        r = []

    return train, valid, test