예제 #1
0
def main():

  input_file, output_prefix = sys.argv[1:]
  
  dataset = []

  labels = ["1.0", "2.0", "3.0", "4.0", "5.0"]
  label_map = dl.make_label_map(labels)
  
  with open(input_file, 'r') as in_file:
    for line in in_file:
      (avg_score, _, p1, p2, _) = line.split("\t")
      p1_words = [word.lower() for word in p1.split()]
      p2_words = [word.lower() for word in p2.split()]
      label = str(ceil(float(avg_score)))
      if label == '0.0':
        label = '1.0'
      dataset.append((p1_words, p2_words, label_map[label]))

  random.Random(137).shuffle(dataset)

  train_len = len(dataset) - 4000
  dev_len = int(0.1 * float(train_len))

  dev = dataset[4000:(4000 + dev_len)]
  train = dataset[(4000 + dev_len):]
  
  for dataset, filename in zip([train, dev], ["train", "dev"]):
    out_file = output_prefix + filename + ".pickle"
    dl.make_pickle(dataset, label_map, out_file)
예제 #2
0
def main():

    input_prefix, output_prefix = sys.argv[1:]

    train = read_dataset_sentences(input_prefix + '/train.label')
    for dataset, filename in zip([train], ["train"]):
        out_file = output_prefix + filename + ".pickle"
        labels = sorted(list(set([label for question, label in dataset])))
        label_map = dl.make_label_map(labels)
        new_dataset = [(question, label_map[label])
                       for question, label in dataset]
        dl.make_pickle(new_dataset, label_map, out_file)
예제 #3
0
def main():

    input_prefix, output_prefix, window_size = sys.argv[1:]
    window_size = int(window_size)

    train = read_dataset_sentences(input_prefix + '/train.txt', window_size)
    valid = read_dataset_sentences(input_prefix + '/dev.txt', window_size)
    train_windows = make_windows(train, window_size)
    valid_windows = make_windows(valid, window_size)

    for dataset, filename in zip([train_windows, valid_windows],
                                 ["train", "dev"]):
        out_file = output_prefix + filename + ".pickle"
        labels = sorted(list(set([label for window, label in dataset])))
        label_map = dl.make_label_map(labels)
        new_dataset = [(window, label_map[label]) for window, label in dataset]
        dl.make_pickle(new_dataset, label_map, out_file)
예제 #4
0
def main():

    input_prefix, output_prefix, window_size = sys.argv[1:]
    window_size = int(window_size)

    train = read_dataset_sentences(input_prefix + '/train.txt', window_size)
    train_windows = make_windows(train, window_size)

    train_labels = sorted(list(set([label
                                    for window, label in train_windows])))
    train_labels.append('I-LST')  # Missing from training set
    labels = sorted(set(train_labels))
    label_map = dl.make_label_map(labels)

    for dataset, filename in zip([train_windows], ["train"]):
        out_file = output_prefix + filename + ".pickle"
        new_dataset = [(window, label_map[label]) for window, label in dataset]
        dl.make_pickle(new_dataset, label_map, out_file)
예제 #5
0
def construct_sentiment_dataset(input_prefix):

    binary_label_map = {
        0.0: 0.0,
        1.0: 0.0,
        2.0: 0.0,
        3.0: None,
        4.0: 1.0,
        5.0: 1.0
    }

    dataset_sentences = read_dataset_sentences(input_prefix)
    dictionary = read_dictionary(input_prefix)
    sentiment_labels = read_sentiment_labels(input_prefix)
    train, test, dev = read_dataset_split(input_prefix)
    new_train = []
    new_test = []
    new_dev = []
    for old, new in zip([train, test, dev], [new_train, new_test, new_dev]):
        temp = []
        labels = set()
        for sent_id in old:
            phrase = dataset_sentences[sent_id]
            phrase = phrase.replace("-LRB-", "(").replace("-RRB-", ")")
            string_label = sentiment_labels[dictionary[phrase]]
            binary_sentiment_label = get_binary_label(binary_label_map,
                                                      string_label)
            if binary_sentiment_label is not None:
                temp.append((phrase.split(), binary_sentiment_label))
                labels.add(binary_sentiment_label)
        label_map = dl.make_label_map(sorted(list(labels)))
        for phrase, label in temp:
            lowered_phrase = [word.lower() for word in phrase]
            new.append((lowered_phrase, label_map[label]))

    return new_train, new_test, new_dev, label_map