def calculate_kappa(filename):
    # save labels
    label_list = []
    with open('data/' + filename + '_data_result.json') as json_file:
        tweets = json.load(json_file)
        for row in tweets:
            label_list.append(row['label'])

    # Generate two fake labels to calculate kappa
    man_1_label = change_some_values(label_list)
    man_2_label = change_some_values(label_list)

    # save the labels to a csv file
    save_to_csv('data/label_1.csv', man_1_label)
    save_to_csv('data/label_2.csv', man_2_label)

    # calculate inter annotator agreement
    civ_1 = ['c1'] * len(man_1_label)
    civ_2 = ['c2'] * len(man_2_label)
    item_num_list = range(0, len(man_1_label))
    civ_1 = zip(civ_1, item_num_list, man_1_label)
    civ_2 = zip(civ_2, item_num_list, man_2_label)
    task_data = civ_1 + civ_2
    task = AnnotationTask(data=task_data)

    # observed disagreement for the weighted kappa coefficient
    print 'kappa: ' + str(task.kappa())
def calculate_kappa(filename):
  # save labels
  label_list = []
  with open('data/' + filename + '_data_result.json') as json_file:
    tweets = json.load(json_file)
    for row in tweets:
      label_list.append(row['label'])

  # Generate two fake labels to calculate kappa
  man_1_label = change_some_values(label_list)
  man_2_label = change_some_values(label_list)

  # save the labels to a csv file
  save_to_csv('data/label_1.csv', man_1_label)
  save_to_csv('data/label_2.csv', man_2_label)

  # calculate inter annotator agreement
  civ_1 = ['c1'] * len(man_1_label)
  civ_2 = ['c2'] * len(man_2_label)
  item_num_list = range(0, len(man_1_label))
  civ_1 = zip(civ_1, item_num_list, man_1_label)
  civ_2 = zip(civ_2, item_num_list, man_2_label)
  task_data = civ_1 + civ_2
  task = AnnotationTask(data=task_data)

  # observed disagreement for the weighted kappa coefficient
  print 'kappa: ' + str(task.kappa())
def lin_svc():
    label_list = get_labels()
    tweet_list = get_labelled_tweets()
    # vectorise using tf-idf
    vectoriser = TfidfVectorizer(
        min_df=3,
        max_features=None,
        strip_accents='unicode',
        analyzer='word',
        token_pattern=r'\w{1,}',
        ngram_range=(1, 2),
        use_idf=1,
        smooth_idf=1,
        sublinear_tf=1,
    )

    ## do transformation into vector
    fitted_vectoriser = vectoriser.fit(tweet_list)
    vectorised_tweet_list = fitted_vectoriser.transform(tweet_list)
    train_vector, test_vector, train_labels, test_labels = train_test_split(
        vectorised_tweet_list, label_list, test_size=0.8, random_state=42)

    # train model and predict
    model = LinearSVC()
    ovr_classifier = OneVsRestClassifier(model).fit(train_vector, train_labels)
    result = ovr_classifier.predict(test_vector)

    # output result to csv
    create_directory('data')
    save_to_csv("data/testset_labels.csv", test_labels)
    result.tofile("data/tfidf_linsvc.csv", sep=',')

    save_model(ovr_classifier, 'tfidf_linsvc')
    save_vectoriser(fitted_vectoriser, 'tfidf_vectoriser')

    # evaluation
    label_score = ovr_classifier.decision_function(test_vector)
    binarise_result = label_binarize(result, classes=class_list)
    binarise_labels = label_binarize(test_labels, classes=class_list)

    evaluate(binarise_result, binarise_labels, label_score, 'tfidf_linsvc')
def lin_svc():
  label_list = get_labels()
  tweet_list = get_labelled_tweets()
  # vectorise using tf-idf
  vectoriser = TfidfVectorizer(min_df=3,
                               max_features=None,
                               strip_accents='unicode',
                               analyzer='word',
                               token_pattern=r'\w{1,}',
                               ngram_range=(1, 2),
                               use_idf=1,
                               smooth_idf=1,
                               sublinear_tf=1,)

  ## do transformation into vector
  fitted_vectoriser = vectoriser.fit(tweet_list)
  vectorised_tweet_list = fitted_vectoriser.transform(tweet_list)
  train_vector, test_vector, train_labels, test_labels = train_test_split(vectorised_tweet_list,
                                                                          label_list,
                                                                          test_size=0.8,
                                                                          random_state=42)

  # train model and predict
  model = LinearSVC()
  ovr_classifier = OneVsRestClassifier(model).fit(train_vector, train_labels)
  result = ovr_classifier.predict(test_vector)

  # output result to csv
  create_directory('data')
  save_to_csv("data/testset_labels.csv", test_labels)
  result.tofile("data/tfidf_linsvc.csv", sep=',')

  save_model(ovr_classifier, 'tfidf_linsvc')
  save_vectoriser(fitted_vectoriser, 'tfidf_vectoriser')

  # evaluation
  label_score = ovr_classifier.decision_function(test_vector)
  binarise_result = label_binarize(result, classes=class_list)
  binarise_labels = label_binarize(test_labels, classes=class_list)

  evaluate(binarise_result, binarise_labels, label_score, 'tfidf_linsvc')
def preprocess(filename):
  # open file
  data = []
  with open('data/' + filename + '_data.json') as json_file:
    tweets = json.load(json_file)
    # load tweet data to a list
    for index, tweet in enumerate(tweets):
      text = tweet['text'].encode('ascii', 'ignore')
      data.append(text)

  # preprocess
  stop_words = stopwords.words('english')
  tweet_list = preprocess_tweets(data, stop_words)

  save_to_csv('data/labelled_tweet.csv', tweet_list)

  # save labels
  label_list = []
  with open('data/' + filename + '_data_result.json') as json_file:
    tweets = json.load(json_file)
    for row in tweets:
      label_list.append(row['label'])
  save_to_csv('data/label_api.csv', label_list)
def preprocess(filename):
    # open file
    data = []
    with open('data/' + filename + '_data.json') as json_file:
        tweets = json.load(json_file)
        # load tweet data to a list
        for index, tweet in enumerate(tweets):
            text = tweet['text'].encode('ascii', 'ignore')
            data.append(text)

    # preprocess
    stop_words = stopwords.words('english')
    tweet_list = preprocess_tweets(data, stop_words)

    save_to_csv('data/labelled_tweet.csv', tweet_list)

    # save labels
    label_list = []
    with open('data/' + filename + '_data_result.json') as json_file:
        tweets = json.load(json_file)
        for row in tweets:
            label_list.append(row['label'])
    save_to_csv('data/label_api.csv', label_list)