def prepData(filepath, stopfilter, multiword):
    print("Preparing data...")

    ret = []  # list of lists

    print("Reading data...")
    # this reads file in JSON format
    #tweets = readTweets(jsonfilepath)

    # this reads SemEval format tweets
    tweets, _, _, _ = readTweetsOfficial(filepath)
    #tweets = "\n".join(tweets)

    print("Tokenising...")
    for tweet in tweets:
        tokenised_tweet = tokenize(tweet.lower())
        if stopfilter:
            words = filterStopwords(tokenised_tweet)
            ret.append(words)
        else:
            ret.append(tokenised_tweet)

    if multiword:
        return learnMultiword(ret)
    else:
        return ret
Exemplo n.º 2
0
            else:
                outfOut.write(line)
            cntr += 1

    outfIn.close()
    outfOut.close()


if __name__ == '__main__':
    testdata = "../data/SemEval2016-Task6-subtaskB-testdata-gold.txt"
    devdata = "../data/semEval2016-task6-trialdata_new.txt"
    traindata = "../data/semeval2016-task6-train+dev.txt"

    devbest = "../out/results_all-1e-3-false_conditional-reverse_w2vsmall_hidd60_droptrue_stop-most_pre_cont_accthresh0.98_2.txt"

    tweets_gold, targets_gold, labels_gold, ids_gold = reader.readTweetsOfficial(
        devdata, 'windows-1252', 2)
    tweets_res, targets_res, labels_res, ids_res = reader.readTweetsOfficial(
        devdata, 'windows-1252', 2)

    inlist = selectTrainData(tweets_gold, targets_gold)

    printInOutFiles(inlist, devbest, "out_dev_inTwe_cond.txt",
                    "out_dev_outTwe_cond.txt")
    printInOutFiles(inlist, devdata, "_gold_dev_inTwe.txt",
                    "_gold_dev_outTwe.txt")

    print("All")
    writer.eval(devdata, devbest)

    print("Inlist")
    writer.eval("_gold_dev_inTwe.txt", "out_dev_inTwe_cond.txt")
Exemplo n.º 3
0
        v = np.zeros(dim)
        if dim == 3:
            if lab == 'NONE':
                ix = 0
            elif lab == 'AGAINST':
                ix = 1
            elif lab == 'FAVOR':
                ix = 2
        else:
            if lab == 'AGAINST':
                ix = 0
            elif lab == 'FAVOR':
                ix = 1
        v[ix] = 1
        labels_t.append(v)
    return labels_t


if __name__ == '__main__':
    tweets, targets, labels, ids = reader.readTweetsOfficial("../data/semeval2016-task6-train+dev.txt")
    tweet_tokens = tokenise_tweets(tweets)
    target_tokens = tokenise_tweets(targets)
    count, dictionary, reverse_dictionary = build_dataset([token for senttoks in tweet_tokens+target_tokens for token in senttoks])  #flatten tweets for vocab construction
    transformed_tweets = [transform_tweet_dict(dictionary, senttoks) for senttoks in tweet_tokens]
    transformed_targets = [transform_tweet_dict(dictionary, senttoks) for senttoks in target_tokens]
    transformed_labels = transform_labels(labels)
    print('Longest tweet', len(max(transformed_tweets,key=len)))
    print('Longest target', len(max(transformed_targets,key=len)))
    print('Most common words (+UNK)', count[:5])
    #print('Sample data', data[:10])
Exemplo n.º 4
0
    #     "../data/downloaded_Donald_Trump.txt", "utf-8", 1)
    # tweet_tokens_trump = tokenise_tweets(tweets_trump, stopwords="most")
    #
    # tweets_Gampa, targets_Gampa, labels_Gampa, ids_Gampa = reader.readTweetsOfficial(
    #     "/local/data/haoxu/Rudetect/Gabapentin_0628_0121/final/corpus.csv", "utf-8", 1)
    # tweets_tokens_Gampa = tokenise_tweets(tweets_Gampa, stopwords="most")
    #
    # tweets_unlabelled = reader.readTweets(
    #     "../data/additionalTweetsStanceDetection.json")
    # tweet_tokens_unlabelled = tokenise_tweets(
    #     tweets_unlabelled, stopwords="most")
    #
    # trainWord2VecModel(unk_tokens + tweet_tokens + tweet_tokens_trump + tweet_tokens_unlabelled +
    #                    tweets_tokens_Gampa, "../out/skip_nostop_single_100features_5minwords_5context_big")
    modelname = "/local/data/haoxu/Rudetect_additional/stance-conditional/out/skip_nostop_single_100features_5minwords_5context_big"
    folderPath = "/local/data/haoxu/Rudetect"
    dirnames = [
        d for d in os.listdir(folderPath)
        if os.path.isdir(folderPath + '/' + d)
    ]
    final_tweets_tokens = []
    for dirname in dirnames:
        print(dirname)
        p = os.path.join(folderPath, dirname, 'final', 'corpus.csv')
        tweets, target, labels, ids = reader.readTweetsOfficial(p, "utf-8")
        tweets_tokens = tokenise_tweets(tweets, stopwords="most")
        final_tweets_tokens += tweets_tokens
    trainWord2VecModel(final_tweets_tokens, modelname)
    # applyWord2VecMostSimilar(
    #     "../out/skip_nostop_single_100features_5minwords_5context_big")
Exemplo n.º 5
0
def readInputAndEval(testSetting,
                     outfile,
                     hidden_size,
                     max_epochs,
                     tanhOrSoftmax,
                     dropout,
                     stopwords="most",
                     testid="test1",
                     modeltype="bicond",
                     word2vecmodel="small",
                     postprocess=True,
                     shortenTargets=False,
                     useAutoTrump=False,
                     useClinton=True,
                     acc_thresh=1.0,
                     pretrain="pre_cont",
                     usePhrases=False):
    """
    Reading input files, calling the trainer for training the model, evaluate with official script
    :param outfile: name for output file
    :param stopwords: how to filter stopwords, see preprocess.filterStopwords()
    :param postprocess: force against/favor for tweets which contain the target
    :param shortenTargets: shorten the target text, see preprocess.transform_targets()
    :param useAutoTrump: use automatically annotated Trump tweets, experimental, not helping at the moment
    :param useClinton: add the Hillary Clinton dev data to train data
    :param testSetting: evaluate on Trump
    """

    if word2vecmodel == "small":
        w2vmodel = word2vec.Word2Vec.load(
            "../out/skip_nostop_single_100features_5minwords_5context")
    else:
        w2vmodel = word2vec.Word2Vec.load(
            "../out/skip_nostop_single_100features_5minwords_5context_big")

    if usePhrases == True:
        phrasemodel = Phrases.load("../out/phrase_all.model")
        w2vmodel = word2vec.Word2Vec.load(
            "../out/skip_nostop_multi_100features_5minwords_5context")

    if testSetting == "true":
        trainingdata = "../data/semeval2016-task6-train+dev.txt"
        testdata = "../data/SemEval2016-Task6-subtaskB-testdata-gold.txt"
    elif testSetting == "weaklySup":
        trainingdata = "../data/trump_autolabelled.txt"
        testdata = "../data/SemEval2016-Task6-subtaskB-testdata-gold.txt"
        enc = "utf-8"
    else:
        trainingdata = "../data/semeval2016-task6-trainingdata_new.txt"
        testdata = "../data/semEval2016-task6-trialdata_new.txt"
    if useClinton == False:
        trainingdata = "../data/semeval2016-task6-trainingdata_new.txt"

    tweets, targets, labels, ids = reader.readTweetsOfficial(trainingdata,
                                                             encoding=enc)

    # this is for using automatically labelled Donald Trump data in addition to task data
    if useAutoTrump == True:
        tweets_devaut, targets_devaut, labels_devaut, ids_devaut = reader.readTweetsOfficial(
            "../data/trump_autolabelled.txt", encoding='utf-8')
        ids_new = []
        for i in ids_devaut:
            ids_new.append(i + 10000)

        tweets = tweets + tweets_devaut
        targets = targets + targets_devaut
        labels = labels + labels_devaut
        ids = ids + ids_new

    if usePhrases == False:
        tweet_tokens = tokenise_tweets(tweets, stopwords)
        if shortenTargets == False:
            target_tokens = tokenise_tweets(targets, stopwords)
        else:
            target_tokens = tokenise_tweets(transform_targets(targets),
                                            stopwords)
    else:
        tweet_tokens = phrasemodel[tokenise_tweets(tweets, stopwords)]
        if shortenTargets == False:
            target_tokens = phrasemodel[tokenise_tweets(targets, stopwords)]
        else:
            target_tokens = phrasemodel[tokenise_tweets(
                transform_targets(targets), stopwords)]

    transformed_tweets = [
        transform_tweet(w2vmodel, senttoks) for senttoks in tweet_tokens
    ]
    transformed_targets = [
        transform_tweet(w2vmodel, senttoks) for senttoks in target_tokens
    ]
    transformed_labels = transform_labels(labels)

    tweets_test, targets_test, labels_test, ids_test = reader.readTweetsOfficial(
        testdata)

    if usePhrases == False:
        tweet_tokens_test = tokenise_tweets(tweets_test, stopwords)
        if shortenTargets == False:
            target_tokens_test = tokenise_tweets(targets_test, stopwords)
        else:
            target_tokens_test = tokenise_tweets(
                transform_targets(targets_test), stopwords)
    else:
        tweet_tokens_test = phrasemodel[tokenise_tweets(
            tweets_test, stopwords)]
        if shortenTargets == False:
            target_tokens_test = phrasemodel[tokenise_tweets(
                targets_test, stopwords)]
        else:
            target_tokens_test = phrasemodel[tokenise_tweets(
                transform_targets(targets_test), stopwords)]

    transformed_tweets_test = [
        transform_tweet(w2vmodel, senttoks) for senttoks in tweet_tokens_test
    ]
    transformed_targets_test = [
        transform_tweet(w2vmodel, senttoks) for senttoks in target_tokens_test
    ]
    transformed_labels_test = transform_labels(labels_test)

    targetInTweet = {}
    if postprocess == True:
        ids_test_list = [
            item for sublist in [l.tolist() for l in ids_test]
            for item in sublist
        ]
        id_tweet_dict = dict(zip(ids_test_list, tweets_test))
        targetInTweet = istargetInTweet(id_tweet_dict,
                                        targets_test)  #istargetInTweet

    predictions_all, predictions_detailed_all, ids_all = test_trainer(
        testSetting,
        w2vmodel,
        transformed_tweets,
        transformed_targets,
        transformed_labels,
        ids,
        transformed_tweets_test,
        transformed_targets_test,
        transformed_labels_test,
        ids_test,
        hidden_size,
        max_epochs,
        tanhOrSoftmax,
        dropout,
        modeltype,
        targetInTweet,
        testid,
        acc_thresh=acc_thresh,
        pretrain=pretrain)

    writer.printPredsToFileByID(testdata, outfile, ids_all, predictions_all)
    writer.eval(testdata, outfile, evalscript="eval.pl")
Exemplo n.º 6
0
        word="#donaldtrump",
        top=10):
    model = word2vec.Word2Vec.load(modelname)
    print("Find ", top, " terms most similar to ", word, "...")
    for res in model.most_similar(word, topn=top):
        print(res)
    print("Finding terms containing ", word, "...")
    for v in model.vocab:
        if word in v:
            print(v)


if __name__ == '__main__':
    unk_tokens = [["unk"], ["unk"], ["unk"], ["unk"], ["unk"], ["unk"],
                  ["unk"], ["unk"], ["unk"], ["unk"]]
    tweets, targets, labels, ids = reader.readTweetsOfficial(
        "../data/semeval2016-task6-train+dev.txt")
    tweet_tokens = tokenise_tweets(tweets, stopwords="most")
    tweets_trump, targets_trump, labels_trump, ids_trump = reader.readTweetsOfficial(
        "../data/downloaded_Donald_Trump.txt", "utf-8", 1)
    tweet_tokens_trump = tokenise_tweets(tweets_trump, stopwords="most")

    tweets_unlabelled = reader.readTweets(
        "../data/additionalTweetsStanceDetection.json")
    tweet_tokens_unlabelled = tokenise_tweets(tweets_unlabelled,
                                              stopwords="most")

    trainWord2VecModel(
        unk_tokens + tweet_tokens + tweet_tokens_trump +
        tweet_tokens_unlabelled,
        "../out/skip_nostop_single_100features_5minwords_5context_big")