def prepData(filepath, stopfilter, multiword): print("Preparing data...") ret = [] # list of lists print("Reading data...") # this reads file in JSON format #tweets = readTweets(jsonfilepath) # this reads SemEval format tweets tweets, _, _, _ = readTweetsOfficial(filepath) #tweets = "\n".join(tweets) print("Tokenising...") for tweet in tweets: tokenised_tweet = tokenize(tweet.lower()) if stopfilter: words = filterStopwords(tokenised_tweet) ret.append(words) else: ret.append(tokenised_tweet) if multiword: return learnMultiword(ret) else: return ret
else: outfOut.write(line) cntr += 1 outfIn.close() outfOut.close() if __name__ == '__main__': testdata = "../data/SemEval2016-Task6-subtaskB-testdata-gold.txt" devdata = "../data/semEval2016-task6-trialdata_new.txt" traindata = "../data/semeval2016-task6-train+dev.txt" devbest = "../out/results_all-1e-3-false_conditional-reverse_w2vsmall_hidd60_droptrue_stop-most_pre_cont_accthresh0.98_2.txt" tweets_gold, targets_gold, labels_gold, ids_gold = reader.readTweetsOfficial( devdata, 'windows-1252', 2) tweets_res, targets_res, labels_res, ids_res = reader.readTweetsOfficial( devdata, 'windows-1252', 2) inlist = selectTrainData(tweets_gold, targets_gold) printInOutFiles(inlist, devbest, "out_dev_inTwe_cond.txt", "out_dev_outTwe_cond.txt") printInOutFiles(inlist, devdata, "_gold_dev_inTwe.txt", "_gold_dev_outTwe.txt") print("All") writer.eval(devdata, devbest) print("Inlist") writer.eval("_gold_dev_inTwe.txt", "out_dev_inTwe_cond.txt")
v = np.zeros(dim) if dim == 3: if lab == 'NONE': ix = 0 elif lab == 'AGAINST': ix = 1 elif lab == 'FAVOR': ix = 2 else: if lab == 'AGAINST': ix = 0 elif lab == 'FAVOR': ix = 1 v[ix] = 1 labels_t.append(v) return labels_t if __name__ == '__main__': tweets, targets, labels, ids = reader.readTweetsOfficial("../data/semeval2016-task6-train+dev.txt") tweet_tokens = tokenise_tweets(tweets) target_tokens = tokenise_tweets(targets) count, dictionary, reverse_dictionary = build_dataset([token for senttoks in tweet_tokens+target_tokens for token in senttoks]) #flatten tweets for vocab construction transformed_tweets = [transform_tweet_dict(dictionary, senttoks) for senttoks in tweet_tokens] transformed_targets = [transform_tweet_dict(dictionary, senttoks) for senttoks in target_tokens] transformed_labels = transform_labels(labels) print('Longest tweet', len(max(transformed_tweets,key=len))) print('Longest target', len(max(transformed_targets,key=len))) print('Most common words (+UNK)', count[:5]) #print('Sample data', data[:10])
# "../data/downloaded_Donald_Trump.txt", "utf-8", 1) # tweet_tokens_trump = tokenise_tweets(tweets_trump, stopwords="most") # # tweets_Gampa, targets_Gampa, labels_Gampa, ids_Gampa = reader.readTweetsOfficial( # "/local/data/haoxu/Rudetect/Gabapentin_0628_0121/final/corpus.csv", "utf-8", 1) # tweets_tokens_Gampa = tokenise_tweets(tweets_Gampa, stopwords="most") # # tweets_unlabelled = reader.readTweets( # "../data/additionalTweetsStanceDetection.json") # tweet_tokens_unlabelled = tokenise_tweets( # tweets_unlabelled, stopwords="most") # # trainWord2VecModel(unk_tokens + tweet_tokens + tweet_tokens_trump + tweet_tokens_unlabelled + # tweets_tokens_Gampa, "../out/skip_nostop_single_100features_5minwords_5context_big") modelname = "/local/data/haoxu/Rudetect_additional/stance-conditional/out/skip_nostop_single_100features_5minwords_5context_big" folderPath = "/local/data/haoxu/Rudetect" dirnames = [ d for d in os.listdir(folderPath) if os.path.isdir(folderPath + '/' + d) ] final_tweets_tokens = [] for dirname in dirnames: print(dirname) p = os.path.join(folderPath, dirname, 'final', 'corpus.csv') tweets, target, labels, ids = reader.readTweetsOfficial(p, "utf-8") tweets_tokens = tokenise_tweets(tweets, stopwords="most") final_tweets_tokens += tweets_tokens trainWord2VecModel(final_tweets_tokens, modelname) # applyWord2VecMostSimilar( # "../out/skip_nostop_single_100features_5minwords_5context_big")
def readInputAndEval(testSetting, outfile, hidden_size, max_epochs, tanhOrSoftmax, dropout, stopwords="most", testid="test1", modeltype="bicond", word2vecmodel="small", postprocess=True, shortenTargets=False, useAutoTrump=False, useClinton=True, acc_thresh=1.0, pretrain="pre_cont", usePhrases=False): """ Reading input files, calling the trainer for training the model, evaluate with official script :param outfile: name for output file :param stopwords: how to filter stopwords, see preprocess.filterStopwords() :param postprocess: force against/favor for tweets which contain the target :param shortenTargets: shorten the target text, see preprocess.transform_targets() :param useAutoTrump: use automatically annotated Trump tweets, experimental, not helping at the moment :param useClinton: add the Hillary Clinton dev data to train data :param testSetting: evaluate on Trump """ if word2vecmodel == "small": w2vmodel = word2vec.Word2Vec.load( "../out/skip_nostop_single_100features_5minwords_5context") else: w2vmodel = word2vec.Word2Vec.load( "../out/skip_nostop_single_100features_5minwords_5context_big") if usePhrases == True: phrasemodel = Phrases.load("../out/phrase_all.model") w2vmodel = word2vec.Word2Vec.load( "../out/skip_nostop_multi_100features_5minwords_5context") if testSetting == "true": trainingdata = "../data/semeval2016-task6-train+dev.txt" testdata = "../data/SemEval2016-Task6-subtaskB-testdata-gold.txt" elif testSetting == "weaklySup": trainingdata = "../data/trump_autolabelled.txt" testdata = "../data/SemEval2016-Task6-subtaskB-testdata-gold.txt" enc = "utf-8" else: trainingdata = "../data/semeval2016-task6-trainingdata_new.txt" testdata = "../data/semEval2016-task6-trialdata_new.txt" if useClinton == False: trainingdata = "../data/semeval2016-task6-trainingdata_new.txt" tweets, targets, labels, ids = reader.readTweetsOfficial(trainingdata, encoding=enc) # this is for using automatically labelled Donald Trump data in addition to task data if useAutoTrump == True: tweets_devaut, targets_devaut, labels_devaut, ids_devaut = reader.readTweetsOfficial( "../data/trump_autolabelled.txt", encoding='utf-8') ids_new = [] for i in ids_devaut: ids_new.append(i + 10000) tweets = tweets + tweets_devaut targets = targets + targets_devaut labels = labels + labels_devaut ids = ids + ids_new if usePhrases == False: tweet_tokens = tokenise_tweets(tweets, stopwords) if shortenTargets == False: target_tokens = tokenise_tweets(targets, stopwords) else: target_tokens = tokenise_tweets(transform_targets(targets), stopwords) else: tweet_tokens = phrasemodel[tokenise_tweets(tweets, stopwords)] if shortenTargets == False: target_tokens = phrasemodel[tokenise_tweets(targets, stopwords)] else: target_tokens = phrasemodel[tokenise_tweets( transform_targets(targets), stopwords)] transformed_tweets = [ transform_tweet(w2vmodel, senttoks) for senttoks in tweet_tokens ] transformed_targets = [ transform_tweet(w2vmodel, senttoks) for senttoks in target_tokens ] transformed_labels = transform_labels(labels) tweets_test, targets_test, labels_test, ids_test = reader.readTweetsOfficial( testdata) if usePhrases == False: tweet_tokens_test = tokenise_tweets(tweets_test, stopwords) if shortenTargets == False: target_tokens_test = tokenise_tweets(targets_test, stopwords) else: target_tokens_test = tokenise_tweets( transform_targets(targets_test), stopwords) else: tweet_tokens_test = phrasemodel[tokenise_tweets( tweets_test, stopwords)] if shortenTargets == False: target_tokens_test = phrasemodel[tokenise_tweets( targets_test, stopwords)] else: target_tokens_test = phrasemodel[tokenise_tweets( transform_targets(targets_test), stopwords)] transformed_tweets_test = [ transform_tweet(w2vmodel, senttoks) for senttoks in tweet_tokens_test ] transformed_targets_test = [ transform_tweet(w2vmodel, senttoks) for senttoks in target_tokens_test ] transformed_labels_test = transform_labels(labels_test) targetInTweet = {} if postprocess == True: ids_test_list = [ item for sublist in [l.tolist() for l in ids_test] for item in sublist ] id_tweet_dict = dict(zip(ids_test_list, tweets_test)) targetInTweet = istargetInTweet(id_tweet_dict, targets_test) #istargetInTweet predictions_all, predictions_detailed_all, ids_all = test_trainer( testSetting, w2vmodel, transformed_tweets, transformed_targets, transformed_labels, ids, transformed_tweets_test, transformed_targets_test, transformed_labels_test, ids_test, hidden_size, max_epochs, tanhOrSoftmax, dropout, modeltype, targetInTweet, testid, acc_thresh=acc_thresh, pretrain=pretrain) writer.printPredsToFileByID(testdata, outfile, ids_all, predictions_all) writer.eval(testdata, outfile, evalscript="eval.pl")
word="#donaldtrump", top=10): model = word2vec.Word2Vec.load(modelname) print("Find ", top, " terms most similar to ", word, "...") for res in model.most_similar(word, topn=top): print(res) print("Finding terms containing ", word, "...") for v in model.vocab: if word in v: print(v) if __name__ == '__main__': unk_tokens = [["unk"], ["unk"], ["unk"], ["unk"], ["unk"], ["unk"], ["unk"], ["unk"], ["unk"], ["unk"]] tweets, targets, labels, ids = reader.readTweetsOfficial( "../data/semeval2016-task6-train+dev.txt") tweet_tokens = tokenise_tweets(tweets, stopwords="most") tweets_trump, targets_trump, labels_trump, ids_trump = reader.readTweetsOfficial( "../data/downloaded_Donald_Trump.txt", "utf-8", 1) tweet_tokens_trump = tokenise_tweets(tweets_trump, stopwords="most") tweets_unlabelled = reader.readTweets( "../data/additionalTweetsStanceDetection.json") tweet_tokens_unlabelled = tokenise_tweets(tweets_unlabelled, stopwords="most") trainWord2VecModel( unk_tokens + tweet_tokens + tweet_tokens_trump + tweet_tokens_unlabelled, "../out/skip_nostop_single_100features_5minwords_5context_big")