def __init__(self): self.wnl = WordNetLemmatizer() self.lancaster_stemmer = LancasterStemmer() self.porter_stemmer = PorterStemmer() self.snowball_stemmer = SnowballStemmer('english') wn.ensure_loaded()
def main(): global bow_corpus global word_to_idx global users wn.ensure_loaded() if NEW_CORPUS: bow_corpus = build_bow_corpus(get_users()) save_corpus(bow_corpus) else: bow_corpus = get_corpus() bow_corpus_tmp = [w[0] for w in bow_corpus.items() if w[1] > 2] word_to_idx = {k: idx for idx, k in enumerate(bow_corpus_tmp)} idx_to_word = {idx: k for k, idx in word_to_idx.items()} users = get_users() results = [] N = 0 # for chik, svdk in exp: # r= [] #for N in range(15): results.append(truth_prediction_for_users(users, idx_to_word, 10000, 20, N)) print(np.average(np.asarray(results), axis=1))
def __init__(self): self.wnl = WordNetLemmatizer() wn.ensure_loaded() self.contractions = { 'isn\'t': ['is', 'not'], 'aren\'t': ['are', 'not'], 'wasn\'t': ['was', 'not'], 'weren\'t': ['were', 'not'], 'don\'t': ['do', 'not'], 'doestn\'t': ['does', 'not'], 'didn\'t': ['did', 'not'], 'can\'t': ['cannot'], 'we\'re': ['we', 'are'], 'i\'m': ['I', 'am'], 'it\'s': ['it', 'is'], 'haven\'t': ['have', 'not'], 'hasn\'t': ['has', 'not'], 'hadn\'t': ['had', 'not'], 'couldn\'t': ['could', 'not'], 'mightn\'t': ['might', 'not'], 'mustn\'t': ['must', 'not'], 'shan\'t': ['shall', 'not'], 'mayn\'t': ['may', 'not'], 'shouldn\'t': ['should', 'not'], 'won\'t': ['will', 'not'], 'wouldn\'t': ['would', 'not'], 'daren\'t': ['dare', 'not'], 'needn\'t': ['need', 'not'], 'usedn\'t': ['use', 'not'], 'let\'s': ['let', 'us'], 'you\'ve': ['you', 'have'], 'i\'ve': ['I', 'have'], }
def __init__(self): #Ensuring that the wordnet corpus is loaded, so we can support multithreading wn.ensure_loaded() self.lemmatizer = wn_stem.WordNetLemmatizer() self.lemmas_dict = {} self.synsets_dict = {} self.similarity_dict = {}
def init(): nonlocal cached_stopwords try: wordnet.ensure_loaded() cached_stopwords = set(stopwords.words("english")) except LookupError: nltk.download("punkt") nltk.download("stopwords")
def build_set(self): wn.ensure_loaded() # `LazyCorpusLoader` conversion into `WordNetCorpusReader` starts print ("WordNet loaded") swn.ensure_loaded() # `LazyCorpusLoader` conversion into `SentiWordNetCorpusReader` starts print ("SentiWordNet loaded") self.tweet_tokenizer = TweetTokenizer(preserve_case=True, reduce_len=False, strip_handles=False) print ("Tweet tokenizer loaded") self.it_tokenizer = MosesTokenizer(lang='it') print ("Moses tokenizer loaded") self.it_tagger = treetaggerwrapper.TreeTagger(TAGLANG="it", TAGDIR=flags.tagger_path) # self.en_tagger = treetaggerwrapper.TreeTagger(TAGLANG="en", TAGDIR=flags.tagger_path) print ("Tagger loaded") self.stop_words = set(stopwords.words('italian')) print ("Stopwords loaded") self.lexicon = lm.LexiconSent('it') print ("OpeNER lexicon loaded") self.emoji = self.get_emoji_sentiment_lexicon(flags.emoji_sentiment_lexicon) print ("Emoji sentiment lexicon loaded") self.translator = Translator() print ("Setting up support dictionaries") self.translated_lemma_tokens = self.load_obj(flags.translated_lemma_tokens) self.lexeme_sentiment_dict = self.load_obj(flags.lexeme_sentiment_dict) print ("Translator loaded") # Build test annotations print ("Building test annotations..") test_set = self.load_obj(flags.test_annotations) if not test_set: test_set = self.get_annotations(flags.test_set_path) self.save_obj(test_set, flags.test_annotations) print ("Test annotations built") # Build training annotations print ("Building training annotations..") training_set = self.load_obj(flags.training_annotations) if not training_set: training_set = self.get_annotations(flags.training_set_path) self.save_obj(training_set, flags.training_annotations) print ("Training annotations built") print ("Saving support dictionaries") self.save_obj(self.translated_lemma_tokens, flags.translated_lemma_tokens) self.save_obj(self.lexeme_sentiment_dict, flags.lexeme_sentiment_dict) # Build distributional docvec from training and test sets self.doc2vec = self.build_distributional_docvec([test_set, training_set]) print ("Doc2Vec built") self.add_context_to_annotations(test_set) print ("Distr. docvec added to test annotations") self.add_context_to_annotations(training_set) print ("Distr. docvec added to training annotations") self.free_ram() print ("Loading pre-trained model..") self.model = ft.load_model(flags.word2vec_path) print ("Pre-trained model loaded") self.add_wordvecs_to_annotations(test_set) print ("Wordvecs added to test annotations") self.add_wordvecs_to_annotations(training_set) print ("Wordvecs added to training annotations") # Save to npy self.free_ram() self.save_obj({"test_set":test_set, "training_set":training_set}, flags.preprocessed_dict)
def feature_pred(features, chik, ldak): global users wn.ensure_loaded() facts = gt.get_fact_topics(DIR) if NEW_DATA: users = gt.get_users(DIR) transactions = gt.get_transactions(DIR) print(transactions.describe()) tr_hsh = transactions['fact'].values # if castillo: comment cond2 out cond = facts['hash'].isin(tr_hsh) cond2 = facts['true'] == 1 | facts['true'] == 0 facts = facts[cond & cond2] facts = Parallel(n_jobs=num_jobs)(delayed(get_features)( fact, transactions[transactions['fact'] == fact['hash']], [ u for u in users if int(u.user_id) in list(transactions[ transactions['fact'] == fact['hash']]['user_id'].values) ]) for idx, fact in facts.iterrows()) facts = pd.DataFrame(facts) with open('model_data/feature_data', 'wb') as tmpfile: pickle.dump(facts, tmpfile) else: with open('model_data/feature_data', 'rb') as tmpfile: facts = pickle.load(tmpfile) print(facts[list(features)].describe()) X = facts[list(features)].values y = facts['y'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) std_clf = make_pipeline(StandardScaler(), PCA(n_components=ldak), SVC(C=1, gamma=1)) std_clf.fit(X_train, y_train) pred_test_std = std_clf.predict(X_test) precision, recall, fscore, sup = precision_recall_fscore_support( y_test, pred_test_std, average='macro') score = metrics.accuracy_score(y_test, pred_test_std) print("Accuracy: %0.3f, Precision: %0.3f, Recall: %0.3f, F1 score: %0.3f" % (score, precision, recall, fscore)) acc_scores = cross_val_score(std_clf, X, y, cv=3) pr_scores = cross_val_score(std_clf, X, y, scoring='precision', cv=3) re_scores = cross_val_score(std_clf, X, y, scoring='recall', cv=3) f1_scores = cross_val_score(std_clf, X, y, scoring='f1', cv=3) print("\t Cross validated Accuracy: %0.3f (+/- %0.3f)" % (acc_scores.mean(), acc_scores.std() * 2)) print("\t Cross validated Precision: %0.3f (+/- %0.3f)" % (pr_scores.mean(), pr_scores.std() * 2)) print("\t Cross validated Recall: %0.3f (+/- %0.3f)" % (re_scores.mean(), re_scores.std() * 2)) print("\t Cross validated F1: %0.3f (+/- %0.3f)" % (f1_scores.mean(), f1_scores.std() * 2)) return acc_scores.mean()
def elaborateText(tweet_text): #removing urls text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\ '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', tweet_text) #removing emoticon text = remove_emoji(text) text = re.sub("(@[A-Za-z0-9_]+)","", text) text = text.lower() for punct_sign in string.punctuation: text = text.replace(punct_sign," ") #others puntuations not within the set of string.punctuation->{!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~} text = text.replace("’"," ") text = text.replace("”"," ") text = text.replace("“"," ") text = text.replace("\n"," ") # removing numbers text = re.sub(r'\d+', '', text) #tokenaziton tokens = nltk.word_tokenize(text) #removing stopwords stopWords = stopwords.words('english') filteredTokens = [] filteredTokens = [word for word in tokens if word not in stopWords] #lemmatization wn.ensure_loaded() lemmatizer = WordNetLemmatizer() lemmatized_sentence = [] lemmatized_sentence_joined = "" tokensTagging = pos_tag(tokens) # pos tagging & lemmatization for word, tag in tokensTagging: if tag.startswith('NN'): pos = 'n' elif tag.startswith('VB'): pos = 'v' else: pos = 'a' lemmatized_sentence.append(lemmatizer.lemmatize(word, pos)) lemmatized_sentence_set = ' '.join(lemmatized_sentence) return lemmatized_sentence_set
def get_clean_content(file: str): # https://stackoverflow.com/questions/27433370/what-would-cause-wordnetcorpusreader-to-have-no-attribute-lazycorpusloader # not the best fix but it works nltk_load_lock.acquire() wordnet.ensure_loaded() nltk_load_lock.release() meta, content = extract(file) if content is not None: lang = meta["language"] content = clean(content, lang) return meta, content
def get(self, request, format=None): wordnet.ensure_loaded() seminars = self.request.query_params.get('seminars') guid = self.request.query_params.get('guid') user = get_user(guid) similarities = {} for seminar_id in seminars.split(','): try: # Load seminar and its keywords seminar = Seminar.objects.get(id=seminar_id) keywords = json.loads(seminar.keywords) # User has not set any interests or seminar has no keywords so set similarity to 0 if not user.interests or not keywords: similarities[seminar.id] = 0 continue # Get synsets of words for user interests and keywords interest_syns = set( synset for interest in user.interests for synset in wordnet.synsets(interest) ) keyword_syns = set( synset for keyword in keywords[0:3] for synset in wordnet.synsets(keyword['text']) ) # If no synsets of words, set similiarity to 0 if not interest_syns or not keyword_syns: similarities[seminar.id] = 0 continue # Calculate best similarity between the sets of words best = max( wordnet.wup_similarity(i, j) or 0 for i, j in product(interest_syns, keyword_syns) ) # Convert to percentage and round to 1 decimal place similarities[seminar.id] = round(best * 100, 1) except Seminar.DoesNotExist: similarities[seminar.id] = 0 continue return Response(similarities)
def preload(fill_cache: bool): """ Pre-loads any data so the user experience is better, i.e. there is less delay during. :param fill_cache: if true, will run all parsing tests to fill the cache for the semantic distance function. """ # Preload the WordNet dictionary. print('Loading WordNet...') wn.ensure_loaded() if fill_cache: print('Filling Cache (Running Tests)...') loader = TestLoader() suite = loader.discover(start_dir='tests/parsing') TextTestRunner(verbosity=1).run(suite)
def __init__(self): #news = pd.read_csv("classifier/training_dataset.csv", names=['id', 'text', 'category']) # load the label encoder to decode category numbers #self.encoder = LabelEncoder() #self.encoder.fit_transform(news['category']) # load the text classifer #self.text_clf = open("classifier/nb_classifier.pkl", "rb") #self.text_clf = pickle.load(self.text_clf) self.porter = PorterStemmer() # prevents odd nltk error # https://stackoverflow.com/questions/27433370/what-would-cause-wordnetcorpusreader-to-have-no-attribute-lazycorpusloader wn.ensure_loaded()
def main(): wn.ensure_loaded() users = get_users() #users = [was_user_correct(user) for user in users] #print("Linguistic features..") #users = Parallel(n_jobs=num_jobs)(delayed(linguistic_f)(user) for user in users) #print("Calculating tweet sentiment for each user") users = Parallel(n_jobs=num_jobs)( delayed(feature_user_tweet_sentiment)(user) for user in users) print("Avg time to retweet") users = Parallel(n_jobs=num_jobs)(delayed(time_til_retweet)(user) for user in users) print([u.sent_tweets_avg for u in users[:10]]) print([u.avg_time_to_retweet for u in users[:10]]) [store_result(user) for user in users]
def main(): global bow_corpus global word_to_idx, idx_to_word global bow_corpus_top_n global users wn.ensure_loaded() bow_corpus = get_corpus() users = get_users() bow_corpus_tmp = [w[0] for w in bow_corpus.items() if w[1] > 2] #print("Corpus size: {}".format(len(bow_corpus_tmp))) word_to_idx = {k: idx for idx, k in enumerate(bow_corpus_tmp)} idx_to_word = {idx: k for k, idx in word_to_idx.items()} #for n in range(0,10,1): lstm_pred(-1)
def preprocess_text(docs): num_task = os.cpu_count() len_slices = len(docs) // num_task remainder_slices = len(docs) % num_task texts = [] stoplist = set(stopwords.words('english')) wn.ensure_loaded() t_start = time.perf_counter() with ProcessPoolExecutor(max_workers=num_task) as executor: futures_tokenize = [] for n in range(0, num_task): upper_bound = (n+1) * len_slices if n == num_task - 1: upper_bound = (n+1) * len_slices + remainder_slices print(n, upper_bound) futures_tokenize.append(executor.submit(preprocess_tokenize, docs[n * len_slices:upper_bound], stoplist)) for future in concurrent.futures.as_completed(futures_tokenize): texts += future.result() t_stop = time.perf_counter() print("removed stopwords and lemmatized in {} s".format(t_stop - t_start)) # Add bigrams and trigrams to docs (only ones that appear 20 times or more). bigram = Phraser(Phrases(texts, min_count=20)) for idx in range(len(texts)): for token in bigram[texts[idx]]: if '_' in token: # Token is a bigram, add to document. texts[idx].append(token) print("Done bigrams") dictionary = Dictionary(texts) dictionary.filter_extremes(no_below=30, no_above=0.5) dictionary.filter_tokens(bad_ids=[dictionary.token2id["like"]]) special_tokens = {'_pad_': 0} dictionary.patch_with_special_tokens(special_tokens) return texts, dictionary
def IndexWebSite(self, url, urlSet, depth, urlAmount, flag=False): print("indexleme basladi") keywords = self.FindKeywords(url, 5) print(keywords) if (len(keywords) == 0): return [] threads = [] self.resultArr = [] wn.ensure_loaded() for i in range(len(urlSet)): t = Thread(target=self.IndexSiteWithThread, args=(urlSet[i], depth, urlAmount, keywords)) t.start() threads.append(t) for thread in threads: thread.join() self.bubbleSort(self.resultArr) return self.resultArr
def main(): wn.ensure_loaded() # batch i = int(sys.argv[1]) user_files = sorted(glob.glob(DIR + 'user_tweets/' + 'user_*.json')) r = 1960 user_files = user_files[(i-1)*r:min(r*i,len(user_files))] users = get_users(user_files) facts, transactions = get_data() users = Parallel(n_jobs=num_jobs)(delayed(was_user_correct)(user, facts, transactions) for user in users) # print("Linguistic features..") # users = Parallel(n_jobs=num_jobs)(delayed(linguistic_f)(user) for user in users) # print("Calculating tweet sentiment for each user") # users = Parallel(n_jobs=num_jobs)(delayed(feature_user_tweet_sentiment)(user) for user in users) # print("Avg time to retweet") # users = Parallel(n_jobs=num_jobs)(delayed(time_til_retweet)(user) for user in users) # print([u.sent_tweets_avg for u in users[:10]]) # print([u.avg_time_to_retweet for u in users[:10]]) [store_result(user) for user in users]
def main(): global bow_corpus global word_to_idx wn.ensure_loaded() if NEW_CORPUS: bow_corpus = build_bow_corpus(get_users()) save_corpus(bow_corpus) else: bow_corpus = get_corpus() bow_corpus_tmp = [w[0] for w in bow_corpus.items() if w[1] > 2] word_to_idx = {k: idx for idx, k in enumerate(bow_corpus_tmp)} idx_to_word = {idx: k for k, idx in word_to_idx.items()} users = get_users() facts = gt.get_fact_topics() transactions = gt.get_transactions() users_df = pd.DataFrame([vars(u) for u in users]) print(users_df.describe()) print(users_df[users_df['stance'] == 0].describe()) print(users_df[users_df['stance'] == 1].describe()) print(users_df[users_df['stance'] == 2].describe()) print(users_df[users_df['stance'] == 3].describe()) users_df['f_t'] = users_df['fact'].map( lambda x: facts[facts['hash'] == x]['true'].values[0]) c_true = users_df['f_t'] == '1' c_fal = users_df['f_t'] == '0' c_fal1 = users_df['f_t'] == 0 c_den = users_df['stance'] == 0 c_sup = users_df['stance'] == 1 print(users_df[c_true & c_sup].describe()) print(users_df[c_fal | c_fal1][c_den].describe()) print(users_df[c_fal | c_fal1][c_sup].describe()) print(users_df[c_true & c_den].describe()) print(users_df[users_df['was_correct'] == 1].describe()) print(users_df[users_df['was_correct'] == 0].describe()) print(len([t for u in users for t in u.tweets if u.tweets is not None])) corpus_analysis(bow_corpus, word_to_idx, idx_to_word) # temporal_analysis(get_users()) cluster_users_on_tweets(users, word_to_idx, idx_to_word)
def main(): global bow_corpus global word_to_idx wn.ensure_loaded() if NEW_CORPUS: bow_corpus = build_bow_corpus(get_users()) save_corpus(bow_corpus) else: bow_corpus = get_corpus() bow_corpus_tmp = [w[0] for w in bow_corpus.items() if w[1] > 2] word_to_idx = {k: idx for idx, k in enumerate(bow_corpus_tmp)} idx_to_word = {idx: k for k, idx in word_to_idx.items()} users = get_users() corpus_analysis(bow_corpus, word_to_idx, idx_to_word) # temporal_analysis(get_users()) cluster_users_on_tweets(users, word_to_idx, idx_to_word)
def __init__(self): print('loading the wordnet corpus...') wordnet.ensure_loaded() print('loading done') self.nlp = spacy.load('en') self.nlp.add_pipe(WordnetAnnotator(self.nlp.lang), after='tagger') f = open('sorted_first_names.txt', 'r') lines = f.readlines() self.first_name_array = [] for line in lines: line = line.rstrip() self.first_name_array.append(line) f = open('sorted_last_names.txt', 'r') lines = f.readlines() self.last_name_array = [] for line in lines: line = line.rstrip() self.last_name_array.append(line) f = open('bad_words.txt', 'r') lines = f.readlines() self.profane_words_array = [] for line in lines: line = line.rstrip() self.profane_words_array.append(line)
def search(self, queryString,search_length=10,return_rank_list=False): wn.ensure_loaded() stop_words = set(stopwords.words('english')) porter_stemmer = PorterStemmer() wordnet_lemmatizer = WordNetLemmatizer() query = word_tokenize(queryString) query=[w.lower() for w in query if (w.isalpha() and w not in stop_words)] query=[(wordnet_lemmatizer.lemmatize(w)) for w in query] query=[porter_stemmer.stem(w) for w in query] self.processQuery( self.vocab, query ) #Getting the page ranking for the above query obj = CosineScore(self.queryVector, self.tfidfMatrix) rankList = obj.getPages(search_length) if return_rank_list==True: return rankList #Getting the id and url name finalList = [] for docIndex in rankList: finalList.append((self.titleList[docIndex], self.urlList[docIndex])) return (finalList)
def process(file_input_name, file_output_name): try: wordnet.ensure_loaded() data = pd.read_csv(file_input_name, encoding='ISO-8859-1') input_text = data['text'] threads = [] result = [] for i in range(10): t = lemmatizeThread(thread_name='thread' + str(i), the_queue=workQueue) t.start() threads.append(t) threadLock.acquire() for i in range(len(input_text) - 1): workQueue.put(str(i) + '-' + input_text[i]) threadLock.release() while not workQueue.empty(): pass global exitFlag exitFlag = 1 for t in threads: t.join() result.append(t.data) with open(file_output_name, 'w') as f: while (not resultQueue.empty()): result = resultQueue.get() f.write(result + '\n') except Exception as e: print(e)
from nltk.corpus import wordnet as wn #right = wn.synset('right_whale.n.01') wn.ensure_loaded() help(wn)
def main(): global bow_corpus global word_to_idx, idx_to_word, fact_to_words global bow_corpus_top_n wn.ensure_loaded() print('Grabbing Data') bow_corpus = gt.get_corpus() facts = gt.get_fact_topics() facts = facts[facts['true'] != 'unknown'] bow_corpus_tmp = [w[0] for w in bow_corpus.items() if w[1] > 2] word_to_idx = {k: idx for idx, k in enumerate(bow_corpus_tmp)} idx_to_word = {idx: k for k, idx in word_to_idx.items()} fact_to_words = { r['hash']: [w for w in r['fact_terms']] for index, r in facts[['hash', 'fact_terms']].iterrows() } if NEW_MODEL: users = gt.get_users() # Prepping lstm model top_words = 50000 X, y, user_order = lstm_cred.get_prebuilt_data() X, y, user_order = lstm_cred.balance_classes(X, y, user_order) #X_train, X_test, y_train, y_test = train_test_split_every_user(X, y, user_order) #X_train, X_test, y_train, y_test = train_test_split_on_facts(X, y, user_order, facts_train.values, users) #X_train, X_test, y_train, y_test = lstm_cred.train_test_split_on_users(X, y, user_order, users, 100) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) X_train, X_test, word_to_idx = lstm_cred.keep_n_best_words( X_train, y_train, X_test, y_test, idx_to_word, top_words) max_tweet_length = 12 X_train = sequence.pad_sequences(X_train, maxlen=max_tweet_length) X_test = sequence.pad_sequences(X_test, maxlen=max_tweet_length) # Training lstm model embedding_vecor_length = 32 model = Sequential() model.add( Embedding(top_words, embedding_vecor_length, input_length=max_tweet_length)) model.add(Dropout(0.2)) model.add(LSTM(100)) model.add(Dropout(0.2)) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) print(model.summary()) model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=64) model.save('model_data/cred_model.h5') scores = model.evaluate(X_test, y_test, verbose=0) print("Accuracy: %.2f%%" % (scores[1] * 100)) if NEW_REL_TWEETS: print('Building new relevant tweets') users = Parallel(n_jobs=num_jobs)( delayed(get_relevant_tweets)(user) for user in users) #users = Parallel(n_jobs=num_jobs)(delayed(get_relevant_tweets_test_set)(user, X_test) for user in users) user_to_rel_tweet = { user.user_id: user.features['relevant_tweets'] for user in users if 'relevant_tweets' in user.features } with open('model_data/relevant_tweets.pkl', 'wb') as tmpfile: pickle.dump(user_to_rel_tweet, tmpfile) else: with open('model_data/relevant_tweets.pkl', 'rb') as tmpfile: user_to_rel_tweet = pickle.load(tmpfile) for user in users: if 'relevant_tweets' in user.features: user.features['relevant_tweets'] = user_to_rel_tweet[ user.user_id] # Build credibility scores for all users on their topic print('Computing credibility') users = [prebuild_cred(model, u) for u in users] users_df = pd.DataFrame([vars(u) for u in users]) [store_result(u) for u in users] with open('model_data/cred_pred_data', 'wb') as tmpfile: pickle.dump({'users': users_df, 'map': word_to_idx}, tmpfile) else: print('Loading users & model') with open('model_data/cred_pred_data', 'rb') as tmpfile: construct = pickle.load(tmpfile) users_df = construct['users'] word_to_idx = construct['map'] print('Making cred*sent predictions') X = [] y = [] for idx, hsh in enumerate(facts['hash'].values): this_users = users_df[users_df['fact'] == hsh] this_x = cred_stance_prediction(this_users) this_y = facts['true'].iloc[idx] X.append((np.average(this_x), np.std(this_x))) y.append(int(this_y)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) std_clf = make_pipeline(StandardScaler(), LinearSVC) std_clf.fit(X_train, y_train) pred = std_clf.predict(X_test) score = metrics.accuracy_score(y_test, pred) precision, recall, fscore, sup = metrics.precision_recall_fscore_support( y_test, pred, average='macro') print( "Rumors: Accuracy: %0.3f, Precision: %0.3f, Recall: %0.3f, F1 score: %0.3f" % (score, precision, recall, fscore)) acc_scores = cross_val_score(std_clf, X, y, cv=3) pr_scores = cross_val_score(std_clf, X, y, scoring='precision', cv=3) re_scores = cross_val_score(std_clf, X, y, scoring='recall', cv=3) f1_scores = cross_val_score(std_clf, X, y, scoring='f1', cv=3) print("\t Cross validated Accuracy: %0.3f (+/- %0.3f)" % (acc_scores.mean(), acc_scores.std() * 2)) print("\t Cross validated Precision: %0.3f (+/- %0.3f)" % (pr_scores.mean(), pr_scores.std() * 2)) print("\t Cross validated Recall: %0.3f (+/- %0.3f)" % (re_scores.mean(), re_scores.std() * 2)) print("\t Cross validated F1: %0.3f (+/- %0.3f)" % (f1_scores.mean(), f1_scores.std() * 2)) print('Making cred*stance predictions') X = [] y = [] all_evidence = [] for idx, hsh in enumerate(facts['hash'].values): this_users = users_df[users_df['fact'] == hsh] this_x, evidence = only_cred_support_deny_pred(this_users) this_y = facts['true'].iloc[idx] evidence = sorted(evidence, reverse=True, key=lambda x: x[0]) # print(facts[facts['hash']==hsh]['text'].values, int(this_y), this_x[-1]) # print(evidence if len(evidence) <3 else evidence[:3]) X.append((np.average(this_x), np.std(this_x))) y.append(int(this_y)) print(X[:20]) print(y[:20]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) std_clf = make_pipeline(StandardScaler(), LinearSVC()) std_clf.fit(X_train, y_train) pred = std_clf.predict(X_test) score = metrics.accuracy_score(y_test, pred) precision, recall, fscore, sup = metrics.precision_recall_fscore_support( y_test, pred, average='macro') print( "Rumors: Accuracy: %0.3f, Precision: %0.3f, Recall: %0.3f, F1 score: %0.3f" % (score, precision, recall, fscore)) acc_scores = cross_val_score(std_clf, X, y, cv=3) pr_scores = cross_val_score(std_clf, X, y, scoring='precision', cv=3) re_scores = cross_val_score(std_clf, X, y, scoring='recall', cv=3) f1_scores = cross_val_score(std_clf, X, y, scoring='f1', cv=3) print(acc_scores) print(pr_scores) print(re_scores) print(f1_scores) print("\t Cross validated Accuracy: %0.3f (+/- %0.3f)" % (acc_scores.mean(), acc_scores.std() * 2)) print("\t Cross validated Precision: %0.3f (+/- %0.3f)" % (pr_scores.mean(), pr_scores.std() * 2)) print("\t Cross validated Recall: %0.3f (+/- %0.3f)" % (re_scores.mean(), re_scores.std() * 2)) print("\t Cross validated F1: %0.3f (+/- %0.3f)" % (f1_scores.mean(), f1_scores.std() * 2))
def text_processing(ques1, ques2): # from nltk.corpus.reader.wordnet import WordNetError from nltk.stem.porter import PorterStemmer from nltk.stem import WordNetLemmatizer from nltk.tokenize import word_tokenize from nltk.corpus import wordnet as wn from nltk.corpus import stopwords import num2words as nw import string """Function to remove punctions in the strings""" r_p1 = list( map( lambda ques: ''.join( [word for word in ques1 if word not in string.punctuation]), [ques1])) r_p2 = list( map( lambda ques: ''.join( [word for word in ques2 if word not in string.punctuation]), [ques2])) """Function to create word token from the document""" w_t1 = list( map( lambda r_p: ' '.join([ nw.num2words(word) if word.isdigit() else word for word in word_tokenize(r_p[0].replace("°", "").replace( "²", "")) ]), [r_p1])) w_t2 = list( map( lambda r_p: ' '.join([ nw.num2words(word) if word.isdigit() else word for word in word_tokenize(r_p[0].replace("°", "").replace( "²", "")) ]), [r_p2])) l_w_t1 = len(word_tokenize(r_p1[0])) l_w_t2 = len(word_tokenize(r_p1[0])) """Function to remove stop words from the document""" wn.ensure_loaded() words = stopwords.words('english') r_s_w1 = list( map( lambda w_t: " ".join( [word for word in w_t[0].split() if word not in words]), [w_t1])) r_s_w2 = list( map( lambda w_t: " ".join( [word for word in w_t[0].split() if word not in words]), [w_t2])) l_r_s_w1 = len(word_tokenize(r_s_w1[0])) l_r_s_w2 = len(word_tokenize(r_s_w2[0])) """Function to stem tokens of string""" stemmer = PorterStemmer() stems1 = list( map( lambda r_s_w: " ".join( [stemmer.stem(word) for word in r_s_w[0].split(" ")]), [r_s_w1])) stems2 = list( map( lambda r_s_w: " ".join( [stemmer.stem(word) for word in r_s_w[0].split(" ")]), [r_s_w2])) """Function to lemmatize tokens of string""" lemmatizer = WordNetLemmatizer() lamit1 = list( map( lambda stems: " ".join( [lemmatizer.lemmatize(word) for word in stems[0].split()]), [stems1])) lamit2 = list( map( lambda stems: " ".join( [lemmatizer.lemmatize(word) for word in stems[0].split()]), [stems2])) # print([lamit1[0], lamit2[0]], [int(l_w_t1 / l_w_t2), int(l_r_s_w1 / l_r_s_w2)]) return [lamit1[0], lamit2[0]], [int(l_w_t1 / l_w_t2), int(l_r_s_w1 / l_r_s_w2)]
def get_queryset(self): wordnet.ensure_loaded() # Wordnet incorporates a lazy corpus model, which starts loading # only on first call to itself, which can cause some issues with # multithreading return
outputFile = open("result_trigram.txt", "w+") corpusLines = inputFile.readlines() totalLines = len(corpusLines) print(" Total Process : ", totalProcess) if (totalLines < totalProcess): print(" Total needed process : ", totalLines) totalProcess = totalLines allocationForProcess = int(totalLines / totalProcess) lastAllocation = totalLines - allocationForProcess * totalProcess print(" Total Lines : ", totalLines) print(" Allocation for a Process : ", allocationForProcess) print(" Last Allocation for a Process : ", lastAllocation) wordnet.ensure_loaded() # first access to wn transforms it if (allocationForProcess > 0): # linePrinter.initCompletedstate(totalProcess+1) processes = [ Thread(target=readByLines, args=(processId, corpusLines, processId * allocationForProcess, (processId + 1) * allocationForProcess, processId * allocationForProcess, True, allocationForProcess)) for processId in range(totalProcess) ] # Run processes for p in processes: p.start()
inputFile = open(basePath + inputFileName, "r") corpusLines = inputFile.readlines() totalLines = len(corpusLines) print(" Started File - ", inputFileName, " | Total Lines : ", totalLines) if not os.path.exists(outputPath + inputFileName): os.makedirs(outputPath + inputFileName) readByLines(corpusLines, totalLines, inputFileName) inputFile.close() return inputFileName startTime = datetime.now() print("Process started : ", startTime) print("Total Files : ", TOTAL_FILES_FOR_READ) FileNames = [] for i in range(0, TOTAL_FILES_FOR_READ): listToken = [] if (i < 9): FileNames.append("news.en-0000" + str(i + 1) + "-of-00100") else: FileNames.append("news.en-000" + str(i + 1) + "-of-00100") wordnet.ensure_loaded() with concurrent.futures.ProcessPoolExecutor() as executor: for lemma, result in zip(FileNames, executor.map(doWorker, FileNames)): print(f"Finished for input file {lemma} was saved inside the {result}") endTime = datetime.now() print("\n\n Process Stopped : ", endTime) print("\n\n Duration : ", endTime - startTime)
import nltk from nltk.corpus import wordnet __all__ = ( 'get_related', 'fix_determiners' ) # NLTK seems to check for updates and tries to unzip the corpus even when it’s # already installed, which slows down the import, so only invoke nltk.download() # if wordnet isn’t already available. try: wordnet.ensure_loaded() except LookupError: try: nltk.download('wordnet') except OSError as exc: raise ImportError("Could not download WordNet:", exc) _VOWELS = "aeiou" def get_related(query, pos='n'): """If query is for a noun, return a random hyponym for query. If query is for an adjective, return a related adjective. Args:
def main(): global bow_corpus global word_to_idx, idx_to_word, fact_to_words global bow_corpus_top_n wn.ensure_loaded() print('Grabbing Data') bow_corpus = gt.get_corpus() facts = gt.get_fact_topics() facts = facts[facts['true'] != 'unknown'] bow_corpus_tmp = [w[0] for w in bow_corpus.items() if w[1] > 2] word_to_idx = {k: idx for idx, k in enumerate(bow_corpus_tmp)} idx_to_word = {idx: k for k, idx in word_to_idx.items()} fact_to_words = { r['hash']: [w for w in r['fact_terms']] for index, r in facts[['hash', 'fact_terms']].iterrows() } # Credibility data print('Loading users & model') with open('model_data/cred_pred_data', 'rb') as tmpfile: construct = pickle.load(tmpfile) users_df = construct['users'] word_to_idx = construct['map'] # Feature data with open('model_data/feature_data', 'rb') as tmpfile: fact_features = pickle.load(tmpfile) features = [ 'avg_links', 'avg_sent_neg', 'avg_sentiment', 'fr_has_url', 'lvl_size', 'avg_len', 'avg_special_symbol', 'avg_time_retweet', 'avg_count_distinct_words', 'avg_sent_pos', 'cred_pred', 'cred_pred_std' ] print('Making cred*stance +best features predictions') facts['cred_pred'] = facts['hash'].map( lambda x: only_cred_support_deny_pred(users_df[users_df['fact'] == x])) facts['cred_pred_std'] = facts['cred_pred'].map(lambda x: np.std(x)) facts['cred_pred'] = facts['cred_pred'].map(lambda x: x[-1]) facts = facts.set_index('hash').join(fact_features.set_index('hash'), rsuffix='_other') X = facts[features].values y = facts['y'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) std_clf = make_pipeline(StandardScaler(), SVC(C=1, gamma=1)) std_clf.fit(X_train, y_train) pred = std_clf.predict(X_test) score = metrics.accuracy_score(y_test, pred) precision, recall, fscore, sup = metrics.precision_recall_fscore_support( y_test, pred, average='macro') print( "Rumors: Accuracy: %0.3f, Precision: %0.3f, Recall: %0.3f, F1 score: %0.3f" % (score, precision, recall, fscore)) acc_scores = cross_val_score(std_clf, X, y, cv=3) pr_scores = cross_val_score(std_clf, X, y, scoring='precision', cv=3) re_scores = cross_val_score(std_clf, X, y, scoring='recall', cv=3) f1_scores = cross_val_score(std_clf, X, y, scoring='f1', cv=3) print("\t Cross validated Accuracy: %0.3f (+/- %0.3f)" % (acc_scores.mean(), acc_scores.std() * 2)) print("\t Cross validated Precision: %0.3f (+/- %0.3f)" % (pr_scores.mean(), pr_scores.std() * 2)) print("\t Cross validated Recall: %0.3f (+/- %0.3f)" % (re_scores.mean(), re_scores.std() * 2)) print("\t Cross validated F1: %0.3f (+/- %0.3f)" % (f1_scores.mean(), f1_scores.std() * 2))
def run(self): out_dir = flags.cgmh_output_dir fout = open(os.path.join(out_dir, "log" + str(self.__idx) + ".log"), "w") res_path = os.path.join(out_dir, "res" + str(self.__idx) + ".res") bb_atk_data = self.__bb_atk_data bb_atk_data_size = len(self.__bb_atk_data['raw']) bb_word2idx = self.__bb_word2idx bb_idx2word = self.__bb_idx2word vocab = self.__vocab bb = self.__bb m = self.__model bb_max_seqlen = self.__bb_max_seqlen sess = self.__sess negations = self.__negs op_prob = [ flags.swp_prob, flags.ins_prob, flags.del_prob, flags.pass_prob ] op_prob = op_prob / numpy.sum(op_prob) n_sample = flags.sample_max_n n_candidate = flags.n_candidate just_acc_rate = flags.just_acc_rate swp_lm_threshold = flags.lm_swp_threshold ins_lm_threshold = flags.lm_ins_threshold del_lm_threshold = flags.lm_del_threshold swp_prob_threshold = flags.swp_threshold ins_prob_threshold = flags.ins_threshold del_prob_threshold = flags.del_threshold swn_obj_threshold = flags.senti_obj_threshold swn_pos_threshold = flags.senti_pos_threshold seq_min_len = flags.seq_min_len mode = flags.index_mode res_log = [] sents = [] idx = 0 op = 3 total_time = 0 n_succ = 0 lemmatzr = WordNetLemmatizer() for i in range(bb_atk_data_size): start_time = time.time() print("===== DATA %d/%d =====" % (i + 1, bb_atk_data_size), file=fout, flush=True) print("DATA %d/%d, id=%d" % (i + 1, bb_atk_data_size, self.__idx)) flush() res_log.append([]) raw = copy.deepcopy(bb_atk_data["raw"][i]) raw = nltk.word_tokenize(raw.lower()) l = len(raw) + 1 if (l > flags.seq_max_len): l = flags.seq_max_len seq = [vocab.get_init_idx()] for ii in range(1, l): seq.append(vocab.get_vocab_idx(raw[ii - 1])) while len(seq) < flags.seq_max_len: seq.append(vocab.get_pad_idx()) mask = [True] for ii in range(1, l): mask.append(False) bb_y = bb_atk_data["y"][i] bb_l = len(raw) if (bb_l > bb_max_seqlen): bb_l = bb_max_seqlen bb_seq = [] for ii in range(bb_l): if raw[ii] in bb_word2idx.keys(): bb_seq.append(bb_word2idx[raw[ii]]) else: bb_seq.append(bb_word2idx["<unk>"]) while len(bb_seq) < bb_max_seqlen: bb_seq.append(bb_word2idx["<pad>"]) sents.append([]) sample_cnt = 0 sample_all = 0 idx = 0 sents[-1].append(copy.deepcopy(raw)) print("%d/%d\tOriginal\tFAIL with %.5f" % (i + 1, bb_atk_data_size, 1 - bb_atk_data["prob"][i]), end="\n\t", file=fout, flush=True) for ii in range(len(raw)): print(raw[ii], end=" ", file=fout, flush=True) if bb_y == 1: print("\t<POS>", file=fout, flush=True) else: print("\t<NEG>", file=fout, flush=True) while sample_all < n_sample: try: wn.ensure_loaded() sample_all += 1 op = random_pick_idx_with_unnormalized_prob(op_prob) succ = False if op == 3: tmp_prob = sess.run(bb.prob, feed_dict={ bb.X: [bb_seq], bb.L: [bb_l] })[0][1 - bb_y] if tmp_prob >= 0.5: res_log[i].append((sample_all, 1)) print( "%d/%d\t%d acc / %d all\tPASS\t SUCC with %.5f" % (i + 1, bb_atk_data_size, sample_cnt + 1, sample_all + 1, tmp_prob), file=fout, flush=True) succ = True else: res_log[i].append((sample_all, 0)) print( "%d/%d\t%d acc / %d all\tPASS\t FAIL with %.5f" % (i + 1, bb_atk_data_size, sample_cnt + 1, sample_all + 1, tmp_prob), file=fout, flush=True) sample_cnt += 1 sents[-1].append(copy.deepcopy(raw)) print("", end="\t", file=fout, flush=True) for ii in range(len(raw)): print(raw[ii], end=" ", file=fout, flush=True) if bb_y == 1: print("\t<POS>", file=fout, flush=True) else: print("\t<NEG>", file=fout, flush=True) if succ: print("\tSUCC!") flush() break continue if mode == "random": idx = random.randint(0, l - 1) elif mode == "traverse": idx = (idx + 1) % l elif mode == "grad": if op == 1: idx = random.randint(0, l - 1) else: grad_vecs = sess.run(bb.embed_grad, feed_dict={ bb.X: [bb_seq], bb.L: [bb_l], bb.Y: [1 - bb_y] })[0][0] grads = numpy.linalg.norm(grad_vecs, axis=-1) candidate_grads = [] candidate_idxs = [] position_tag = nltk.pos_tag(raw) for pos in range(len(position_tag)): tmp_tag = get_part_of_speech( position_tag[pos][1]) if tmp_tag is None: candidate_grads.append(grads[pos]) candidate_idxs.append(pos + 1) continue tmp_wn = wn.synsets(lemmatzr.lemmatize( raw[pos]), pos=tmp_tag) if len(tmp_wn) <= 0: candidate_grads.append(grads[pos]) candidate_idxs.append(pos + 1) continue tmp_swn = swn.senti_synset(tmp_wn[0].name()) if (tmp_swn.obj_score() > swn_obj_threshold \ or (tmp_swn.obj_score() <= swn_obj_threshold \ and abs(tmp_swn.pos_score()-tmp_swn.neg_score()) <= swn_pos_threshold)): candidate_grads.append(grads[pos]) candidate_idxs.append(pos + 1) continue idx_idx = random_pick_idx_with_unnormalized_prob( candidate_grads) idx = candidate_idxs[idx_idx] else: assert False, "Invalid mode \"" + mode + "\"" old_wrong_prob = sess.run(bb.prob, feed_dict={ bb.X: [bb_seq], bb.L: [bb_l] })[0][1 - bb_y] if op == 0: if mask[idx]: continue proposal = m.op_replace(sess, copy.deepcopy(seq), l, copy.deepcopy(bb_seq), bb_l, 1 - bb_y, idx, n_candidate, op_prob) tmp_bb_seq = copy.deepcopy(bb_seq) tmp_str = vocab.get_vocab(proposal['proposal'][idx]) if tmp_str in bb_word2idx.keys(): tmp_bb_seq[idx - 1] = bb_word2idx[tmp_str] else: tmp_bb_seq[idx - 1] = bb_word2idx["<unk>"] new_wrong_prob = sess.run(bb.prob, feed_dict={ bb.X: [tmp_bb_seq], bb.L: [bb_l] })[0][1 - bb_y] tmp_raw = copy.deepcopy(raw) tmp_raw[idx - 1] = vocab.get_vocab( proposal["proposal"][idx]) new_tag = get_part_of_speech( nltk.pos_tag(tmp_raw)[idx - 1][1]) if new_tag is None: new_obj = 1 new_pos = 0 else: new_wn = wn.synsets(lemmatzr.lemmatize( tmp_raw[idx - 1]), pos=new_tag) if len(new_wn) <= 0: new_obj = 1 new_pos = 0 else: new_swn = swn.senti_synset(new_wn[0].name()) new_obj = new_swn.obj_score() new_pos = new_swn.pos_score( ) - new_swn.neg_score() if (just_acc(just_acc_rate) or (numpy.random.uniform(0,1) <= \ proposal["alpha"] * new_wrong_prob / old_wrong_prob and proposal["old_prob"] * swp_lm_threshold <= proposal["new_prob"] and old_wrong_prob * swp_prob_threshold <= new_wrong_prob and (new_obj > swn_obj_threshold # objective or (new_obj <= swn_obj_threshold # neutral and abs(new_pos) <= swn_pos_threshold)) and (tmp_str not in negations))): if new_wrong_prob >= 0.5: res_log[i].append((sample_all, 1)) print( "%d/%d\t%d acc / %d all\tSWP\t SUCC with %.5f\t[%s](%d) => [%s](%d) (%d)" % (i + 1, bb_atk_data_size, sample_cnt + 1, sample_all, new_wrong_prob, vocab.get_vocab(seq[idx]), seq[idx], vocab.get_vocab( proposal["proposal"][idx]), proposal["proposal"][idx], idx), file=fout, flush=True) succ = True else: res_log[i].append((sample_all, 0)) print( "%d/%d\t%d acc / %d all\tSWP\t FAIL with %.5f\t[%s](%d) => [%s](%d) (%d)" % (i + 1, bb_atk_data_size, sample_cnt + 1, sample_all, new_wrong_prob, vocab.get_vocab(seq[idx]), seq[idx], vocab.get_vocab( proposal["proposal"][idx]), proposal["proposal"][idx], idx), file=fout, flush=True) sample_cnt += 1 seq = proposal["proposal"] bb_seq = tmp_bb_seq raw = tmp_raw sents[-1].append(copy.deepcopy(raw)) print("", end="\t", file=fout, flush=True) for ii in range(len(raw)): print(raw[ii], end=" ", file=fout, flush=True) if bb_y == 1: print("\t<POS>", file=fout, flush=True) else: print("\t<NEG>", file=fout, flush=True) else: print("%d/%d\t%d acc / %d all\tSWP\talpha %.2e" % (i + 1, bb_atk_data_size, sample_cnt, sample_all, proposal["alpha"]), file=fout, flush=True) elif op == 1: if idx == l - 1: continue proposal = m.op_insert(sess, copy.deepcopy(seq), l, copy.deepcopy(bb_seq), bb_l, 1 - bb_y, idx, n_candidate, op_prob) tmp_bb_seq = numpy.asarray( copy.deepcopy(bb_seq)).tolist() tmp_str = vocab.get_vocab(proposal['proposal'][idx + 1]) if tmp_str in bb_word2idx.keys(): tmp_bb_seq = tmp_bb_seq[:idx] + [ bb_word2idx[tmp_str] ] + tmp_bb_seq[idx:] else: tmp_bb_seq = tmp_bb_seq[:idx] + [ bb_word2idx["<unk>"] ] + tmp_bb_seq[idx:] tmp_bb_seq = tmp_bb_seq[:-1] tmp_bb_l = bb_l + 1 if tmp_bb_l > bb_max_seqlen: tmp_bb_l = bb_max_seqlen new_wrong_prob = sess.run(bb.prob, feed_dict={ bb.X: [tmp_bb_seq], bb.L: [tmp_bb_l] })[0][1 - bb_y] tmp_raw = copy.deepcopy(raw) tmp_raw = tmp_raw[:idx] + [tmp_str] + tmp_raw[idx:] new_tag = get_part_of_speech( nltk.pos_tag(tmp_raw)[idx][1]) if new_tag is None: new_obj = 1 new_pos = 0 else: new_wn = wn.synsets(lemmatzr.lemmatize( tmp_raw[idx]), pos=new_tag) if len(new_wn) <= 0: new_obj = 1 new_pos = 0 else: new_swn = swn.senti_synset(new_wn[0].name()) new_obj = new_swn.obj_score() new_pos = new_swn.pos_score( ) - new_swn.neg_score() if (just_acc(just_acc_rate) or (numpy.random.uniform(0,1) <= \ proposal["alpha"] * new_wrong_prob / old_wrong_prob and proposal["old_prob"] * ins_lm_threshold <= proposal["new_prob"] and old_wrong_prob * ins_prob_threshold <= new_wrong_prob and (new_obj > swn_obj_threshold # objective or (new_obj <= swn_obj_threshold # neutral and new_pos <= swn_pos_threshold)) and (tmp_str not in negations))): if new_wrong_prob >= 0.5: res_log[i].append((sample_all, 1)) print( "%d/%d\t%d acc / %d all\tINS\t SUCC with %.5f\t[] => [%s](%d,%.1f,%.1f) (%d)" % (i + 1, bb_atk_data_size, sample_cnt + 1, sample_all, new_wrong_prob, vocab.get_vocab( proposal["proposal"][idx + 1]), proposal["proposal"][idx + 1], new_obj, new_pos, idx), file=fout, flush=True) succ = True else: res_log[i].append((sample_all, 0)) print( "%d/%d\t%d acc / %d all\tINS\t FAIL with %.5f\t[] => [%s](%d,%.1f,%.1f) (%d)" % (i + 1, bb_atk_data_size, sample_cnt + 1, sample_all, new_wrong_prob, vocab.get_vocab( proposal["proposal"][idx + 1]), proposal["proposal"][idx + 1], new_obj, new_pos, idx), file=fout, flush=True) sample_cnt += 1 seq = proposal["proposal"] bb_seq = tmp_bb_seq l += 1 mask = mask[:idx + 1] + [False] + mask[idx + 1:] if l > flags.seq_max_len: l = flags.seq_max_len mask = mask[:l] bb_l = tmp_bb_l raw = raw[:idx] + [vocab.get_vocab(seq[idx + 1]) ] + raw[idx:] sents[-1].append(copy.deepcopy(raw)) print("", end="\t", file=fout, flush=True) for ii in range(len(raw)): print(raw[ii], end=" ", file=fout, flush=True) if bb_y == 1: print("\t<POS>", file=fout, flush=True) else: print("\t<NEG>", file=fout, flush=True) else: print("%d/%d\t%d acc / %d all\tINS\talpha %.2e" % (i + 1, bb_atk_data_size, sample_cnt, sample_all, proposal["alpha"]), file=fout, flush=True) elif op == 2: if mask[idx] or l - 1 < seq_min_len: continue proposal = m.op_delete(sess, copy.deepcopy(seq), l, copy.deepcopy(bb_seq), bb_l, 1 - bb_y, idx, n_candidate, op_prob) tmp_bb_seq = numpy.asarray( copy.deepcopy(bb_seq)).tolist() tmp_str = vocab.get_vocab(seq[idx]) tmp_bb_seq = tmp_bb_seq[:idx - 1] + tmp_bb_seq[idx:] + [ bb_word2idx['<pad>'] ] tmp_bb_l = bb_l - 1 new_wrong_prob = sess.run(bb.prob, feed_dict={ bb.X: [tmp_bb_seq], bb.L: [tmp_bb_l] })[0][1 - bb_y] if (just_acc(just_acc_rate) or (numpy.random.uniform(0,1) <= \ proposal["alpha"] * new_wrong_prob / old_wrong_prob and proposal["old_prob"] * del_lm_threshold <= proposal["new_prob"] and old_wrong_prob * del_prob_threshold <= new_wrong_prob) and (tmp_str not in negations)): if new_wrong_prob >= 0.5: res_log[i].append((sample_all, 1)) print( "%d/%d\t%d acc / %d all\tDEL\t SUCC with %.5f\t[%s](%d) => [] (%d)" % (i + 1, bb_atk_data_size, sample_cnt + 1, sample_all, new_wrong_prob, vocab.get_vocab(seq[idx]), seq[idx], idx), file=fout, flush=True) succ = True else: res_log[i].append((sample_all, 0)) print( "%d/%d\t%d acc / %d all\tDEL\t FAIL with %.5f\t[%s](%d) => [] (%d)" % (i + 1, bb_atk_data_size, sample_cnt + 1, sample_all, new_wrong_prob, vocab.get_vocab(seq[idx]), seq[idx], idx), file=fout, flush=True) sample_cnt += 1 seq = proposal["proposal"] bb_seq = tmp_bb_seq l -= 1 mask = mask[:idx] + mask[idx + 1:] bb_l = tmp_bb_l raw = raw[:idx - 1] + raw[idx:] sents[-1].append(copy.deepcopy(raw)) print("", end="\t", file=fout, flush=True) for ii in range(len(raw)): print(raw[ii], end=" ", file=fout, flush=True) if bb_y == 1: print("\t<POS>", file=fout, flush=True) else: print("\t<NEG>", file=fout, flush=True) else: print("%d/%d\t%d acc / %d all\tDEL\talpha %.2e" % (i + 1, bb_atk_data_size, sample_cnt, sample_all, proposal["alpha"]), file=fout, flush=True) if succ: end_time = time.time() total_time += end_time - start_time n_succ += 1 print("\tSUCC!") print("\t\ttime =", total_time, n_succ) flush() assert len(mask) == l except Exception as e: print("Something went wrong... Abort!", file=fout, flush=True) print("Something went wrong... Abort! -- Thread %d" % self.__idx) print("\t", e) sys.stdout.flush() sys.stderr.flush() continue with open(res_path, "wb") as f: pkl.dump((res_log, sents), f)
Unit tests for nltk.corpus.wordnet See also nltk/test/wordnet.doctest """ from __future__ import unicode_literals from nose import SkipTest import unittest import os from nltk.corpus.reader.wordnet import WordNetCorpusReader from nltk.corpus import wordnet as wn from nltk.corpus import wordnet_ic as wnic from nltk.data import find as find_data wn.ensure_loaded() S = wn.synset L = wn.lemma class WordnNetDemo(unittest.TestCase): def test_retrieve_synset(self): move_synset = S('go.v.21') self.assertEqual(move_synset.name(), "move.v.15") self.assertEqual(move_synset.lemma_names(), ['move', 'go']) self.assertEqual(move_synset.definition(), "have a turn; make one's move in a game") self.assertEqual(move_synset.examples(), ['Can I go now?']) def test_retrieve_synsets(self): self.assertEqual(sorted(wn.synsets('zap', pos='n')),