def remove_stopwords(words): stopwords = get_stopwords() # all_words = [x for x in words] all_words = [re.sub(r'[^\w\s]', '', x) for x in words] # remove punctuation all_words = [x for x in all_words if x not in stopwords] return all_words
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--dialogue_file', default='../data/subtitles/subtitlesInTSV/finding_nemo_clean.tsv') parser.add_argument('--data_dir', default='../data/subtitles/subtitlesInTSV/') args = parser.parse_args() dialogue_file = args.dialogue_file data_dir = args.data_dir sub_name = os.path.basename(dialogue_file).replace('.tsv', '') print(sub_name) custom_words = [ 'will', 'don', 've', 're', 'oh', 'hey', 'ha', 'aah', 'll', 'can', 'dont', 'just' ] stops = get_stopwords('en') + custom_words data = pd.read_csv(dialogue_file, sep='\t') all_docs = {} for slice_, data_group in data.groupby('slice'): clean_dialogue = [] for d in data_group['dialogue']: # print('raw dialogue %s'%(d)) # cleaned = clean_text(str(d)) try: cleaned = d.decode('utf-8') clean_dialogue.append(cleaned) except Exception, e: print('could not clean text %s because error %s' % (d, e)) all_dialogue = ' '.join(clean_dialogue) all_docs[slice_] = all_dialogue
def tokeniser(desc_text): return [ PorterStemmer().stem(token) for token in wordpunct_tokenize( re.sub('[%s]|\w*\d\w*' % re.escape(string.punctuation), '', desc_text.lower())) if token.lower() not in get_stopwords() ]
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--tf', # default='../../data/frequency/2015_2016_tf.tsv') default='../../data/frequency/2015_2016_tf_norm.tsv') parser.add_argument('--top_k', type=int, default=100000) args = parser.parse_args() tf_file = args.tf top_k = args.top_k print(tf_file) timeframe = re.findall('201[0-9]_201[0-9]', tf_file)[0] tf = pd.read_csv(tf_file, sep='\t', index_col=0) totals = tf.sum(axis=1) totals.sort_values(inplace=True, ascending=False) stops = set(get_stopwords('en')) # only want valid words!! valid_words = list( filter(lambda w: (type(w) is str and w.isalpha()) and w not in stops, totals.index)) top_vocab = totals.loc[valid_words][:top_k] top_vocab = pd.DataFrame(top_vocab, columns=['count']) print('got %d vocab' % (len(top_vocab))) # renormalize top_vocab.loc[:, 'count'] = top_vocab.loc[:, 'count'] / top_vocab.loc[:, 'count'].sum( axis=0) out_dir = os.path.dirname(tf_file) out_fname = os.path.join(out_dir, '%s_top_%d_vocab.tsv' % (timeframe, top_k)) top_vocab.to_csv(out_fname, sep='\t')
def remove_stopwords(words): translator = str.maketrans('', '', punctuation) stopwords = get_stopwords() all_words = [ word.translate(translator) for word in words if word not in stopwords ] return all_words
def frequence_word(list): ###Test unitaire fait #prend en entrée la liste résultante de la recherche d'un domaine de la fction analyse #ici, on va déterminer récupérer la fréquence de chaque mot dans la liste nb_frequence = {} mots_inutiles= stopwords.get_stopwords('fr')+stopwords.get_stopwords('en') for word in list : #on ne garde que les mots d'une longueur plus gde que 3 pour qu'ils aient vraiment un sens et #on ne garde as les mots vides if word not in mots_inutiles and len(word)>3: if word in nb_frequence: nb_frequence[word]+=1 else: nb_frequence[word]=1 return(nb_frequence)
def do_process(args): sc = SparkContext(appName="task") sc.addPyFile("generate_key.py") stop_words = get_stopwords(args.lang) rdd = sc.textFile(args.dump) rdd_processed = rdd.flatMap( lambda x: map_preprocess_wikidata(x, args.lang, stop_words)) rdd_processed.saveAsSequenceFile(args.output)
def text_pre_processing(csvFile, columnNumberForText): # import data-set # colNum becomes an index, which should start at 0, and columns in spreadsheets start at 1, so subtract 1 from columnNumberForText documents = importColumnFromCSV(fileName=csvFile, colNum=int(columnNumberForText) - 1, header=True) print "imported documents..." # phrase detection model training abstracts = [] # list of abstracts containing a list of words for line in documents: # tokenize abstract tokens = nltk.word_tokenize(remove_non_ascii(line)) abstracts.append(tokens) # create bigram and trigram phrase models bigram = models.Phrases(abstracts) trigram = models.Phrases(bigram[abstracts]) print "built bigram and trigram phrase detection models..." # text pre-processing tools stops = get_stopwords('en') # stronger stopwords STOPS = list(' '.join(str(e).title() for e in stops).split()) # uppercase stopwords noNum = re.compile(r'[^a-zA-Z ]') # number and punctuation remover # function that cleans the text def clean(text): clean_text = noNum.sub(' ', text) # remove numbers and punctuations tokens = nltk.word_tokenize(clean_text) # tokenize text filtered_words = [w for w in tokens if not w in stops] # filter out lowercase stopwords double_filtered_words = [w for w in filtered_words if not w in STOPS] # filter out uppercase stopwords trigrams = trigram[bigram[double_filtered_words]] # apply the bigram and trigram models to the filtered words trigrams_str = ' '.join(str(x) for x in trigrams) # stringify clean and filtered tokens return trigrams_str results = [] # create list for storing clean abstracts # figure out path for the text corpus rawFilePathBase = os.path.basename(csvFile) rawFileName = os.path.splitext(rawFilePathBase)[0] corpusPath = "../../data/" + rawFileName + "_textCorpus.txt" # write list of clean text documents to text corpus file with open(corpusPath, 'w') as f: print 'Cleaned up text corpus file has been created at ', corpusPath, ' ...' f.truncate() # if file is not empty, remove everything inside the file for abstract in documents: text = clean(abstract) # clean each abstract, one at a time f.write(text + '\n') # write clean abstract to desired text corpus file results.append(text) # append clean abstracts to list return results, corpusPath # return a list of clean abstracts
def preprocess(self, s): # extract all tokens build with specific words for tweets tokens = self.tokenize(s) # gather stop words punctuation = list(string.punctuation) stop = stopwords.get_stopwords() + punctuation # return tokens without stop words tokens = [ token.lower() for token in tokens if token.lower() not in stop ] return tokens
def __init__(self, name=None, level=None, **kwargs): super(SimilarityCache, self).__init__(**kwargs) self.name = name self.confdLevel = level self.docuCommonWords = [ 'principal', 'discharge', 'diagnosis', 'responsible', 'after', 'study', 'causing', 'admission', 'same', 'other', 'record', 'orders', 'end', 'conditions', 'infections', 'complications', 'diet', 'service', 'admission', 'date', 'limited', 'need', 'felt', 'month', 'day', 'years', 'service', 'full', 'code', 'status', 'medications', 'entered', 'order', 'summary', 'will', 'none', 'summary:', 'home', 'year', '~', 'liter', 'status:', 'know', '?' ] self.stop_words = get_stopwords('en')
def __stem_doc(doc_details): # Import nltk tools from nltk.tokenize import wordpunct_tokenize as wordpunct_tokenize # from nltk.stem.snowball import EnglishStemmer from nltk.stem.porter import PorterStemmer as EnglishStemmer idx, doc = doc_details if idx % 100 == 0: print "Processed doc " + str(idx) if doc.endswith('.txt'): d = open(doc).read() stemmer = EnglishStemmer() # This method only works for english documents. # Stem, lowercase, substitute all punctuations, remove stopwords. attribute_names = [stemmer.stem(token.lower()) for token in wordpunct_tokenize( re.sub('[%s]' % re.escape(string.punctuation), '', d.decode(encoding='UTF-8', errors='ignore'))) if token.lower() not in stopwords.get_stopwords()] s.dump(attribute_names, open(doc.replace(".txt", ".p"), "wb"))
def do_process(args): sc = SparkContext(appName="task") stop_words = get_stopwords(args.lang) sections_by_title = extract_sections(args.dump) dataset = [] for title, sections in sections_by_title.items(): for section in sections: key = generate_key(stop_words, title, section) value = "{}#{}".format(title, section) if key != "": dataset.append((key, value)) rdd = sc.parallelize(dataset) rdd.saveAsSequenceFile(args.output)
def calculate_similarity(query, text, model_path): embeddingSize = 300 query_embedding =np.zeros((1,embeddingSize)) stop = stopwords.get_stopwords('english') model = word2vec.load(model_path) query_embedding = get_embedding(query, model, stop, query_embedding) nword=0 score = 0.0 for word in nltk.tokenize.word_tokenize(text.decode('utf8')): if word in model and word not in stop: nword += 1 wordNorm = linalg.norm(model[word]) score += np.dot(query_embedding, model[word]) / wordNorm if nword!=0: score = score / nword print score[0] return score[0]
def calculate_similarity(query, text, model_path): embeddingSize = 300 query_embedding = np.zeros((1, embeddingSize)) stop = stopwords.get_stopwords('english') model = word2vec.load(model_path) query_embedding = get_embedding(query, model, stop, query_embedding) nword = 0 score = 0.0 for word in nltk.tokenize.word_tokenize(text.decode('utf8')): if word in model and word not in stop: nword += 1 wordNorm = linalg.norm(model[word]) score += np.dot(query_embedding, model[word]) / wordNorm if nword != 0: score = score / nword print score[0] return score[0]
def pos_tag_tweets(name): # Remove stopwords stop_words = set(stopwords.get_stopwords()) a = { ',', '!', '#', '%', ':', '+', '.', '@', '-', '&', '?', '\"', '\'', '(', ')', '\'', '`' } stop_words.update(a) #get all the tweets and tokenize them tokenized_tweets = [] candidate_tweets = candidate.tweets(name) temp = word_tokenize(candidate_tweets) for t in temp: if t not in stop_words: tokenized_tweets.append(t) tagged_tweets = nltk.pos_tag(tokenized_tweets) return tagged_tweets
def liste_mots(tweets): #prend en entrée une liste de texte de tweets, que l'on met dans un seul string text = '' mots_inutiles = stopwords.get_stopwords('en') mystopwords = ['I', 'will', '\'', 'The', 'http'] for tweet in tweets: text += str(tweet) liste = TextBlob(text) #wordlist fait une liste des mots de tous les tweets wordlist = liste.words unique = [] for word in wordlist: w = Word(word) #on récupère les mots qui ne sont pas des mots vides et qui sont dans leur forme de base + #qui sont plus long que 3 carac if word not in mots_inutiles and word not in mystopwords and len( word) > 2: word_lemmatize = w.lemmatize() unique.append(word_lemmatize) ###renvoie les mots dans une LISTE return (unique)
def __stem_doc(doc_details): # Import nltk tools from nltk.tokenize import wordpunct_tokenize as wordpunct_tokenize # from nltk.stem.snowball import EnglishStemmer from nltk.stem.porter import PorterStemmer as EnglishStemmer idx, doc = doc_details if idx % 100 == 0: print "Processed doc " + str(idx) if doc.endswith(".txt"): d = open(doc).read() stemmer = EnglishStemmer() # This method only works for english documents. # Stem, lowercase, substitute all punctuations, remove stopwords. attribute_names = [ stemmer.stem(token.lower()) for token in wordpunct_tokenize( re.sub("[%s]" % re.escape(string.punctuation), "", d.decode(encoding="UTF-8", errors="ignore")) ) if token.lower() not in stopwords.get_stopwords() ] s.dump(attribute_names, open(doc.replace(".txt", ".p"), "wb"))
def get_tweets(username): forbidden_words=["https", "RT", "en", "lo", "de", "the", "a", ""] # http://tweepy.readthedocs.org/en/v3.1.0/getting_started.html#api auth = tweepy.OAuthHandler(config.BaseConfig.CONSUMER_KEY, config.BaseConfig.CONSUMER_SECRET) auth.set_access_token(config.BaseConfig.ACCESS_KEY, config.BaseConfig.ACCESS_SECRET) api = tweepy.API(auth) # set count to however many tweets you want number_of_tweets = 100 # get tweets tweets_for_csv = [] words = '' for tweet in tweepy.Cursor(api.user_timeline, screen_name = username).items(number_of_tweets): words += str(tweet.text.encode("utf-8"), 'utf-8') stop_words = get_stopwords() word_tokens = words.split(" ") filtered_sentence = [ w for w in word_tokens if w not in stop_words and "@" not in w and "https" not in w ] words = ' '.join(filtered_sentence) return words
def main(): parser = argparse.ArgumentParser() parser.add_argument('--data_dir', default='../data/subtitles/subtitlesInTSV/') args = parser.parse_args() data_dir = args.data_dir dialogue_files = [ f for f in os.listdir(data_dir) if re.findall('S[1-9]E[0-9]+.tsv', f) ] dialogue_files = [os.path.join(data_dir, f) for f in dialogue_files] stops = get_stopwords('en') + [ 'will', 'don', 've', ] all_docs = {} for f in dialogue_files: ep_name = re.findall('S[1-9]E[0-9]+', f)[0] data = pd.read_csv(f, sep='\t') docs = [] for chunk, data_group in data.groupby('chunk'): clean_dialogue = [] for d in data_group['dialogue']: # print('raw dialogue %s'%(d)) cleaned = clean_text(str(d)) try: cleaned = cleaned.decode('utf-8') clean_dialogue.append(cleaned) except Exception, e: print('could not clean text %s because error %s' % (cleaned, e)) all_dialogue = ' '.join(clean_dialogue) docs.append(all_dialogue) episode_text = ' '.join(docs) # print('got full text %s'% # (episode_text)) all_docs[ep_name] = episode_text
def normalisasi2(pos_texts, neg_texts, kamus_hasil): factory = StemmerFactory() stemmer = factory.create_stemmer() stopwords = get_stopwords() pos_texts_normalized = [] for text in pos_texts: pos_text_normalized = [] for word in text.split(): # normalisasi word = kamus_hasil[word] if word not in stopwords: word = stemmer.stem(word) if word not in stopwords: pos_text_normalized.append(word) pos_texts_normalized.append(' '.join(pos_text_normalized)) neg_texts_normalized = [] for text in neg_texts: neg_text_normalized = [] for word in text.split(): # normalisasi word = kamus_hasil[word] if word not in stopwords: word = stemmer.stem(word) if word not in stopwords: neg_text_normalized.append(word) neg_texts_normalized.append(' '.join(neg_text_normalized)) return pos_texts_normalized, neg_texts_normalized
def initFeatures(self): ### text vectorization--go from strings to lists of numbers # words to exclude exclusion = stopwords.get_stopwords('english') # vectorizer vectorizer = TfidfVectorizer(stop_words=exclusion) jokes = preprocessJoke(self.DATA_LIMIT, jokeData) jokeLabels = labelData(jokes, 1) # 1 is joke print ("Joke Labels length:", len(jokeLabels)) tweets = preprocessNormalTweets(self.DATA_LIMIT, normalTweetData) tweetLabels = labelData(tweets, 0) # 0 is not joke print ("Tweet labels length:", len(tweetLabels)) # concat joke_labels and tweet_labels training_labels = jokeLabels + tweetLabels print ("training label length:", len(training_labels)) # concat features training_features = jokes + tweets print ("training feature length:", len(training_features)) transformedFeatures = vectorizer.fit_transform(training_features).toarray() return transformedFeatures, training_labels, vectorizer
return tokens_re.findall(s) def preprocess(s, lowercase=False): tokens = tokenize(s) if lowercase: tokens = [ token if emoticon_re.search(token) else token.lower() for token in tokens ] return tokens print(preprocess(sentence)) stop = set(stopwords.get_stopwords('english')) #stop = set(stopwords.words('english')) tweets_data_path = '/Users/priyamurthy/Documents/PycharmProjects/program1/twitter_data.txt' tweets_data = [] tweets_file = open(tweets_data_path, "r") for line in tweets_file: try: tweet = json.loads(line) tweets_data.append(tweet['text']) except: continue print 'Im here' #i=0 #while i<len(tweets_data):
from text_to_data import Doc2Data from text_to_data import CalculatePair from model import train_model, get_data from progress.bar import Bar import os from corpy.udpipe import Model import stopwords from random import randint import numpy as np m = Model("russian-syntagrus-ud-2.5-191206.udpipe") stop = stopwords.get_stopwords('ru') # postfix can be _sm for demo corpus and _med for second part, _all for all corpus postfix = '_all' text_folder_name = 'texts' + postfix + '/' data_folder_name = 'data' + postfix + '/' def make_data_from_texts(): # goes through folder and process all texts in json all_texts = os.listdir(text_folder_name) for text in Bar(' text parsing...').iter(all_texts): Doc2Data(text_folder_name + text, m, stop, data_folder_name) def make_pairs(authors): all_texts = os.listdir(text_folder_name) texts = open('db' + postfix + '.csv', 'r').read().split('\n')[:authors] text = [] for i in texts:
from orderedset._orderedset import OrderedSet import json import string import re from stopwords import get_stopwords from tika import language stopwords = get_stopwords("en") stopwords = [x.upper() for x in stopwords] #freqListFile = open("/Users/charanshampur/solr/lucene_solr_4_10/solr/example/solr-webapp/webapp/MyHtml/freqList.json","w") freqListFile = open("freqList.json", "w") sweetJsonFile = open( "/Users/charanshampur/PycharmProjects/CSCI599/MetaScoreNew.json", "r") jsonLoad = json.load(sweetJsonFile) langFile = open("Language.json", "r") langDictionary = json.load(langFile) removeWords = [ "FOR", "LOGIN", "SALE", "NEW", "FREE", "``", "BUY", "SYSTEM", "WANT", "REPORT", "WITHIN", "S", "...", "TO", "SAN", "P", "W/", "ALL", "'S", "W", "M", "PAGE", "ITEMS" ] #print "NLTK succesfully loaded<br>" #print "Json succesfully loaded" wordCloud = {} skipList = [ "NER_DATE", "id", "Geographic_LATITUDE", "content", "title", "Measurements", "Meta_Score", "NER_PERCENT", "NER_MONEY", "" ]
from nltk.tokenize import RegexpTokenizer from stopwords import get_stopwords from nltk.stem.porter import PorterStemmer from gensim import corpora, models import gensim import numpy as np import pandas as pd import requests from bs4 import BeautifulSoup import re import sys tokenizer = RegexpTokenizer(r'\w+') # create English stop words list en_stop = get_stopwords('en') a = [ '/', 's', 'n', ' ', 'may', '9', '11', '2016', 'brooklyn', 'cruise', 'terminal', 'brooklyn', 'york', 'thus', 'far', 'disrupt', 'ny', 've', 'seen', 'handshake', 'stalemate', 'iab', 'randall', 'rothenberger', 'adblocker', 'till', 'faida', 've', 'seen', 'submit', 'startup', 'careers', 'contact', 'us', 'privacy', 'policy', 'disclaimer', 'activate', 'facebook', 'messenger', 'news', 'bot', 'subscribed', 'bot', 'will', 'send', 'digest', 'trending', 'stories', 'day', 'can', 'also', 'customize', 'types', 'stories', 'sends', 'click', 'button', 'subscribe', 'wait', 'new', 'facebook', 'message', 'tc', 'messenger', 'news', 'bot', 'thanks', 'tc', 'team', 'cost', 'costs', 'text', 'com', 'we', 'users', 'user', 'people', 'global', 'you', 'city', 'state', 'country' ] do_not_include = [
def get_default_stopwords(): return get_stopwords('en')
from stopwords import get_stopwords from bs4 import BeautifulSoup from emoji_processing import replace_hidden_emoji from message_reactions import delete_reaction_end stopwords = get_stopwords() def read_file(filepath=None, text=None): if filepath: with open(filepath, "r") as f: lines = f.readlines() elif text: lines = [text] else: raise Exception("Neither text or filepath was entered") soup = BeautifulSoup(lines[0], "html.parser") names = [ n.text for n in soup.findAll("div", {"class": "_3-96 _2pio _2lek _2lel"}) ] messages = [m.text for m in soup.findAll("div", {"class": "_3-96 _2let"})] times = [t.text for t in soup.findAll("div", {"class": "_3-94 _2lem"})] names.reverse() times.reverse() messages.reverse() return list(zip(names, times, messages)), names, times, messages
#Appending the articles url.append(urls) title.append(titles) dop.append(dops) content.append(texts) #Condition for reading in the 25000 articles ten_thousand+=1 except: count+=1 #Tokenising tokenizer = RegexpTokenizer(r'[a-zA-Z]{3,}') english_stopwords = get_stopwords('en') english_stopwords.append('reuters') english_stopwords.append('said') token_content = [] processed_content = [] for article in content: tokens = tokenizer.tokenize(article.lower()) token_content.append(tokens) stopped_tokens = [i for i in tokens if i not in english_stopwords] processed_content.append(stopped_tokens) # Creating a bigram model bigram = models.Phrases(token_content, min_count=5, threshold = 100) bigram_mod = models.phrases.Phraser(bigram) bigram_content = [bigram_mod[i] for i in processed_content]
HAS_NLTK = True except Exception, exception: HAS_NLTK = False print "word stemmer is turned off"\ "download nltk at http://www.nltk.org/ for feature" from stopwords import get_stopwords HAS_NLTK = False wn_lemmatizer = None if HAS_NLTK: # create stemmer wn_lemmatizer = WordNetLemmatizer() # get list of stopwords stop_words = get_stopwords() # punctuation EndPunctuationSet = set(".,?!()[]`%'\"") StartPunctuationSet = set("\"([&^*<@`") def reversed_dict(index): """ reverse keys and values """ return dict((v, k) for k, v in index.iteritems()) def open_text(filename): """opens single file with multiple documents, separated by <TEXT> tag""" wlist = [] groups = [] with open(filename, 'r') as tfile:
N_SLICES = 60 if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument( '--sub_file', default='../data/subtitles/subtitlesInTSV/finding_nemo_clean.tsv') parser.add_argument('--LIWC_dir', default='/hg191/corpora/LIWC/resources/liwc_lexicons/') args = parser.parse_args() sub_file = args.sub_file LIWC_dir = args.LIWC_dir LIWC_categories = [ 'positive_affect', 'negative_affect', 'anger', 'death', 'family', 'home', 'humans', 'social', 'percept', 'insight' ] stopwords = get_stopwords('en') LIWC_category_wordlists = { c: [ re.compile('^' + l.strip() + '$') for l in open(os.path.join(LIWC_dir, '%s' % (c)), 'r') if l.strip() not in stopwords ] for c in LIWC_categories } # replace positive/negative affect LIWC_categories += ['positive', 'negative'] LIWC_categories.remove('positive_affect') LIWC_categories.remove('negative_affect') LIWC_category_wordlists['positive'] = LIWC_category_wordlists.pop( 'positive_affect') LIWC_category_wordlists['negative'] = LIWC_category_wordlists.pop(
def main(): # if GPU is availale, use GPU device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') print("Use " + str(device)) # Load the training dataset, and create a dataloader to generate a batch.自动处理小写,计算长度 textField = data.Field( lower=True, include_lengths=True, batch_first=True, preprocessing=preprocessing, # 单词形式下的预处理,过去式之类的去除 postprocessing=postprocessing, stop_words=get_stopwords()) # 剔除stopwords中的所有单词 labelField = data.Field(sequential=False, use_vocab=False, is_target=True) dataset = data.TabularDataset('train.csv', 'csv', { 'text': ('text', textField), 'target': ('target', labelField) }) textField.build_vocab( dataset, vectors=config.wordVectors) # 把数据转换为向量,用上面定义的textfield # 分割数据集,训练集与验证集 train_dataset, validate_dataset = dataset.split( split_ratio=config.proportion_of_val_dataset, stratified=True, strata_field='target') train_loader, val_loader = data.BucketIterator.splits( (train_dataset, validate_dataset), shuffle=True, batch_size=config.batchSize, sort_key=lambda x: len(x.text), sort_within_batch=True) net = get_model(config.dim, config.from_old_model, config.model_path).to(device) criterion = config.criterion params = net.parameters() # create optimizer if config.optimizer_name == "SGD": optimizer = toptim.SGD(params, lr=config.learning_rate) elif config.optimizer_name == "Adam": optimizer = toptim.Adam(params, lr=config.learning_rate) elif config.optimizer_name == "AdamW": optimizer = AdamW(params, lr=config.learning_rate, weight_decay=1e-6) # 混合精度加速 if config.use_apex: net, optimizer = amp.initialize(net, optimizer, opt_level="O1") train_start = time.time() for epoch in range(config.epochs): ''' # change lr by epoch adjust_learning_rate(optimizer, epoch) ''' # start train train(net, train_loader, config.criterion, optimizer, epoch, device, log, textField) # start val val(net, val_loader, config.criterion, optimizer, epoch, device, log, train_start, textField) print("Final saved model is epoch " + str(best_val_acc[0]) + ", acc: " + str(best_val_acc[1]) + ".") log.write("Final saved model is epoch " + str(best_val_acc[0]) + ", acc: " + str(best_val_acc[1]) + "\n") print("Done.") log.write("Done.\n")
import pickle import warnings warnings.filterwarnings('ignore') import os import sys np.set_printoptions(threshold=sys.maxsize) from time import time from utils import preprocess from stopwords import get_stopwords STOPWORDS = get_stopwords() t = time() # READ IN REVIEWS print('Loading Dataset...') reviews = pd.read_csv('data/amazon_reviews_us_Electronics_v1_00.tsv', sep='\t', error_bad_lines=False) reviews = reviews.iloc[:1000] print('Dataset Loaded: ', round(time() - t, 2), 's') print("Full Size:", reviews.shape[0], ' reviews') # DROP USELESS ROWS print('Cleaning dataframe...') E_simple = reviews[[
def stopper(testo): stop_words = set(stopwords.get_stopwords('english')) result = [i.lower() for i in testo if i.lower() not in stop_words] return result
from orderedset._orderedset import OrderedSet import json import string import re from stopwords import get_stopwords from tika import language stopwords=get_stopwords("en") stopwords=[x.upper() for x in stopwords] #freqListFile = open("/Users/charanshampur/solr/lucene_solr_4_10/solr/example/solr-webapp/webapp/MyHtml/freqList.json","w") freqListFile = open("freqList.json","w") sweetJsonFile=open("/Users/charanshampur/PycharmProjects/CSCI599/MetaScoreNew.json","r") jsonLoad=json.load(sweetJsonFile) langFile = open("Language.json","r") langDictionary=json.load(langFile) removeWords=["FOR","LOGIN","SALE","NEW","FREE","``","BUY","SYSTEM","WANT","REPORT","WITHIN","S","...","TO","SAN","P","W/","ALL","'S","W","M","PAGE","ITEMS"] #print "NLTK succesfully loaded<br>" #print "Json succesfully loaded" wordCloud={} skipList=["NER_DATE","id","Geographic_LATITUDE","content","title","Measurements","Meta_Score","NER_PERCENT","NER_MONEY",""] def reduceList(nestedList): MainList=[] def subList(x): if type(x) is list: for item in x: subList(item) else: MainList.append(x)