def GetSentimentForText(_text): review = str(_text) review = tp.expand_contractions(review) review = tp.scrub_words(review) review = tp.remove_accented_chars(review) encoding = config.TOKENIZER.encode_plus( review, max_length=config.MAX_LEN, add_special_tokens=True, # Add '[CLS]' and '[SEP]' return_token_type_ids=False, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt', # Return PyTorch tensors ) input_ids = encoding['input_ids'].to(config.device) attention_mask = encoding['attention_mask'].to(config.device) output = md.model(input_ids, attention_mask) _, prediction = torch.max(output, dim=1) print(f'Review text: {review}') print(f'Sentiment : {config.class_names[prediction]}') return config.class_names[prediction]
def getWordCounts(articles, useAbstract=True): """ Get for each calendar week the word frequencies of all occuring words in the headlines (and optionally also abstracts) in the given dataset of ``articles``. Parameters ---------- articles : dict Dict of news articles in JSON format. useAbstract : bool Specifies whether the abstract should also be used - only available for NYT (default is True). Returns ------- dict Dict of dicts for each calendar week with word frequencies. """ result = {} articles = get_articles_as_list(articles) for a in articles: key = (getYear(a['pub_date']), getCalendarWeek(a['pub_date'])) if key not in result: result[key] = {} for w in txt.parseSentence(a['headline']): if w not in result[key]: result[key][w] = 1 else: result[key][w] += 1 if useAbstract and "abstract" in a: for w in txt.parseSentence(a['abstract']): if w not in result[key]: result[key][w] = 1 else: result[key][w] += 1 return result
class ClassificationPreprocessing: def __init__(self): self.tp = TextPreprocessing() self.encoder = preprocessing.MultiLabelBinarizer() self.word_classes_indexes = {} def fit_from_text(self, ref_file): sentences = [] cl = [] origin = [] with open(ref_file, 'r', encoding="utf-8") as f: next(f) for line in f: line_ = line.split(';') if line_[1] == 'T': c = line_[7] c = re.sub(r'[\+\.\-†*!\s]', '', c) cl.append(c) self.encoder.fit(cl) self.n_classes = len(self.encoder.classes_) self.class_to_index = dict( zip(self.encoder.classes_, range(self.n_classes))) def fit(self, y): self.encoder.fit(y) self.n_classes = len(self.encoder.classes_) self.class_to_index = dict( zip(self.encoder.classes_, range(self.n_classes))) def fit_word_indexes(self, index, c_index, class_to_exclude=[]): for c, s in zip(c_index, index): if c not in class_to_exclude: s = self.tp.tokenizer(s) for w in s: if w not in self.word_classes_indexes: self.word_classes_indexes[w] = set() self.word_classes_indexes[w].update( [self.class_to_index[c]]) #np.save('word_codes_indexes',self.word_codes_indexes) print('Index word size =', len(self.class_to_index))
import TextPreprocessing import pymongo import os if __name__ == "__main__": client = pymongo.MongoClient("mongodb://localhost:27017") db = client['epilepsy_qa_system_demo_data'] collectionNames = db.collection_names() SEED = 1000 # get epilepsy dataset from demo_data illnessSet_epilepsy = TextPreprocessing.processingData( isEpilepsy=True, collectionNames=collectionNames, db=db) # get no-epilepsy dataset from demo_data illnessSet_noEpilepsy = TextPreprocessing.processingData( isEpilepsy=False, collectionNames=collectionNames, db=db) # build negative data illnessSet_epilepsy_negative = TextPreprocessing.buildNegative( illnessSet_epilepsy, illnessSet_noEpilepsy, rate=1, seed=SEED, ) dataset_trainAndDev, dataset_test = TextPreprocessing.deal_concat_random_cut( illnessSet_epilepsy, illnessSet_noEpilepsy) # save dataset dataPath = '../data'
import streamlit as st import pandas as pd import numpy as np import TextPreprocessing as tp import pickle loaded_classifier = pickle.load(open('sentiment_svc.pickle', 'rb')) loaded_vectorizer = pickle.load(open('sentiment_tfidf.pickle', 'rb')) st.title("Twitter Sentiment Analysis App") text = st.text_input("Tweet to analyze:") st.write("Tweet:") st.write(text) # Preprocess text text_cleaned = tp.clean_tweet(text) transformed_data = loaded_vectorizer.transform([text_cleaned]) prediction = loaded_classifier.predict(transformed_data) if prediction[0] == 0: prediction_name = "Negative" elif prediction[0] == 1: prediction_name = "Neutral" else: prediction_name = "Positive" st.write(f"Sentiment: {prediction_name}")
class FeatureExtractor(): def __init__(self): self.token_index = {} self.dictionary={} self.text_preprocessing=TextPreprocessing() def fit(self,sentences,START_VOCAB=[]): tokenized_sent = [self.text_preprocessing.tokenizer(s) for s in sentences] self.dictionary, self.rev_dictionary = build_vocabulary(tokenized_sent,START_VOCAB) def barket_removal(self,s): all_=re.findall('\((.*?)\)',s) if len(all_)!=0: all_=[w for w in all_]+[''] _s_=[] s_=re.split(r'\(.*?\)',s) if ' ' in s_: s_.remove(' ') for i,sp in enumerate(all_): s__='' for j,w in enumerate(s_): if j==0: s__=w+sp else: s__=s__+w s__=re.sub(r'\s{2,}',' ',s__) _s_.append(s__) return(_s_) else: return([s]) def square_barket_removal(self,s): all_= re.findall('\[(.*?)\]',s) if len(all_)!=0: _s_=[] all_=[w for w in all_] for i,sp in enumerate(all_): s_=re.split(r'\[.*?\]',s) if ' ' in s_: s_.remove(' ') del s_[-0] s___=sp+' '.join(s_) s___=re.sub(r'\s{2,}',' ',s___) _s_.append(s___) s_=re.split(r'\[.*?\]',s) if ' ' in s_ and len(s_)>2: s_.remove(' ') s___=' '.join(s_) s___=re.sub(r'\s{2,}',' ',s___) _s_.append(s___) s___=s_[0]+sp+s_[1] s___=re.sub(r'\s{2,}',' ',s___) _s_.append(s___) return(_s_+all_) else: return [s] def features_from_tokens(self,tokenize_sent,max_length): return vectorize_corpus(tokenize_sent,max_length,self.dictionary)
def __init__(self): self.token_index = {} self.dictionary={} self.text_preprocessing=TextPreprocessing()
def restore_keyword(keyword, articles, start=None, end=None, searchrange=None, minLength=2, minCount=5, useAbstract=True): """ Get subsequences of words which contain single ``keyword``. Searches all subsequences of words which contain the ``keyword`` in headlines and abstracts for the given dataset of ``articles``. Notes ----- 1.) In contrast to ``get_cooccurrences``, here we consider the positional distance of a word to the keyword in the headline/abstract. 2.) Using ``searchrange`` increases the performance. Parameters ---------- keyword : str Keyword in ``articles``. articles : dict Dict of news articles in JSON format. start : datetime.date Search is limited to articles which were published on or after this day (defaut is None). end : datetime.date Search is limited to articles which were published on or before this day (defaut is None). searchrange : int Only consider words within this maximum (symmetric) positional distance to the keyword in the headline/abstract (defaut is None). minLength : int Minimum length (number of words) of the subsequence (defaut is 2). minCount : int Minimum amount of occurrences of the subsequence (defaut is 5). useAbstract : bool Specifies whether the abstract should also be used - only available for NYT (default is True). Returns ------- list Sorted (descending count) list of tuples with format: (word_sequence, count) """ data = [] articles = get_articles_as_list(articles) for a in articles: match = True if "abstract" in a and useAbstract: content = txt.parseSentence(a["headline"] + " " + a["abstract"]) else: content = txt.parseSentence(a["headline"]) if keyword not in content: match = False if match and (start != None or end != None): date = parse_pubdate(a["pub_date"]) if start != None: if date < start: match = False if end != None: if date > end: match = False if match: if searchrange != None: for index in [ i for i, v in enumerate(content) if v == keyword ]: data.append( content[max(index - searchrange - 1, 0):min(index + searchrange, len(content))]) else: data.append(content) substring_counts = subsequence_counts(data, minLength=minLength, minCount=minCount) result = {} for el in substring_counts: if keyword in el[0]: result[" ".join(el[0])] = el[1] return [(k, result[k]) for k in sorted(result, key=result.get, reverse=True)]
def get_group_cooccurrences(keywords, articles, starts=None, ends=None, useAbstract=True): """ Get co-occurring words for multiple ``keywords``. Searches all words which occur together with words from ``keywords`` in headlines and abstracts for the given dataset of ``articles``. Notes ----- In contrast to ``restore_keyword``, here we do not consider the positional distance of a word to the keyword in the headline/abstract. Parameters ---------- keywords : list List of keywords in ``articles``. articles : dict Dict of news articles in JSON format. starts : list of datetime.date Search is limited to articles which were published on or after this day depending on keyword (defaut is None). ends : list of datetime.date Search is limited to articles which were published on or before this day depending on keyword (defaut is None). useAbstract : bool Specifies whether the abstract should also be used - only available for NYT (default is True). Returns ------- dict Dict containing for each keyword a sorted (descending count) list of tuples with format: (co-occurring_keyword, count) """ result = {} for keyword in keywords: result[keyword] = {} articles = get_articles_as_list(articles) for a in articles: if "abstract" in a and useAbstract: content = txt.parseSentence(a["headline"] + " " + a["abstract"]) else: content = txt.parseSentence(a["headline"]) for keyword in keywords: match = True if keyword not in content: match = False if match and (starts != None or ends != None): date = parse_pubdate(a["pub_date"]) if starts[keyword] != None: if date < starts[keyword]: match = False if ends[keyword] != None: if date > ends[keyword]: match = False if match: for cooccurrence in content: if cooccurrence != keyword: if cooccurrence not in result[keyword]: result[keyword][cooccurrence] = 1 else: result[keyword][cooccurrence] += 1 for keyword in keywords: result[keyword] = [(k, result[keyword][k]) for k in sorted( result[keyword], key=result[keyword].get, reverse=True)] return result
return result def get_identified_species_count(self, columns): res = [] for _, row in self.csv.iterrows(): current_comb = "" for col in columns: current_comb += str(row[col]) res.append(current_comb) c = Counter(res) return len(c) def search_combinations(self): print("Searching combinations...") result = self.random_search_column_combinations(100, 20)[:5] amount, speciescount, score, count, combinations = result[0] print(str(len(combinations)), "channels can distinguish", str(round(score * 100, 2)) + "%", "of all species") text_file = open(data_paths.most_common_values_best_features, "w") text_file.write("Amount: " + str(amount) + "\n") text_file.write("Species count: " + str(speciescount) + "\n") text_file.write("Score: " + str(score) + "\n") text_file.write("Features count: " + str(count) + "\n") text_file.write("Features: '" + "', '".join(x for x in combinations) + "'") text_file.close() if __name__ == "__main__": ImageToCSVConverter.extract_occurences_train() TextPreprocessing.extract_train() MostCommonValueExtractor.extract() MostCommonValuesUniqueCountDiagram().search_combinations() #MostCommonValuesUniqueCountDiagram().plot()
def __init__(self): self.tp = TextPreprocessing() self.encoder = preprocessing.MultiLabelBinarizer() self.word_classes_indexes = {}
def __init__(self): csv, species, species_c = TextPreprocessing.load_train() self.csv = csv self.species = species self.species_count = species_c