def main(data_dir, out_dir): docid2path = dict() # iterable of (doctext, docpath) tuple reader = TextsStreamReader(data_dir, as_lines=False) outfile = codecs.open( os.path.join(out_dir, 'processed_enron_docs_as_lines.txt'), 'w', 'utf-8', 'ignore') docid = 0 opts = dict(sents=False, lower=True, stem=False, min_token_len=3, min_sent_len=4, remove_stops=True, filters=[ 'strip_multiple_whitespaces', 'strip_tags', 'strip_punctuation', 'split_alphanum', 'strip_numeric' ]) for doctext, docpath in reader: doctext = preprocess_text(doctext, **opts) # generator to list doctext = list(doctext) if doctext: # when sents=False, each document is returned as single sentence (first element), # where every element is a list of tokens doctext = doctext[0] if doctext: docid2path[docid] = docpath outfile.write(" ".join(doctext) + '\n') docid += 1 outfile.close() utils.pickle(docid2path, os.path.join(out_dir, 'docid2path.pkl')) # create another file to hold sentences (useful for word2vec) outfile = codecs.open( os.path.join(out_dir, 'processed_enron_sents_as_lines.txt'), 'w', 'utf-8', 'ignore') opts['sents'] = True for doctext, _ in reader: docsents = preprocess_text(doctext, **opts) docsents = list(docsents) if docsents: for sent in docsents: if sent: outfile.write(" ".join(sent) + '\n') outfile.close()
def _cooccurrence_preprocessing(doc, context, already_preprocessed): """Preprocess document as needed for co-occurrence network creation""" if context=='window': if already_preprocessed: doc = doc.split(' ') else: doc = preprocess.preprocess_text(doc) elif context=='sentence': doc = preprocess.tokenize_sentences(doc) for i, sentence in enumerate(doc): sentence = preprocess.preprocess_text(sentence) doc[i] = sentence return doc
def _cooccurrence_preprocessing(doc, context, already_preprocessed): """Preprocess document as needed for co-occurrence network creation""" if context == 'window': if already_preprocessed: doc = doc.split(' ') else: doc = preprocess.preprocess_text(doc) elif context == 'sentence': doc = preprocess.tokenize_sentences(doc) for i, sentence in enumerate(doc): sentence = preprocess.preprocess_text(sentence) doc[i] = sentence return doc
def create_model(d, f): model = Counter() for file in f: content = preprocess_text(d+file) c = ngrams(content, 2) model.update(c) return model
def text_to_vector(docs, metric): """ Create frequency based feature-vector from text Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`. """ doc_freqs = FreqDist() # Distribution over how many documents each word appear in. tf_dists = [] # List of TF distributions per document # Create freq_dist for each document for doc in docs: doc = preprocess.preprocess_text(doc) fd = FreqDist() for word in doc: fd.inc(word) doc_freqs.update(fd.samples()) tf_dists.append(fd) all_tokens = doc_freqs.keys() num_docs = len(docs) num_features = len(all_tokens) # Build feature x document matrix matrix = np.zeros((num_features, num_docs)) for i, fd in enumerate(tf_dists): if metric == FrequencyMetrics.TF: v = [fd.freq(word) for word in all_tokens] elif metric == FrequencyMetrics.TF_IDF: v = [fd.freq(word) * math.log(float(num_docs)/doc_freqs[word]) for word in all_tokens] else: raise ValueError("No such feature type: %s" % feature_type); matrix[:,i] = v return matrix
def text_to_dict(docs, metric): """ Create dictionaries of term frequencies based on documents Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`. """ doc_freqs = FreqDist() # Distribution over how many documents each word appear in. tf_dists = [] # List of TF distributions per document # Create freq_dist for each document for doc in docs: doc = preprocess.preprocess_text(doc) fd = FreqDist() for word in doc: fd.inc(word) doc_freqs.update(fd.samples()) tf_dists.append(fd) num_docs = len(docs) # Build dictionaries dicts = [] for i, fd in enumerate(tf_dists): if i%100==0: print ' dict',str(i)+'/'+str(len(tf_dists)) d = {} if metric == FrequencyMetrics.TF: for word in fd.samples(): d[word] = fd.freq(word) elif metric == FrequencyMetrics.TF_IDF: for word in fd.samples(): d[word] = fd.freq(word) * math.log(float(num_docs)/doc_freqs[word]) else: raise ValueError("No such feature type: %s" % feature_type); dicts.append(d) return dicts
def get_models_predictions(text): preprocessed_text = preprocess_text(text) labels = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] with graph.as_default(): raw_probabilities = [ *map( lambda model: numpy.squeeze(model.predict(preprocessed_text), axis=0).tolist(), models) ] probabilities_with_labels = [ *map( lambda probability: [ *map( lambda i: { 'label': labels[i], 'probability': probability[i] }, range(0, 6)) ], raw_probabilities) ] averaged_probabilities = numpy.average(raw_probabilities, axis=0).tolist() return { 'probabilities_of_models': raw_probabilities, 'probabilities_of_models_with_labels': probabilities_with_labels, 'models_averaged_probabilities': averaged_probabilities, 'most_probable_category': { 'label': labels[numpy.argmax(averaged_probabilities)], 'probability': numpy.max(averaged_probabilities) } }
def text_to_dict(docs, metric): """ Create dictionaries of term frequencies based on documents Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`. """ doc_freqs = FreqDist( ) # Distribution over how many documents each word appear in. tf_dists = [] # List of TF distributions per document # Create freq_dist for each document for doc in docs: doc = preprocess.preprocess_text(doc) fd = FreqDist() for word in doc: fd.inc(word) doc_freqs.update(fd.samples()) tf_dists.append(fd) num_docs = len(docs) # Build dictionaries dicts = [] for i, fd in enumerate(tf_dists): if i % 100 == 0: print ' dict', str(i) + '/' + str(len(tf_dists)) d = {} if metric == FrequencyMetrics.TF: for word in fd.samples(): d[word] = fd.freq(word) elif metric == FrequencyMetrics.TF_IDF: for word in fd.samples(): d[word] = fd.freq(word) * math.log( float(num_docs) / doc_freqs[word]) else: raise ValueError("No such feature type: %s" % feature_type) dicts.append(d) return dicts
def index_markdown(markdown_filepath, ix_writer): file = path.basename(markdown_filepath) with open(markdown_filepath) as f: content = preprocess_text(f.read()) # Do any preprocessing here, but the QA model may also read from the filepath. ix_writer.add_document(title=file, content=content, filepath=markdown_filepath)
def generate_perplexity(self, n, sentences, r=[0.4, 0.6, 1]): for z in xrange(n): x = z + 1 self.nprob_dic[x] = self.nprob_dic[ x] if x in self.nprob_dic else self.generate_ngram(x) tokens = preprocess.preprocess_text(sentences).split() # Prepare sentences for each ngram # token_list = [[],[],[]] # token_list[0] = tokens.replace('<s>', '').split() # token_list[1] = tokens.split() # token_list[2] = tokens.replace('<s>', '<s1> <s2>').split() # tokens = tokens.split() # use unk_1 to repalce word not in ncounter_dic[1] self.ncounter_dic[1] = self.ncounter_dic[ 1] if 1 in self.ncounter_dic else self.ntoken_count(1) for i, token in enumerate(tokens): key = tuple([token]) if key not in self.ncounter_dic[1]: tokens[i] = '<unk_1>' # calculate perplexity perp = 0 _len = len(tokens) # iters = [0, 0, 0] for i in xrange(_len): prob_tup = [] for j in xrange(n): key = tuple(tokens[i - j:i + 1]) if j > 0: unk = '<unk_{}>'.format(j) if key != (): if key not in self.nprob_dic[j + 1]: key = tuple([unk, tokens[-1]]) if key == () or (('<s>' in key) and key[-1] != '<s>' and j > 0): prob_element = 0 else: prob_element = self.nprob_dic[j + 1][key] prob_tup.append(prob_element) ntemp = n - 1 while (prob_tup[ntemp] == 0 or prob_tup[ntemp] == 1): ntemp -= 1 prob = prob_tup[ntemp] * r[ntemp] perp -= log(prob) perp = exp(1.0 * perp / len(tokens)) return perp
def word_prob_solver(text): orig_text = text nlp, spacy_parser, dep_parser, tree_parser, verb_cats_json = init_parsers() text = preprocess_text(text) # print(text) document = nlp(text) non_lem_sents = [str(sent) for sent in document] h = get_num_dep_nouns(document, dep_parser) h_lem = set([]) is_h_lemmatized = False for h_noun in h: dh = nlp(h_noun) h_noun_lem = dh[0][0].lemma if h_noun_lem != h_noun: text = text.replace(h_noun, h_noun_lem) is_h_lemmatized = True h_lem.update([h_noun_lem]) all_h = deepcopy(h) h = list(h_lem) if is_h_lemmatized: document = nlp(text) # print(text) sentences, numbers = get_numbers(document) NPs, et = get_noun_phrases_entities(h, sentences, tree_parser) et = filter_et(et, sentences, numbers) document2 = spacy_parser(text) ex = get_ex(document2, sentences, h) numt = get_numt(et, numbers) process_bare_num(numbers, sentences, numt, et, h) vt = get_verbs(et + [ex], non_lem_sents, dep_parser, nlp) vx = vt[-1] del vt[-1] at, ax = get_attributes(et, ex, dep_parser) fragments = get_fragments(et, numt, vt, at, ex, vx, ax, sentences, non_lem_sents) assert len(fragments) == len(sentences) ct = get_containers(fragments, all_h, dep_parser, nlp) print("final fragments :", "\n") for fragment in fragments: print(fragment, "\n") fragx = fragments[-1] del fragments[-1] verb_cats = [] for fragment in fragments: verb_cats.append(verb_category(fragment[4], nlp, verb_cats_json)) states = get_states(fragments, verb_cats, ex, ax) equations = build_equations(states) solutions = solve_equations(equations) answer = get_answer(solutions, states, fragments, fragx, orig_text, nlp, verb_cats_json) return answer
def create_occurrences_dict(string): """Given a string, count occurences of preprocessed tokens.""" # Keeps track of preprocessed tokens count occurrences = {} tokens = preprocess_text(string) # Count occurences of tokens in string for token in tokens: if token not in occurrences: occurrences[token] = 0 occurrences[token] += 1 return occurrences
def get_test_loader(batch_size): #df = pd.read_csv(os.path.join(settings.DATA_DIR, 'test_clean.csv')) df = pd.read_csv(os.path.join(settings.DATA_DIR, 'test.csv')) #print(df.head()) df.comment_text = preprocess_text(df.comment_text) #print(df.head()) ds_test = ToxicDataset(df, train_mode=False, labeled=False) loader = data.DataLoader(ds_test, batch_size=batch_size, shuffle=False, num_workers=4, collate_fn=ds_test.collate_fn, drop_last=False) loader.num = len(df) return loader
def collect_data(root_dir, do_lemmatize=True, from_file='', encoding='cp1251'): data = OrderedDict() if from_file != '' and do_lemmatize: logging.info("loading data from file") with open(from_file, mode='rb') as art_pkl: data = pickle.load(art_pkl) else: for cur_root, dirs, files in os.walk(root_dir): for name in files: with open(os.path.join(cur_root, name), encoding=encoding) as tf: text = get_title(tf.name) if conf.only_title else tf.read() data[tf.name] = preprocess_text(text, do_lemmatize) logging.info("saving collected data") with open('./%s/articles.%spkl' % (SAVED_DIR, 'lemmatized.' if do_lemmatize else ''), mode='wb') as art_pkl: pickle.dump(data, art_pkl) return data
def predict_old(self, X_test): predicted = [] print('Testing..') for test_case in tqdm(X_test): target_sums = [self.target_data[t]['prior'] for t in self.targets] test_words = preprocess_text(test_case) for t_idx, t in enumerate(self.targets): t_data = self.target_data[t] for word in test_words: if word in t_data['likelihood']: target_sums[t_idx] += t_data['likelihood'][word] # Get biggest result predicted.append(self.targets[np.argmax(target_sums)]) return predicted
def get_train_val_loaders(batch_size=64, val_batch_size=256, val_percent=0.95, val_num=10000): #df = shuffle(pd.read_csv(os.path.join(settings.DATA_DIR, 'train_clean.csv')), random_state=1234) df = shuffle(pd.read_csv(os.path.join(settings.DATA_DIR, 'train.csv')), random_state=1234) #print(df.head()) df.comment_text = preprocess_text(df.comment_text) add_loss_weight(df) print(df.shape) split_index = int(len(df) * val_percent) df_train = df[:split_index] df_val = df[split_index:] if val_num is not None: df_val = df_val[:val_num] print(df_train.head()) print(df_val.head()) ds_train = ToxicDataset(df_train) train_loader = data.DataLoader(ds_train, batch_size=batch_size, shuffle=True, num_workers=4, collate_fn=ds_train.collate_fn, drop_last=True) train_loader.num = len(df_train) ds_val = ToxicDataset(df_val) val_loader = data.DataLoader(ds_val, batch_size=val_batch_size, shuffle=False, num_workers=4, collate_fn=ds_val.collate_fn, drop_last=False) val_loader.num = len(df_val) val_loader.df = df_val return train_loader, val_loader
def tokenize_cases(document): tokenized_document = [] for line in document: output = line.strip(' ') things = re.findall('\([\w]+\)', output) for thing in things: # makes the job of the section tokenizer easier while thing in output: output = output.replace(thing, '') while ' ' in output: output = output.replace(' ', ' ') output = output.lower().replace( 'section 7703', 'sec_7703') # somehow the tokenizer sometimes fails for this output = output.lower().replace( 'section 68', 'sec_68') # somehow the tokenizer sometimes fails for this output = preprocess_text(output.lower()) tokenized_document.append(output.strip('\n')) return tokenized_document
def detect_fake(text, device, model, tokenizer): text_parts = preprocess_text(text, device, tokenizer) overall_output = torch.zeros((1, 2)).to(device) try: for part in text_parts: if len(part) > 0: overall_output += model(part.reshape(1, -1))[0] except RuntimeError: print("GPU out of memory, skipping this entry.") overall_output = F.softmax(overall_output[0], dim=-1) value, result = overall_output.max(0) term = False if result.item() == 0: term = True print("Is real - {} at {}%".format(term, value.item() * 100)) return term, value.item() * 100
def test_preprocess_flow(self): words_dict, text = genertate_text() result_words_dict = preprocess.preprocess_text(text) for k, v in words_dict.items(): self.assertEqual(result_words_dict[k], v) cr = Crypto() encrypted = preprocess.words_dict_encrypt_and_hashed( result_words_dict, cr) decrypted = preprocess.words_dict_decrypt(encrypted, cr) for item in decrypted: word = item['word'] self.assertIn(word, words_dict.keys()) self.assertEqual(item['count'], words_dict[word]) with open(TEST_HTML) as f: text = f.read() # Next we expect found more 5 unique words in text self.assertGreater(len(preprocess.main_preprocess(text)), 5)
def predict(config, text, code, model=None, embedding_input=None): if model is None: model = load_model(config, code) preprocessed = preprocess_text(text) if embedding_input is None: embedding = [] word_model = load_word2vec(config.embeddings_model) for word in preprocessed.split(' '): if word in word_model.wv.index2word: vec = word_model.wv[word] embedding.append(vec) embedding_input = Variable( torch.Tensor(np_sentence_to_list(embedding))) pred = model(embedding_input) pred_label = pred.data.max(1)[1].numpy()[0] pred_char = get_char_for_binary(code, pred_label) return pred_char
def construct_random_network(doc, p=0.2): """Construct random network for use as baseline. Create a random network based on *doc*, with words used for nodes. Edges are created between any given pair of nodes (a,b) with probability *p*. All edges will have weight = 1.0 """ doc = preprocess.preprocess_text(doc) words = list(set(doc)) # list of unique words # create graph graph = nx.DiGraph() graph.add_nodes_from(words) # add edges for word_a in graph.nodes(): for word_b in graph.nodes(): if word_a != word_b and rand() < p: _update_edge_weight(graph, word_a, word_b) return graph
def predict(self, X_test): print('Testing {}...'.format(self.name)) predicted = [] lap_predicted = [] smooth_probs = [ math.log(1 / (t_data['doc_count'] + len(self.X_train))) for t, t_data in self.target_data.items() ] # TODO: multiprocessing for test_case in tqdm(X_test, unit='test'): target_sums = [self.target_data[t]['prior'] for t in self.targets] lap_target_sums = [ self.target_data[t]['prior'] for t in self.targets ] test_words = preprocess_text(test_case) for t_idx, t in enumerate(self.target_data): t_data = self.target_data[t] for word in test_words: if word in t_data['likelihood']: target_sums[t_idx] += t_data['likelihood'][word] lap_target_sums[t_idx] += t_data['lap_likelihood'][ word] else: for t2_idx, t2 in enumerate(self.targets): t2_data = self.target_data[t2] if t2 != t and word in t2_data['likelihood']: target_sums[t_idx] += smooth_probs[t_idx] lap_target_sums[t_idx] += smooth_probs[t_idx] break # Get biggest result predicted.append(self.targets[np.argmax(target_sums)]) lap_predicted.append(self.targets[np.argmax(lap_target_sums)]) return predicted, lap_predicted
def tokenize_statutes(document): current_section = '' tokenized_document = [] for line in document: # 1. normalize names of sections output = line if line.startswith('Section '): # remove leading 'Section XYZ' section_name = output.split('.')[0] rest = '.'.join(output.split('.')[1:]) new_section_name = 'sec_' + section_name.split(' ')[1] current_section = new_section_name output = rest things = re.findall('\([\w]+\)', output) for thing in things: while thing in output: output = output.replace(thing, '') while ' ' in output: output = output.replace(' ', ' ') output = current_section + ' ' + output # 2. tokenize output = preprocess_text(output.strip(' ')) tokenized_document.append(output.strip('\n')) return tokenized_document
def run(self): self.timerThread.start(time.time()) lines = preprocess_text(self.text) output = [] for count, line in enumerate(lines): _mutex1.lock() if _running1 == False: _mutex1.unlock() self.interruptSignal.emit() return else: _mutex1.unlock() self.iterSignal.emit((count, len(lines))) sequence = np.array(text_to_sequence( line, ['english_cleaners']))[None, :] device = torch.device('cuda' if self.use_cuda else 'cpu') sequence = torch.autograd.Variable( torch.from_numpy(sequence)).to(device).long() # Decode text input mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference( sequence) with torch.no_grad(): audio = self.waveglow.infer( mel_outputs_postnet, sigma=0.666, progress_callback=self.progress, elapsed_callback=self.elapsed, get_interruptflag=self.get_interruptflag) if type(audio) != torch.Tensor: # Catches when waveglow is interrupted and returns none self.interruptSignal.emit() return self.iterSignal.emit((count + 1, len(lines))) wav = audio[0].data.cpu().numpy() output.append(wav) outwav = np.concatenate(output) self.audioSignal.emit(outwav)
def text_to_vector(docs, metric): """ Create frequency based feature-vector from text Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`. """ doc_freqs = FreqDist( ) # Distribution over how many documents each word appear in. tf_dists = [] # List of TF distributions per document # Create freq_dist for each document for doc in docs: doc = preprocess.preprocess_text(doc) fd = FreqDist() for word in doc: fd.inc(word) doc_freqs.update(fd.samples()) tf_dists.append(fd) all_tokens = doc_freqs.keys() num_docs = len(docs) num_features = len(all_tokens) # Build feature x document matrix matrix = np.zeros((num_features, num_docs)) for i, fd in enumerate(tf_dists): if metric == FrequencyMetrics.TF: v = [fd.freq(word) for word in all_tokens] elif metric == FrequencyMetrics.TF_IDF: v = [ fd.freq(word) * math.log(float(num_docs) / doc_freqs[word]) for word in all_tokens ] else: raise ValueError("No such feature type: %s" % feature_type) matrix[:, i] = v return matrix
def tokenize_sentences(sentences): """Tokenizes sentences using Preprocessor""" for sen in sentences: sen.tokens = preprocess.preprocess_text(sen.original)
import pickle from preprocess import preprocess_text text = open('./data/alice/alice-in-wonderland.txt', 'r').read() preprocess_text(text, './data/alice/alice-processed.pickle')
from preprocess import preprocess_text text = open('./data/shakespeare/sonnets.txt', 'r').read() preprocess_text(text, './data/shakespeare/processed.pickle')
def main(csvfile): # Read data reviews_df = pd.read_csv(csvfile) # Removing NA's reviews_df = reviews_df.dropna() reviews_df = reviews_df.reset_index(drop=True) # Create labels # Divide Reviewer_Scores into four classes, 3 with score>7.5, 2 with score > 5, 2 with score >2.5, 0 with score <2.5 reviews_df["Label"] = reviews_df["Reviewer_Score"].apply( lambda x: 3 if x > 7.5 else (2 if x > 5 else (1 if x > 2.5 else 0))) reviews_df = reviews_df[[ "Additional_Number_of_Scoring", "Average_Score", "Review_Total_Negative_Word_Counts", "Review_Total_Positive_Word_Counts", "Total_Number_of_Reviews_Reviewer_Has_Given", "Negative_Review", "Positive_Review", "Label" ]] # The whole dataset is too large and here only take 30% of the dataset reviews_df = reviews_df.sample(frac=0.3, replace=False, random_state=42) # PART 1: Prediction without nlp feature print("Without NLP features:") # Feature selection features = [ "Additional_Number_of_Scoring", "Average_Score", "Review_Total_Negative_Word_Counts", "Review_Total_Positive_Word_Counts", "Total_Number_of_Reviews_Reviewer_Has_Given" ] X_train, X_test, y_train, y_test = train_test_split(reviews_df[features], reviews_df["Label"], test_size=0.30, random_state=20) # Logistic Regression logistic_regression(X_train, y_train, X_test, y_test) # Random Forest random_forest(X_train, y_train, X_test, y_test) # XGBoost xgboost(X_train, y_train, X_test, y_test) # PART 2: Prediction with adding nlp features print("With NLP features:") # Append the positive and negative text reviews reviews_df["review"] = reviews_df["Negative_Review"] + reviews_df[ "Positive_Review"] # Remove 'No Negative' or 'No Positive' from text reviews_df["review"] = reviews_df["review"].apply( lambda x: x.replace("No Negative", "").replace("No Positive", "")) # Clean text data print("Start preprocessing textual columns...") reviews_df["review_clean"] = reviews_df["review"].apply( lambda x: preprocess_text(x)) # Train a Doc2Vec model with text data print("Adding Doc2Vec...") reviews_df = doc2vec(reviews_df) # Add tf-idf columns print("Adding TF-IDF...") reviews_df = tf_idf(reviews_df) # Feature selection label = "Label" ignore_cols = [ label, "review", "review_clean", "Negative_Review", "Positive_Review" ] features_2 = [c for c in reviews_df.columns if c not in ignore_cols] X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split( reviews_df[features_2], reviews_df["Label"], test_size=0.30, random_state=20) # Logistic Regression logistic_regression(X_train_2, y_train_2, X_test_2, y_test_2) # Random Forest random_forest(X_train_2, y_train_2, X_test_2, y_test_2) # XGBoost with nlp features xgboost(X_train_2, y_train_2, X_test_2, y_test_2)
def map_function(dynamicRecord): tweet = dynamicRecord["tweet"] features = preprocess.preprocess_text(tweet, 140) dynamicRecord["features"] = features return dynamicRecord
def _text_to_preprocessed_text(text): """Convert text to preprocessed text""" prep = preprocess.preprocess_text(text) return ' '.join(prep)
def execute_this_fn(self, TOKEN, min_donation, channel, se_opts, use_cuda, model, waveglow, offset, prev_time, startup_time, progress_callback, elapsed_callback, text_ready, fn_callback): # TODO: refactor this messy block fn_callback.emit(('GUI: start of polling loop', None)) text_ready.emit("Sta2:Connecting to StreamElements") url = "https://api.streamelements.com/kappa/v2/tips/" + self.channel_id headers = { 'accept': 'application/json', "Authorization": "Bearer " + TOKEN } text_ready.emit('Log2:Initializing') text_ready.emit('Log2:Minimum amount for TTS: ' + str(min_donation)) while True: _mutex2.lock() if _running2 == False: _mutex2.unlock() break else: _mutex2.unlock() if not channel.get_busy(): #print('Polling', datetime.datetime.utcnow().isoformat()) text_ready.emit("Sta2:Waiting for incoming donations . . .") current_time = datetime.datetime.utcnow().isoformat() # TODO: possible bug: missed donations once time pasts midnight querystring = { "offset": offset, "limit": "1", "sort": "createdAt", "after": startup_time, "before": current_time } response = requests.request("GET", url, headers=headers, params=querystring) data = json.loads(response.text) for dono in data['docs']: text_ready.emit("Sta2:Processing donations") dono_time = dono['createdAt'] offset += 1 if dono_time > prev_time: # Str comparison amount = dono['donation']['amount'] # Int if float(amount) >= min_donation and dono[ 'approved'] == 'allowed': name = dono['donation']['user']['username'] msg = dono['donation']['message'] if msg.isspace(): break # Check for empty line ## TODO Allow multiple speaker in msg currency = dono['donation']['currency'] dono_id = dono['_id'] text_ready.emit( "Log2:\n###########################") text_ready.emit("Log2:" + name + ' donated ' + currency + str(amount)) text_ready.emit("Log2:" + msg) lines = preprocess_text(msg) if se_opts[ 'read dono amount'] == 1: # reads dono name and amount msg = '{} donated {} {}.'.format( name, str(amount), cleaners.expand_currency(currency)) lines.insert(0, msg) # Add to head to list output = [] for count, line in enumerate(lines): fn_callback.emit( ('GUI: progress bar 2 text', (count, len(lines)))) sequence = np.array( text_to_sequence( line, ['english_cleaners']))[None, :] # Inference device = torch.device( 'cuda' if use_cuda else 'cpu') sequence = torch.autograd.Variable( torch.from_numpy(sequence)).to( device).long() # Decode text input mel_outputs, mel_outputs_postnet, _, alignments = model.inference( sequence) with torch.no_grad(): audio = waveglow.infer( mel_outputs_postnet, sigma=0.666, progress_callback=progress_callback, elapsed_callback=None, get_interruptflag=self. get_interruptflag2) if type(audio) != torch.Tensor: # Catches when waveglow is interrupted and returns none break fn_callback.emit( ('GUI: progress bar 2 text', (count + 1, len(lines)))) wav = audio[0].data.cpu().numpy() output.append(wav) _mutex3.lock() if _running3 == True: _mutex3.unlock() outwav = np.concatenate(output) # Playback fn_callback.emit(('Wav: playback', outwav)) else: _mutex3.unlock() prev_time = dono_time # Increment time time.sleep(0.5) fn_callback.emit(('GUI: end of polling loop', None)) text_ready.emit('Log2:\nDisconnected') text_ready.emit('Sta2:Ready') fn_callback.emit(('Var: offset', offset)) fn_callback.emit(('Var: prev_time', prev_time)) return #'Return value of execute_this_fn'
from preprocess import preprocess_text book_paths = [ './data/fitzgerald/beautiful-and-damned.txt', './data/fitzgerald/flappers-and-philosophers.txt', './data/fitzgerald/tales-of-the-jazz-age.txt', './data/fitzgerald/this-side-of-paradise.txt', ] # Combine books into 1 big book combined_text = '' combined_len = 0 for path in book_paths: txt = open(path, 'r').read() combined_len = combined_len + len(txt) combined_text = combined_text + ' ' + txt preprocess_text(combined_text, './data/fitzgerald/processed-all-books.pickle')
ents = nlp(text).ents glose_ents = [] for ent in ents: if ent.label_ == "PERSON": cat = "PER" elif ent.label_ in ["ORG", "LOC"]: cat = ent.label_ else: cat = "MISC" glose_ent = GloseEntity(ent.text, ent.start_char, ent.end_char, cat) glose_ents.append(glose_ent) elif args.model != None: from keras.models import load_model lword_id_sents, casing_id_sents, pos_sents = preprocess_text(text) model_path = get_model_path(args.model) model = load_model(model_path) pred_label_id_sents = model.predict([lword_id_sents, casing_id_sents]).argmax(axis=2) # Create entities from model predictions glose_ents = [] for pred_label_id_sent, pos_sent in zip(pred_label_id_sents, pos_sents): # Remove padding pred_label_id_sent = pred_label_id_sent[-len(pos_sent):] ent = None