def predict_class(text): logging.info(f"Input text: {text}") logging.info("cleaning input text") text = clean_text(text) sentence = [text] tokenizer = load_tokenizer() logging.info("trained tokenizer loaded") word_index = tokenizer.word_index vocab_size = len(word_index) text_sequences = tokenizer.texts_to_sequences(sentence) text_padded = pad_sequences(text_sequences, padding=PADDING_TYPE, truncating=TRUNC_TYPE, maxlen=MAX_LENGTH) logging.info("creating embedding matrix using Glove Embeddings") embedding_matrix = embedding_matrix_glove(word_index) logging.info(f"Embeddings Weights created {embedding_matrix.shape}") logging.info("getting pre-trained model") model = create_model(vocab_size, EMBEDDING_DIM, MAX_LENGTH, embedding_matrix) print(model.summary()) logging.info("loading model weights") model.load_weights(MODEL) predict = model.predict(text_padded) predict = np.argmax(predict) predicted_main_product = get_main_product(predict) logging.info(f'Predicted Main Product: {predicted_main_product}')
def clean(emoticons, src="data/tweets/original", dest="data/tweets/clean/"): """ Clean Tweet: - remove @username mentions - remove http - expand basic acronyms like we'll -> we will - remove emoticons """ print("Cleaing tweets...") src = os.path.join(src, '*.js') files = sorted(glob.glob(src)) for filename in files: print("Loading from {}".format(filename)) fn = os.path.splitext(os.path.basename(filename))[0] with open(filename) as f: f.readline() data = [] for d in json.load(f): text = d['text'] text = add_sentence_boundary(text) text = clean_text(text) text = remove_emoticons(text, emoticons) data.append({'text': text, 'created_at': d['created_at']}) dumpname = os.path.join(dest, fn) + '.json' print("Dumping to {}".format(dumpname)) with open(dumpname, 'w') as f: json.dump(data, f)
def parse_doc(self, input): ''' Assume input is a sequence of sentences. split multiple sentences and apply nlp parse ''' doc = self.nlp(clean_text(input)) return [self.parse_sentence(sent.text) for sent in doc.sents]
def main(): global hidden_size, data, English, vectors, words, word2idx, glove hidden_size = 100 vectors = bcolz.open(f'Embedding/6B.300d.dat')[:] words = pickle.load(open(f'Embedding/6B.300_words.pkl', 'rb')) word2idx = pickle.load(open(f'Embedding/6B.300_idx.pkl', 'rb')) glove = {w: vectors[word2idx[w]] for w in words} print("Imported Embedding Data") fullData = pd.read_csv("dataset.csv") print("Imported Full Data") data = [] English = Lang('English') print("Counting words...") for article, toxicity in zip(fullData['article'], fullData['toxicity']): English.addSentence(article) data.append(list([article, toxicity])) print("Counted Words:") print(English.name, English.n_words) train, test = train_test_split(data, random_state=42, test_size=0.4, shuffle=True) print("Data Split") target_vocab = English.word2index.keys() weights_matrix = get_weights_matrix(target_vocab) encoder1 = EncoderRNN(weights_matrix, English.n_words, hidden_size).to(device) fnn1 = FullyConnectedNN(hidden_size).to(device) encoder1.load_state_dict(torch.load('cpu_encoder.pt')) fnn1.load_state_dict(torch.load('cpu_fcn.pt')) sentence = sys.argv[1] sentence = clean_text(sentence) output = evaluate(encoder1, fnn1, sentence, English) output = F.sigmoid(output) if output >= 0.7: output = 1 else: output = 0 print(output)
def pre_news_raw_text(news_id): title_text = df_analysis.loc[df_analysis['id'] == news_id, 'title'].values[0] src_text = df_analysis.loc[df_analysis['id'] == news_id, 'source'].values[0] src_text = src_text + ".json" text_dir = st.d_news_text text_path = text_dir + st.d_delimiter + src_text special = r"(\W|\\u[\w][\w][\w][\w])*" title_text = title_text.replace('[', '\[') title_text = title_text.replace(']', '\]') title_text = title_text.replace('?', '\?') title_text = title_text.replace('(', '\(') title_text = title_text.replace(')', '\)') title_text = title_text.replace('.', '\.') title_text = title_text.replace(',', '\,') title_text = title_text.replace('"', '\",') title_text = title_text.replace(' ', special) re_title = r"title\":.*" + title_text + r".*(date|content)\"" l_text = [] with open(text_path) as f: l_text = [l for l in f if re.search(re_title, l)] for index, text in enumerate(l_text): tmp_text = text tmp_text = pre.clean_convertion(tmp_text) tmp_text = pre.clean_links_text(tmp_text) tmp_text = pre.clean_text(tmp_text) tmp_text = pre.clean_escape_char(tmp_text) ''' Clean up return line ''' escape_char = re.compile(r'\\n') tmp_text = re.sub(escape_char, ' ', tmp_text, re.MULTILINE | re.IGNORECASE).strip() escape_char = re.compile(r'\\n') tmp_text = re.sub(escape_char, ' ', tmp_text, re.MULTILINE | re.IGNORECASE).strip() escape_char = re.compile(r'\\n') tmp_text = re.sub(escape_char, ' ', tmp_text, re.MULTILINE | re.IGNORECASE).strip() l_text[index] = tmp_text return list(OrderedDict.fromkeys(l_text))
def words_pos(self, input): """ add pos or entity info to each word in the input """ doc = self.nlp(clean_text(input)) self.join_entity(doc) strings = [] for sent in doc.sents: if sent.text.strip(): strings.append(' '.join( self.represent_word(w) for w in sent if not w.is_space)) if strings: return '\n'.join(strings) + '\n' else: return ''
def fetch(page): result = newsapi.get_everything(sources=sources, language='en', page=page) articles = result['articles'] for article in articles: if (not article['title']) or (not article['description']): continue values = [ article['source']['name'], article['author'], dumps(article['title']), dumps(article['description']), article['url'], article['urlToImage'], article['publishedAt'] ] text = clean_text(crawl(values[4], article['source']['id'])) category = model.predict(vect.transform([text]))[0] insert(values, category) print(category) print('=' * len(category)) print(text) print('')
def parse_sentence(self, input): ''' parse each sentence and return the spacy's nlp properties Pipeline: sentence |> nlp.tokenizer |> nlp.tagger |> nlp.parser |> nlp.entity ''' line = self.nlp(clean_text(input)) keyphrases = self.find_keyphrases(line) output = { 'text': line.text, "entities": [(entity.text, entity.label_) for entity in line.ents], # Given we will be dealing with small text, I am assuming we do not need to compute word frequency to find keywords "keyphrases": list(keyphrases), 'words': [(w.text, w.tag_, w.pos_, w.ent_type_, w.dep_) for w in line], 'persons': [(entity.text, extract_entity(entity.text)["data"]) for entity in line.ents if entity.label_ == 'PERSON'], 'products': [ entity.text for entity in line.ents if entity.label_ == 'PRODUCT' ], 'organizations': [(entity.text, extract_entity(entity.text)["data"]) for entity in line.ents if entity.label_ == 'ORG'], 'medias': [ entity.text for entity in line.ents if entity.label_ == 'WORK_OF_ART' ], 'locations': [entity.text for entity in line.ents if entity.label_ == 'GPE'], 'info': extract_info(line.text), 'nouns': [w.text for w in line if w.tag_ == 'NN'], 'action_verbs': [w.text for w in line if w.tag_ == 'VB'], 'subject': [w.text for w in line if w.dep_ == 'nsubj'], 'object': [w.text for w in line if w.dep_ == 'nobj'] } return output
def evaluate(sentence, samp_type = 1): sentence = clean_text(sentence) inputs = [] # split the sentence and replace unknown words by <unk> token. for i in sentence.split(' '): try: inputs.append(inp_lang.word_index[i]) except KeyError: inputs.append(inp_lang.word_index['<unk>']) inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen = max_sentence_length, padding='post') inputs = tf.convert_to_tensor(inputs) result = '' enc_output, enc_hidden = encoder(inputs) dec_hidden = enc_hidden dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0) for t in range(max_sentence_length): predictions, dec_hidden = decoder([enc_output, dec_hidden, dec_input]) if samp_type == 1: # that means simple greedy sampling predicted_id = tf.argmax(predictions[0]).numpy() elif samp_type == 2: predicted_id = np.random.choice(vocab_tar_size, p = predictions[0].numpy()) elif samp_type == 3: _ , indices = tf.math.top_k(predictions[0], k = 3) predicted_id = np.random.choice(indices) if predicted_id!= 0: if targ_lang.index_word[predicted_id] == '<end>': return result, sentence else: result += targ_lang.index_word[predicted_id] + ' ' # the predicted ID is fed back into the model dec_input = tf.expand_dims([predicted_id], 0) return result, sentence
def find_keyphrases(self, line): if not line.text: line = self.nlp(clean_text(line)) keyphrases = set([word.text for word in line.ents]) # add nouns that have word_vectors nouns = [w.text for w in line if w.tag_ in ['NN', 'NNP']] # add noun chunks that have word vectors candidates = [word.text for word in line.noun_chunks] # add noun_chunks if noun in noun chunks else add noun if nouns and candidates: for noun in nouns: add_noun = True for candidate in candidates: if noun in candidate: add_noun = False keyphrases.add(candidate) if add_noun: keyphrases.add(noun) return keyphrases
index = 1 for category in categories: path = './dataset/%s' % (category, ) files = listdir(path) if (len(files) < 81): continue count = 0 for file in files: with open(path + '/' + file, 'r') as fp: data = json.loads(fp.read()) for article in data['posts']: if (count >= limit): break text = ' '.join([ Word(word).lemmatize() for word in clean_text(article['text']).split() ]) if (not len(text)): continue x.append(text) y.append(category) count += 1 else: continue break print colored([index, category, count], 'green') index += 1 dataset = [x, y] with open('dataset.json', 'w') as fp:
from preprocess import apply_lemmatization, apply_stemming, extract_n_grams from vectorizer import create_TF_IDF_matrix from cluster import k_mean test_file_name = '../data/Test Corpus/Test Corpus.txt' stopwords_file = '../data/Stopwords/Basic Stopwords List.txt' # 1. a lines = get_file_lines(test_file_name) print("***********Test-1***********") print(lines) print("****************************") # 1. b for i in range(len(lines)): lines[i] = clean_text(lines[i]) print("***********Test-2***********") print(lines) print("****************************") # 1.c for i in range(len(lines)): lines[i] = remove_stopwords(stopwords_file, lines[i], do_clean=True) print("***********Test-3***********") print(lines) print("****************************") # 1.d for i in range(len(lines)): lines[i] = apply_stemming(lines[i]) lines[i] = apply_lemmatization(lines[i])
def launch_model(): full_text = request.form['full_text'] id_ = request.form['id'] model_type = request.form['model_type'] global BERT, JOINT, GRANU, MGN, NUM_TASK, MASKING, HIER BERT = model_type == BERT_PATH JOINT = model_type == JOINT_BERT_PATH GRANU = model_type == GRANU_BERT_PATH MGN = model_type == MGN_SIGM_BERT_PATH # either of the four variants: # BERT = False # JOINT = False # GRANU = False # MGN = True assert BERT or JOINT or GRANU or MGN assert not (BERT and JOINT) and not (BERT and GRANU) and not (BERT and MGN) \ and not (JOINT and GRANU) and not (JOINT and MGN) and not (GRANU and MGN) # either of the two variants SIGMOID_ACTIVATION = True RELU_ACTIVATION = False assert not (SIGMOID_ACTIVATION and RELU_ACTIVATION) and ( SIGMOID_ACTIVATION or RELU_ACTIVATION) if BERT: NUM_TASK = 1 MASKING = 0 HIER = 0 elif JOINT: NUM_TASK = 2 MASKING = 0 HIER = 0 elif GRANU: NUM_TASK = 2 MASKING = 0 HIER = 1 elif MGN: NUM_TASK = 2 MASKING = 1 HIER = 0 else: raise ValueError( "You should choose one of bert, joint, granu and mgn in options") dct = { 'NUM_TASK': NUM_TASK, 'MASKING': MASKING, 'SIGMOID_ACTIVATION': SIGMOID_ACTIVATION, 'HIER': HIER } model = load_model(model_type, **dct) if not id_: ids = get_existent_ids() id_ = random_module.randint(0, N) while id_ in ids: id_ = random_module.randint(0, N) with open(DIRECTORY_PREDICT.joinpath(f'article{id_}.txt'), 'w', encoding='utf-8') as f: f.write(full_text) text = overwrite_one_article(id_, directory=DIRECTORY_PREDICT) my_predict_dataset = PropDataset(DIRECTORY_PREDICT, is_test=True) my_predict_iter = data.DataLoader(dataset=my_predict_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=1, collate_fn=pad) tmp_file = 'tmp.txt' eval(model, my_predict_iter, tmp_file, criterion, binary_criterion, NUM_TASK=NUM_TASK) ids, texts = read_data(DIRECTORY_PREDICT, is_test=True) t_texts = clean_text(texts, ids) flat_texts = [sentence for article in t_texts for sentence in article] fi, prop_sents = convert(NUM_TASK - 1, flat_texts, tmp_file) prop_sents = prop_sents[id_] prop_sents = ['1' if elem else '' for elem in prop_sents] results = remove_duplicates(fi) DIRECTORY_PREDICT.joinpath(f'article{id_}.txt').rename( DIRECTORY_MARKUP.joinpath(f'article{id_}.txt')) lst = [set() for _ in range(len(full_text))] source_lst = [set() for _ in range(len(full_text))] for inner_lst in results: for i in range(inner_lst[-2], inner_lst[-1]): lst[i].add(HUMAN_READABLE_TECHNIQUES[TECHNIQUES.index( inner_lst[-3])]) source_lst[i].add(inner_lst[-3]) extracts_s_e = [] extracts = [] categories = [] for elem in fi: if elem[0] != str(id_): continue _, category, start, end = elem extracts_s_e.append((start, end)) extracts.append(text[start:end]) categories.append(category) extracts = [ ' '.join(normalize(extract.strip())) for extract in extracts if extract ] print(f'extracts: {extracts}') # CHECK # extracts = [word for sent in extracts for word in sent.split()] test_x, test_maxlen = get_data(extracts, vocab_size=args.vocab_size, maxlen=args.maxlen) test_x = sequence.pad_sequences(test_x, maxlen=max(train_maxlen, test_maxlen)) test_length = test_x.shape[0] splits = [] for i in range(1, test_length // args.batch_size): splits.append(args.batch_size * i) if test_length % args.batch_size: splits += [(test_length // args.batch_size) * args.batch_size] test_x = np.split(test_x, splits) with graph.as_default(): aspect_model = keras_load_model(os.path.join('flask_app', 'output', 'reviews', 'model_param'), custom_objects={ "Attention": Attention, "Average": Average, "WeightedSum": WeightedSum, "MaxMargin": MaxMargin, "WeightedAspectEmb": WeightedAspectEmb, "max_margin_loss": U.max_margin_loss }, compile=True) test_fn = K.function([ aspect_model.get_layer('sentence_input').input, K.learning_phase() ], [ aspect_model.get_layer('att_weights').output, aspect_model.get_layer('p_t').output ]) aspect_probs = [] for batch in tqdm(test_x): _, cur_aspect_probs = test_fn([batch, 0]) aspect_probs.append(cur_aspect_probs) aspect_probs = np.concatenate(aspect_probs) label_ids = np.argsort(aspect_probs, axis=1)[:, -5:] for i, labels in enumerate(label_ids): print( f'{extracts[i]}: {[aspects[label] for label in labels][::-1]}') correct_lst = ['; '.join(list(elem)) for elem in lst] commands = { extract: ([aspects[label] for label in label_ids[i]][::-1], []) for i, extract in enumerate(extracts) } write_existent_dict(id_, source_lst, directory=DIRECTORY_MARKUP) for f in glob.glob(f'{DIRECTORY_PREDICT}/*'): os.remove(f) return jsonify( result={ 'id': id_, 'list': correct_lst, 'text': text, 'prop_sents': prop_sents, 'commands': commands })
if classifier is None: print("Load training first") else: path = command.split(" ") if (len(path) < 2): print("Please enter filepath") else: path = path[1] path = Path('.').joinpath(path) text = "" try: text = path.open('r', encoding='utf-8').read() except OSError as e: print("File doesn't exist/Invalid path") print(text) text = pp.clean_text(text) pos, neg = classifier.test(text) print("CLASS: ", end='') if pos == classifier.pos_prior or neg == classifier.neg_prior: print("SOMETHING WENT WRONG") elif pos >= neg: print("POSITIVE") else: print("NEGATIVE") #Display stats clause elif command.startswith('d'): scores = pp.load_stats() pp.print_stats(scores) #Refresh menu clause elif command.startswith('m'):
def launch_model(): full_text = request.form['full_text'] id_ = request.form['id'] model_type = request.form['model_type'] global BERT, JOINT, GRANU, MGN, NUM_TASK, MASKING, HIER BERT = model_type == BERT_PATH JOINT = model_type == JOINT_BERT_PATH GRANU = model_type == GRANU_BERT_PATH MGN = model_type == MGN_SIGM_BERT_PATH # either of the four variants: # BERT = False # JOINT = False # GRANU = False # MGN = True assert BERT or JOINT or GRANU or MGN assert not (BERT and JOINT) and not (BERT and GRANU) and not (BERT and MGN) \ and not (JOINT and GRANU) and not (JOINT and MGN) and not (GRANU and MGN) # either of the two variants SIGMOID_ACTIVATION = True RELU_ACTIVATION = False assert not (SIGMOID_ACTIVATION and RELU_ACTIVATION) and ( SIGMOID_ACTIVATION or RELU_ACTIVATION) if BERT: NUM_TASK = 1 MASKING = 0 HIER = 0 elif JOINT: NUM_TASK = 2 MASKING = 0 HIER = 0 elif GRANU: NUM_TASK = 2 MASKING = 0 HIER = 1 elif MGN: NUM_TASK = 2 MASKING = 1 HIER = 0 else: raise ValueError( "You should choose one of bert, joint, granu and mgn in options") dct = { 'NUM_TASK': NUM_TASK, 'MASKING': MASKING, 'SIGMOID_ACTIVATION': SIGMOID_ACTIVATION, 'HIER': HIER } model = load_model(model_type, **dct) print(1) if not id_: print(2) ids = get_existent_ids() print(3) id_ = random_module.randint(0, N) print(4) while id_ in ids: id_ = random_module.randint(0, N) print(5) with open(DIRECTORY_PREDICT.joinpath(f'article{id_}.txt'), 'w', encoding='utf-8') as f: f.write(full_text) print(6) print(7) text = overwrite_one_article(id_, directory=DIRECTORY_PREDICT) print(8) my_predict_dataset = PropDataset(DIRECTORY_PREDICT, is_test=True) print(9) my_predict_iter = data.DataLoader(dataset=my_predict_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=1, collate_fn=pad) print(10) tmp_file = 'tmp.txt' print(11) eval(model, my_predict_iter, tmp_file, criterion, binary_criterion, NUM_TASK=NUM_TASK) print(12) ids, texts = read_data(DIRECTORY_PREDICT, is_test=True) print(13) t_texts = clean_text(texts, ids) print(14) flat_texts = [sentence for article in t_texts for sentence in article] print(15) fi, prop_sents = convert(NUM_TASK - 1, flat_texts, tmp_file) print(16) prop_sents = prop_sents[id_] print(17) prop_sents = ['1' if elem else '' for elem in prop_sents] print(18) results = remove_duplicates(fi) print(19) DIRECTORY_PREDICT.joinpath(f'article{id_}.txt').rename( DIRECTORY_MARKUP.joinpath(f'article{id_}.txt')) print(20) lst = [set() for _ in range(len(full_text))] print(21) source_lst = [set() for _ in range(len(full_text))] print(22) for inner_lst in results: for i in range(inner_lst[-2], inner_lst[-1]): lst[i].add(HUMAN_READABLE_TECHNIQUES[TECHNIQUES.index( inner_lst[-3])]) source_lst[i].add(inner_lst[-3]) print(23) correct_lst = ['; '.join(list(elem)) for elem in lst] print(24) write_existent_dict(id_, source_lst, directory=DIRECTORY_MARKUP) print(25) return jsonify(result={ 'id': id_, 'list': correct_lst, 'text': text, 'prop_sents': prop_sents })
from keras.layers import Dense, LSTM, Dropout from keras.layers.embeddings import Embedding from keras.models import Sequential from keras.preprocessing.sequence import pad_sequences from keras.preprocessing.text import Tokenizer from preprocess import clean_text pd.set_option('display.max_columns', 500) data_train = pd.read_csv("resources/train.csv", low_memory=False) data_test = pd.read_csv("resources/test.csv", low_memory=False) df = pd.concat([data_train, data_test], sort=False) df = df.reset_index(drop=True) df.comment_text = df.comment_text.map(lambda x: clean_text(x)) start = time.time() comments = df.comment_text pickle.dump(comments, open("test.pkl", "wb")) corpus_comments = pickle.load(open("test.pkl", "rb")) end = time.time() # number of seconds taken step = end - start print(step) train_cl = df[:data_train.shape[0]] test_cl = df[data_train.shape[0]:]
pred_test_y2 = model.predict_proba(test_X2.multiply(r))[:, 1] return pred_test_y, pred_test_y2 print('~~~~~~~~~~~~~~~~~~~') print_step('Importing Data') train, test = get_data() train['non_toxic'] = train[[ 'toxic', 'severe_toxic', 'obscene', 'insult', 'threat', 'identity_hate' ]].sum(axis=1).apply(lambda x: 0 if x > 1 else 1) save_in_cache('extra_label', train, test) if not is_in_cache('cleaned'): print('~~~~~~~~~~~~~') print_step('Cleaning') train_cleaned, test_cleaned = clean_text(train, test) save_in_cache('cleaned', train_cleaned, test_cleaned) else: train_cleaned, test_cleaned = load_cache('cleaned') print('~~~~~~~~~~~~~~~~~~~~~~~~') print_step('Making KFold for CV') kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2017) if not is_in_cache('tfidf_word'): print('~~~~~~~~~~~~~~~~~~~') print_step('Run TFIDF WORD') TFIDF_PARAMS_WORD.update({'train': train, 'test': test}) post_train, post_test = run_tfidf(**TFIDF_PARAMS_WORD) save_in_cache('tfidf_word', post_train, post_test) del post_train
def tokenizer(doc): doc = preprocess.clean_text(doc) return preprocess.lemmatize(doc)