def make_w2vec_matrix(question, paragraph, model=word2vec): train_question = preprocess_sentence(question) train_answers = preprocess_sentence(paragraph) tokens_question = ViTokenizer.tokenize(train_question).split() tokens_answer = ViTokenizer.tokenize(train_answers).split() question_embs = [] answer_embs = [] for i in range(len(tokens_question)): if tokens_question[i] in model: question_embs.append(model[tokens_question[i]]) else: question_embs.append(model['unknown']) for i in range(len(tokens_answer)): if tokens_answer[i] in model: answer_embs.append(model[tokens_answer[i]]) else: answer_embs.append(model['unknown']) question_embs = np.array(question_embs) answer_embs = np.array(answer_embs) """ if question_embs.shape[0] < MIN_LENGTH_QUESTION: question_embs = np.pad(question_embs, ((4,4), (0,0))) """ if answer_embs.shape[0] < MIN_LENGTH_ANSWER: paddings = np.ceil(MIN_LENGTH_ANSWER / answer_embs.shape[0]) d = np.copy(answer_embs) for i in range(int(paddings)): answer_embs = np.concatenate((answer_embs, d)) return question_embs, answer_embs
def query(self, query_string): query_string = self.process_spell_errors(query_string) query_terms = preprocess_sentence(query_string) query_docs = [] for i in range(len(query_terms)): newdocdict = copy.deepcopy(self.index[query_terms[i]]) for doc in newdocdict: #doc is an id with value = list newdocdict[doc] = set(map(lambda x: x - i, newdocdict[doc])) query_docs.append(newdocdict) #query docs is a list of dictionaries, where each dict correpsonds to the posting list of 1 query term answer = [] for doc in query_docs[0]: #do something docflag = True for position in query_docs[0][doc]: posflag = True for other_doclist in query_docs[1:]: if doc not in other_doclist: docflag = False break if position not in other_doclist[doc]: posflag = False break if docflag and posflag: answer.append(doc) break if not docflag: break return answer
def index(): if request.method == 'POST': review = request.form['review'] data = preprocess_sentence(review) aspect_extractor = AspectExtractor() bio, aspect_terms = aspect_extractor.extract_aspect(data, review) bio = convert_bio(bio) aspects = get_aspects(data, bio) aspect_map = {} for i in range(len(aspects)): aspect_map[aspect_terms[i]] = aspects[i] sentiment_food = predictData([review], "food") sentiment_price = predictData([review], "price") sentiment_place = predictData([review], "place") sentiment_service = predictData([review], "service") return render_template('index.html', review=review, bio=bio, aspect_terms=aspect_terms, aspects=aspect_map, food=sentiment_food, price=sentiment_price, place=sentiment_place, service=sentiment_service) else: return render_template('index.html')
def generate_actions(path): """ Reads csv through csv.DictReader() and yields a single document for each record. This function is passed into the bulk() helper to create many documents in sequence. """ uid = 0 for _csv in tqdm(sorted(os.listdir(path))): file = os.path.join(path, _csv) with open(file, mode="r") as f: reader = csv.DictReader(f) for row in reader: doc = { "id": uid, "document_name": _csv, # "URL" : row["\ufeffURL"], # "MatchDateTime" : row["MatchDateTime"], "Station": row["Station"], "Show": row["Show"], # "IAShowID" : row["IAShowID"], # "IAPreviewThumb" : row["IAPreviewThumb"], "Snippet": " ".join(preprocess_sentence(row["Snippet"])) if config_params["es_preprocess"] else row['Snippet'] } uid += 1 yield doc
def main(): tweetfile = "data/tweets/clean/clean.csv" # df = load_df(tweetfile) # some shitty CLI args = sys.argv[1:] if len(args) < 2: print( "LOL. Please input in the format <loopback value> <word1> <word2> ..." ) print("Example: tweetgen2.py 2 my life") return n = int(args[0]) initial_words = args[1:] mc = MarkovChain(lookback=n) mc.train(load_df(tweetfile)['text'].values.tolist()) # initial_words = ['we', 'tend', 'to'] # initial_words = ['life', 'is'] tweet = mc.generate(initial_words) print("Generated tweet::\n{}".format(tweet)) print('-' * 30) print("After preprocessing <SENTENCE>::\n{}".format( preprocess_sentence(tweet)))
def compare_scores(snippets): corpus = ''.join(snippets) corpus_list = corpus.split('.') scores_dict = dict() F1avg = 0 valid_F1 = 0 happ_avg_time = 0 es_avg_time = 0 for query in snippets: count = 0 print("iteration number:", valid_F1) print(query) # preprocess the query if config_params["es_preprocess"]: query = " ".join(preprocess_sentence(query)) if (len(query.split()) < 4): count += 1 continue scores = metrics.metrics(query) precision = scores[0] / (scores[0] + scores[1] + 1e-9) recall = scores[0] / (scores[0] + scores[2] + 1e-9) F1 = 2 * precision * recall / (precision + recall + 1e-9) scores_dict[query] = [F1, precision, recall, scores[4], scores[5]] valid_F1 += 1 happ_avg_time += scores[4] es_avg_time += scores[5] F1avg += F1 print('scores:', scores) print('F1-score:', F1, 'precision:', precision, 'recall:', recall) print() print(F1avg / (valid_F1), happ_avg_time / valid_F1, es_avg_time / valid_F1) return scores_dict
def detect_topic(inp): sentence = preprocess_sentence(inp) subject_freq = defaultdict(int) words = set(sentence.split()) subjects_found = False for w in words: ignore = sum([1 if w.startswith(i) else 0 for i in ignore_words]) if not ignore: try: print (w, word_subject_map[w]) for k, v in word_subject_map[w].items(): subject_freq[k] += v if not subjects_found: subjects_found = True except KeyError: pass if subjects_found: subject_scores = list() for k, v in subject_freq.items(): subject_scores.append([k, v]) subject_scores.sort(key=lambda x: x[1], reverse=True) print (subject_scores) up_to_index = 1 threshold_score = remove_bottom * subject_scores[0][1] for i in range(1, len(subject_scores)): if subject_scores[i][1] >= threshold_score: up_to_index += 1 return subject_scores[:up_to_index] else: return ("Could not detect subject")
def create_dataset(path, num_examples): lines = io.open(path, encoding="UTF-8").read().strip().split("\n") word_pairs = [ [preprocess_sentence(w) for w in l.split("\t")] for l in lines[:num_examples] ] return zip(*word_pairs)
def inline(model, model_config, src_vocab, tgt_vocab, src_word_ind, tgt_word_ind, sentence): sentence = preprocess_sentence(sentence) src_tensor = [[int(src_word_ind[i]) if i in src_word_ind else 0 for i in sentence.split(' ')] + [0]*(8-len(sentence.split(' ')))] src_tensor = np.array(src_tensor) enc_hidden = tf.zeros((len(src_tensor), model_config['reccurent_hidden'])) dec_input = tf.expand_dims([int(tgt_word_ind['<start>'])] * len(src_tensor), 1) preds = model.predict([src_tensor, enc_hidden, dec_input]) preds = np.array([preds[i].argmax(axis = 1) for i in range(len(preds))]) preds = preds.swapaxes(0,1) print("Predicted Sentence is: ", *[tgt_vocab[str(i)] for i in preds[0]]) # This will not contain the first start token
def main(): tweetfile = "data/tweets/clean/clean.csv" df = load_df(tweetfile) text = "\n".join(df['text'].values.tolist()).strip() pairs = create_pairs(text) trie = build_trie(pairs) generated_words = generate1(trie, initial_word='i', max_len=15, verbose=False) generated_text = ' '.join(generated_words) print("Generated tweet::\n{}".format(generated_text)) print('-' * 30) print("After preprocessing <SENTENCE>::\n{}".format( preprocess_sentence(generated_text)))
def query(self, query_string): """query the tfidf index and return the list of matching doc IDs. :param query_string: A query string """ #returns a sorted list of docids, with decreasing cosine similarity query_string = self.process_spell_errors(query_string) query_terms = preprocess_sentence(query_string) query_frequencies = Counter( query_terms) #query_term : frequency(query_term) in the query dotproducts = defaultdict( int ) #sum of dotproduct elements of tfidf of the query and the document magnitude = defaultdict( int) #stores the magnitude of the document tfidf vector query_magnitude = 0 #calculate the cosine similarity for the docs for term in query_frequencies.keys(): if term not in self.index: continue query_tfidf = self.tfidf_score(query_frequencies[term], self.idf[term]) query_magnitude += query_tfidf**2 for doc in self.index[term]: doc_tfidf = self.tfidf_score(self.index[term][doc], self.idf[term]) dotproducts[doc] += query_tfidf * doc_tfidf magnitude[doc] += doc_tfidf**2 query_magnitude = sqrt(query_magnitude) cosine_similarity = {} for doc in magnitude: cosine_similarity[doc] = dotproducts[doc] / ( query_magnitude * sqrt(magnitude[doc]) + 1e-10) ranked_docs = list(cosine_similarity.items()) ranked_docs = sorted(ranked_docs, key=lambda x: x[1], reverse=True) threshold_docs = list( filter(lambda x: x[1] > config_params["threshold_score"], ranked_docs)) #return docs with score>threshold, or the top 10% docs return threshold_docs if len( threshold_docs) else ranked_docs[:len(ranked_docs) // 10 + 1]
def evaluate(sentence): #This is to store the attention vector for plotting.Ignore this attention_plot = np.zeros((max_length_targ, max_length_inp)) #Preprocessing the sentence.Steps 2,3 and 4 sentence = preprocess.preprocess_sentence(sentence) inputs = [inp_lang.word_index[i] for i in sentence.split(' ')] inputs = tf.keras.preprocessing.sequence.pad_sequences( [inputs], maxlen=max_length_inp, padding='post') #Step 4 inputs = tf.convert_to_tensor(inputs) #creating a string to store the translated sentence result = '' #Step 5 hidden = [tf.zeros((1, units))] enc_out, enc_hidden = encoder(inputs, hidden) dec_hidden = enc_hidden #Step 6 dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0) #Step 7 for t in range(max_length_targ): predictions, dec_hidden, attention_weights = decoder( dec_input, dec_hidden, enc_out) # storing the attention weights to plot later on.Ignore it attention_weights = tf.reshape(attention_weights, (-1, )) attention_plot[t] = attention_weights.numpy() #Step 8 predicted_id = tf.argmax(predictions[0]).numpy() result += targ_lang.index_word[predicted_id] + ' ' if targ_lang.index_word[predicted_id] == '<end>': return result, sentence, attention_plot # the predicted ID is fed back into the model dec_input = tf.expand_dims([predicted_id], 0) #Return the Original Sentence,Translated Sentence and the history of attention weights return result, sentence, attention_plot
def load_data(path, src_vocab, src_vocab_len, sen_len, padding = 'post'): word_ind = {} for i in src_vocab: if i != 'num_words' and int(i) < src_vocab_len: word_ind[src_vocab[i]] = i if path: lines = io.open(path, encoding='UTF-8').read().strip().split('\n') cleaned = [preprocess_sentence(i) for i in lines] tokenizer = Tokenizer(oov_token='<OOV>') tokenizer.word_index = word_ind tensor = tokenizer.texts_to_sequences(cleaned) padded_tensor = pad_sequences(tensor, padding=padding, truncating='post', maxlen=sen_len) return padded_tensor, word_ind return None, word_ind
def translate(x_test = x_test,y_test = y_test,target_vocab = trgt_vocab,source_vocab = src_vocab): #we will be able to translate sentence untill we give keyboard interrupt print("Enter Sentences for translation : ") while(True): sentence = input("Source : ") #preprocessing the sentence sentence1 = pp.preprocess_sentence(sentence) #tokenizing the sentence sentence1 = pp.preprocess_corpus_translate(sentence1, tokenizers['en']) #converting it to tensor sentence_tensor = pp.tensors_from_pair_translate(source_vocab,sentence1[0],max_seq_length) sentence_tensor = torch.transpose(sentence_tensor, 1, 0) #initizaling the model net = nmt.seq2seq(len(source_vocab),len(target_vocab),1024,1) net = net.cuda() net.load_state_dict(torch.load('./model/mdl_weights.pth')) #sending the sentence to cuda sentence_tensor = sentence_tensor.cuda() y = net(sentence_tensor) translation = ' '.join(target_vocab.unidex_words(y[1:-1])) print('Translation: "{}"\n'.format(translation))
def evaluate(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ): attention_plot = np.zeros((max_length_targ, max_length_inp)) sentence = preprocess_sentence(sentence) inputs = [inp_lang.word2idx[i] for i in sentence.split(' ')] inputs = tf.keras.preprocessing.sequence.pad_sequences( [inputs], maxlen=max_length_inp, padding='post') inputs = tf.convert_to_tensor(inputs) result = '' hidden = [tf.zeros((1, encoder.enc_units))] enc_out, enc_hidden = encoder(inputs, hidden) dec_hidden = enc_hidden dec_input = tf.expand_dims([targ_lang.word2idx['<start>']], 0) for t in range(max_length_targ): predictions, dec_hidden, attention_weights = decoder( dec_input, dec_hidden, enc_out) attention_weights = tf.reshape(attention_weights, (-1, )) attention_plot[t] = attention_weights.numpy() predicted_id = tf.argmax(predictions[0]).numpy() result += targ_lang.idx2word[predicted_id] + ' ' if targ_lang.idx2word[predicted_id] == '<end>': return result, sentence, attention_plot dec_input = tf.expand_dims([predicted_id], 0) return result, sentence, attention_plot
targets = np.ndarray.flatten( np.array(np.argmax(outputs, axis=2) * mask)) f_score_cal = calculate_f_score(predictions, targets) #print(loss.shape) #optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE).minimize(loss) return loss, accuracy, f_score_cal if __name__ == '__main__': input_file = os.path.join( '/Users/emielzyde/Desktop/Project/grammar_correction/lang8_preprocess.pickle' ) with open(input_file, 'rb') as f: lang_data = pickle.load(f) new_data = [[preprocess.preprocess_sentence(w) for w in l.split('\t')] for l in lang_data[:NUM_DATA]] label_holder = [] input_sentences = [] for line in new_data: labels = postprocess.sentence_labeller(line[0], line[1]) label_holder.append(labels) input_sentences.append(line[1]) #label_holder = np.array(label_holder) #Pre-process the data data_holder = preprocess.Preprocessor(lang_data, NUM_DATA, 'TRAIN') _, target_dataset, _, output_table, _, max_length_tar, _, _, _, output_index2word, target_lengths = data_holder.finalise_dataset( )
def break_query(self, query_string): """break_query. A function to split a query based on wildcard operators :param query_string: A query string """ star_flag = 0 query_string = self.process_spell_errors(query_string) query_terms = preprocess_sentence(query_string) result_docs = set() new_query_terms = [] for term in query_terms: if '*' in term: #prefix query star_flag = 1 if term[-1] == '*': term = term[:-1] temp_terms = self.get_words_from_tree(self.tree, term) result_docs = self.update_doclist(result_docs, temp_terms) elif term[0] == '*': #suffix query term = term[1:][::-1] temp_terms = [] temp_terms = self.get_words_from_tree( self.reverse_tree, term) result_docs = self.update_doclist(result_docs, temp_terms) else: #prefix+suffix query pref_terms = [] suff_terms = [] star_index = term.index('*') prefix_term = term[:star_index] suffix_term = term[star_index + 1:] pref_terms = self.get_words_from_tree( self.tree, prefix_term) suffix_term = suffix_term[::-1] suff_terms = [ i[::-1] for i in self.get_words_from_tree( self.reverse_tree, suffix_term) ] result_docs = self.update_doclist( result_docs, list(set(pref_terms).intersection(set(suff_terms)))) else: new_query_terms.append(term) query_terms = new_query_terms query_terms.sort(key=lambda x: len(self.index[x])) #if it is a wild card query if star_flag == 1: if (len(query_terms) != 0): result_docs = set( reduce(lambda x, y: x.intersection(y), map(lambda x: self.index[x], query_terms))).intersection(result_docs) return list(result_docs) if (len(query_terms) == 0): return list() return list( set( reduce(lambda x, y: x.intersection(y), map(lambda x: self.index[x], query_terms))))
#Loading all the data from the url print("Loading Data................\n") url_en = 'https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/train.en' url_vi = 'https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/train.vi' text_en = pd.read_csv(url_en,'\n',header = None) text_vi = pd.read_csv(url_vi,'\n',header = None) data = pd.concat([text_en,text_vi],axis=1) data.columns = ["source","target"] #Due to some string error taking values in .notnull() data = data[data["source"].notnull()] data = data[data["target"].notnull()] #Send this data in preprocess_sentence where sentence words are lowered and unwanted characters are removed data["source"] = data.source.apply(lambda w: pp.preprocess_sentence(w)) data["target"] = data.target.apply(lambda w: pp.preprocess_sentence(w)) #Keeping sentences which are less than max_len = 25 # data = data[data["target"].str.split(" ").str.len() <= max_len] # data = data[data["source"].str.split(" ").str.len() <= max_len] data = data[(data['source'].str.split(" ").str.len() <= max_len) & (data['target'].str.split(" ").str.len() <= max_len)] data = data.reset_index().drop('index',1) #Loading test Data url_en_test = 'https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/tst2012.en' url_vi_test = 'https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/tst2012.vi' text_en_test = pd.read_csv(url_en_test,'\n',header = None) text_vi_test = pd.read_csv(url_vi_test,'\n',header = None) data_test = pd.concat([text_en_test,text_vi_test], axis = 1)