def test_batch_text_features(self): test_data = ["Queen of England", "Prime Minister of Canada"] response = text_features(test_data) self.assertTrue(isinstance(response, np.ndarray)) self.assertEqual(len(response), 2) self.assertEqual(len(response[0]), 300) self.assertEqual(len(response[1]), 300)
def test_batch_text_features(self): test_data = ['Queen of England', 'Prime Minister of Canada'] response = text_features(test_data) self.assertTrue(isinstance(response, list)) self.assertEqual(len(response), 2) self.assertEqual(len(response[0]), 300) self.assertEqual(len(response[1]), 300)
def test_batch_text_features(self): test_data = ['Queen of England', 'Prime Minister of Canada'] response = text_features(test_data) self.assertTrue(isinstance(response, np.ndarray)) self.assertEqual(len(response), 2) self.assertEqual(len(response[0]), 300) self.assertEqual(len(response[1]), 300)
def finish_recording(images): result = stitch_images(images, get_candidate_image_positions(images)) cv2.imwrite("ocr.png", result) # Perform OCR. with open("ocr.png", "rb") as image_file: encoded_image = base64.b64encode(image_file.read()).decode() payload = { "requests": [{ "image": { "content": encoded_image }, "features": [{ "type": "TEXT_DETECTION" }] }] } r = requests.post('https://vision.googleapis.com/v1/images:annotate?key=' + config.google_key(), json=payload) text = r.json()['responses'][0]['textAnnotations'][0]['description'] print("OCR Result:", text) sentence = re.sub('[^0-9a-zA-Z]+', ' ', text).strip() features = indicoio.text_features(sentence) category = int(classifier.predict([features])[0]) print("Category:", category) firebase_upload(config.google_key(), text, category)
def input_question(data, feats, user_response): if user_response is not None: data.insert(0, user_response) new_feats = indicoio.text_features(user_response) feats.insert(0, new_feats) return data, feats
def get_batch_embeddings(comment_ids, nested_comments_dict): """call to indico text_features api comment_ids: List of comment_ids to fetch in this batch nested_comments_dict: Dictionary with comment_id as key, and value is a dict containing 'body' """ strings = [nested_comments_dict[id]['body'] for id in comment_ids] embeddings = indicoio.text_features(strings) return {comment_ids[x]: embeddings[x] for x in range(len(comment_ids))}
def get_vectors(): ''' Iterate through the collection of movie reviews, sending them in batches of 100 to Indico to create 300-feature vectors for each document. Add a field for the doc_vec to the movie review entries, as well as a field with the word_count of the review. NOTE: Because the Mongo cursor will time out before the entire database can be iterated through, this is set to grab 1000 items at a time from the database. ''' movie_reviews = reviews_coll.find({'word_count':{'$exists':False}}, {'_id': 1, 'review': 1}, limit=1000) i = 0 while movie_reviews.count() > 0: review_list = [] id_list = [] for movie_dict in movie_reviews: movie_review = movie_dict['review'] movie_id = movie_dict['_id'] review_list.append(movie_review) id_list.append(movie_id) # batch reviews in groups of 100 to send to Indico to create a document vector if len(id_list)>99: print "Sending reviews {0} to {1} to Indico for vectorizing".format(i, i+len(id_list)-1) results = indicoio.text_features(review_list) for j in xrange(len(id_list)): this_id = id_list[j] doc_vec = results[j] reviews_coll.update_one({'_id':this_id}, {'$push': {'doc_vec': doc_vec}}) # while we're at it, determine the length of the review reviews_coll.update_one({'_id':this_id}, {'$push': {'word_count': len(review_list[j].split())}}) # after the vectors have been posted to the database, update the counter and batch lists i += len(id_list) id_list = [] review_list = [] # when the reviews have been iterated through, if it wasn't an even 100, batch the remaining reviews if len(id_list)>0: print "Sending reviews {0} to {1} to Indico for vectorizing".format(i, i+len(id_list)-1) results = indicoio.text_features(review_list) for j in xrange(len(id_list)): this_id = id_list[j] doc_vec = results[j] reviews_coll.update_one({'_id':this_id}, {'$push': {'doc_vec': doc_vec}}) # while we're at it, determine the length of the review reviews_coll.update_one({'_id':this_id}, {'$push': {'word_count': len(review_list[j].split())}}) movie_reviews = reviews_coll.find({'word_count':{'$exists':False}}, {'_id': 1, 'review': 1}, limit=1000)
def make_feats(data): ''' Using Indicoio API for text vectorization ''' batches = [data[x:x+100] for x in range(0, len(data), 100)] features = [] # Using a progress bar for chunk in tqdm(batches): feats.extend(indicoio.text_features(chunk)) return feats
def make_feats(data): xrange = range chunks = [data[x:x + 100] for x in xrange(0, len(data), 100)] feats = [] for chunk in (chunks): feats.extend(indicoio.text_features(chunk)) print(len(feats)) return feats
def extractive_encode(list_of_sentences, savePath = None): """Featurizes a list of sentences using the indico API""" print("Featurizing " + str(len(list_of_sentences)) + " sentences - remember to be careful with API credits!") API_KEY = "Private - contact if you need it!" indicoio.config.api_key = API_KEY sentencefeatures = indicoio.text_features(list_of_sentences, version = 2) if savePath != None: pickle.dump(sentencefeatures, open(savePath,"wb")) return sentencefeatures
def make_feats(data): """ Send our text data throught the indico API and return each text example's text vector representation """ chunks = [data[x:x+100] for x in range(0, len(data), 100)] feats = [] # just a progress bar to show us how much we have left for chunk in tqdm(chunks): feats.extend(indicoio.text_features(chunk)) return feats
def input_question(question, data, feats): # TODO # Pass a question # add the user question and its vector representations to the corresponding lists, `data` and `feats` # insert them at index 0 so you know exactly where they are for later distance calculations if question is not None: data.insert(0, question) new_feats = indicoio.text_features(question) feats.insert(0, new_feats) return data, feats
def make_feats(data): """ Send our text data throught the indico API and return each text example's text vector representation """ # TODO chunks = [data[x:x+100] for x in range(0, len(data), 100)] feats = [] # working with chunks of the data at a time for chunk in chunks: feats.extend(indicoio.text_features(chunk)) return feats
def make_feats(data): """ Send our text data through the indico API and return each text example's text vector representation """ chunks = [data[x:x + 100] for x in range(0, len(data), 100)] feats = [] # just a progress bar to show us how much we have left for chunk in tqdm(chunks): feats.extend( indicoio.text_features(chunk) ) #text_features- Convert text into meaningful feature vectors. #Extracts abstract text features for use as inputs to learning algorithms. return feats
def input_question(self, data, feats, question=None): """ Receive the input questions by Users """ # input a question # question = input("What is your question? ") # add the user question and its vector representations to the corresponding lists, `data` and `feats` # insert them at index 0 so you know exactly where they are for later distance lations if question is not None: data.insert(0, question) new_feats = indicoio.text_features(question) feats.insert(0, new_feats) return data, feats
from sklearn.model_selection import train_test_split import numpy as np import config indicoio.config.api_key = config.indico_key() dataset = np.load('datasets/dataset.htn.npz') data, target = dataset['data'], dataset['target'] training_features, testing_features, training_target, testing_target = \ train_test_split(data, target, random_state=42) exported_pipeline = make_pipeline( StackingEstimator(estimator=LogisticRegression(C=20.0)), LogisticRegression(C=15.0, dual=True)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) from sklearn.externals import joblib joblib.dump(exported_pipeline, 'classifier.pkl') while True: sentence = input("Enter a sentence: ") sentence = re.sub('[^0-9a-zA-Z]+', ' ', sentence).strip() features = indicoio.text_features(sentence) print(exported_pipeline.predict([features]))
def get_embedding(text): """API call to get vector representing user input""" return np.array(indicoio.text_features(my_text_1))
dataset = {"education": [], "legal": [], "nutrition": []} for category in categories: category_sentences = [] for index in range(5): with open('datasets/' + category + 'Samples' + str(index) + '.txt') as file: text = file.read() sentences = [ re.sub('[^0-9a-zA-Z]+', ' ', sentence).strip() for sentence in tokenizer.tokenize(text) ] category_sentences.extend(sentences) features = indicoio.text_features(category_sentences, batch=True) dataset[category].extend(features) data = [] target = [] for key in categories: category_sentences = dataset[key] data.extend(category_sentences) target.extend([categories.index(key)] * len(category_sentences)) data = np.asarray(data, np.float32) target = np.asarray(target, np.float32) np.savez('datasets/dataset.htn', data=data, target=target)
def string_corr_coefs(list_of_strings): vecs = np.array([(vec) for vec in indicoio.text_features(list_of_strings)]) return(np.corrcoef(vecs))
def get_vectors(): ''' Iterate through the collection of movie reviews, sending them in batches of 100 to Indico to create 300-feature vectors for each document. Add a field for the doc_vec to the movie review entries, as well as a field with the word_count of the review. NOTE: Because the Mongo cursor will time out before the entire database can be iterated through, this is set to grab 1000 items at a time from the database. ''' movie_reviews = reviews_coll.find({'word_count': { '$exists': False }}, { '_id': 1, 'review': 1 }, limit=1000) i = 0 while movie_reviews.count() > 0: review_list = [] id_list = [] for movie_dict in movie_reviews: movie_review = movie_dict['review'] movie_id = movie_dict['_id'] review_list.append(movie_review) id_list.append(movie_id) # batch reviews in groups of 100 to send to Indico to create a document vector if len(id_list) > 99: print "Sending reviews {0} to {1} to Indico for vectorizing".format( i, i + len(id_list) - 1) results = indicoio.text_features(review_list) for j in xrange(len(id_list)): this_id = id_list[j] doc_vec = results[j] reviews_coll.update_one({'_id': this_id}, {'$push': { 'doc_vec': doc_vec }}) # while we're at it, determine the length of the review reviews_coll.update_one( {'_id': this_id}, {'$push': { 'word_count': len(review_list[j].split()) }}) # after the vectors have been posted to the database, update the counter and batch lists i += len(id_list) id_list = [] review_list = [] # when the reviews have been iterated through, if it wasn't an even 100, batch the remaining reviews if len(id_list) > 0: print "Sending reviews {0} to {1} to Indico for vectorizing".format( i, i + len(id_list) - 1) results = indicoio.text_features(review_list) for j in xrange(len(id_list)): this_id = id_list[j] doc_vec = results[j] reviews_coll.update_one({'_id': this_id}, {'$push': { 'doc_vec': doc_vec }}) # while we're at it, determine the length of the review reviews_coll.update_one( {'_id': this_id}, {'$push': { 'word_count': len(review_list[j].split()) }}) movie_reviews = reviews_coll.find({'word_count': { '$exists': False }}, { '_id': 1, 'review': 1 }, limit=1000)
def test_text_features(self): test_data = 'Queen of England' response = text_features(test_data) self.assertTrue(isinstance(response, list)) self.assertEqual(len(response), 300)
def test_text_features(self): test_data = 'Queen of England' response = text_features(test_data) self.assertTrue(isinstance(response, np.ndarray)) self.assertEqual(len(response), 300)
def input_question(data, feats): if question is not None: data.insert(0, question) new_feats = indicoio.text_features(question) feats.insert(0, new_feats) return data, feats
def text_vector(s): return indicoio.text_features(s)
def string_corr_coefs(list_of_strings): vecs = np.array([(vec) for vec in indicoio.text_features(list_of_strings)]) return (np.corrcoef(vecs))
def test_text_features(self): test_data = "Queen of England" response = text_features(test_data) self.assertTrue(isinstance(response, np.ndarray)) self.assertEqual(len(response), 300)