def test_batch_text_features(self):
     test_data = ["Queen of England", "Prime Minister of Canada"]
     response = text_features(test_data)
     self.assertTrue(isinstance(response, np.ndarray))
     self.assertEqual(len(response), 2)
     self.assertEqual(len(response[0]), 300)
     self.assertEqual(len(response[1]), 300)
 def test_batch_text_features(self):
     test_data = ['Queen of England', 'Prime Minister of Canada']
     response = text_features(test_data)
     self.assertTrue(isinstance(response, list))
     self.assertEqual(len(response), 2)
     self.assertEqual(len(response[0]), 300)
     self.assertEqual(len(response[1]), 300)
 def test_batch_text_features(self):
     test_data = ['Queen of England', 'Prime Minister of Canada']
     response = text_features(test_data)
     self.assertTrue(isinstance(response, np.ndarray))
     self.assertEqual(len(response), 2)
     self.assertEqual(len(response[0]), 300)
     self.assertEqual(len(response[1]), 300)
Exemplo n.º 4
0
def finish_recording(images):
    result = stitch_images(images, get_candidate_image_positions(images))
    cv2.imwrite("ocr.png", result)

    # Perform OCR.

    with open("ocr.png", "rb") as image_file:
        encoded_image = base64.b64encode(image_file.read()).decode()
        payload = {
            "requests": [{
                "image": {
                    "content": encoded_image
                },
                "features": [{
                    "type": "TEXT_DETECTION"
                }]
            }]
        }

    r = requests.post('https://vision.googleapis.com/v1/images:annotate?key=' +
                      config.google_key(),
                      json=payload)
    text = r.json()['responses'][0]['textAnnotations'][0]['description']

    print("OCR Result:", text)

    sentence = re.sub('[^0-9a-zA-Z]+', ' ', text).strip()
    features = indicoio.text_features(sentence)

    category = int(classifier.predict([features])[0])

    print("Category:", category)

    firebase_upload(config.google_key(), text, category)
Exemplo n.º 5
0
def input_question(data, feats, user_response):

    if user_response is not None:
        data.insert(0, user_response)
    new_feats = indicoio.text_features(user_response)
    feats.insert(0, new_feats)
    return data, feats
Exemplo n.º 6
0
def get_batch_embeddings(comment_ids, nested_comments_dict):
    """call to indico text_features api
       comment_ids: List of comment_ids to fetch in this batch
       nested_comments_dict: Dictionary with comment_id as key, and
                            value is a dict containing 'body'
    """
    strings = [nested_comments_dict[id]['body'] for id in comment_ids]
    embeddings = indicoio.text_features(strings)
    return {comment_ids[x]: embeddings[x] for x in range(len(comment_ids))}
def get_vectors():
    '''
    Iterate through the collection of movie reviews, sending them in batches of
    100 to Indico to create 300-feature vectors for each document. Add a field
    for the doc_vec to the movie review entries, as well as a field with the
    word_count of the review.
    NOTE: Because the Mongo cursor will time out before the entire database can
    be iterated through, this is set to grab 1000 items at a time from the database.
    '''
    movie_reviews = reviews_coll.find({'word_count':{'$exists':False}}, {'_id': 1, 'review': 1}, limit=1000)
    i = 0
    while movie_reviews.count() > 0:
        review_list = []
        id_list = []
        for movie_dict in movie_reviews:
            movie_review = movie_dict['review']
            movie_id = movie_dict['_id']
            review_list.append(movie_review)
            id_list.append(movie_id)
            # batch reviews in groups of 100 to send to Indico to create a document vector
            if len(id_list)>99:
                print "Sending reviews {0} to {1} to Indico for vectorizing".format(i, i+len(id_list)-1)
                results = indicoio.text_features(review_list)
                for j in xrange(len(id_list)):
                    this_id = id_list[j]
                    doc_vec = results[j]
                    reviews_coll.update_one({'_id':this_id}, {'$push': {'doc_vec': doc_vec}})
                    # while we're at it, determine the length of the review
                    reviews_coll.update_one({'_id':this_id}, {'$push': {'word_count': len(review_list[j].split())}})
                # after the vectors have been posted to the database, update the counter and batch lists
                i += len(id_list)
                id_list = []
                review_list = []
        # when the reviews have been iterated through, if it wasn't an even 100, batch the remaining reviews
        if len(id_list)>0:
            print "Sending reviews {0} to {1} to Indico for vectorizing".format(i, i+len(id_list)-1)
            results = indicoio.text_features(review_list)
            for j in xrange(len(id_list)):
                this_id = id_list[j]
                doc_vec = results[j]
                reviews_coll.update_one({'_id':this_id}, {'$push': {'doc_vec': doc_vec}})
                # while we're at it, determine the length of the review
                reviews_coll.update_one({'_id':this_id}, {'$push': {'word_count': len(review_list[j].split())}})
        movie_reviews = reviews_coll.find({'word_count':{'$exists':False}}, {'_id': 1, 'review': 1}, limit=1000)
Exemplo n.º 8
0
def make_feats(data):
    '''
    Using  Indicoio API for text vectorization
    '''
    batches = [data[x:x+100] for x in range(0, len(data), 100)]
    features = []
    # Using a progress bar
    for chunk in tqdm(batches):
        feats.extend(indicoio.text_features(chunk))
    return feats
Exemplo n.º 9
0
def make_feats(data):
    xrange = range
    chunks = [data[x:x + 100] for x in xrange(0, len(data), 100)]
    feats = []

    for chunk in (chunks):
        feats.extend(indicoio.text_features(chunk))
        print(len(feats))

    return feats
Exemplo n.º 10
0
def extractive_encode(list_of_sentences, savePath = None):
    """Featurizes a list of sentences using the indico API"""
    print("Featurizing " + str(len(list_of_sentences)) + 
          " sentences - remember to be careful with API credits!")
    API_KEY = "Private - contact if you need it!"
    indicoio.config.api_key = API_KEY 
    sentencefeatures = indicoio.text_features(list_of_sentences, version = 2)
    if savePath != None:
        pickle.dump(sentencefeatures, open(savePath,"wb"))
    return sentencefeatures
Exemplo n.º 11
0
def make_feats(data):
    """
    Send our text data throught the indico API and return each text example's text vector representation
    """
    chunks = [data[x:x+100] for x in range(0, len(data), 100)]
    feats = []

    # just a progress bar to show us how much we have left
    for chunk in tqdm(chunks):
        feats.extend(indicoio.text_features(chunk))

    return feats
Exemplo n.º 12
0
def input_question(question, data, feats):
    # TODO
    # Pass a question

    # add the user question and its vector representations to the corresponding lists, `data` and `feats`
    # insert them at index 0 so you know exactly where they are for later distance calculations
    if question is not None:
        data.insert(0, question)

    new_feats = indicoio.text_features(question)
    feats.insert(0, new_feats)

    return data, feats
Exemplo n.º 13
0
def make_feats(data):
    """
    Send our text data throught the indico API and return each text example's text vector representation
    """
    # TODO
    chunks = [data[x:x+100] for x in range(0, len(data), 100)]
    feats = []

    # working with chunks of the data at a time
    for chunk in chunks:
        feats.extend(indicoio.text_features(chunk))

    return feats
Exemplo n.º 14
0
def make_feats(data):
    """
    Send our text data through the indico API and return each text example's text vector representation
    """
    chunks = [data[x:x + 100] for x in range(0, len(data), 100)]
    feats = []
    # just a progress bar to show us how much we have left
    for chunk in tqdm(chunks):
        feats.extend(
            indicoio.text_features(chunk)
        )  #text_features- Convert text into meaningful feature vectors.
        #Extracts abstract text features for use as inputs to learning algorithms.
    return feats
Exemplo n.º 15
0
 def input_question(self, data, feats, question=None):
     """
     Receive the input questions by Users
     """
     # input a question
     # question = input("What is your question? ")
     # add the user question and its vector representations to the corresponding lists, `data` and `feats`
     # insert them at index 0 so you know exactly where they are for later distance lations
     if question is not None:
         data.insert(0, question)
     new_feats = indicoio.text_features(question)
     feats.insert(0, new_feats)
     return data, feats
Exemplo n.º 16
0
from sklearn.model_selection import train_test_split
import numpy as np

import config

indicoio.config.api_key = config.indico_key()

dataset = np.load('datasets/dataset.htn.npz')
data, target = dataset['data'], dataset['target']

training_features, testing_features, training_target, testing_target = \
    train_test_split(data, target, random_state=42)

exported_pipeline = make_pipeline(
    StackingEstimator(estimator=LogisticRegression(C=20.0)),
    LogisticRegression(C=15.0, dual=True))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

from sklearn.externals import joblib
joblib.dump(exported_pipeline, 'classifier.pkl')

while True:
    sentence = input("Enter a sentence: ")
    sentence = re.sub('[^0-9a-zA-Z]+', ' ', sentence).strip()

    features = indicoio.text_features(sentence)

    print(exported_pipeline.predict([features]))
Exemplo n.º 17
0
def get_embedding(text):
    """API call to get vector representing user input"""
    return np.array(indicoio.text_features(my_text_1))
Exemplo n.º 18
0
dataset = {"education": [], "legal": [], "nutrition": []}

for category in categories:
    category_sentences = []
    for index in range(5):
        with open('datasets/' + category + 'Samples' + str(index) +
                  '.txt') as file:
            text = file.read()

            sentences = [
                re.sub('[^0-9a-zA-Z]+', ' ', sentence).strip()
                for sentence in tokenizer.tokenize(text)
            ]
            category_sentences.extend(sentences)

    features = indicoio.text_features(category_sentences, batch=True)
    dataset[category].extend(features)

data = []
target = []

for key in categories:
    category_sentences = dataset[key]
    data.extend(category_sentences)
    target.extend([categories.index(key)] * len(category_sentences))

data = np.asarray(data, np.float32)
target = np.asarray(target, np.float32)

np.savez('datasets/dataset.htn', data=data, target=target)
Exemplo n.º 19
0
def string_corr_coefs(list_of_strings):
    vecs = np.array([(vec) for vec in indicoio.text_features(list_of_strings)])
    return(np.corrcoef(vecs))
Exemplo n.º 20
0
def get_vectors():
    '''
    Iterate through the collection of movie reviews, sending them in batches of
    100 to Indico to create 300-feature vectors for each document. Add a field
    for the doc_vec to the movie review entries, as well as a field with the
    word_count of the review.
    NOTE: Because the Mongo cursor will time out before the entire database can
    be iterated through, this is set to grab 1000 items at a time from the database.
    '''
    movie_reviews = reviews_coll.find({'word_count': {
        '$exists': False
    }}, {
        '_id': 1,
        'review': 1
    },
                                      limit=1000)
    i = 0
    while movie_reviews.count() > 0:
        review_list = []
        id_list = []
        for movie_dict in movie_reviews:
            movie_review = movie_dict['review']
            movie_id = movie_dict['_id']
            review_list.append(movie_review)
            id_list.append(movie_id)
            # batch reviews in groups of 100 to send to Indico to create a document vector
            if len(id_list) > 99:
                print "Sending reviews {0} to {1} to Indico for vectorizing".format(
                    i, i + len(id_list) - 1)
                results = indicoio.text_features(review_list)
                for j in xrange(len(id_list)):
                    this_id = id_list[j]
                    doc_vec = results[j]
                    reviews_coll.update_one({'_id': this_id},
                                            {'$push': {
                                                'doc_vec': doc_vec
                                            }})
                    # while we're at it, determine the length of the review
                    reviews_coll.update_one(
                        {'_id': this_id},
                        {'$push': {
                            'word_count': len(review_list[j].split())
                        }})
                # after the vectors have been posted to the database, update the counter and batch lists
                i += len(id_list)
                id_list = []
                review_list = []
        # when the reviews have been iterated through, if it wasn't an even 100, batch the remaining reviews
        if len(id_list) > 0:
            print "Sending reviews {0} to {1} to Indico for vectorizing".format(
                i, i + len(id_list) - 1)
            results = indicoio.text_features(review_list)
            for j in xrange(len(id_list)):
                this_id = id_list[j]
                doc_vec = results[j]
                reviews_coll.update_one({'_id': this_id},
                                        {'$push': {
                                            'doc_vec': doc_vec
                                        }})
                # while we're at it, determine the length of the review
                reviews_coll.update_one(
                    {'_id': this_id},
                    {'$push': {
                        'word_count': len(review_list[j].split())
                    }})
        movie_reviews = reviews_coll.find({'word_count': {
            '$exists': False
        }}, {
            '_id': 1,
            'review': 1
        },
                                          limit=1000)
Exemplo n.º 21
0
 def test_text_features(self):
     test_data = 'Queen of England'
     response = text_features(test_data)
     self.assertTrue(isinstance(response, list))
     self.assertEqual(len(response), 300)
 def test_text_features(self):
     test_data = 'Queen of England'
     response = text_features(test_data)
     self.assertTrue(isinstance(response, np.ndarray))
     self.assertEqual(len(response), 300)
Exemplo n.º 23
0
def input_question(data, feats):
    if question is not None:
        data.insert(0, question)
    new_feats = indicoio.text_features(question)
    feats.insert(0, new_feats)
    return data, feats
Exemplo n.º 24
0
def text_vector(s):
    return indicoio.text_features(s)
Exemplo n.º 25
0
def string_corr_coefs(list_of_strings):
    vecs = np.array([(vec) for vec in indicoio.text_features(list_of_strings)])
    return (np.corrcoef(vecs))
Exemplo n.º 26
0
 def test_text_features(self):
     test_data = "Queen of England"
     response = text_features(test_data)
     self.assertTrue(isinstance(response, np.ndarray))
     self.assertEqual(len(response), 300)
Exemplo n.º 27
0
def get_embedding(text):
    """API call to get vector representing user input"""
    return np.array(indicoio.text_features(my_text_1))