Exemplo n.º 1
0
    def test_corpus(self):

        with open("../data/pt_BR/nnp") as f:
            nnp = [line.rstrip() for line in f.readlines()]
        with open("../data/pt_BR/terms") as f:
            terms = [line.rstrip() for line in f.readlines()]
        with open("../data/pt_BR/patterns") as f:
            patterns = [line.rstrip() for line in f.readlines()]

        data = LoadData(['../corpus/sel1.csv', '../corpus/sel2.csv']).load()
        p = PreProcessing(nnp, terms, patterns)

        tokens = []
        for d in data.values():
            tokens += p.clean_and_stem(d)

        bow, bow_features_names = p.build_bow(tokens)
        dist = np.sum(bow.toarray(), axis=0)
        tbow = {}
        for term, count in zip(bow_features_names, dist):
            tbow[term] = count

        import operator
        with open("bow", "w") as f:
            f.write(str(len(tbow)))
            f.write(
                str(
                    sorted(tbow.items(),
                           key=operator.itemgetter(1),
                           reverse=True)))

        terms = p.compute_tfidf(data.values(), eliminate_zeros=True)
        with open("terms", "w") as f:
            f.write(str(terms))
Exemplo n.º 2
0
 def test_should_remove_accents_and_special_chars(self):
     c = PreProcessing()
     expected = ['oi', 'qual', 'e', 'o', 'email', 'do', 'sr', 'joao', 'e', 'joaogmailcom', 'ah', 'eu', 'ja', 'sabia']
     self.assertEquals(expected, c.__normalize__("Oi, qual é o e-mail do Sr. João? "
                                                 "É [email protected]! Ah eu já sabia!"))
     expected = ['o', 'cpf', 'do', 'joao', 'e', '12345678900']
     self.assertEquals(expected, c.__normalize__("o cpf do joao é 123.456.789-00"))
Exemplo n.º 3
0
    def __init__(self, questions: set, answers: set, word_vectors=None):

        self.bow = CountVectorizer()
        self.questions = questions
        self.answers = answers
        self.word_vectors = word_vectors
        self.pp = PreProcessing()
Exemplo n.º 4
0
def load_gof_data():
    from pre_processing import Loader, PreProcessing
    loader = Loader()
    word_lst = loader.get_list('data/testdatatxt.txt')
    pre_process = PreProcessing()
    lst = pre_process.process2(word_lst)
    return lst
Exemplo n.º 5
0
    def test(self):
        p = PreProcessing([], [], [])
        cts = machado.fileids()[:5]

        tokens = []
        for c in cts:
            text = machado.raw(c)
            tokens += p.clean_and_stem(text)

        bow, bow_features_names = p.build_bow(tokens)
        dist = np.sum(bow.toarray(), axis=0)
        tbow = {}
        for term, count in zip(bow_features_names, dist):
            tbow[term] = count

        import operator
        print sorted(tbow.items(), key=operator.itemgetter(1), reverse=True)

        texts = {}
        for c in cts:
            text = machado.raw(c)
            texts[c] = text

        terms = p.compute_tfidf(texts.values(), top_n=10, eliminate_zeros=True)
        print terms
Exemplo n.º 6
0
def load_gof_data():
    from pre_processing import Loader, PreProcessing

    loader = Loader()
    word_lst = loader.get_list("data/testdatatxt.txt")
    pre_process = PreProcessing()
    lst = pre_process.process2(word_lst)
    return lst
Exemplo n.º 7
0
 def test_should_remove_digits(self):
     c = PreProcessing(["joao"], [], ["\d+"])
     self.assertEquals(["tem", "anos"],
                       c.__obfuscate__(["joao", "tem", "12", "anos"]))
     self.assertEquals(["anos", "e", "amigos", "no", "facebook"],
                       c.__obfuscate__(["joao", "12", "anos", "e", "1765546587", "amigos", "no", "facebook"]))
     self.assertEquals(["o", "cpf", "do",  "e"],
                       c.__obfuscate__(["o", "cpf", "do", "joao", "e", "123.456.789-00"]))
Exemplo n.º 8
0
    def test_should_build_bag_of_words(self):
        p = PreProcessing(["joao", "maria"], [], ["\d+", "nomeemp*"])

        text = "O técnico João foi até a casa da Maria (NOMEEMPRESA) e solucionou o problema. " \
               "Ele não foi solucionado? NomeempProd"
        tokens = p.clean(text)
        tokens = p.stem(tokens)

        bow, bfn = p.build_bow(tokens)
        self.assertEquals("(7, 6)", bow.shape.__str__())
Exemplo n.º 9
0
    def pipeline(img, lanes_fit, camera_matrix, dist_coef):
        # debug flag
        is_debug_enabled = True

        # checkbox dimensions for calibration
        nx, ny, channels = 9, 6, 3

        # calibrate camera and undistort the image
        undistorted_image = PreProcessing.get_undistorted_image(
            nx, ny, img, camera_matrix, dist_coef)

        # get the color and gradient threshold image
        binary_image = PreProcessing.get_binary_image(undistorted_image)

        # get source and destination points
        src, dst = PerspectiveTransform.get_perspective_points(img)

        # get image with source and destination points drawn
        img_src, img_dst = PerspectiveTransform.get_sample_wrapped_images(
            img, src, dst)

        # perspective transform to bird eye view
        warped_image = PerspectiveTransform.get_wrapped_image(
            binary_image, src, dst)

        # find the lanes lines and polynomial fit
        if len(lanes_fit) == 0:
            lane_lines, lanes_fit, left_xy, right_xy = LanesFitting.get_lanes_fit(
                warped_image)
        else:
            lane_lines, lanes_fit, left_xy, right_xy = LanesFitting.update_lanes_fit(
                warped_image, lanes_fit)

        # find the radius of curvature
        radius = Metrics.get_curvature_radius(lane_lines, left_xy, right_xy)

        # find the car distance from center lane
        center_distance, lane_width = Metrics.get_distance_from_center(
            lane_lines, lanes_fit)

        # unwrap the image
        resultant_img = PerspectiveTransform.get_unwrapped_image(
            undistorted_image, warped_image, src, dst, lanes_fit)

        # visualize the pipeline
        if is_debug_enabled is True:
            resultant_img = Visualization.visualize_pipeline(
                resultant_img, img_dst, binary_image, lane_lines, radius,
                center_distance, lane_width)

        return lanes_fit, resultant_img
Exemplo n.º 10
0
	def __init__(self, messages, model, questions: set, answers: set,
				 pc_questions: dict, pc_answers: dict, tokenizer):

		self.questions = questions
		self.answers = answers
		self.pc_questions = pc_questions
		self.pc_answers = pc_answers	
		self.tokenizer = tokenizer	
		self.model = model
		self.messages = messages
		self.pp = PreProcessing()		
		self.s = Similarity(questions=self.questions,
							answers=self.answers
							)
Exemplo n.º 11
0
class Prediction:

	def __init__(self, messages, model, questions: set, answers: set,
				 pc_questions: dict, pc_answers: dict, tokenizer):

		self.questions = questions
		self.answers = answers
		self.pc_questions = pc_questions
		self.pc_answers = pc_answers	
		self.tokenizer = tokenizer	
		self.model = model
		self.messages = messages
		self.pp = PreProcessing()		
		self.s = Similarity(questions=self.questions,
							answers=self.answers
							)


	def predict(self, msg):
		if msg == '' or msg is None:
			return emergency_message()
			
		try:
			msg = self.pp.pre_processing_text_for_similarity(msg)
			msg_nn = self.pp.pre_processing_text_for_neural_network(msg)
		except Exception as e:
			save_content_to_log(e)
			return BOT_PREFIX + emergency_message() + '\n' + str(e)

		if msg == '' or msg is None:
			return emergency_message()

		p = self.tokenizer.texts_to_matrix([msg_nn])

		res = self.model.predict(p)

		if res >= 0.5:
			pc = self.pc_questions
		else:
			pc = self.pc_answers

		conversations = self.s.return_conversation_by_cossine(msg, res)
		
		conversations = self.s.return_conversation_by_page_rank(msg, conversations,
																page_compute=pc,
																reverse=True)		
		
		return self.s.get_the_next_conversation(conversations, self.messages)
Exemplo n.º 12
0
def apply_gazetteer(extract_from,
                    tweets_file,
                    gazetteer_file,
                    final_file,
                    lang_tweets=None):
    with open(tweets_file) as tweets_f:
        gazetteer_list = load_to_list_gazetteer_file(gazetteer_file)
        prepro = PreProcessing()
        i = 0
        t = "{0}\t{1}\t{2}\n"
        for line in tweets_f:
            if i > 0:
                if extract_from == "mixed":
                    write_for_mixed_signal(line, gazetteer_list, final_file,
                                           prepro, lang_tweets)
                elif extract_from == "gps":
                    write_for_gps_signal(line, gazetteer_list, final_file,
                                         prepro)
                else:
                    write_for_tweet_or_location_signal(line, extract_from,
                                                       gazetteer_list,
                                                       final_file, prepro,
                                                       lang_tweets)
            else:
                with open(final_file, "w") as final_f:
                    final_f.write(
                        t.format(line.strip(), 'hierarchy',
                                 'location_detected'))
            i += 1
Exemplo n.º 13
0
    def test_should_compute_tdidf(self):
        p = PreProcessing(["joao", "maria"], [], ["\d+", "nomeemp*"])

        text_1 = "O técnico João foi até a casa da cliente Maria (NOMEEMPRESA) e solucionou o problema. " \
                 "Ele não foi solucionado? NomeempProd"
        text_2 = "A cliente Maria disse que continua sem sinal de Internet e " \
                 "reclamou que o problema não foi resolvido, ela continua sem sinal"
        text_3 = "Maria solicitou reparo, cliente reclama que esta sem sinal de Internet e Telefone após chuva"

        texts = [text_1, text_2, text_3]

        terms = p.compute_tfidf(texts)
        print terms

        import operator
        print sorted(terms.items(), key=operator.itemgetter(1), reverse=True)
Exemplo n.º 14
0
def retrain():
    ds = process(PreProcessing('./data/starwars.txt'))

    word_embedding = WordEmbedding(source='./embedding/FT/fasttext_cbow_300d.bin')

    word_embedding.train(ds.pairs)
    word_embedding.save('./embedding/starwars', 'starwars.bin')
Exemplo n.º 15
0
def process(reader: PreProcessing, storage: DatasetStorage = DatasetStorage()):

    if not storage.exist(reader.idx):
        pairs = reader.process()
        dataset = Dataset(pairs, reader.idx)
        storage.save(dataset)

    return storage.load(reader.idx)
Exemplo n.º 16
0
def train():
    ds = process(PreProcessing(open('./data/starwars.txt', 'r')))

    word_embedding = WordEmbedding(source=ds.pairs)

    word_embedding.train(ds.pairs)

    word_embedding.save(target_folder='./embedding/starwars', filename='starwars.bin')
Exemplo n.º 17
0
    def test_should_save_load_dataset(self):
        storage = ds.DatasetStorage()
        pre_processing = PreProcessing(sentences)
        dataset = ds.process(pre_processing)
        expected = storage.save(dataset)

        result = storage.load(expected.idx)

        self.assertEqual('{"idx": "' + expected.idx + '", "pairs": 3}',
                         result.__str__())
Exemplo n.º 18
0
    def setUpClass(cls):
        cls.pre_processing = PreProcessing(sentences)
        cls.dataset = ds.process(cls.pre_processing)
        cls.word_embedding = WordEmbedding(source=cls.dataset.pairs)

        encoder = EncoderRNN(cls.word_embedding, 300, 1).to(settings.device)
        decoder = DecoderRNN(300, cls.word_embedding, 0.0,
                             1).to(settings.device)
        cls.model = Model(encoder, decoder)
        cls.model.train(cls.dataset)
Exemplo n.º 19
0
class Dataset:
    def __init__(self):
        self.pp = PreProcessing()

    def import_dataset(self):
        messages = pd.read_csv(DATA_FILE,
                               delimiter="\t",
                               quoting=3,
                               encoding="ISO-8859-2")
        messages.columns = [
            'msg_line', 'user_id', 'movie_id', 'msg', 'msg_pre_processed',
            'msg_2', 'target'
        ]
        return messages

    def get_questions(self, messages):
        return set(
            messages[messages["target"] == 1]["msg_pre_processed"].astype(str))

    def get_answers(self, messages):
        return set(
            messages[messages["target"] == 0]["msg_pre_processed"].astype(str))

    def get_page_compute(self, qea=0):

        pc = None
        file = None

        if qea == 0:
            file = PAGE_RANK_ANSWERS
        else:
            file = PAGE_RANK_QUESTIONS

        pc = self.pp.pre_processing_page_rank_file(file)

        return self.pp.normalize_dictionary(pc)

    def load_tokenizer(self):
        with open(TOKENIZER_FILE, "rb") as handle:
            tokenizer = pickle.load(handle)
        return tokenizer
Exemplo n.º 20
0
def __main__():
    # get video stream
    video_cap = imageio.get_reader(config["project_video"])
    # polynomial lane fit
    lanes_fit = []

    # history of heatmaps to reject false positives
    history = deque(maxlen=config["history_limit"])

    # classifier and scaler
    classifier = Classifier.get_trained_classifier(use_pre_trained=True)

    # load calibration parameters:
    camera_matrix, dist_coef = PreProcessing.load_calibration_params()
    for index, img in enumerate(video_cap):
        if index % config["skip_frames"] == 0:
            # get lanes
            lanes_fit, img = LaneDetection.pipeline(img, lanes_fit,
                                                    camera_matrix, dist_coef)
            # resize image to improve speed of vehicle detection using classifier

            # jpg to png
            if config["is_training_png"]:
                img = Helper.scale_to_png(img)

            # 3 channel without alpha
            img = img[:, :, :config["channels"]]

            bounding_boxes = []
            # get bounding boxes for left side
            x_start_stop_left, y_start_stop_left = config["xy_start_stop_left"]
            bounding_boxes += WindowSearch.get_bounding_boxes(
                img, classifier, x_start_stop_left, y_start_stop_left)
            # # get bounding boxes for top side
            x_start_stop_top, y_start_stop_top = config["xy_start_stop_top"]
            bounding_boxes += WindowSearch.get_bounding_boxes(
                img, classifier, x_start_stop_top, y_start_stop_top)
            # get bounding boxes for right side
            x_start_stop_right, y_start_stop_right = config[
                "xy_start_stop_right"]
            bounding_boxes += WindowSearch.get_bounding_boxes(
                img, classifier, x_start_stop_right, y_start_stop_right)

            # remove false positives and duplicates from detection
            detected_cars = Helper.remove_false_positives(
                img, bounding_boxes, history)
            # visualization
            plt.imshow(detected_cars)
            plt.pause(0.0001)
Exemplo n.º 21
0
    def test_pre_processing(self):
        pre_processing = PreProcessing(sentences)
        dataset = ds.process(pre_processing)

        expected = [
            ('ontem à noite e anteontem à noite . . .',
             'tommyknockers, tommyknockers batendo na porta .'),
            ('tommyknockers, tommyknockers batendo na porta .',
             'eu quero sair, não sei se posso . . . tenho medo do tommyknockers'
             ),
            ('eu quero sair, não sei se posso . . . tenho medo do tommyknockers',
             'bobbi .')
        ]

        self.assertEqual(dataset.pairs, expected)
Exemplo n.º 22
0
def run(hidden,
        layer,
        dropout,
        learning_rate,
        iteration,
        save,
        train=None,
        test=None):
    if train:
        dataset_id = train.split('/')[-1].split('.')[0]

        pre_processing = PreProcessing(open(train, 'r'), dataset_id)
        dataset = process(pre_processing)

        encoder_embeddings = WordEmbedding(source=dataset.pairs)
        decoder_embeddings = WordEmbedding(source=dataset.pairs)

        encoder = EncoderRNN(encoder_embeddings, hidden,
                             layer).to(settings.device)
        decoder = DecoderRNN(hidden, decoder_embeddings, dropout,
                             layer).to(settings.device)

        model = Model(
            encoder=encoder,
            decoder=decoder,
            learning_rate=learning_rate,
        )
        model.summary()
        model.train(dataset, n_iter=iteration, save_every=save)

    if test:

        dataset = load(test)

        model = Model.load(test)

        while True:
            decoded_words = model.evaluate(str(input("> ")), dataset)
            print(' '.join(decoded_words))
Exemplo n.º 23
0
            links = self.get_links(visiting_now)
            if (links == -1):
                visit_quantity -= 1
                continue

            self.print_debug(links)

            links = self.validate_links(links)

            self.evaluate_links(links, method)

            time.sleep(0.5)
        if save_results:
            self.save_visited_csv(method)
        print("Done")

        if (self.debug):
            self.out.close()


if (__name__ == "__main__"):
    p = PreProcessing("../site.txt")
    sites = p.get_sites_info()
    for m in ['ml']:
        print('Initializing (' + m + '):')
        for s in sites:
            c = Crawler(s, dbg=False)
            c.visit(method=m, save_results=True)
        print('Finishing (' + m + ')\n')
Exemplo n.º 24
0
	passwd="legend", # your password
	#db="data_flood"
	db="data_earthquake_09_2015"
	) # name of the data base


if __name__ == '__main__':
	cur = db.cursor() 
	punctuation = list(string.punctuation)
	stop = stopwords.words('spanish') + punctuation + ['rt', 'via']
	# Use all the SQL you like
	
	cur.execute("select text,date_sub(created_at, INTERVAL 3 HOUR) from no_retweet;")
	dates = []

	prepro = PreProcessing()
	for row in cur.fetchall():
		terms_stop = [term for term in prepro.preprocess(row[0]) if len(term) >= 3 and term not in stop]  #ignore terms with length <= 3
		# if 'terremoto' in terms_stop:
		dates.append(row[1])

	# a list of "1" to count the hashtags
	ones = [1]*len(dates)
	# the index of the series
	idx = pandas.DatetimeIndex(dates)
	# the actual series (at series of 1s for the moment)
	date_serie = pandas.Series(ones, index=idx)	 
	# Resampling / bucketing
	per_minute = date_serie.resample('1Min', how='sum').fillna(0)
	time_chart = Line(per_minute)
	time_chart.axis_titles(x='Time', y='Freq')
Exemplo n.º 25
0
class Similarity:
    def __init__(self, questions: set, answers: set, word_vectors=None):

        self.bow = CountVectorizer()
        self.questions = questions
        self.answers = answers
        self.word_vectors = word_vectors
        self.pp = PreProcessing()

    def get_the_next_conversation(self, conversations, df):
        """
		Get the first item in the dict
		"""

        keys_view = conversations.keys()
        keys_iterator = iter(keys_view)
        try:
            conversation = next(keys_iterator)
        except Exception as e:
            save_content_to_log(e)
            return naive_massage()

        return list(df[df['msg_pre_processed'] == conversation]['msg_2'])[0]

    def return_conversation_by_page_rank(self,
                                         msg,
                                         conversations,
                                         page_compute,
                                         reverse=True):
        """
		Return a dictionary of message and similarity sorted by highter similarity
		"""

        conversations = self.pp.normalize_dictionary(conversations)

        conversations = {
            k: page_compute[k] + v
            for k, v in conversations.items()
        }

        return {
            k: v
            for k, v in sorted(conversations.items(),
                               key=lambda item: item[1],
                               reverse=reverse)
        }

    def return_conversation_by_cossine(self, msg, res):
        """
		Return a dictionary of message and similarity sorted by highter similarity
		"""
        if res >= 0.5:
            msg_list = self.questions
        else:
            msg_list = self.answers

        similarity = []

        for m in msg_list:
            m = str(m)
            new_msg_list = [msg, m]
            vector_bow = self.bow.fit_transform(new_msg_list)
            msg_bow = vector_bow.todense()[0]
            m_bow = vector_bow.todense()[1]

            d1_array = (1, 1)

            if m_bow.shape == d1_array and msg_bow.shape == d1_array:
                d = 1 - distance.euclidean(msg_bow, m_bow)
            else:
                d = 1 - distance.cosine(msg_bow, m_bow)

            if math.isnan(float(d)):
                similarity.append(0.0)
            else:
                similarity.append(d)
        """

		vector_bow = [self.bow.fit_transform([msg, m]) for m in msg_list]
		msg_bow = [vect.todense()[0] for vect in vector_bow]
		m_bow = [vect.todense()[1] for vect in vector_bow]

		similarity = [1 - distance.cosine(msg_vect, m_vect) for msg_vect, m_vect in zip(msg_bow, m_bow)]
		
		"""
        result = dict(zip(msg_list, similarity))

        return result
Exemplo n.º 26
0
    parser.add_argument("review_limit",
                        help="the number of reviews to be processed")
    args = parser.parse_args()

    try:
        review_limit = int(args.review_limit)
    except ValueError:
        raise Exception("Review limit must be a number")

    if review_limit < 100:
        raise Exception("Review limit must be over 100")

    # step 1 - pre processing the training data
    # convert to combined pandas dataframe
    # remving stopwords and stemming the review text
    pre_processing = PreProcessing(limit_reviews=review_limit)

    df_reviews = pre_processing.get_df_reviews()
    df_meta = pre_processing.get_df_meta()

    combined = pre_processing.filter_and_combine(df_reviews, df_meta)
    reviews_clean = pre_processing.preprocess_reviews(
        combined['reviewTextProcessed'].tolist())
    no_stop_words = pre_processing.remove_stop_words(reviews_clean)
    stemmed_reviews = pre_processing.get_stemmed_text(no_stop_words)

    combined['reviewTextProcessed'] = stemmed_reviews
    combined = pre_processing.change_categories_column(combined)

    combined.to_csv(args.output_file, sep='\t', encoding='utf-8')
    parser.add_argument("review_limit",
                        help="the number of reviews to be processed")
    args = parser.parse_args()

    try:
        review_limit = int(args.review_limit)
    except ValueError:
        raise Exception("Review limit must be a number")

    if review_limit < 100:
        raise Exception("Review limit must be over 100")

    # step 1 - pre processing the training data
    # convert to combined pandas dataframe
    # remving stopwords and stemming the review text
    pre_processing = PreProcessing(limit_reviews=review_limit)

    df_reviews = pre_processing.get_df_reviews()
    df_meta = pre_processing.get_df_meta()

    combined = pre_processing.filter_and_combine(df_reviews, df_meta)
    combined['reviewTextProcessed'] = pre_processing.preprocess_reviews(
        combined['reviewTextProcessed'])
    combined['reviewTextProcessed'] = pre_processing.remove_stop_words(
        combined['reviewTextProcessed'])
    combined['reviewTextProcessed'] = pre_processing.get_stemmed_text(
        combined['reviewTextProcessed'])

    reviews_and_sentiment = combined[['reviewTextProcessed', 'overall']]

    # convert string rating values to numerical values
Exemplo n.º 28
0
 def __init__(self):
     self.pp = PreProcessing()
Exemplo n.º 29
0
 def setUpClass(cls):
     cls.dataset = ds.process(PreProcessing(sentences))
Exemplo n.º 30
0
def main():
    configs = json.load(open('config.json', 'r'))

    # download and process all the datasets involved
    # includes AMZN
    GetData(configs['data']['symbol'], configs['data']['start'],
            configs['data']['end'], configs).get_stock_data()
    amzn_dataloader = DataLoader(
        os.path.join(configs['data']['save_dir'],
                     configs['data']['symbol'] + '.csv'),
        configs['data']['columns'])

    preprocessing = PreProcessing()
    preprocessing.denoise(amzn_dataloader.data, configs)

    all_data = {configs['data']['symbol']: preprocessing.denoised}

    # and the correlated ones
    for correlate in configs['data']['correlates_to']:
        GetData(correlate, configs['data']['start'], configs['data']['end'],
                configs).get_stock_data()
        dataloader = DataLoader(
            os.path.join(configs['data']['save_dir'], correlate + '.csv'),
            configs['data']['columns'])
        preprocessing = PreProcessing()
        preprocessing.denoise(dataloader.data, configs)
        all_data.update({correlate: preprocessing.denoised})

    # save all data preprocessed
    dataframe = pd.DataFrame(all_data)
    filename = os.path.join(configs['preprocessing']['save_dir'],
                            configs['preprocessing']['filename'])
    dataframe.to_csv(filename, index=False)

    dataloader = DataLoader(filename, configs['data']['correlates_to'])
    dataloader.train_test_split(configs['data']['days'],
                                configs['data']['train_test_split'])

    model = Model()
    # build and train model
    model.build(configs, dataloader)

    from keras.utils import plot_model
    plot_model(model.model,
               show_shapes=True,
               to_file="autoencoder-lstm-multivariable-for-prediction.png")

    yhat = model.predict(dataloader.train, dataloader.test,
                         configs['data']['inputs'])

    import matplotlib.pyplot as plt
    plt.figure(figsize=(10, 8))
    plt.plot(dataloader.test[0, :, 0].flatten(), label='Real')
    plt.plot(yhat.flatten(), label='Predicted')
    plt.legend()
    plt.show()

    yhat = model.predict(dataloader.train, dataloader.train,
                         configs['data']['inputs'])
    plt.figure(figsize=(10, 8))
    plt.plot(dataloader.train[0, :, 0].flatten(), label='Real')
    plt.plot(yhat.flatten(), label='Predicted')
    plt.legend()
    plt.show()
Exemplo n.º 31
0
    def test_should_create_dataset_dir(self):
        storage = ds.DatasetStorage()
        pre_processing = PreProcessing(sentences)
        dataset = ds.process(pre_processing)

        self.assertTrue(storage.exist(dataset.idx))
Exemplo n.º 32
0
 def test_should_generate_training_pairs(self):
     pre_processing = PreProcessing(sentences)
     dataset = ds.process(pre_processing)
     word_embedding = WordEmbedding(freeze=False, source=dataset.pairs)
     word_embedding.train()
     self.assertEqual(len(dataset.training_pairs(2, word_embedding)), 2)