Exemplo n.º 1
0
    def get_tweets(self):

        api = Api.twitter_api()
        tweets_ = []
        created = []
        hashtag = []

        for search_word in self.search_words:

            new_search = search_word + " -filter:retweets"  # Filter retweets
            tweets = tw.Cursor(api.search, q=new_search,
                               lang='en').items(self.n_tweets)

            for tweet in tweets:
                tweets_.append(tweet.text)  # Get tweets
                created.append(tweet.created_at)  # Get timestamp
                hashtag.append(search_word)

        dataset = pd.DataFrame({
            "hashtag": hashtag,
            "created_at": created,
            "tweet": tweets_
        })
        store_data = DataHandler('twitter', self.search_words)
        store_data.store_network_dataset(dataset)
Exemplo n.º 2
0
    def vader_sentiment(self):

        handler = DataHandler(self.social_network, self.search_word)
        df_network = handler.read_network_dataset()
        df = df_network[df_network.tweet != '']

        prepross = Processing(self.social_network, self.search_word)
        analyzer = SentimentIntensityAnalyzer()

        predict_df = pd.DataFrame(
            None,
            columns=['date', 'hashtag', 'tweet', 'clean_tweet', 'sentiment'])

        for i, row in df.iterrows():

            clean_tweet = prepross.clean_text(row['tweet'])
            sentiment = analyzer.polarity_scores(clean_tweet)['compound']
            predict_df.loc[i] = [
                row['created_at'], row['hashtag'], row['tweet'], clean_tweet,
                sentiment
            ]

        predict_df.to_csv(r'data/output/dataset_predict.csv',
                          sep=';',
                          index=None)  #
Exemplo n.º 3
0
 def load_order_csv(self, path, columns=None):
     # Load order csv data
     data = self.load_file(path, columns)
     # Parse columns
     dh = DataHandler()
     data = dh.format_csv_actions(data)
     dh.format_csv_dates(data)
     return data
Exemplo n.º 4
0
 def train_sift(self,X_list):
     bow_list = []
     for X in X_list:
         bow_list.append(self.compute(X))
     self.bow_matrix = reduce(np.vstack,bow_list)
     dh = DataHandler()
     dh.load()
     sample_y = np.empty((len(X_list),1))
     for i in range(len(sample_y)):
         sample_y[i][0] = dh.get_lables(id=i)
     sample_data = np.hstack(sample_y,self.bow_matrix)
     # save sample data
     np.savetxt(os.path.join(self.bow_path,'bow_sift.txt'),sample_data)
Exemplo n.º 5
0
    def bug_count(self):

        data_handler = DataHandler(data_path=root.joinpath('data'))
        files = data_handler.get_data()
        all_results = dict()
        for proj, data in files.items():
            col_name = ['Date', 'Actual', 'ARIMA', 'NAIVE']
            results = []
            actual = 0
            for train, test in self.moving_window(data, frame=24):
                try:
                    p, d, q = 4, 1, 4

                    # if not self.is_stationary(train):
                    #     train = self.detrend_series(train)

                    arima = ARIMA(train, order=(p, d, q), freq='W-MON')
                    arima_fit = arima.fit(disp=0)

                    # Find start and end time stamps
                    start, end = test.index[0], test.index[-1]

                    # Save date, actual, and forecast
                    prev_actual = actual
                    actual = test.values.ravel()[0]

                    forecast_arima = int(abs(arima_fit.forecast()[0]))
                    forecast_naive = prev_actual
                    date = test.index.strftime("%Y-%m-%d").values[0]
                    results.append(
                        [date, actual, forecast_arima, forecast_naive])
                except:
                    X = np.arange(len(train.values) + 1)
                    X = np.reshape(X, (len(X), 1))
                    y = train.values
                    model = LinearRegression()
                    model.fit(X[:-1], y)
                    prev_actual = actual
                    actual = test.values.ravel()[0]
                    forecast_arima = int(
                        abs(model.predict(X[-1].reshape(1, -1))[0]))
                    forecast_naive = prev_actual
                    date = test.index.strftime("%Y-%m-%d").values[0]
                    results.append(
                        [date, actual, forecast_arima, forecast_naive])

            results = pd.DataFrame(results, columns=col_name).set_index('Date')
            results.to_csv(root.joinpath('results', proj + ".csv"))

        return all_results
Exemplo n.º 6
0
def get_character_mentions(name):
    """
    Perform a lookup for all the sentences in which
    a character has been mentioned, if we don't have
    a match on the name we return all the sentences
    for the main characters
    Parameters:
        name str: The name of a character, could be None
    Returns:
    A list of all the sentences for the given character
    or the list for every main character
    """
    logger.info(
        f'CALLED FROM: {GET_CHARACTER_MENTIONS} ENDPOINT: Displaying the sentences for a character mentions or the sentences for all main characters.'
    )

    data = DataHandler().get_data()

    if not data:
        abort(404, description=MISSING_DATA_STR)
    elif not name:
        abort(404, description='No name provided.')
    elif name in data:
        res = list(data[name][SENT].keys())
    else:
        main_characters_names = list(
            filter(lambda x: data[x].get(RANK) == 1, data))
        res = {}
        for name in main_characters_names:
            res[name] = list(data[name][SENT].keys())

    return jsonify(res)
Exemplo n.º 7
0
def get_characters_co_mentions(name_a, name_b):
    """
    Display all the sentences in which we have both entities
    else an error
    Parameters:
        name_a str: The name of the first character from the book
        name_b str: The name of the second character from the book
    Returns:
    The list of sentences or 404
    """
    logger.info(
        f'CALLED FROM: {GET_CHARACTER_CO_MENTIONS} ENDPOINT: Displaying the sentences in which we have both character_a and character_b.'
    )

    data = DataHandler().get_data()

    if not data:
        abort(404, description=MISSING_DATA_STR)
        logger.info(
            'Called FROM: {GET_CHARACTER_CO_MENTIONS} ENDPOINT: No data in the collection.'
        )
    if not all((name_a, name_b)):
        abort(404, description='Missing name argument.')

    sents = set()
    if name_a in data and name_b in data:
        for sent in data[name_a][SENT]:
            if name_b in sent:
                sents.add(sent)
        for sent in data[name_b][SENT]:
            if name_a in sent:
                sents.add(sent)

    return jsonify(list(sents))
Exemplo n.º 8
0
def get_character_info(name):
    """
    Performs a lookup in the data collection for a
    specific name and if we have a match returns is schema,
    else it return 404 with the error
    Parametes:
        name str: The name of the named entity
    Returns:
        the generate schema
    """
    logger.info(
        f'CALLED FROM: {GET_CHARACTER_INFO} ENDPOINT: Performing a lookup on an entity.'
    )

    data = DataHandler().get_data()

    if not data:
        abort(404, description=MISSING_DATA_STR)
    elif not name:
        abort(404, description='No name provided.')
    elif name not in data:
        abort(404, description=f"The name: {name} is not in the collection")

    res_entity = extract_entity(data[name])
    return jsonify(res_entity)
    def pre_processing(self):

        handler = DataHandler(self.social_network, self.search_word)
        df_network = handler.read_network_dataset()
        df = df_network[df_network.tweets != '']

        nlp = spacy.load('pt_core_news_sm')
        #nlp = spacy.load('en_core_web_sm')
        nltk.download("stopwords")
        nltk.download('punkt')

        stop_words_ = STOP_WORDS.union(stopwords.words('english'))
        stop_words = [unidecode(stop).lower() for stop in stop_words_]

        nltk.download('rslp')

        all_words, all_words_n_gram = Processing.words_dataset(
            df['tweets'], stop_words, nlp)  # Get all dataset words

        bag_words = []
        bag_words_n_gram = []
        n_gram = []
        clean_tweets = []

        for sentence in df['tweets']:
            clean = Processing.clean_text(sentence, stop_words)
            token = Processing.lemma(clean.split(), nlp)
            concat = ' '.join(token)
            ngram = Processing.n_gram(concat)
            n_gram.append(Processing.n_gram(concat))
            bag_words_n_gram.append(
                Processing.bag_of_words(ngram, all_words_n_gram))
            bag_words.append(Processing.bag_of_words(concat.split(),
                                                     all_words))
            clean_tweets.append(concat)

        Processing.word_cloud(clean_tweets)

        dataset = pd.DataFrame({
            "Posts": clean_tweets,
            "BOW": bag_words,
            "N-gram": n_gram,
            "BOW-N": bag_words_n_gram
        })
        handler.store_processed_dataset(dataset)
Exemplo n.º 10
0
def run_evaluation(cfg):
    """Test a tensorflow model"""
    logger = logging.getLogger(run_evaluation.__name__)

    data, ground_truth = DataHandler(cfg.data,
                                     cfg.eval_n_elements,
                                     shuffle=False).next_batch()

    prediction = np.zeros(ground_truth.shape)
    samples = prediction.shape[0]
    tictoc = np.zeros(samples)
    print_every = 2500

    logger.info('Load Tensorflow model')
    with TensorflowWrapper(os.path.dirname(cfg.model),
                           os.path.basename(cfg.model),
                           cfg.use_snapshots) as model:

        logger.info('Start model evaluation')
        for i in range(samples):

            start_time = time.time()
            prediction[i, :] = model.inference(data[i, :])
            tictoc[i % print_every] = time.time() - start_time

            next_i = i + 1
            if next_i > 0 and next_i % print_every == 0 or next_i == samples:
                logger.info(
                    'Processed: {:6.2f}% ({:5d}/{}) ({:.{width}f} ms/sample)'.
                    format(next_i / samples * 100.0,
                           next_i,
                           samples,
                           np.mean(tictoc) * 1e3,
                           width=math.ceil(math.log10(samples))))

    # Compute the loss
    error = np.mean(np.abs(ground_truth - prediction), axis=0)
    comined_error = np.mean(error)

    logger.info('Evaluation error: {:.5f} ({:.5f}/{:.5f})'.format(
        comined_error, error[0], error[1]))

    # Write the results into a .csv file
    if cfg.write_result:
        add_header_line = not os.path.isfile(cfg.results)

        with open(cfg.results, 'a') as capture:
            if add_header_line:
                capture.write(
                    'date_time,model,error_combined,error_linear,error_angular,execution_time\n'
                )

            capture.write('{},{},{},{},{},{}\n'.format(
                time.strftime('%Y-%m-%d %H:%M:%S'), cfg.model, comined_error,
                error[0], error[1], np.mean(tictoc)))
Exemplo n.º 11
0
class TestDataHandler(unittest.TestCase):
    def __init__(self, *args, **kwargs):
        super(TestDataHandler, self).__init__(*args, **kwargs)
        self.dh = DataHandler(data_path=root.joinpath("data"))

    def test_get_data(self):
        all_data = self.dh.get_data()
        self.assertIsInstance(all_data, dict)
        for proj, datasets in all_data.items():
            self.assertIsInstance(proj, str)
            self.assertIsInstance(datasets, dict)
            for key, value in datasets.items():
                self.assertIsInstance(key, str)
                self.assertIsInstance(value, pd.core.frame.DataFrame)
Exemplo n.º 12
0
def get_main_characters():
    """
    Get all the entities from the data collection
    that have the rank = 1
    Returns:
    The generated schema for all the main entities
    """
    logger.info(
        f'CALLED FROM: {GET_MAIN_CHARACTERS} ENDPOINT: Displaying all the main characters.'
    )

    data = DataHandler().get_data()

    if not data:
        abort(404, description=MISSING_DATA_STR)
    main_characters_names = list(
        filter(lambda x: data[x].get('rank') == 1, data))
    main_characters = get_entities(main_characters_names, data)

    return jsonify(create_response_schema(main_characters))
Exemplo n.º 13
0
def get_support_characters():
    """
    Get all the entities from the data collection
    that have the rank = 2
    Returns:
    The generated schema for all the support entities
    """
    logger.info(
        f'CALLED FROM: {GET_SUPPORT_CHARACTES} ENDPOINT: Displaying all the support characters.'
    )

    data = DataHandler().get_data()

    if not data:
        abort(404, description=MISSING_DATA_STR)

    secondary = list(filter(lambda x: data[x].get(RANK) == 2, data))
    secondary_characters = get_entities(secondary, data)

    return jsonify(create_response_schema(secondary_characters))
Exemplo n.º 14
0
def get_episode_characters():
    """
    Get 10 random entities from the data collection
    that have the rank = 3
    Returns:
    The generated schema for all the episode entities
    """
    logger.info(
        f'CALLED FROM: {GET_EPISODE_CHARACTERS} ENDPOINT: Displaying all the episode characters.'
    )

    data = DataHandler().get_data()

    if not data:
        abort(404, description=MISSING_DATA_STR)

    episode_names = list(filter(lambda x: data[x].get(RANK) == 3, data))
    random.shuffle(episode_names)
    episode_characters = get_entities(episode_names[:10], data)

    return jsonify(create_response_schema(episode_characters))
Exemplo n.º 15
0
    def run(self):
        logger = logging.getLogger(__name__)

        # Folder where to store snapshots, meta data and the final model
        storage_path = os.path.join(
            self.args.train_dir,
            (time.strftime('%Y-%m-%d_%H-%M_') + model.NAME))

        logger.info('Build Tensorflow Graph')
        with tf.Graph().as_default():

            # Define the used machine learning model
            global_step, learning_rate = model.learning_rate(
                self.args.learning_rate)

            self.sess = tf.Session(config=tf.ConfigProto(
                intra_op_parallelism_threads=8))

            logger.info('Create the data runner for the input data')
            self.custom_data_runner = CustomDataRunner(
                self.args.datafile_train, self.args.batch_size, 2**18)
            data_batch, cmd_batch = self.custom_data_runner.get_inputs()

            logger.info('Add operations to the computation graph')
            keep_prob_placeholder = tf.placeholder(
                tf.float32, name='keep_prob_placeholder')
            prediction = model.inference(data_batch,
                                         keep_prob_placeholder,
                                         self.args.batch_size,
                                         output_name='prediction')

            loss, loss_split = model.loss(prediction, cmd_batch)

            train_op = model.training(loss, loss_split, learning_rate,
                                      global_step)

            eval_data_placeholder, eval_cmd_placeholder = self.placeholder_inputs(
                1083, 2, 'eval_data_input')
            eval_prediction = model.inference(eval_data_placeholder,
                                              keep_prob_placeholder,
                                              self.eval_batch_size,
                                              training=False,
                                              reuse=True,
                                              output_name='eval_prediction')
            eval_predictions_placeholder = tf.placeholder(
                tf.float32, shape=[self.eval_n_elements, 2])
            evaluation, evaluation_split = model.evaluation(
                eval_predictions_placeholder, eval_cmd_placeholder)

            # This model is saved with the trained weights and can direclty be executed
            exe_data_placeholder, exe_cmd_placeholder = self.placeholder_inputs(
                1083, 2)
            model_inference = model.inference(exe_data_placeholder,
                                              keep_prob_placeholder,
                                              1,
                                              training=False,
                                              reuse=True,
                                              output_name='model_inference')

            # Variables to use in the summary (shown in tensorboard)
            train_loss = tf.scalar_summary('loss', loss)
            train_loss_lin = tf.scalar_summary('loss_linear_x', loss_split[0])
            train_loss_ang = tf.scalar_summary('loss_angular_yaw',
                                               loss_split[1])
            train_learning_rate = tf.scalar_summary('learning_rate',
                                                    learning_rate)

            eval_loss = tf.scalar_summary('loss', evaluation)
            eval_loss_lin = tf.scalar_summary('loss_linear_x',
                                              evaluation_split[0])
            eval_loss_ang = tf.scalar_summary('loss_angular_yaw',
                                              evaluation_split[1])

            summary_op = tf.merge_summary([
                train_loss, train_loss_lin, train_loss_ang, train_learning_rate
            ])
            eval_summary_op = tf.merge_summary(
                [eval_loss, eval_loss_lin, eval_loss_ang])

            # Saver for model snapshots
            saver = tf.train.Saver()

            self.sess.run(tf.initialize_all_variables())

            # start the tensorflow QueueRunner's
            self.coord = tf.train.Coordinator()
            self.runners = tf.train.start_queue_runners(sess=self.sess,
                                                        coord=self.coord)
            # start our custom queue runner's threads
            self.custom_data_runner.start_threads(self.sess, self.coord)

            # Save summaries for training and evaluation in separate folders
            summary_writer = tf.train.SummaryWriter(
                os.path.join(storage_path, 'train'), self.sess.graph)
            eval_summary_writer = tf.train.SummaryWriter(
                os.path.join(storage_path, 'eval'), self.sess.graph)

            # Save the tensorflow graph definition as protobuf file (does not include weights)
            tf.train.write_graph(self.sess.graph_def,
                                 os.path.join(storage_path), 'graph.pb',
                                 False)  #proto

            # Vector to average the duration over the last report steps
            duration_vector = [0.0] * (self.args.eval_steps // 100)

            if self.args.weight_initialize:

                logger.info('Initialize with weights from another model')

                if os.path.exists(self.args.weight_initialize):
                    saver.restore(self.sess, self.args.weight_initialize)
                    logger.info('Model restored: {}'.format(
                        self.args.weight_initialize))
                else:
                    logger.warning('No weights are loaded!')
                    logger.warning('File does not exist: {}'.format(
                        self.args.weight_initialize))

            logger.info('Load the evaluation data')
            (X_eval, Y_eval) = DataHandler(self.args.datafile_eval,
                                           self.eval_n_elements,
                                           shuffle=False).next_batch()
            X_eval = self.check_extend(
                X_eval,
                np.ceil(self.eval_n_elements / self.eval_batch_size) *
                self.eval_batch_size)

            loss_train = 0.0
            # Perform all training steps

            logger.info('Training begins')
            for step in range(self.args.max_steps):
                start_time = time.time()

                feed_dict = {keep_prob_placeholder: 0.5}
                _, loss_value, loss_split_value, summary_str = self.sess.run(
                    [train_op, loss, loss_split, summary_op],
                    feed_dict=feed_dict)

                duration = time.time() - start_time

                # Report every 100 steps
                if step > 0 and step % 100 == 0:
                    # Print status to stdout.
                    logger.info(
                        'Step {}: loss = ({:.4f},{:.4f}) {:.3f} msec'.format(
                            step, loss_split_value[0], loss_split_value[1],
                            duration / 1e-3))
                    summary_writer.add_summary(summary_str, step)
                    summary_writer.flush()
                    loss_train = loss_value

                    # Replace the durations in fifo fashion
                    duration_vector[((step % self.args.eval_steps) //
                                     100)] = duration

                # Evaluatie the model
                if step > 0 and step % self.args.eval_steps == 0 or step == (
                        self.args.max_steps - 1):
                    start_eval = time.time()

                    # Create an empty array, that has the correct size for to hold all predictions
                    eval_predictions = np.zeros([X_eval.shape[0], 2],
                                                dtype=np.float)

                    # Evaluate the data in batches and capture the predictions
                    for index in range(X_eval.shape[0] //
                                       self.eval_batch_size):
                        start_index = index * self.eval_batch_size
                        end_index = (index + 1) * self.eval_batch_size
                        feed_dict = {
                            eval_data_placeholder:
                            X_eval[start_index:end_index, :],
                            keep_prob_placeholder: 1.0
                        }
                        eval_predictions[
                            start_index:end_index, :] = self.sess.run(
                                [eval_prediction], feed_dict=feed_dict)[0]

                    # Finally evaluate the predictions and compute the scores
                    feed_dict = {
                        eval_predictions_placeholder:
                        eval_predictions[:self.eval_n_elements, :],
                        eval_cmd_placeholder:
                        Y_eval,
                        keep_prob_placeholder:
                        1.0
                    }

                    loss_value, loss_split_value, summary_str = self.sess.run(
                        [evaluation, evaluation_split, eval_summary_op],
                        feed_dict=feed_dict)

                    duration_eval = time.time() - start_eval

                    logger.info(
                        'Evaluattion: loss = ({:.4f},{:.4f}) {:.3f} msec'.
                        format(loss_split_value[0], loss_split_value[1],
                               duration_eval / 1e-3))

                    eval_summary_writer.add_summary(summary_str, step)
                    eval_summary_writer.flush()

                if step > 0 and step % 1000 == 0:
                    # Save a checkpoint
                    logger.info('Save model snapshot')
                    filename = os.path.join(storage_path, 'snap')
                    saver.save(self.sess, filename, global_step=step)

            logger.info('Save final model snapshot')
            filename = os.path.join(storage_path, 'final')
            saver.save(self.sess, filename)

            # Save the model with weights in one file
            # This will only capture the operations used to generate the prediction. It also
            # replaces the variables with the weights from training as constant values
            # See:
            # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py
            logger.info('Save final model with weights')
            output_node_names = 'model_inference'
            output_graph_def = tf.python.client.graph_util.convert_variables_to_constants(
                self.sess, self.sess.graph_def, output_node_names.split(","))
            with tf.gfile.GFile(os.path.join(storage_path, 'model.pb'),
                                "wb") as f:
                f.write(output_graph_def.SerializeToString())
                logger.info("{} ops in the final graph.".format(
                    len(output_graph_def.node)))

        if self.args.mail:
            self.send_notification(loss_train, loss_value)
Exemplo n.º 16
0
 def test_something(self):
     dh = DataHandler()
     dh.parse_data("design.json")
Exemplo n.º 17
0
from data.data_loader import DataLoader
from data.data_handler import DataHandler
from data.order_parser import OrderParser

DataLoader = DataLoader()
DataHandler = DataHandler()
OrderParser = OrderParser
Exemplo n.º 18
0
    parser.add_argument('-i',action='store',dest='id_upper',type = int)
    parser.add_argument('-c',action='store_true',help='train classifier')
    parser.add_argument('-f',action='store',dest='file',type=str,help='parser a clothes image')
    args = parser.parse_args()


    if args.cmd == 'train':
        if args.b:
            if args.s:
                train_bow_sift(args.id_upper)
            elif args.p:
                train_bow_pixel()
        if args.c:
            train_clf('pixel')

    if args.cmd == 'test':
        kmeans = KmeansModel()
        kmeans.load('kmeans_pixel')
        clf = RandomForest()
        clf.load()
        data = DataHandler()
        data.load()
        if args.file:
            for res in clf.predict(kmeans,file):
                print int(res),data.tell_label(int(res))

    if args.cmd == 'data':
        data = DataHandler()
        data.parse_data('design.json')
        data.save()
Exemplo n.º 19
0
    # Climb up the directory tree until you reach
    root = root.parent

if root not in sys.path:
    sys.path.append(root)

from metrics.abcd import ABCD
from data.data_handler import DataHandler
from prediction.model import PredictionModel

import warnings

warnings.filterwarnings("ignore")

if __name__ == "__main__":
    dh = DataHandler()
    data = dh.get_data(top_k=1)
    for _, val in data.items():
        data = val

    X = data[data.columns[:-1]]
    y = data[data.columns[-1]]
    # lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
    # model = SelectFromModel(lsvc, prefit=True)
    # X = model.transform(X)
    pca = PCA(n_components=3)
    pca.fit(X)
    X = pca.transform(X)
    colors = ['navy', 'darkorange']

    for X_transformed, title in [(X, "PCA")]:
Exemplo n.º 20
0
from config import cifar_configs
from train.trainer import Trainer
from data.data_handler import DataHandler

if __name__ == "__main__":
    # parser = argparse.ArgumentParser()
    # parser.add_argument("--config-file", default = "./config/train_config.yaml", metavar = "FILE", type = str)
    # args = parser.parse_args()

    # #extract config
    # config_file = open(args.config_file, 'r')
    # configs = yaml.load(config_file)
    datahandler = DataHandler(cifar_configs)
    trainer = Trainer(cifar_configs, datahandler)
    trainer.train()
Exemplo n.º 21
0
from pdb import set_trace
from prettytable import PrettyTable

from pathlib import Path

root = Path(os.path.abspath(os.path.join(os.getcwd().split("src")[0], 'src')))

if root not in sys.path:
    sys.path.append(str(root))

from metrics.abcd import ABCD
from data.data_handler import DataHandler
from prediction.model import PredictionModel

if __name__ == "__main__":
    dh = DataHandler()
    mdl = PredictionModel()
    data = dh.get_data()

    "Create a Table than can pretty printed"
    results = PrettyTable()
    results.field_names = ["Train", "Test ", "   Pd", "   Pf", "   F1"]

    "Align Data"
    results.align["Train"] = "l"
    results.align["Test "] = "l"
    results.align["   Pd"] = "r"
    results.align["   Pf"] = "r"
    results.align["   F1"] = "r"

    for proj, dataset in data.items():
Exemplo n.º 22
0
    def get_tweets(self):

        api = Api.twitter_api()
        tweets = api.user_timeline(screen_name=self.tt_user,
                                   count=200,
                                   tweet_mode='extended',
                                   include_rts=False,
                                   exclude_replies=True)
        last_id = tweets[-1].id
        while (True):
            more_tweets = api.user_timeline(screen_name=self.tt_user,
                                            count=200,
                                            include_rts=False,
                                            exclude_replies=True,
                                            max_id=last_id - 1)
            if (len(more_tweets) == 0):
                break
            else:
                last_id = more_tweets[-1].id - 1
                tweets = tweets + more_tweets

        created = []
        tweet_id = []
        text = []
        hashtags = []
        symbols = []
        image_url = []
        user_mentions = []
        user_id = []
        user_name = []
        user_screen_name = []
        user_location = []
        user_description = []
        user_protected = []
        user_followers_count = []
        user_friends_count = []
        user_listed_count = []
        user_created_at = []
        user_favourites_count = []
        user_utc_offset = []
        user_timezone = []
        user_geo_enabled = []
        user_verified = []
        user_statuses_count = []
        user_lang = []
        user_contributors_enabled = []
        user_is_translator = []
        user_is_translation_enabled = []
        quoted_status = []
        quoted_text = []
        quoted_media = []
        quoted_user_id = []

        for tweet in tweets:

            created.append(tweet.created_at)
            tweet_id.append(tweet.id)
            try:
                text.append(tweet.full_text)
            except AttributeError:
                text.append(tweet.text)
            hashtags.append(tweet.entities['hashtags'])
            symbols.append(tweet.entities['symbols'])
            user_mentions.append(tweet.entities['user_mentions'])
            user_id.append(tweet.user.id)
            user_name.append(tweet.user.name)
            user_screen_name.append(tweet.user.screen_name)
            user_location.append(tweet.user.location)
            user_description.append(tweet.user.description)
            user_protected.append(tweet.user.protected)
            user_followers_count.append(tweet.user.followers_count)
            user_friends_count.append(tweet.user.friends_count)
            user_listed_count.append(tweet.user.listed_count)
            user_created_at.append(tweet.user.created_at.strftime("%Y-%m-%d"))
            user_favourites_count.append(tweet.user.favourites_count)
            user_utc_offset.append(tweet.user.utc_offset)
            user_timezone.append(tweet.user.time_zone)
            user_geo_enabled.append(tweet.user.geo_enabled)
            user_verified.append(tweet.user.verified)
            user_statuses_count.append(tweet.user.statuses_count)
            user_lang.append(tweet.user.lang)
            user_contributors_enabled.append(tweet.user.contributors_enabled)
            user_is_translator.append(tweet.user.is_translator)
            user_is_translation_enabled.append(
                tweet.user.is_translation_enabled)

            if tweet.is_quote_status == True:

                try:
                    quoted_text.append(tweet.quoted_status.text)
                except AttributeError:
                    quoted_text.append(np.nan)
                try:
                    quoted_user_id.append(tweet.quoted_status.user.id)
                except AttributeError:
                    quoted_user_id.append(np.nan)
                try:
                    quoted_media.append(
                        tweet.quoted_status.entities['media'][0]['media_url'])
                except Exception:
                    quoted_media.append(np.nan)
            else:
                quoted_text.append(np.nan)
                quoted_user_id.append(np.nan)
                quoted_media.append(np.nan)

            try:
                image_url.append(tweet.entities['media'][0]['media_url'])
            except:
                image_url.append(np.nan)

        dataset = pd.DataFrame({
            "created_at": created,
            "tweet_id": tweet_id,
            "text": text,
            "hashtags": hashtags,
            "symbols": symbols,
            "image_url": image_url,
            "user_mentions": user_mentions,
            "user_id": user_id,
            "user_name": user_name,
            "user_screen_name": user_screen_name,
            "user_location": user_location,
            "user_description": user_description,
            "user_protected": user_protected,
            "user_followers_count": user_followers_count,
            "user_friends_count": user_friends_count,
            "user_listed_count": user_listed_count,
            "user_created_at": user_created_at,
            "user_favourites_count": user_favourites_count,
            "user_utc_offset": user_utc_offset,
            "user_timezone": user_timezone,
            "user_geo_enabled": user_geo_enabled,
            "user_verified": user_verified,
            "user_statuses_count": user_statuses_count,
            "user_lang": user_lang,
            "user_contributors_enabled": user_contributors_enabled,
            "user_is_translator": user_is_translator,
            "user_is_translation_enabled": user_is_translation_enabled,
            "quoted_text": quoted_text,
            "quoted_media": quoted_media,
            "quoted_user_id": quoted_user_id,
        })

        store_data = DataHandler('twitter', self.tt_user)
        store_data.store_network_dataset(dataset)
        import pdb
        pdb.set_trace()
        print('a')
Exemplo n.º 23
0
 def __init__(self, *args, **kwargs):
     super(TestDataHandler, self).__init__(*args, **kwargs)
     self.dh = DataHandler(data_path=root.joinpath("data"))
Exemplo n.º 24
0
    #     transforms.Normalize((0.485, 0.456, 0.406),(0.229, 0.224, 0.225)),
    # ])
    transform_test = transforms.Compose([
        transforms.ToPILImage(),
        # transforms.ToTensor(),
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # transforms=[transform_train,transform_test]

    dataset_configs = (train_dataset_config, test_dataset_config)
    datahandler = DataHandler(ds_class=(MenWomenDataset, MenWomenDataset),
                              transforms=None,
                              dataset_configs=dataset_configs,
                              configs=cfgs)
    # datahandler.show_batch(1,6,mode='train')

    # trainer buiding
    trainer_configs = {
        'model_path': '',
        'validate': 0.7,
        'lr': 0.001,
        'num_epochs': 10,
        'steps_save_loss': 2,
        'output_folder':
        'C:\\Users\\thanhdh6\\Documents\\projects\\vinbrain_internship\\image_classifier\\train\\logs',
        'device': 'cpu',
        'gpu_id': 0,
        'lr_schedule': None,