示例#1
0
def get_timeline_count(tweets_file, keywords=None, timezones=None):
    # get dataframe
    l = Loader(tweets_file)
    data = l.get_dataframe()

    def valid_keyword(x):
        for keyword in keywords:
            if keyword in x:
                return True
        return False

    def valid_timezone(t):
        for timezone in timezones:
            if timezone in t:
                return True
        return False

    if not keywords is None:
        data = data[data["text"].apply(valid_keyword) == True]

    if not timezones is None:
        data = data[data["user_time_zone"].apply(valid_timezone) == True]

    print len(data.index)

    data["created_at"] = data["created_at"].astype("datetime64")
    a = data["created_at"].groupby(data["created_at"].dt.date).count()


    print a
    a = a.to_frame()
    return np.array(date2num(a.index)), np.array(a.values)
示例#2
0
def train(conf):
    loader = Loader(conf['embedding'], conf['text'])
    data, label_str, word2vec = loader.load()
    data = data[:700]
    labels = np.array(label_str[:700], dtype=np.int32)
    classifier = BiLstm(2, conf['embedding']['sequence_length'], word2vec.vocab_size, word2vec.embed_size)
    trainer = Trainer(classifier, word2vec.embeddings)
    trainer.train(data, labels)
示例#3
0
def predict(conf):
    loader = Loader(conf['embedding'], conf['text'])
    data, label_str, word2vec = loader.load()
    classifier = BiLstm(7, conf['embedding']['sequence_length'],
                        word2vec.vocab_size, word2vec.embed_size)
    predictor = Predictor(classifier)
    res = predictor.predict(data)
    f = open('./data/breakdown_predict.pik', 'wb')
    print(res[0])
    pickle.dump(res, f)
    f.close()
示例#4
0
def train(conf):
    loader = Loader(conf['embedding'], conf['text'])
    data, label_str, word2vec = loader.load()

    labels = np.zeros_like(label_str)
    for idx, val in enumerate(label_str):
        if val in gender_mapping:
            labels[idx] = gender_mapping[val]
        else:
            labels[idx] = 0

    classifier = BiLstm(4, conf['embedding']['sequence_length'], word2vec.vocab_size, word2vec.embed_size)
    trainer = Trainer(classifier, word2vec.embeddings)
    trainer.train(data, labels)
示例#5
0
class TestLoader(unittest.TestCase):
    def setUp(self) -> None:
        logging.basicConfig(level=logging.CRITICAL)
        self._parser = MagicMock()
        self._downloader = MagicMock()
        self._loader = Loader(self._parser, self._downloader)

    def test_downloads_all_urls_returned_from_parser(self):
        urls = ["a", "b", "c"]
        self._parser.get_urls.return_value = urls

        self._loader.start()

        self.check_if_there_is_a_call_for_each_url(urls)

    def test_IF_url_parser_throws_error_THEN_return_false(self):
        self._parser.get_urls.side_effect = Exception("foo")

        success = self._loader.start()

        self.assertFalse(success)

    def check_if_there_is_a_call_for_each_url(self, urls):
        expected_calls = [call(url) for url in urls]
        self._downloader.download.assert_has_calls(expected_calls,
                                                   any_order=True)
        self.assertEqual(3, self._downloader.download.call_count)

    def test_respects_continue_on_error_equals_false(self):
        urls = ["a", "b", "c"]
        self._parser.get_urls.return_value = urls
        self._downloader.download.side_effect = Exception("some error")

        self._loader.start()

        self.assertEqual(1, self._downloader.download.call_count)

    def test_respects_continue_on_error_equals_true(self):
        urls = ["a", "b", "c"]
        self._parser.get_urls.return_value = urls
        self._downloader.download.side_effect = Exception("some error")

        self._loader.start(continue_on_error=True)

        self.check_if_there_is_a_call_for_each_url(urls)
示例#6
0
文件: test.py 项目: OwenGY/twexit
def test3():
    l = Loader("./data/weight.json")
    print l.get_dataframe()
示例#7
0
文件: test.py 项目: OwenGY/twexit
def test2():
    l = Loader(data_dir + "brexit_data.json")
    print "Total number of tweets: ", len(l.get_tweets())
    l.remove_retweets()
    print "Without retweets: ", len(l.get_tweets())
示例#8
0
from loader.loader import Loader
yamlsettings = Loader.loadSettings()
示例#9
0
def main():
    loader = Loader()
示例#10
0
        tweet = df.loc[idx, "text"]
        day = dates_set[i]
        for key in keys:
            if key in tweet:
                try:
                    counts[day] += 1
                except:
                    counts[day] = 1

                df = df.drop(idx)
                break

    return df, counts


l = Loader("./data/May_16.csv")
df = l.get_dataframe()
df["created_at"] = pandas.to_datetime(df["created_at"])
dates = df["created_at"].dt.date.to_frame()
dates = np.array(date2num(dates)).flatten()

print "==== Size : {} ====".format(len(df.index))

# Remove tweets containing keywords mapped to a fixed sentiment
df, counts_leave = count_for_set(df, dates, leave_keys)
df, counts_other = count_for_set(df, dates, other_keys)
df, counts_stay = count_for_set(df, dates, stay_keys)

print "==== Size : {} ====".format(len(df.index))

print "Days   : ", dates
示例#11
0
 def setUp(self) -> None:
     logging.basicConfig(level=logging.CRITICAL)
     self._parser = MagicMock()
     self._downloader = MagicMock()
     self._loader = Loader(self._parser, self._downloader)