예제 #1
0
    def test_base_36_conversion(self):
        self.assertEqual(helpers.convert_numeric_id_to_id36(295), '87')
        self.assertEqual(helpers.convert_id36_to_numeric_id('87'), 295)

        self.assertEqual(helpers.convert_numeric_id_to_id36(275492), '5wkk')
        self.assertEqual(helpers.convert_id36_to_numeric_id('5wkk'), 275492)

        self.assertRaises(TypeError, helpers.convert_numeric_id_to_id36)
        self.assertRaises(ValueError, helpers.convert_numeric_id_to_id36, '1')
        self.assertRaises(ValueError, helpers.convert_numeric_id_to_id36, -1)

        self.assertRaises(TypeError, helpers.convert_id36_to_numeric_id)
        self.assertRaises(ValueError, helpers.convert_id36_to_numeric_id,
                          't3_87')
        self.assertRaises(ValueError, helpers.convert_id36_to_numeric_id, 87)
예제 #2
0
    def test_base_36_conversion(self):
        self.assertEqual(helpers.convert_numeric_id_to_id36(295), '87')
        self.assertEqual(helpers.convert_id36_to_numeric_id('87'), 295)

        self.assertEqual(helpers.convert_numeric_id_to_id36(275492), '5wkk')
        self.assertEqual(helpers.convert_id36_to_numeric_id('5wkk'), 275492)

        self.assertRaises(TypeError, helpers.convert_numeric_id_to_id36)
        self.assertRaises(ValueError, helpers.convert_numeric_id_to_id36, '1')
        self.assertRaises(ValueError, helpers.convert_numeric_id_to_id36, -1)

        self.assertRaises(TypeError, helpers.convert_id36_to_numeric_id)
        self.assertRaises(ValueError, helpers.convert_id36_to_numeric_id,
                          't3_87')
        self.assertRaises(ValueError, helpers.convert_id36_to_numeric_id, 87)
예제 #3
0
    df.dropna(inplace=True)
    df["created_utc"] = df.created.apply(
        lambda x: datetime.utcfromtimestamp(x))
    df["crawled_utc"] = df.crawled.apply(
        lambda x: datetime.utcfromtimestamp(x))
    print "crawl started:", df.crawled_utc.min()
    print "crawl ended:", df.crawled_utc.max()
    print "crawl lasted for:", df.crawled_utc.max() - df.crawled_utc.min()
    print "comment ids collected:", len(df)
    print "avg comment rate: %2.4f comments per second" % (
        len(df) /
        (df.crawled_utc.max() - df.crawled_utc.min()).total_seconds())
    #     plt.show()
    #     print df.id.apply(lambda x: convert_id36_to_numeric_id(str(x[3:])))
    df["num_ids"] = df.id.apply(
        lambda x: convert_id36_to_numeric_id(str(x[3:])))
    num_ids = df["num_ids"]
    #     print df.describe()
    df.set_index("num_ids", inplace=True)
    df["crawl_lag"] = df.crawled - df.created
    ax = df.crawl_lag.plot(kind="area", zorder=300, alpha=.8, lw=0)
    #     plt.savefig("crawl lag in seconds.pdf")
    #     print "missing ids:", num_ids.max()-num_ids.min() - len(num_ids), "/", len(num_ids)
    missing_ids = pd.Series(
        sorted(set(range(num_ids.min(),
                         num_ids.max() + 1)) - set(num_ids))
    )  #.apply(lambda x: u"t1_"+convert_numeric_id_to_id36(int(x)))
    #     print df.index.min()
    for i in missing_ids:
        plt.axvline(i, color='k', alpha=.2, zorder=0)
#     pd.DataFrame(data = zip(missing_ids.values, np.ones_like(missing_ids.values))).plot(ls="^")