def start(self): while True: print('Commands: "show": display all category, "linear <category name>": display linear graph, "exit": exit programme, "coef": Show trening again product stats') the_input = raw_input('Enter input:') if the_input == 'exit': break elif the_input == 'show': the_input = raw_input('Enter 1 for products, 2 for categories:') hb = Myhbase('crawler') if the_input == '1': print(hb.getAllproducts()) elif the_input == '2': print(hb.getAllcategories()) elif re.compile(r'^linear\s.+$').match(the_input): category = re.compile(r'^linear\s(.+)$').match(the_input) if category is not None: self.lineargraph(category.group(1)) elif the_input == 'coef': self.coef()
def start(self): while True: print( 'Commands: "show": display all category, "linear <category name>": display linear graph, "exit": exit programme, "coef": Show trening again product stats' ) the_input = raw_input('Enter input:') if the_input == 'exit': break elif the_input == 'show': the_input = raw_input( 'Enter 1 for products, 2 for categories:') hb = Myhbase('crawler') if the_input == '1': print(hb.getAllproducts()) elif the_input == '2': print(hb.getAllcategories()) elif re.compile(r'^linear\s.+$').match(the_input): category = re.compile(r'^linear\s(.+)$').match(the_input) if category is not None: self.lineargraph(category.group(1)) elif the_input == 'coef': self.coef()
class Sparksteam(Myredis, Myhbase): def __init__(self, zkQuorum, topic, hbtable): self.zkQuorum = zkQuorum self.topic = topic self.hbase = Myhbase(hbtable) self.redis = Myredis() def start(self): sc = SparkContext(appName="PythonStreamingNOTHS") ssc = StreamingContext(sc, 10) kvs = KafkaUtils.createStream(ssc, self.zkQuorum, "spark-streaming-consumer", {self.topic: 1}) print('******* Event received in window: ', kvs.pprint()) if topic == 'NOTHS-crawler-topic': kvs.foreachRDD(self.save_crawler_hbase) elif topic == 'NOTHS-trends-topic': kvs.foreachRDD(self.save_trends_hbase) ssc.start() ssc.awaitTermination() def save_trends_hbase(self, time, rdd): try: recs = rdd.collect() if recs: for rec in recs: self.hbase.save_trend(rec) except: print('HBase update Err.') def save_crawler_hbase(self, time, rdd): try: recs = rdd.collect() if recs: for rec in recs: self.hbase.save_crawler(rec) x = rec[1].split(',') if str(x[0]) == 'category_link': if self.redis.isNewCategory(str(x[2])): self.redis.addcategory(str(x[2])) except: print('HBase update Err.')
def lineargraph(self, category): if category is not '': try: hb = Myhbase('trend') X = [] Y = [] i = 1 rowkey = category + 'interests' print(rowkey) for key, data in hb.table.scan(row_prefix=rowkey, ): X.append([i]) v = int(data['stats:value']) Y.append([v]) i += 1 if X: mdl = LinearRegression().fit(X, Y) m = mdl.coef_[0] b = mdl.intercept_ plt.scatter(X, Y, color='blue') x_len = len(X) plt.plot([0, x_len], [b, m * x_len + b], 'r') plt.title(category, fontsize=20) plt.xlabel('Time (Weeks)', fontsize=15) plt.ylabel('Trending', fontsize=15) plt.show() else: print('not enough data for category', category) except: print('opss something is wrong.')
def categorytrend(self, category): print('Get trend data..') hb = Myhbase('trend') X = [] Y = [] if category is not '': try: hb = Myhbase('trend') i = 1 rowkey = category + 'interests' for key, data in hb.table.scan(row_prefix=rowkey, ): X.append([i]) v = int(data['stats:value']) Y.append([v]) i += 1 except: print('err') return (X, Y)
def coef(self): hb_crawler = Myhbase('crawler') hb_trend = Myhbase('trend') data_hash = collections.defaultdict(dict) categories = hb_crawler.getAllcategories() for category in categories: print(category) #find coffient X = [] Y = [] i = 1 rowkey = category + 'interests' print(rowkey) for key, data in hb_trend.table.scan(row_prefix=rowkey, ): X.append([i]) v = int(data['stats:value']) Y.append([v]) i += 1 if X: mdl = LinearRegression().fit(X, Y) m = mdl.coef_[0] data_hash[category]['coef'] = m min_price = None max_price = None mean_price = 0 total_price = 0 products_in_category = hb_crawler.getCategoryProducts(category) for item in products_in_category: if item['product:price']: item_price = item['product:price'] regex = re.compile('[^0-9\.]') item_price = regex.sub('', item_price) item_price = float(item_price) total_price += item_price if item_price < min_price or min_price is None: min_price = item_price if item_price > max_price or max_price is None: max_price = item_price data_hash[category]['min_price'] = min_price data_hash[category]['max_price'] = max_price data_hash[category]['total_price'] = total_price data_hash[category]['mean_price'] = total_price / len( products_in_category) data_hash[category][ 'range_price'] = max_price - min_price data_hash[category]['total_item'] = len( products_in_category) print(data_hash) x = [] y = [] z = [] for key, item in data_hash.iteritems(): x.append(item['mean_price']) y.append(item['coef'][0]) z.append(item['total_item']) x = np.array(x) y = np.array(y) z = np.array(z) colors = np.random.rand(len(x)) area = np.pi * z plt.scatter(x, y, s=area, c=colors, alpha=0.5) plt.title('Product range, price and trending coefficient', fontsize=20) plt.xlabel('Averge Price', fontsize=15) plt.ylabel('Trend Coefficient', fontsize=15) plt.show()
def __init__(self, zkQuorum, topic, hbtable): self.zkQuorum = zkQuorum self.topic = topic self.hbase = Myhbase(hbtable) self.redis = Myredis()
'category:name': 'mycategory', 'stats:from': '2016-03-01', 'stats:to': '2016-03-31', 'stats:value': '10' }) trend_test_conn.put( 'mycategoryinterests2016-04-012016-05-01', { 'group:name': 'interests', 'category:name': 'mycategory', 'stats:from': '2016-04-01', 'stats:to': '2016-05-01', 'stats:value': '20' }) hb_cralwer_test = Myhbase('crawler_test') def test_save_cawler_rec(): new_data = ",".join([ 'product_link', 'http://newadd.com', 'product1', '$10', 'Im a new product desc from test', 'testcategory' ]) hb_cralwer_test.save_crawler([0, new_data]) row = crawler_test_conn.row('testcategoryproduct1') assert row['product:name'] == 'product1' assert row['product:desc'] == 'Im a new product desc from test' assert row['category:name'] == 'testcategory'
def coef(self): hb_crawler = Myhbase('crawler') hb_trend = Myhbase('trend') data_hash = collections.defaultdict(dict) categories = hb_crawler.getAllcategories() for category in categories: print(category) #find coffient X = [] Y = [] i = 1 rowkey = category + 'interests' print(rowkey) for key, data in hb_trend.table.scan(row_prefix=rowkey, ): X.append([i]) v = int(data['stats:value']) Y.append([v]) i += 1 if X: mdl = LinearRegression().fit(X, Y) m = mdl.coef_[0] data_hash[category]['coef'] = m min_price = None max_price = None mean_price = 0 total_price = 0 products_in_category = hb_crawler.getCategoryProducts(category) for item in products_in_category: if item['product:price']: item_price = item['product:price'] regex = re.compile('[^0-9\.]') item_price = regex.sub('', item_price) item_price = float(item_price) total_price += item_price if item_price < min_price or min_price is None: min_price = item_price if item_price > max_price or max_price is None: max_price = item_price data_hash[category]['min_price'] = min_price data_hash[category]['max_price'] = max_price data_hash[category]['total_price'] = total_price data_hash[category]['mean_price'] = total_price / len(products_in_category) data_hash[category]['range_price'] = max_price - min_price data_hash[category]['total_item'] = len(products_in_category) print(data_hash) x = [] y = [] z = [] for key, item in data_hash.iteritems(): x.append(item['mean_price']) y.append(item['coef'][0]) z.append(item['total_item']) x = np.array(x) y = np.array(y) z = np.array(z) colors = np.random.rand( len(x) ) area = np.pi * z plt.scatter(x, y, s=area, c=colors, alpha=0.5) plt.title('Product range, price and trending coefficient', fontsize=20) plt.xlabel('Averge Price', fontsize=15) plt.ylabel('Trend Coefficient', fontsize=15) plt.show()