def generate_samples(output_fname): """""" mg_conn = MongoSource() conn = mg_conn.get_connection("stock", "au99") stock_value_dic = dict( [ (i, (item["date"], get_price(item))) for i, item in enumerate(conn.find({"date": {"$gt": 20130101}}).sort("date")) ] ) date_now = int(datetime.now().strftime("%Y%m%d")) print "Samples Count: ", len(stock_value_dic) with open(output_fname, "w") as writer: for idx, c_price in sorted(stock_value_dic.iteritems(), key=lambda x: x[0]): date, price = c_price after_idx = idx + AFTER_DAYS if after_idx in stock_value_dic: after_price = max([stock_value_dic[idx + i][1] for i in xrange(1, AFTER_DAYS + 1)]) if after_price > c_price[1]: label = 1 else: label = 0 writer.write("%s %s %s %s %.2f\n" % (idx, label, date, price, after_price)) else: label = -1 writer.write("%s %s %s %s %.2f\n" % (idx, label, date, price, 0.0))
def run_seg(pid, core_cnt, output_fname): '''''' mg_conn = MongoSource() conn_lst = [mg_conn.get_connection('finance', 'golden_pages'),\ mg_conn.get_connection('finance', 'usa_pages')] cnt = 0 with open(output_fname, 'w') as fp: for conn in conn_lst: for i, page in enumerate(conn.find()): if i % core_cnt != pid: continue if cnt % 20000 == pid: print 'Process %s dealed %s pages' % (pid, cnt) cnt += 1 page_id = page['_id'] if type(page['article_date']) == type(int): article_date = str(page['article_date']) else: article_date = page['article_date'].encode('utf8') article_date = article_date[:-4] title = strip_words(page['title']) content = strip_words(page['content']) cut_words = [word for word in jieba.cut(title)] cut_words.extend([word for word in jieba.cut(content)]) output_str = '%s\t%s\t%s\n' \ % (page_id.encode('utf8'), article_date, ' '.join(cut_words).encode('utf8')) fp.write(output_str)
def __load_source_words(self): """""" mg_conn = MongoSource() pages_conn = mg_conn.get_connection("finance", "golden_pages") return set([word.get("source", "").encode("utf8") for word in pages_conn.find()])