예제 #1
0
def do_many_days_s3_APPS(argv):
	if len(argv) < 3:
		print "usage: start_day end_day"
		print "usage: 2011-10-7 2011-10-13"
		sys.exit()

	QUERY_FILES = [
		'euospcomp07.osp_query.log',
		'euospcomp08.osp_query.log',
		'euospsch01.osp_query.log',
		'euospsch03.osp_query.log',
		'euospsch01.2.osp_query.log',
		'euospsch03.2.osp_query.log',
	]
	CLICK_FILES = [
		'euospsch03.osp_click.log',
		'euospcomp08.osp_click.log'
	]

	#QUERY_FILES = QUERY_FILES[:2]

	daylist = handolUtil.get_day_list(argv[1],argv[2])
	analyzer = LogAnalyzer("APPS", "%s-%s" %(daylist[0], daylist[-1]))
	stopwatch = handolUtil.StopWatch()
	for daystr in daylist:
		for q in QUERY_FILES:
			s3_file = "DEVELOPING/app/7nmc1m75ij/apps-log/query_log/%s/%s.%s" % (daystr, q, daystr)
			analyzer.querylog_from_s3("sch-emr", s3_file)
			print "Loading & ETL: %f sec" % (stopwatch.laptime())

	analyzer.get_stats()
	print "Calc Stats: %f sec" % (stopwatch.laptime())
	analyzer.write_info()
예제 #2
0
    def get_stats_from_qcmatched(self, fname):
        """ calculate statistics from Query/Click matched log
		"""
        stopwatch = handolUtil.StopWatch()
        stopwatch.start()
        qcmatched_list = QcLogAnalyzer.loadfile_qc_matched(fname)
        print "Loading : %f sec" % (stopwatch.laptime())
        stopwatch.start()

        for v in qcmatched_list:
            im = v[0]
            num_clicks = int(v[1])
            click_pos = int(v[2])
            tot = int(v[3])
            devtype = v[4]
            dos = v[5]
            dm = v[6]
            shop = v[7]
            keyword = v[8]
            if tot == 0: nores = 1
            else: nores = 0
            if num_clicks > 0:
                anyclick = 1
            else:
                anyclick = 0

            if is_non_ascii(keyword):
                english = 0
            else:
                english = 1

            os_shop = "%s_%s" % (dos, shop)
            shop_keyw = (shop, keyword)
            self.langD.add(english,
                           [1, anyclick, nores, click_pos, num_clicks])
            self.imD.add(im,
                         [1, anyclick, nores, click_pos, num_clicks, english
                          ])  # query count, click
            self.keywordD.add(
                keyword, [1, anyclick, nores, click_pos, num_clicks, english])
            self.shopkeywordD.add(shop_keyw,
                                  [1, anyclick, nores, click_pos, num_clicks])
            self.shopD.add(
                shop, [1, anyclick, nores, click_pos, num_clicks, english])
            self.osshopD.add(
                os_shop, [1, anyclick, nores, click_pos, num_clicks, english])
            self.devtypeD.add(
                devtype, [1, anyclick, nores, click_pos, num_clicks, english])
            self.dosD.add(dos,
                          [1, anyclick, nores, click_pos, num_clicks, english])
            self.dmD.add(dm,
                         [1, anyclick, nores, click_pos, num_clicks, english])
            #self.diffsecD.add(diffsec, [1, anyclick, nores, click_pos, len(clicklist)])

            if tot == 0:
                self.zeroD.add(keyword, [1, english])
                #self.zeroshopD.add(shop, [1])
                #self.zeroshopkeywordD.add(shop_keyw, [1])

        print "Calculating : %f sec" % (stopwatch.laptime())
예제 #3
0
def test_w_local_LHRHVH():
	analyzer = LogAnalyzer("LHRHVH", "any")
	stopwatch = handolUtil.StopWatch()

	analyzer.querylog_from_file("osp_query.log.20130306")
	print "Loading & ETL: %f sec" % (stopwatch.laptime())

	analyzer.get_stats()
	print "Calc Stats: %f sec" % (stopwatch.laptime())
	analyzer.write_info()
예제 #4
0
def test_w_local_APPS():
	analyzer = LogAnalyzer("APPS", "any")
	stopwatch = handolUtil.StopWatch()

	analyzer.querylog_from_file("euospsch03.2.osp_query.log.20130215")
	#analyzer.querylog_from_file("a.query.log")
	print "Loading & ETL: %f sec" % (stopwatch.laptime())

	analyzer.get_stats()
	print "Calc Stats: %f sec" % (stopwatch.laptime())
	analyzer.write_info()
예제 #5
0
    def main(self, bucket, cfiles, qfiles, outfile):
        stopwatch = handolUtil.StopWatch()
        for cfile in cfiles:
            self.add_click_file(cfile, bucket)
            print "Loading Click - %s: %f sec" % (cfile, stopwatch.laptime())

        qclistAll = []
        for qfile in qfiles:
            qclist = self.add_query_file(qfile, bucket)
            qclistAll += qclist
            print "Load Query & Match - %s: %f sec" % (qfile,
                                                       stopwatch.laptime())

        QueryClickMatcher.save_qc_match(qclistAll, outfile)
예제 #6
0
def load_qcmatched(org_log_dir, daystr):
    os.chdir(org_log_dir)
    print "DIR:", os.getcwd()
    print "DAY:", daystr

    stopwatch = handolUtil.StopWatch()
    stopwatch.start()
    analyzer = QcLogAnalyzer(daystr)
    QCLOG = "qc_matched.all.%s.log" % daystr
    analyzer.get_stats_from_qcmatched(QCLOG)
    print "Loading & Statistics: %f sec" % (stopwatch.laptime())

    stopwatch.start()
    analyzer.print_stats('%s.all.csv' % (daystr))
    print "Saving output: %f sec" % (stopwatch.laptime())
예제 #7
0
def do_many_days_s3_LHRHVH(argv):
	if len(argv) < 3:
		print "usage: start_day end_day"
		print "usage: 2011-10-7 2011-10-13"
		sys.exit()

	daylist = handolUtil.get_day_list(argv[1],argv[2])
	analyzer = LogAnalyzer("LHRHVH", "%s-%s" %(daylist[0], daylist[-1]))
	stopwatch = handolUtil.StopWatch()
	for daystr in daylist:
		s3_file = "DEVELOPING/app/7nmc1m75ij/hubs1-log/query_log/osp_query.log.%s" % (daystr)
		analyzer.querylog_from_s3("sch-emr", s3_file)
		print "Loading & ETL: %f sec" % (stopwatch.laptime())
		s3_file = "DEVELOPING/app/7nmc1m75ij/hubs2-log/query_log/osp_query.log.%s" % (daystr)
		analyzer.querylog_from_s3("sch-emr", s3_file)
		print "Loading & ETL: %f sec" % (stopwatch.laptime())

	analyzer.get_stats()
	print "Calc Stats: %f sec" % (stopwatch.laptime())
	analyzer.write_info()
예제 #8
0
def load_keywords_stat(fname, resfname, wikif):
    """
	"""
    watch = handolUtil.StopWatch()
    print "Loading ...", wikif
    wikiStat = {}
    fp = codecs.open(wikif, 'rb', encoding='utf-8')
    for line in fp:
        flds = line.split()
        w = flds[0].lower().replace('_', ' ')
        n = int(flds[1])
        if not wikiStat.has_key(w):
            wikiStat[w] = n

    fp.close()
    print "Loaded: %f" % (watch.laptime())
    """
	== keywords.stat
	3997563 270333  6410    facebook
	2399787 186241  45449   whatsapp
	1433213 71764   1458    whats app
	1315718 130860  2040    temple run
	1004372 163723  272     games
	947351  87185   19846   skype
	829801  316731  834     angry birds
	529552  64385   1416    fruit ninja
	"""
    print "Loading ...", fname
    highKeywords = DictDict()
    noresKeywords = []
    fp = codecs.open(fname, 'rb', encoding='utf-8')
    for line in fp:
        flds = line.split('\t')
        q = int(flds[0])
        c = int(flds[1])
        nores = int(flds[2])
        keyword = flds[3].strip()
        kflds = keyword.split()
        if len(kflds) > 2: continue
        noresratio = (nores * 100) / q

        if q > 10 and noresratio > 50:
            if not wikiStat.has_key(keyword):
                noresKeywords.append([keyword, q, noresratio])
        if q > 150 and noresratio < 4:
            highKeywords.insert(keyword, [q, noresratio])
    fp.close()
    print "Loaded: %f" % (watch.laptime())
    print "noresKeywords: %d" % (len(noresKeywords))
    print "highKeywords: %d" % (len(highKeywords.D))

    fp = codecs.open(resfname, 'wb', encoding='utf-8')
    for noresK in noresKeywords:
        nearest = highKeywords.searchNearest(noresK[0], noresK[1])
        if nearest:
            prnformat = u'\t'.join(
                map(lambda (d, x, y, z): "%d\t%s\t%d\t%d" % (d, x, y, z),
                    nearest))
            fp.write("%s\t%d\t%d\t%s\n" %
                     (noresK[0], noresK[1], noresK[2], prnformat))
    fp.close()
    print "Processed: %f" % (watch.laptime())
예제 #9
0
    stopwatch = handolUtil.StopWatch()
    stopwatch.start()
    analyzer = QcLogAnalyzer(daystr)
    QCLOG = "qc_matched.all.%s.log" % daystr
    analyzer.get_stats_from_qcmatched(QCLOG)
    print "Loading & Statistics: %f sec" % (stopwatch.laptime())

    stopwatch.start()
    analyzer.print_stats('%s.all.csv' % (daystr))
    print "Saving output: %f sec" % (stopwatch.laptime())

    #analyzer.print_info('%s.csv' % (daystr))
    #analyzer.save_to_mongo()
    #analyzer.save_keywords()


if __name__ == "__main__":
    import os
    import sys
    #os.chdir(sys.argv[1])
    #daystr = sys.argv[2]
    org_log_dir = u"D:/0001/9 org logs/20130215"
    daystr = '20130215'
    stopwatch = handolUtil.StopWatch()
    stopwatch.start()
    #save_qcmatched(org_log_dir, daystr)
    load_qcmatched(org_log_dir, daystr)
    print is_non_ascii('avcd')
    print is_non_ascii('微信')