예제 #1
0
def generateAd2UsersGivenAdSet(data_training, adSet) :
    ad2Users = dict([(adid, set())for adid in adSet])
    for line in file(data_training) :
        fields = dataParser.parseTrainData(line)
        if fields == None or len(fields) == 0 : return
        Click, Impression, Display_url, AdID, AdvertiserID, Depth, \
        Position, QueryID, KeywordID, TitleID, DescriptionID, UserID = fields
        if UserID == '0' or AdID not in adSet : continue
        ad2Users[AdID].add(UserID)
    return ad2Users
예제 #2
0
def generateAd2UsersGivenAdSet(data_training, adSet):
    ad2Users = dict([(adid, set()) for adid in adSet])
    for line in file(data_training):
        fields = dataParser.parseTrainData(line)
        if fields == None or len(fields) == 0: return
        Click, Impression, Display_url, AdID, AdvertiserID, Depth, \
        Position, QueryID, KeywordID, TitleID, DescriptionID, UserID = fields
        if UserID == '0' or AdID not in adSet: continue
        ad2Users[AdID].add(UserID)
    return ad2Users
예제 #3
0
def generateTopAdsUsersByClick(data_training, top = 200) :
    AdClickCnt = dict()
    for line in file(data_training) :
        fields = dataParser.parseTrainData(line)
        if fields == None or len(fields) == 0 : return
        Click, Impression, Display_url, AdID, AdvertiserID, Depth, \
        Position, QueryID, KeywordID, TitleID, DescriptionID, UserID = fields
        if AdID not in AdClickCnt :
            AdClickCnt[AdID] = 0
        AdClickCnt[AdID] += Click

    adClickCntList = [(clickCnt, adid) for adid, clickCnt in AdClickCnt.items()]
    return heapq.nlargest(top, adClickCntList)
예제 #4
0
def dumpAd2UserStatus(data_training, adSet, userSet, fn_out) :
    output = file(fn_out, 'w')
    format = '%s\t%s\t%d\t%d\n'
    for line in file(data_training) :
        fields = dataParser.parseTrainData(line)
        if fields == None or len(fields) == 0 : return
        Click, Impression, Display_url, AdID, AdvertiserID, Depth, \
        Position, QueryID, KeywordID, TitleID, DescriptionID, UserID = fields
        #if (Click == 0 and Impression <= 2) or AdID not in adSet or UserID not in userSet :
        #    continue
        if AdID not in adSet or UserID not in userSet :
            continue
        output.write(format % (AdID, UserID, Click, Impression))
    output.close()
예제 #5
0
def generateTopAdsUsersByClick(data_training, top=200):
    AdClickCnt = dict()
    for line in file(data_training):
        fields = dataParser.parseTrainData(line)
        if fields == None or len(fields) == 0: return
        Click, Impression, Display_url, AdID, AdvertiserID, Depth, \
        Position, QueryID, KeywordID, TitleID, DescriptionID, UserID = fields
        if AdID not in AdClickCnt:
            AdClickCnt[AdID] = 0
        AdClickCnt[AdID] += Click

    adClickCntList = [(clickCnt, adid)
                      for adid, clickCnt in AdClickCnt.items()]
    return heapq.nlargest(top, adClickCntList)
예제 #6
0
def dumpAd2UserStatus(data_training, adSet, userSet, fn_out):
    output = file(fn_out, 'w')
    format = '%s\t%s\t%d\t%d\n'
    for line in file(data_training):
        fields = dataParser.parseTrainData(line)
        if fields == None or len(fields) == 0: return
        Click, Impression, Display_url, AdID, AdvertiserID, Depth, \
        Position, QueryID, KeywordID, TitleID, DescriptionID, UserID = fields
        #if (Click == 0 and Impression <= 2) or AdID not in adSet or UserID not in userSet :
        #    continue
        if AdID not in adSet or UserID not in userSet:
            continue
        output.write(format % (AdID, UserID, Click, Impression))
    output.close()
예제 #7
0
def dumpUserRawFeatureGivenUserSet(data_training, userSet, fn):
    userDict = dict([(userid, {
        'queryIDlist': [],
        'titleIDlist': [],
        'descIDList': []
    }) for userid in userSet])
    queryIDset = set()
    titleIDset = set()
    descIDset = set()

    for line in file(data_training):
        fields = dataParser.parseTrainData(line)
        if fields == None or len(fields) == 0: return
        Click, Impression, Display_url, AdID, AdvertiserID, Depth, \
        Position, QueryID, KeywordID, TitleID, DescriptionID, UserID = fields

        if UserID == '0' or UserID not in userSet: continue

        queryIDset.add(QueryID)
        titleIDset.add(TitleID)
        descIDset.add(DescriptionID)
        userDict[UserID]['queryIDlist'].append(QueryID)
        #only track clicked ads' infomation
        if Click > 0:
            userDict[UserID]['titleIDlist'].append(TitleID)
            userDict[UserID]['descIDList'].append(DescriptionID)

    #dump aggregation result to file
    dump_format = '%s\x01%s\x02%s\x02%s\n'
    aggregateUserResult = file(fn, 'w')
    for user in userDict:
        aggregateUserResult.write(dump_format % \
                (user,
                    '\t'.join(userDict[user]['queryIDlist']),
                    '\t'.join(userDict[user]['titleIDlist']),
                    '\t'.join(userDict[user]['descIDList'])))
    aggregateUserResult.close()

    #dump all ID set to files which would be used to filter additional data.
    dumpFilesName = {
        TMP_DATA_DIR_PATH + 'queryID.set': queryIDset,
        TMP_DATA_DIR_PATH + 'titleID.set': titleIDset,
        TMP_DATA_DIR_PATH + 'descID.set': descIDset
    }
    for filename, s in dumpFilesName.items():
        dumpfile = file(filename, 'w')
        for item in s:
            dumpfile.write('%s\n' % (item))
        dumpfile.close()
예제 #8
0
def dumpUserRawFeatureGivenUserSet(data_training, userSet, fn) :
    userDict = dict([(userid, {'queryIDlist' : [], 'titleIDlist' : [], 'descIDList': []}) for userid in userSet])
    queryIDset = set()
    titleIDset = set()
    descIDset = set()

    for line in file(data_training) :
        fields = dataParser.parseTrainData(line)
        if fields == None or len(fields) == 0 : return
        Click, Impression, Display_url, AdID, AdvertiserID, Depth, \
        Position, QueryID, KeywordID, TitleID, DescriptionID, UserID = fields

        if UserID == '0' or UserID not in userSet : continue
       
        queryIDset.add(QueryID)
        titleIDset.add(TitleID)
        descIDset.add(DescriptionID)
        userDict[UserID]['queryIDlist'].append(QueryID)
        #only track clicked ads' infomation
        if Click > 0 :
            userDict[UserID]['titleIDlist'].append(TitleID)
            userDict[UserID]['descIDList'].append(DescriptionID)

    #dump aggregation result to file
    dump_format = '%s\x01%s\x02%s\x02%s\n'
    aggregateUserResult = file(fn, 'w')
    for user in userDict :
        aggregateUserResult.write(dump_format % \
                (user,
                    '\t'.join(userDict[user]['queryIDlist']),
                    '\t'.join(userDict[user]['titleIDlist']),
                    '\t'.join(userDict[user]['descIDList'])))
    aggregateUserResult.close()

    #dump all ID set to files which would be used to filter additional data.
    dumpFilesName = {TMP_DATA_DIR_PATH + 'queryID.set' : queryIDset, TMP_DATA_DIR_PATH + 'titleID.set' : titleIDset, TMP_DATA_DIR_PATH + 'descID.set' : descIDset}
    for filename, s in dumpFilesName.items() :
        dumpfile = file(filename, 'w')
        for item in s :
            dumpfile.write('%s\n' % (item))
        dumpfile.close()
예제 #9
0
def genQueryToken (input_file, ADID) :
    preFilterUserSet = set(line.split()[1] for line in file(TMP_DATA_DIR_PATH + 'status/%s.ad2userStatus.dat' % ADID ))
    user_query = {}
    num = 1
    for line in file(input_file) :
        if num % 100000 == 0 : print ADID, num
        num += 1
        fields = dataParser.parseTrainData(line)
        Click, Impression, Display_url, AdID, AdvertiserID, Depth, Position, QueryID, KeywordID, TitleID, DescriptionID, UserID = fields
        if UserID not in preFilterUserSet : continue
        if UserID not in user_query :
            user_query[UserID] = []
        user_query[UserID].append(QueryID)

    writer = file(TMP_DATA_DIR_PATH + 'userQuery/%s.user.query' % ADID, 'w')
    for user in user_query :
        writer.write('%s%s\n' % (user, '\t'.join(user_query[user])))
    writer.close() 

    writer = file(TMP_DATA_DIR_PATH + 'userQuery/%s.user.queryTokens' % ADID, 'w')
    querySet = set()

    for line in file(TMP_DATA_DIR_PATH + 'userQuery/%s.user.query' % ADID) :
        for q in line.strip().split('')[1].split() :
            querySet.add(q)

    queryMap = dict(line.strip().split() for line in file(DATA_QUERY) if line.strip().split()[0] in querySet)

    for line in file(TMP_DATA_DIR_PATH + 'userQuery/%s.user.query' % ADID) :
        user, query = line.strip().split('')
        query = query.split()
        writer.write('%s%s\n' % (user, '|'.join(queryMap[q] for q in query)))
    
    userQuery = None
    querySet = None
    queryMap = None