def main(argv=None): if argv is None: argv = sys.argv if len(argv) < 3: print "arguments: <input dir> <output>" return 1 indir = argv[1]+"/" activityFactorFiles=[] personFactorFiles=[] friendsFiles = [] outdir = argv[2]+"/" random.seed(SEED) for file in os.listdir(indir): if file.endswith("activityFactors.txt"): activityFactorFiles.append(indir+file) if file.endswith("personFactors.txt"): personFactorFiles.append(indir+file) if file.startswith("m0friendList"): friendsFiles.append(indir+file) # read precomputed counts from files (personFactors, countryFactors, tagFactors, tagClassFactors, nameFactors, givenNames, ts, postHisto) = readfactors.load(personFactorFiles, activityFactorFiles, friendsFiles) # find person parameters print "find parameter bindings for Persons" selectedPersonParams = {} for i in range(1, 15): factors = readfactors.getFactorsForQuery(i, personFactors) selectedPersonParams[i] = discoverparams.generate(factors) # Queries 13 and 14 take two person parameters each. Generate pairs # secondPerson = {} # for i in [13, 14]: # secondPerson[i] = [] # for person in selectedPersonParams[i]: # j = 0 # while True: # j = random.randint(0, len(selectedPersonParams[i])-1) # if selectedPersonParams[i][j] != person: # break # secondPerson[i].append(selectedPersonParams[i][j]) # find country parameters for Query 3 and 11 print "find parameter bindings for Countries" selectedCountryParams = {} for i in [3, 11]: factors = readfactors.getCountryFactorsForQuery(i, countryFactors) selectedCountryParams[i] = discoverparams.generate(factors, portion=0.1) # make sure there are as many country parameters as person parameters oldlen = len(selectedCountryParams[i]) newlen = len(selectedPersonParams[i]) selectedCountryParams[i].extend([selectedCountryParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)]) # Query 3 needs two countries as parameters. Generate the second one: secondCountry = [] for c in selectedCountryParams[3]: i=0 while True: i = random.randint(0, len(selectedCountryParams[3])-1) if selectedCountryParams[3][i]!=c: break secondCountry.append(selectedCountryParams[3][i]) #find tag parameters for Query 6 #print "find parameter bindings for Tags" # old tag selection #selectedTagParams = {} #for i in [6]: # selectedTagParams[i] = discoverparams.generate(tagFactors, portion=0.1) # # make sure there are as many tag paramters as person parameters # oldlen = len(selectedTagParams[i]) # newlen = len(selectedPersonParams[i]) # selectedTagParams[i].extend([selectedTagParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)]) #print "find parameter bindings for Tags" (leftTagFactors, rightTagFactors) = discoverparams.divideFactors(tagFactors, 0.7) leftSize = len(leftTagFactors) rightSize = len(rightTagFactors) leftPortion = 0.1*(leftSize+rightSize) / (2.0*leftSize) rightPortion = 0.1*(leftSize+rightSize) / (2.0*rightSize) selectedTagParams = {} for i in [6]: selectedTagParams[i] = discoverparams.generate(leftTagFactors, portion=leftPortion) selectedTagParams[i].extend(discoverparams.generate(rightTagFactors, portion=rightPortion)) oldlen = len(selectedTagParams[i]) newlen = len(selectedPersonParams[i]) selectedTagParams[i].extend([selectedTagParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)]) # generate tag type parameters for Query 12 selectedTagTypeParams = {} for i in [12]: selectedTagTypeParams[i] = discoverparams.generate(tagClassFactors, portion=0.1) # make sure there are as many tag paramters as person parameters oldlen = len(selectedTagTypeParams[i]) newlen = len(selectedPersonParams[i]) selectedTagTypeParams[i].extend([selectedTagTypeParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)]) # find time parameters for Queries 2,3,4,5,9,13,14 selectedPersons = selectedPersonParams[2] + selectedPersonParams[3]+selectedPersonParams[4] selectedPersons += selectedPersonParams[5] + selectedPersonParams[9] selectedPersons += selectedPersonParams[13] + selectedPersonParams[14] selectedTimeParams = {} timeSelectionInput = { 2: (selectedPersonParams[2], "f", getTimeParamsBeforeMedian), 3: (selectedPersonParams[3], "ff", getTimeParamsWithMedian), 4: (selectedPersonParams[4], "f", getTimeParamsWithMedian), 5: (selectedPersonParams[5], "ffg", getTimeParamsAfterMedian), 9: (selectedPersonParams[9], "ff", getTimeParamsBeforeMedian), 13: (selectedPersonParams[13], "f", getTimeParamsAfterMedian), 14: (selectedPersonParams[14], "f", getTimeParamsAfterMedian) #11: (selectedPersonParams[11], "w", getTimeParamsBeforeMedian) # friends of friends work } print "find parameter bindings for Timestamps" selectedTimeParams = findTimeParams(timeSelectionInput, personFactorFiles, activityFactorFiles, friendsFiles, ts[1]) # Query 11 takes WorksFrom timestamp selectedTimeParams[11] = [random.randint(ts[2], ts[3]) for j in range(len(selectedPersonParams[11]))] # Query 10 additionally needs the HS parameter HS = [] for person in selectedPersonParams[10]: HS0 = random.randint(1, 12) if HS0 == 12: HS1 = 1 else: HS1 = HS0 + 1 HS.append((HS0, HS1)) # Query 1 takes first name as a parameter #nameParams = findNameParameters(nameFactors)# discoverparams.generate(nameFactors) ## if there are fewer first names than person parameters, repeat some of the names #if len(nameParams) < len(selectedPersonParams[2]): # oldlen = len(nameParams) # newlen = len(selectedPersonParams[2]) # nameParams.extend([nameParams[random.randint(0, oldlen-1)] for j in range(newlen-oldlen)]) nameParams = [] for person in selectedPersonParams[1]: n = givenNames.getValue(person) nameParams.append(n) # serialize all the parameters as CSV csvWriters = {} # all the queries have Person as parameter for i in range(1,15): csvWriter = CSVSerializer() csvWriter.setOutputFile(outdir+"query_%d_param.txt"%(i)) # if i != 13 and i != 14: # these three queries take two Persons as parameters csvWriter.registerHandler(handlePersonParam, selectedPersonParams[i], "Person") csvWriters[i] = csvWriter # add output for Time parameter for i in timeSelectionInput: if i==3 or i==4: csvWriters[i].registerHandler(handleTimeDurationParam, selectedTimeParams[i], "Date0|Duration") else: csvWriters[i].registerHandler(handleTimeParam, selectedTimeParams[i], "Date0") # other, query-specific parameters csvWriters[1].registerHandler(handleFirstNameParam, nameParams, "Name") csvWriters[3].registerHandler(handlePairCountryParam, zip(selectedCountryParams[3],secondCountry),"Country1|Country2") csvWriters[6].registerHandler(handleTagParam, selectedTagParams[6],"Tag") csvWriters[10].registerHandler(handleHSParam, HS, "HS0") csvWriters[11].registerHandler(handleCountryParam, selectedCountryParams[11],"Country") csvWriters[11].registerHandler(handleWorkYearParam, selectedTimeParams[11],"Year") csvWriters[12].registerHandler(handleTagTypeParam, selectedTagTypeParams[12],"TagType") # csvWriters[13].registerHandler(handlePairPersonParam, zip(selectedPersonParams[13], secondPerson[13]),"Person1|Person2") # csvWriters[14].registerHandler(handlePairPersonParam, zip(selectedPersonParams[14], secondPerson[14]),"Person1|Person2") for j in csvWriters: csvWriters[j].writeCSV()
def main(argv=None): if argv is None: argv = sys.argv if len(argv) < 3: print "arguments: <input dir> <output dir>" return 1 indir = argv[1] + "/" outdir = argv[2] + "/" activityFactorFiles = [] personFactorFiles = [] friendsFiles = [] for file in os.listdir(indir): if file.endswith("activityFactors.txt"): activityFactorFiles.append(indir + file) if file.endswith("personFactors.txt"): personFactorFiles.append(indir + file) if file.startswith("m0friendList"): friendsFiles.append(indir + file) # read precomputed counts from files (personFactors, countryFactors, tagFactors, tagClassFactors, nameFactors, givenNames, ts, postsHisto) = \ readfactors.load(personFactorFiles,activityFactorFiles, friendsFiles) week_posts = convert_posts_histo(postsHisto) persons = [] for key, _ in personFactors.values.iteritems(): persons.append(key) random.seed(1988) random.shuffle(persons) country_sample = [] for key, value in countryFactors.values.iteritems(): country_sample.append([key, value.getValue("p")]) country_sample.sort(key=lambda x: x[1], reverse=True) tagclass_posts = tagClassFactors tagclass_posts.sort(key=lambda x: x[1], reverse=True) tag_posts = tagFactors tag_posts.sort(key=lambda x: x[1], reverse=True) total_posts = 0 for day, count in tag_posts: total_posts += count person_sum = 0 for country, count in country_sample: person_sum += count post_lower_threshold = 0.1 * total_posts * 0.9 post_upper_threshold = 0.1 * total_posts * 1.1 post_day_ranges = post_date_range_params(week_posts, post_lower_threshold, post_upper_threshold) bad_words = [ 'Augustine', 'William', 'James', 'with', 'Henry', 'Robert', 'from', 'Pope', 'Hippo', 'album', 'David', 'has', 'one', 'also', 'Green', 'which', 'that' ] #post_lower_threshold = (total_posts/(week_posts[len(week_posts)-1][0]/7/4))*0.8 #post_upper_threshold = (total_posts/(week_posts[len(week_posts)-1][0]/7/4))*1.2 non_empty_weeks = len(week_posts) for ix in range(0, len(week_posts)): if week_posts[ix][1] == 0: non_empty_weeks -= 1 post_lower_threshold = (total_posts / (non_empty_weeks / 4)) * 0.8 post_upper_threshold = (total_posts / (non_empty_weeks / 4)) * 1.2 post_months = post_month_params(week_posts, post_lower_threshold, post_upper_threshold) # the lower bound is inclusive and the upper bound is exclusive path_bounds = enumerate_path_bounds(3, 6, 2) language_codes = prob_language_codes() post_lengths = prob_post_lengths() serialize_q2(outdir, key_params(country_sample, total_posts / 200, total_posts / 100), post_day_ranges) # TODO determine constants serialize_q3(outdir, post_months) serialize_q14(outdir, post_months) serialize_q1( outdir, post_date_right_open_range_params(week_posts, 0.3 * total_posts, 0.6 * total_posts)) serialize_q12( outdir, post_date_right_open_range_params(week_posts, 0.3 * total_posts, 0.6 * total_posts)) serialize_q18( outdir, post_date_right_open_range_params(week_posts, 0.3 * total_posts, 0.6 * total_posts), post_lengths, language_codes) serialize_q10( outdir, key_params(tag_posts, total_posts / 900, total_posts / 600), post_date_right_open_range_params(week_posts, 0.3 * total_posts, 0.6 * total_posts)) serialize_q4( outdir, key_params(tagclass_posts, total_posts / 20, total_posts / 10), key_params(country_sample, total_posts / 150, total_posts / 50)) serialize_q5( outdir, key_params(country_sample, total_posts / 200, total_posts / 100)) serialize_q6(outdir, key_params(tag_posts, total_posts / 1300, total_posts / 900)) serialize_q7(outdir, key_params(tag_posts, total_posts / 900, total_posts / 600)) serialize_q8(outdir, key_params(tag_posts, total_posts / 600, total_posts / 300)) serialize_q9(outdir, key_params(tagclass_posts, 6000, 25000)) serialize_q13( outdir, key_params(country_sample, total_posts / 200, total_posts / 100)) serialize_q15( outdir, key_params(country_sample, total_posts / 200, total_posts / 100)) serialize_q16( outdir, persons, key_params(tagclass_posts, total_posts / 30, total_posts / 10), key_params(country_sample, total_posts / 80, total_posts / 20), path_bounds) serialize_q17( outdir, key_params(country_sample, total_posts / 200, total_posts / 100)) serialize_q19( outdir, key_params(tagclass_posts, total_posts / 60, total_posts / 10)) serialize_q21( outdir, key_params(country_sample, total_posts / 200, total_posts / 100)) serialize_q22( outdir, key_params(country_sample, total_posts / 120, total_posts / 40)) serialize_q23( outdir, key_params(country_sample, total_posts / 200, total_posts / 100)) serialize_q24( outdir, key_params(tagclass_posts, total_posts / 140, total_posts / 5)) serialize_q25(outdir, persons, post_months) # TODO: Refine serialize_q20( outdir, key_params(tagclass_posts, total_posts / 20, total_posts / 2)) serialize_q11( outdir, key_params(country_sample, total_posts / 80, total_posts / 20), bad_words)
def main(argv=None): if argv is None: argv = sys.argv if len(argv) < 3: print "arguments: <input dir> <output>" return 1 indir = argv[1]+"/" activityFactorFiles=[] personFactorFiles=[] friendsFiles = [] outdir = argv[2]+"/" random.seed(SEED) for file in os.listdir(indir): if file.endswith("activityFactors.txt"): activityFactorFiles.append(indir+file) if file.endswith("personFactors.txt"): personFactorFiles.append(indir+file) if file.startswith("m0friendList"): friendsFiles.append(indir+file) # read precomputed counts from files (personFactors, countryFactors, tagFactors, tagClassFactors, nameFactors, givenNames, ts, postHisto) = readfactors.load(personFactorFiles, activityFactorFiles, friendsFiles) # find person parameters print "find parameter bindings for Persons" selectedPersonParams = {} for i in range(1, 15): factors = readfactors.getFactorsForQuery(i, personFactors) selectedPersonParams[i] = discoverparams.generate(factors) # Queries 13 and 14 take two person parameters each. Generate pairs secondPerson = {} for i in [13, 14]: secondPerson[i] = [] for person in selectedPersonParams[i]: j = 0 while True: j = random.randint(0, len(selectedPersonParams[i])-1) if selectedPersonParams[i][j] != person: break secondPerson[i].append(selectedPersonParams[i][j]) # find country parameters for Query 3 and 11 print "find parameter bindings for Countries" selectedCountryParams = {} for i in [3, 11]: factors = readfactors.getCountryFactorsForQuery(i, countryFactors) selectedCountryParams[i] = discoverparams.generate(factors, portion=0.1) # make sure there are as many country parameters as person parameters oldlen = len(selectedCountryParams[i]) newlen = len(selectedPersonParams[i]) selectedCountryParams[i].extend([selectedCountryParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)]) # Query 3 needs two countries as parameters. Generate the second one: secondCountry = [] for c in selectedCountryParams[3]: i=0 while True: i = random.randint(0, len(selectedCountryParams[3])-1) if selectedCountryParams[3][i]!=c: break secondCountry.append(selectedCountryParams[3][i]) #find tag parameters for Query 6 #print "find parameter bindings for Tags" # old tag selection #selectedTagParams = {} #for i in [6]: # selectedTagParams[i] = discoverparams.generate(tagFactors, portion=0.1) # # make sure there are as many tag paramters as person parameters # oldlen = len(selectedTagParams[i]) # newlen = len(selectedPersonParams[i]) # selectedTagParams[i].extend([selectedTagParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)]) #print "find parameter bindings for Tags" (leftTagFactors, rightTagFactors) = discoverparams.divideFactors(tagFactors, 0.7) leftSize = len(leftTagFactors) rightSize = len(rightTagFactors) leftPortion = 0.1*(leftSize+rightSize) / (2.0*leftSize) rightPortion = 0.1*(leftSize+rightSize) / (2.0*rightSize) selectedTagParams = {} for i in [6]: selectedTagParams[i] = discoverparams.generate(leftTagFactors, portion=leftPortion) selectedTagParams[i].extend(discoverparams.generate(rightTagFactors, portion=rightPortion)) oldlen = len(selectedTagParams[i]) newlen = len(selectedPersonParams[i]) selectedTagParams[i].extend([selectedTagParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)]) # generate tag type parameters for Query 12 selectedTagTypeParams = {} for i in [12]: selectedTagTypeParams[i] = discoverparams.generate(tagClassFactors, portion=0.1) # make sure there are as many tag paramters as person parameters oldlen = len(selectedTagTypeParams[i]) newlen = len(selectedPersonParams[i]) selectedTagTypeParams[i].extend([selectedTagTypeParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)]) # find time parameters for Queries 2,3,4,5,9 selectedPersons = selectedPersonParams[2] + selectedPersonParams[3]+selectedPersonParams[4] selectedPersons += selectedPersonParams[5] + selectedPersonParams[9] selectedTimeParams = {} timeSelectionInput = { 2: (selectedPersonParams[2], "f", getTimeParamsBeforeMedian), 3: (selectedPersonParams[3], "ff", getTimeParamsWithMedian), 4: (selectedPersonParams[4], "f", getTimeParamsWithMedian), 5: (selectedPersonParams[5], "ffg", getTimeParamsAfterMedian), 9: (selectedPersonParams[9], "ff", getTimeParamsBeforeMedian) #11: (selectedPersonParams[11], "w", getTimeParamsBeforeMedian) # friends of friends work } print "find parameter bindings for Timestamps" selectedTimeParams = findTimeParams(timeSelectionInput, personFactorFiles, activityFactorFiles, friendsFiles, ts[1]) # Query 11 takes WorksFrom timestamp selectedTimeParams[11] = [random.randint(ts[2], ts[3]) for j in range(len(selectedPersonParams[11]))] # Query 10 additionally needs the HS parameter HS = [] for person in selectedPersonParams[10]: HS0 = random.randint(1, 12) if HS0 == 12: HS1 = 1 else: HS1 = HS0 + 1 HS.append((HS0, HS1)) # Query 1 takes first name as a parameter #nameParams = findNameParameters(nameFactors)# discoverparams.generate(nameFactors) ## if there are fewer first names than person parameters, repeat some of the names #if len(nameParams) < len(selectedPersonParams[2]): # oldlen = len(nameParams) # newlen = len(selectedPersonParams[2]) # nameParams.extend([nameParams[random.randint(0, oldlen-1)] for j in range(newlen-oldlen)]) nameParams = [] for person in selectedPersonParams[1]: n = givenNames.getValue(person) nameParams.append(n) # serialize all the parameters as CSV csvWriters = {} # all the queries have Person as parameter for i in range(1,15): csvWriter = CSVSerializer() csvWriter.setOutputFile(outdir+"interactive_%d_param.txt"%(i)) if i != 13 and i != 14: # these three queries take two Persons as parameters csvWriter.registerHandler(handlePersonParam, selectedPersonParams[i], "Person") csvWriters[i] = csvWriter # add output for Time parameter for i in timeSelectionInput: if i==3 or i==4: csvWriters[i].registerHandler(handleTimeDurationParam, selectedTimeParams[i], "Date0|Duration") else: csvWriters[i].registerHandler(handleTimeParam, selectedTimeParams[i], "Date0") # other, query-specific parameters csvWriters[1].registerHandler(handleFirstNameParam, nameParams, "Name") csvWriters[3].registerHandler(handlePairCountryParam, zip(selectedCountryParams[3],secondCountry),"Country1|Country2") csvWriters[6].registerHandler(handleTagParam, selectedTagParams[6],"Tag") csvWriters[10].registerHandler(handleHSParam, HS, "HS0") csvWriters[11].registerHandler(handleCountryParam, selectedCountryParams[11],"Country") csvWriters[11].registerHandler(handleWorkYearParam, selectedTimeParams[11],"Year") csvWriters[12].registerHandler(handleTagTypeParam, selectedTagTypeParams[12],"TagType") csvWriters[13].registerHandler(handlePairPersonParam, zip(selectedPersonParams[13], secondPerson[13]),"Person1|Person2") csvWriters[14].registerHandler(handlePairPersonParam, zip(selectedPersonParams[14], secondPerson[14]),"Person1|Person2") for j in csvWriters: csvWriters[j].writeCSV()
def main(argv=None): if argv is None: argv = sys.argv if len(argv) < 3: print("arguments: <input dir> <output dir>") return 1 indir = argv[1] + "/" outdir = argv[2] + "/" activityFactorFiles = [] personFactorFiles = [] friendsFiles = [] for file in os.listdir(indir): if file.endswith("activityFactors.txt"): activityFactorFiles.append(indir + file) if file.endswith("personFactors.txt"): personFactorFiles.append(indir + file) if file.startswith("m0friendList"): friendsFiles.append(indir + file) # read precomputed counts from files (personFactors, countryFactors, tagFactors, tagClassFactors, nameFactors, givenNames, ts, postsHisto) = \ readfactors.load(personFactorFiles,activityFactorFiles, friendsFiles) week_posts = convert_posts_histo(postsHisto) persons = [] for key, _ in personFactors.values.items(): persons.append(key) random.seed(1988) random.shuffle(persons) country_sample = countryFactors country_sample.sort(key=lambda x: x[1], reverse=True) tagclass_posts = tagClassFactors tagclass_posts.sort(key=lambda x: x[1], reverse=True) tag_posts = tagFactors tag_posts.sort(key=lambda x: x[1], reverse=True) total_posts = 0 for day, count in tag_posts: total_posts += count person_sum = 0 for country, count in country_sample: person_sum += count post_lower_threshold = 0.1 * total_posts * 0.9 post_upper_threshold = 0.1 * total_posts * 1.1 non_empty_weeks = len(week_posts) for ix in range(0, len(week_posts)): if week_posts[ix][1] == 0: non_empty_weeks -= 1 post_lower_threshold = (total_posts // (non_empty_weeks // 4)) * 0.8 post_upper_threshold = (total_posts // (non_empty_weeks // 4)) * 1.2 post_months = post_month_params(week_posts, post_lower_threshold, post_upper_threshold) # the lower bound is inclusive and the upper bound is exclusive path_bounds = enumerate_path_bounds(3, 6, 2) language_codes = prob_language_codes() post_lengths = prob_post_lengths() serialize_q3(outdir, post_months) #new: 2 serialize_q14(outdir, post_months) #new: 9 serialize_q1( outdir, post_date_right_open_range_params(week_posts, 0.3 * total_posts, 0.6 * total_posts)) serialize_q18(outdir, post_date_right_open_range_params(week_posts, 0.3 * total_posts, 0.6 * total_posts), post_lengths, language_codes) #new: 12 serialize_q10( outdir, key_params(tag_posts, total_posts // 900, total_posts // 600), post_date_right_open_range_params(week_posts, 0.3 * total_posts, 0.6 * total_posts)) #new: 8 serialize_q4(outdir, key_params(tagclass_posts, total_posts // 20, total_posts // 10), key_params(country_sample, total_posts // 150, total_posts // 50)) #new: 3 serialize_q5(outdir, key_params(country_sample, total_posts // 200, total_posts // 100)) #new: 4 serialize_q6(outdir, key_params(tag_posts, total_posts // 1300, total_posts // 900)) #new: 5 serialize_q7(outdir, key_params(tag_posts, total_posts // 900, total_posts // 600)) #new: 6 serialize_q8(outdir, key_params(tag_posts, total_posts // 600, total_posts // 300)) #new: 7 serialize_q16(outdir, persons, key_params(tagclass_posts, total_posts // 30, total_posts // 10), key_params(country_sample, total_posts // 80, total_posts // 20), path_bounds) #new: 10 serialize_q17(outdir, key_params(country_sample, total_posts // 200, total_posts // 100)) #new: 11 serialize_q21(outdir, key_params(country_sample, total_posts // 200, total_posts // 100)) #new: 13 serialize_q22(outdir, key_params(country_sample, total_posts // 120, total_posts // 40)) #new: 14 serialize_q25(outdir, persons, post_months) #new: 15
def main(argv=None): if argv is None: argv = sys.argv if len(argv) < 3: print "arguments: <input dir> <output>" return 1 indir = argv[1]+"/" factorFiles=[] friendsFiles = [] outdir = argv[2]+"/" for file in os.listdir(indir): if file.endswith("factors.txt"): factorFiles.append(indir+file) if file.startswith("m0friendList"): friendsFiles.append(indir+file) # read precomputed counts from files (personFactors, countryFactors, tagFactors, tagClassFactors, nameFactors, givenNames, ts, postsHisto) = readfactors.load(factorFiles, friendsFiles) week_posts = convert_posts_histo(postsHisto) country_sample = [] for key, value in countryFactors.values.iteritems(): country_sample.append([key, value.getValue("p")]) country_sample.sort(key=lambda x: x[1], reverse=True) tagclass_posts = tagClassFactors tagclass_posts.sort(key=lambda x: x[1], reverse=True) tag_posts = tagFactors tag_posts.sort(key=lambda x: x[1], reverse=True) total_posts = 0 for day, count in tag_posts: total_posts += count person_sum = 0 for country, count in country_sample: person_sum += count country_lower_threshold = 0.1*total_posts*0.9 country_upper_threshold = 0.1*total_posts*1.1 country_sets = country_sets_params(country_sample, country_lower_threshold, country_upper_threshold, 4) post_lower_threshold = 0.1*total_posts*0.9 post_upper_threshold = 0.1*total_posts*1.1 post_day_ranges = post_date_range_params(week_posts, post_lower_threshold, post_upper_threshold) post_lower_threshold = (total_posts/(week_posts[len(week_posts)-1][0]/7/4))*0.8 post_upper_threshold = (total_posts/(week_posts[len(week_posts)-1][0]/7/4))*1.2 post_months = post_month_params(week_posts, post_lower_threshold, post_upper_threshold) serialize_q2(country_sets, post_day_ranges) serialize_q3(post_months) serialize_q14(post_month_params(week_posts, post_lower_threshold*2, post_upper_threshold*2)) serialize_q1(post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts)) serialize_q12(post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts)) serialize_q18(post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts)) serialize_q4(key_params(tagclass_posts, total_posts/20, total_posts/10), key_params(country_sample, total_posts/120, total_posts/70)) serialize_q5(key_params(country_sample, total_posts/200, total_posts/100)) serialize_q6(key_params(tag_posts, total_posts/1300, total_posts/900)) serialize_q7(key_params(tag_posts, total_posts/900, total_posts/600)) serialize_q8(key_params(tag_posts, total_posts/600, total_posts/300)) serialize_q9(key_params(tagclass_posts, 6000, 25000)) serialize_q10(key_params(tag_posts, total_posts/900, total_posts/600)) serialize_q13(key_params(country_sample, total_posts/200, total_posts/100)) # serialize_q14(post_month_params(week_posts, post_lower_threshold*2, post_upper_threshold*2)) serialize_q15(key_params(country_sample, total_posts/200, total_posts/100)) serialize_q16(key_params(tagclass_posts, total_posts/30, total_posts/10), key_params(country_sample, total_posts/110, total_posts/70)) serialize_q17(key_params(country_sample, total_posts/200, total_posts/100)) serialize_q19(key_params(tagclass_posts, total_posts/60, total_posts/10)) serialize_q21(key_params(country_sample, total_posts/200, total_posts/100)) serialize_q22(key_params(country_sample, total_posts/120, total_posts/40)) serialize_q23(key_params(country_sample, total_posts/200, total_posts/100)) serialize_q24(key_params(tagclass_posts, total_posts/140, total_posts/5))
def main(argv=None): if argv is None: argv = sys.argv if len(argv) < 3: print("arguments: <input dir> <output>") return 1 indir = argv[1]+"/" activityFactorFiles=[] personFactorFiles=[] friendsFiles = [] outdir = argv[2]+"/" random.seed(SEED) for file in os.listdir(indir): if file.endswith("activityFactors.txt"): activityFactorFiles.append(indir+file) if file.endswith("personFactors.txt"): personFactorFiles.append(indir+file) if file.startswith("m0friendList"): friendsFiles.append(indir+file) # read precomputed counts from files (personFactors, countryFactors, tagFactors, tagClassFactors, nameFactors, givenNames, ts, postsHisto) = readfactors.load(personFactorFiles, activityFactorFiles, friendsFiles) # find person parameters selectedPersonParams = {} for i in range(1, 15): factors = readfactors.getFactorsForQuery(i, personFactors) selectedPersonParams[i] = discoverparams.generate(factors) # Queries 13 and 14 take two person parameters each. Generate pairs secondPerson = {} for i in [13, 14]: secondPerson[i] = [] for person in selectedPersonParams[i]: j = 0 while True: j = random.randint(0, len(selectedPersonParams[i])-1) if selectedPersonParams[i][j] != person: break secondPerson[i].append(selectedPersonParams[i][j]) # find country parameters for Query 3 and 11 selectedCountryParams = {} for i in [3, 11]: selectedCountryParams[i] = discoverparams.generate(countryFactors, portion=0.1) # make sure there are as many country parameters as person parameters oldlen = len(selectedCountryParams[i]) newlen = len(selectedPersonParams[i]) selectedCountryParams[i].extend([selectedCountryParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)]) # Query 3 needs two countries as parameters. Generate the second one: secondCountry = [] for c in selectedCountryParams[3]: i=0 while True: i = random.randint(0, len(selectedCountryParams[3])-1) if selectedCountryParams[3][i]!=c: break secondCountry.append(selectedCountryParams[3][i]) (leftTagFactors, rightTagFactors) = discoverparams.divideFactors(tagFactors, 0.7) leftSize = len(leftTagFactors) rightSize = len(rightTagFactors) leftPortion = 0.1*(leftSize+rightSize) / (2.0*leftSize) rightPortion = 0.1*(leftSize+rightSize) / (2.0*rightSize) selectedTagParams = {} for i in [6]: selectedTagParams[i] = discoverparams.generate(leftTagFactors, portion=leftPortion) selectedTagParams[i].extend(discoverparams.generate(rightTagFactors, portion=rightPortion)) oldlen = len(selectedTagParams[i]) newlen = len(selectedPersonParams[i]) selectedTagParams[i].extend([selectedTagParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)]) # generate tag type parameters for Query 12 selectedTagTypeParams = {} for i in [12]: selectedTagTypeParams[i] = discoverparams.generate(tagClassFactors, portion=0.1) # make sure there are as many tag paramters as person parameters oldlen = len(selectedTagTypeParams[i]) newlen = len(selectedPersonParams[i]) selectedTagTypeParams[i].extend([selectedTagTypeParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)]) # find time parameters for Queries 2,3,4,5,9 selectedPersons = selectedPersonParams[2] + selectedPersonParams[3]+selectedPersonParams[4] selectedPersons += selectedPersonParams[5] + selectedPersonParams[9] timeSelectionInput = { 2: (selectedPersonParams[2], "f", getTimeParamsBeforeMedian), 3: (selectedPersonParams[3], "ff", getTimeParamsWithMedian), 4: (selectedPersonParams[4], "f", getTimeParamsWithMedian), 5: (selectedPersonParams[5], "ffg", getTimeParamsAfterMedian), 9: (selectedPersonParams[9], "ff", getTimeParamsBeforeMedian) } selectedTimeParams = findTimeParams(timeSelectionInput, personFactorFiles, activityFactorFiles, friendsFiles, ts[1]) # Query 11 takes WorksFrom timestamp selectedTimeParams[11] = [random.randint(ts[2], ts[3]) for j in range(len(selectedPersonParams[11]))] # Query 10 additionally needs the month parameter months = [] for person in selectedPersonParams[10]: month = random.randint(1, 12) months.append(month) nameParams = [] for person in selectedPersonParams[1]: n = givenNames.getValue(person) nameParams.append(n) # serialize all the parameters as CSV csvWriters = {} # all the queries have Person as parameter for i in range(1,15): csvWriter = CSVSerializer() csvWriter.setOutputFile(outdir+"interactive_%d_param.txt"%(i)) if i != 13 and i != 14: # these two queries take two Persons as parameters csvWriter.registerHandler(handlePersonParam, selectedPersonParams[i], "personId") csvWriters[i] = csvWriter # add output for Time parameter for i in timeSelectionInput: if i==3 or i==4: csvWriters[i].registerHandler(handleTimeDurationParam, selectedTimeParams[i], "startDate|durationDays") elif i==2 or i==9: csvWriters[i].registerHandler(handleMaxTimeParam, selectedTimeParams[i], "maxDate") elif i==5: csvWriters[i].registerHandler(handleMinTimeParam, selectedTimeParams[i], "minDate") # other, query-specific parameters csvWriters[1].registerHandler(handleFirstNameParam, nameParams, "firstName") csvWriters[3].registerHandler(handlePairCountryParam, list(zip(selectedCountryParams[3],secondCountry)), "countryXName|countryYName") csvWriters[6].registerHandler(handleTagParam, selectedTagParams[6], "tagName") csvWriters[10].registerHandler(handleMonthParam, months, "month") csvWriters[11].registerHandler(handleCountryParam, selectedCountryParams[11], "countryName") csvWriters[11].registerHandler(handleWorkYearParam, selectedTimeParams[11], "workFromYear") csvWriters[12].registerHandler(handleTagTypeParam, selectedTagTypeParams[12], "tagClassName") csvWriters[13].registerHandler(handlePairPersonParam, list(zip(selectedPersonParams[13], secondPerson[13])), "person1Id|person2Id") csvWriters[14].registerHandler(handlePairPersonParam, list(zip(selectedPersonParams[14], secondPerson[14])), "person1Id|person2Id") for j in csvWriters: csvWriters[j].writeCSV()
def main(argv=None): if argv is None: argv = sys.argv if len(argv) < 3: print "arguments: <input dir> <output dir>" return 1 indir = argv[1]+"/" outdir = argv[2]+"/" activityFactorFiles=[] personFactorFiles=[] friendsFiles = [] for file in os.listdir(indir): if file.endswith("activityFactors.txt"): activityFactorFiles.append(indir+file) if file.endswith("personFactors.txt"): personFactorFiles.append(indir+file) if file.startswith("m0friendList"): friendsFiles.append(indir+file) # read precomputed counts from files (personFactors, countryFactors, tagFactors, tagClassFactors, nameFactors, givenNames, ts, postsHisto) = \ readfactors.load(personFactorFiles,activityFactorFiles, friendsFiles) week_posts = convert_posts_histo(postsHisto) persons = [] for key, _ in personFactors.values.iteritems(): persons.append(key) random.seed(1988) random.shuffle(persons) country_sample = [] for key, value in countryFactors.values.iteritems(): country_sample.append([key, value.getValue("p")]) country_sample.sort(key=lambda x: x[1], reverse=True) tagclass_posts = tagClassFactors tagclass_posts.sort(key=lambda x: x[1], reverse=True) tag_posts = tagFactors tag_posts.sort(key=lambda x: x[1], reverse=True) total_posts = 0 for day, count in tag_posts: total_posts += count person_sum = 0 for country, count in country_sample: person_sum += count post_lower_threshold = 0.1*total_posts*0.9 post_upper_threshold = 0.1*total_posts*1.1 post_day_ranges = post_date_range_params(week_posts, post_lower_threshold, post_upper_threshold) bad_words = ['Augustine','William','James','with','Henry','Robert','from','Pope','Hippo','album','David','has','one','also','Green','which','that'] #post_lower_threshold = (total_posts/(week_posts[len(week_posts)-1][0]/7/4))*0.8 #post_upper_threshold = (total_posts/(week_posts[len(week_posts)-1][0]/7/4))*1.2 non_empty_weeks=len(week_posts) for ix in range(0,len(week_posts)): if week_posts[ix][1]==0: non_empty_weeks-= 1 post_lower_threshold = (total_posts/(non_empty_weeks/4))*0.8 post_upper_threshold = (total_posts/(non_empty_weeks/4))*1.2 post_months = post_month_params(week_posts, post_lower_threshold, post_upper_threshold) path_bounds = enumerate_path_bounds(3, 9, 2) language_codes = prob_language_codes() post_lengths = prob_post_lengths() serialize_q2 (outdir, key_params(country_sample, total_posts/200, total_posts/100), post_day_ranges) # TODO determine constants serialize_q3 (outdir, post_months) serialize_q14(outdir, post_month_params(week_posts, post_lower_threshold*2, post_upper_threshold*2)) serialize_q1 (outdir, post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts)) serialize_q12(outdir, post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts)) serialize_q18(outdir, post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts), post_lengths, language_codes) serialize_q10(outdir, key_params(tag_posts, total_posts/900, total_posts/600), post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts)) serialize_q4 (outdir, key_params(tagclass_posts, total_posts/20, total_posts/10), key_params(country_sample, total_posts/120, total_posts/70)) serialize_q5 (outdir, key_params(country_sample, total_posts/200, total_posts/100)) serialize_q6 (outdir, key_params(tag_posts, total_posts/1300, total_posts/900)) serialize_q7 (outdir, key_params(tag_posts, total_posts/900, total_posts/600)) serialize_q8 (outdir, key_params(tag_posts, total_posts/600, total_posts/300)) serialize_q9 (outdir, key_params(tagclass_posts, 6000, 25000)) serialize_q13(outdir, key_params(country_sample, total_posts/200, total_posts/100)) serialize_q15(outdir, key_params(country_sample, total_posts/200, total_posts/100)) serialize_q16(outdir, persons, key_params(tagclass_posts, total_posts/30, total_posts/10), key_params(country_sample, total_posts/80, total_posts/20), path_bounds) serialize_q17(outdir, key_params(country_sample, total_posts/200, total_posts/100)) serialize_q19(outdir, key_params(tagclass_posts, total_posts/60, total_posts/10)) serialize_q21(outdir, key_params(country_sample, total_posts/200, total_posts/100)) serialize_q22(outdir, key_params(country_sample, total_posts/120, total_posts/40)) serialize_q23(outdir, key_params(country_sample, total_posts/200, total_posts/100)) serialize_q24(outdir, key_params(tagclass_posts, total_posts/140, total_posts/5)) serialize_q25(outdir, persons, post_months) # TODO: Refine serialize_q20(outdir, key_params(tagclass_posts, total_posts/20, total_posts/2)) serialize_q11(outdir, key_params(country_sample, total_posts/80, total_posts/20), bad_words)