DIR = 'city_pickles' NUM_TOPICS = 30 #double check DEST = 'meta.csv' city_features = {} def CreateMetaVector(data): result = {'titleLength':0.0,'bodyLength':0.0,'numImages':0.0} for post in data: result['numImages'] += len(post['images']) result['titleLength'] += len(str(post['heading'])) result['bodyLength'] += len(str(post['body'])) result['numImages'] /= len(data) result['titleLength'] /= len(data) result['bodyLength'] /= len(data) return [result['titleLength'], result['bodyLength'], result['numImages']] for filename in os.listdir(DIR): city = filename[0:7] with open(DIR + '/' + filename) as source: city_features[city] = CreateMetaVector(loadp(source)) #Write feature vectors to csv file with open(DEST, 'wb') as csvdata: writer = csv.writer(csvdata, delimiter=',') for city, vec in city_features.iteritems(): fvec = [city] + vec writer.writerow(fvec)
if len(sys.argv) < 3: print 'input destination name and option' sys.exit() dest = sys.argv[1] opt = int(sys.argv[2]) field = 'categoryClassName' catNames = [line.strip() for line in open(field+'.txt')] #append the category features to each city's vector for filename in os.listdir(DIR): city = filename[0:7] with open(DIR + '/' + filename) as source: city_features[city] = CreateCatVector(loadp(source)) #append the Topic Modeling features to each city's vector for cat in catNames: citiesTopicModels = {} source = 'csvs/%s.csv'%cat with open(source, 'rb') as postTopics: reader = csv.reader(postTopics, delimiter=',') for line in reader: city = line[0] if city not in citiesTopicModels: citiesTopicModels[city] = {'count':0, 'topics': [0.0]*NUM_TOPICS} citiesTopicModels[city]['count'] +=1 citiesTopicModels[city]['topics'] = [float(a)+float(b) for a,b in zip(citiesTopicModels[city]['topics'],line[1:])] for city in citiesTopicModels: citiesTopicModels[city]['topics'] = [x/citiesTopicModels[city]['count'] for x in citiesTopicModels[city]['topics']]