示例#1
0
DIR = 'city_pickles'
NUM_TOPICS = 30 #double check
DEST = 'meta.csv'

city_features = {}


def CreateMetaVector(data):
  result = {'titleLength':0.0,'bodyLength':0.0,'numImages':0.0}
  for post in data:
    result['numImages'] += len(post['images'])
    result['titleLength'] += len(str(post['heading']))
    result['bodyLength'] += len(str(post['body']))
  result['numImages'] /= len(data)
  result['titleLength'] /= len(data)
  result['bodyLength'] /= len(data)
  return [result['titleLength'], result['bodyLength'], result['numImages']]

for filename in os.listdir(DIR):
  city = filename[0:7]
  with open(DIR + '/' + filename) as source:
    city_features[city] = CreateMetaVector(loadp(source))

#Write feature vectors to csv file
with open(DEST, 'wb') as csvdata:
  writer = csv.writer(csvdata, delimiter=',')
  for city, vec in city_features.iteritems():
    fvec = [city] + vec
    writer.writerow(fvec)
if len(sys.argv) < 3:
  print 'input destination name and option'
  sys.exit()

dest = sys.argv[1]
opt = int(sys.argv[2])

field = 'categoryClassName'

catNames = [line.strip() for line in open(field+'.txt')]

#append the category features to each city's vector
for filename in os.listdir(DIR):
  city = filename[0:7]
  with open(DIR + '/' + filename) as source:
    city_features[city] = CreateCatVector(loadp(source))

#append the Topic Modeling features to each city's vector
for cat in catNames:
  citiesTopicModels = {}
  source = 'csvs/%s.csv'%cat
  with open(source, 'rb') as postTopics:
    reader = csv.reader(postTopics, delimiter=',')
    for line in reader:
      city = line[0]
      if city not in citiesTopicModels:
        citiesTopicModels[city] = {'count':0, 'topics': [0.0]*NUM_TOPICS}
      citiesTopicModels[city]['count'] +=1
      citiesTopicModels[city]['topics'] = [float(a)+float(b) for a,b in zip(citiesTopicModels[city]['topics'],line[1:])]
  for city in citiesTopicModels:
    citiesTopicModels[city]['topics'] = [x/citiesTopicModels[city]['count'] for x in citiesTopicModels[city]['topics']]