def setattr(self,attr,v): if attr == 'created_at': str_original = None if isinstance(v,int): #Epoch Time as int or datetime.datetime str_original = util.time2str(v) elif isinstance(v,datetime): v = util.time2epoch(v) str_original = util.time2str(v) else: # str str_original = v v = util.time2epoch(v) #创建日期,转换为基于东八区的Epoch整数格式 #print '%25s\t:=\t%s' % ('c_at_or',v) self.__setattr__('c_at_or',str_original) #print '%25s\t:=\t%s' % (attr,v) self.__setattr__(attr,v)
def dig_user(fpath): with codecs.open(fpath, 'r', encoding='utf-8-sig') as fp: statuses = json.load(fp,encoding='utf-8-sig') temp = {} for s in statuses: uid = s['user']['id'] name = s['user']['screen_name'] text = s['text'] time = s['created_at'] time = util.time2epoch(time) time = datetime.fromtimestamp(time) (year, week, _) = time.isocalendar() key = "%04d%02d" % (year,week) tval = temp.get(key,None) if tval is not None: temp[key] = tval + '\n' + text else: temp[key] = text output_file = '%s/%s.txt' % (output_dir, uid) sorted_k = sorted(temp.iterkeys()) with codecs.open(output_file,'w',encoding='ascii') as fp: for k in sorted_k: v = temp[k] t = wenxin.TextMind() v = v.encode('utf-8') r = t.process_paragraph(v,encoding='utf-8') fp.write( "%s\t%s\n" % (k, repr(r)) )
def setattr(self, attr, v): if attr == 'created_at': str_original = None if isinstance(v, int): #Epoch Time as int or datetime.datetime str_original = util.time2str(v) elif isinstance(v, datetime): v = util.time2epoch(v) str_original = util.time2str(v) else: # str str_original = v v = util.time2epoch(v) #创建日期,转换为基于东八区的Epoch整数格式 #print '%25s\t:=\t%s' % ('c_at_or',v) self.__setattr__('c_at_or', str_original) #print '%25s\t:=\t%s' % (attr,v) self.__setattr__(attr, v)
def setattr(self, attr, v): if attr in ['province', 'city', 'verified_type'] and not isinstance(v, str): # 认证类别、省、市ID用用str存储 v = str(v) elif attr == 'idstr' and v is None: v = str(id) elif attr == 'created_at': str_original = None if isinstance(v, int): # Epoch Time as int or datetime.datetime str_original = util.time2str(v) elif isinstance(v, datetime): v = util.time2epoch(v) str_original = util.time2str(v) else: # str str_original = v v = util.time2epoch(v) # 创建日期,转换为基于东八区的Epoch整数格式 # print '%25s\t:=\t%s' % ('created_at_or',v) self.__setattr__('created_at_or',str_original) # print '%25s\t:=\t%s' % (attr,v) self.__setattr__(attr,v)
def setattr(self, attr, v): if attr in ['province', 'city', 'verified_type' ] and not isinstance(v, str): #认证类别、省、市ID用用str存储 v = str(v) elif attr == 'idstr' and v is None: v = str(id) elif attr == 'created_at': str_original = None if isinstance(v, int): #Epoch Time as int or datetime.datetime str_original = util.time2str(v) elif isinstance(v, datetime): v = util.time2epoch(v) str_original = util.time2str(v) else: # str str_original = v v = util.time2epoch(v) #创建日期,转换为基于东八区的Epoch整数格式 #print '%25s\t:=\t%s' % ('created_at_or',v) self.__setattr__('created_at_or', str_original) #print '%25s\t:=\t%s' % (attr,v) self.__setattr__(attr, v)
def get_statuses_groupby_week(uid): fields = ['created_at', 'text'] result = defaultdict(str) for s in dSource.get_statuses(uid, fields): time = s.get('created_at') time = util.time2epoch(time) time = datetime.fromtimestamp(time) (year, week, _) = time.isocalendar() key = "%04d%02d" % (year,week) text = s.get('text') tval = result.get(key,None) if tval is not None: result[key] = tval + '\n' + text else: result[key] = text return result
def get_statuses_groupby_week(uid): fields = ['created_at', 'text'] result = defaultdict(str) for s in dSource.get_statuses(uid, fields): time = s.get('created_at') time = util.time2epoch(time) time = datetime.fromtimestamp(time) (year, week, _) = time.isocalendar() key = "%04d%02d" % (year, week) text = s.get('text') tval = result.get(key, None) if tval is not None: result[key] = tval + '\n' + text else: result[key] = text return result
def get_status(uid): cur.execute('select created_at, text from sina_statuses where user_id=%s' % uid) result = defaultdict(str) for s in cur: time = s.get('created_at') time = util.time2epoch(time) time = datetime.fromtimestamp(time) (year, week, _) = time.isocalendar() key = "%04d%02d" % (year,week) text = s.get('text') tval = result.get(key,None) if tval is not None: result[key] = tval + '\n' + text else: result[key] = text return result
if __name__ == '__main__': users = load_user() max_users = float('inf') index = 1 fp = codecs.open(compare_user_path, mode='w', encoding='utf-8') for u, (name, gender, created_at) in users.iteritems(): if index > max_users: break t0 = datetime.strptime(created_at, '%Y/%m/%d') t_m = t0 - timedelta(days=2) #minus t_p = t0 + timedelta(days=2) #plus fp.write('GROUP%s\t%s\t%s\t%s\t%s\n' % (index, u, name, gender, created_at)) results = pick_user(gender, t_m.strftime('%Y/%m/%d'), t_p.strftime('%Y/%m/%d')) for r in results: u_created_at = util.time2epoch(r.created_at_or) u_created_at = time.strftime("%Y/%m/%d", time.localtime(u_created_at)) fp.write('GROUP%s\t%s\t%s\t%s\t%s\n' % (index, r.key, r.name, r.gender, u_created_at)) fp.write('\n') index += 1