def save(self, *args, **kwargs): # Sanitize the post body. self.html = html_util.parse_html(self.content) # Must add tags with instance method. This is just for safety. self.tag_val = html_util.strip_tags(self.tag_val) # Posts other than a question also carry the same tag if self.is_toplevel and self.type != Post.QUESTION: required_tag = self.get_type_display() if required_tag not in self.tag_val: self.tag_val += "," + required_tag if not self.id: # Set the titles if self.parent and not self.title: self.title = self.parent.title if self.parent and self.parent.type in (Post.ANSWER, Post.COMMENT): # Only comments may be added to a parent that is answer or comment. self.type = Post.COMMENT if self.type is None: # Set post type if it was left empty. self.type = self.COMMENT if self.parent else self.FORUM # This runs only once upon object creation. self.title = self.parent.title if self.parent else self.title self.lastedit_user = self.author self.status = self.status or Post.PENDING self.creation_date = self.creation_date or general_util.now() self.lastedit_date = self.creation_date # Set the timestamps on the parent if self.type == Post.ANSWER and self.parent: self.parent.lastedit_date = self.lastedit_date self.parent.lastedit_user = self.lastedit_user self.parent.save() # Recompute post reply count self.update_reply_count() super(Post, self).save(*args, **kwargs)
desc=TEXT(stored=True, analyzer=analyzer), city=TEXT(stored=True)) idx_dir = 'lagou' if not os.path.exists(idx_dir): os.mkdir(idx_dir) ix = create_in(idx_dir, schema) # for create new index # ix = open_dir(idx_dir) # for read only writer = ix.writer() for j in jobs.values()[::10]: desc = j['desc'] if j['desc'] else u'无' city = j['city'] if j['city'] else u'未知' print(j['title']) print(j['job_id']) print(city) print(strip_tags(desc)) writer.add_document( title=unicode(j['title']), path=unicode('/' + str(j['job_id'])), desc=unicode(strip_tags(desc)), city=city ) print('') writer.commit()
job_data_file = '../crawlers/unified.pkl' jobs = from_pickle(job_data_file) # print(type(jobs)) skills = [ u'Python', u'自然语言处理', u'数据挖掘', u'搜索算法', u'精准推荐', u'用户研究员', u'交互设计师', u'.NET', u'Java', u'C', u'PHP', u'Ruby', u'Node.js', u'iOS', u'Android', u'Javascript', u'MongoDB', u'产品经理', u'APP设计师', u'UI设计师', u'数据分析师' ] jlist = jobs.values() loaded = defaultdict(list) for j in jlist: if j['skill_tag'] in skills: loaded[j['skill_tag']].append(strip_tags(j['desc'])) # print(len(loaded)) # print(len(loaded[u'Python'])) # for desc in loaded[u'Python'][:2]: # print(desc) ## start to analyze cn_stop_words = chinese() cn_stop_words.append(u'一门') cn_stop_words.append(u'任一') en_stop_words = stopwords.words('english') for esw in en_stop_words: if esw not in cn_stop_words: cn_stop_words.append(esw)
con.row_factory = sqlite.Row cur = con.cursor() cur.execute("select * from position") # cur.execute("select * from position where name like '%Python%' " # "or name like '%机器学习%' or name like '%数据挖掘%' or name like '%自然语言处理%' " # "or name like '%C#%' or name like '%搜索算法%' or name like '%Hadoop%' " # "or name like '%交互设计师%' or name like '%数据分析师%' or name like '%Java%'") rows = cur.fetchall() print(len(rows)) n = 1000000 cnt_word_doc = Counter() cnt_words = Counter() for pos in rows[:n]: desc = strip_tags(pos['desc']) tokens = jieba.tokenize(desc, mode="search") words = [t[0] for t in tokens] cur = Counter(words) for k in cur: cnt_word_doc[k] += 1 cnt_words[k] += cur[k] to_pickle(cnt_words, 'cnt_words.pkl') to_pickle(cnt_word_doc, 'cnt_word_doc.pkl') # rows = from_pickle('pos_list.pkl') for k in cnt_words.most_common(200): print u'{0}: {1}'.format(k[0], k[1]) print
con = sqlite.connect('lagou.db') with con: con.row_factory = sqlite.Row cur = con.cursor() cur.execute( "select * from position where subcategory in ('后端开发', '前端开发', '用户研究')") rows = cur.fetchall() positions = [] for row in rows: if row['desc'] != 'n/a': positions.append( (row['pos_id'], row['name'], row['industry'], row['desc'])) print(len(positions)) for p in positions[0]: print p to_pickle(positions, 'positions.pkl') ##### raw_positions = positions pos_dict = {} for rp in raw_positions: pos_dict[int(rp[0])] = (rp[1], rp[2], strip_tags(rp[3])) to_pickle(pos_dict, 'pos_norm.pkl')
#encoding=utf-8 from nltk import FreqDist from common.html_util import strip_tags from common.persistence import from_pickle # nlp_jobs = from_pickle('nlp.pkl') jobs = from_pickle('ux.pkl') print(len(jobs)) for j in jobs[:3]: print(j['desc']) print('') # print(strip_tags(j['desc'])) # print('') job_desc = [strip_tags(j['desc']) for j in jobs] ## start to parse with jieba import jieba # segs = jieba.cut(job_desc[0]) # print(', '.join(segs)) # print('') # segs_all = jieba.cut(job_desc[0], cut_all=True) # print(', '.join(segs_all)) # tokens = [] # for jd in job_desc: # tokens += jieba.cut(jd) # # print(len(tokens))
job_data_file = '../crawlers/unified.pkl' jobs = from_pickle(job_data_file) # print(type(jobs)) skills = [u'Python', u'自然语言处理', u'数据挖掘', u'搜索算法', u'精准推荐', u'用户研究员', u'交互设计师', u'.NET', u'Java', u'C', u'PHP', u'Ruby', u'Node.js', u'iOS', u'Android', u'Javascript', u'MongoDB', u'产品经理', u'APP设计师', u'UI设计师', u'数据分析师'] jlist = jobs.values() loaded = defaultdict(list) for j in jlist: if j['skill_tag'] in skills: loaded[j['skill_tag']].append(strip_tags(j['desc'])) # print(len(loaded)) # print(len(loaded[u'Python'])) # for desc in loaded[u'Python'][:2]: # print(desc) ## start to analyze cn_stop_words = chinese() cn_stop_words.append(u'一门') cn_stop_words.append(u'任一') en_stop_words = stopwords.words('english') for esw in en_stop_words: if esw not in cn_stop_words:
from common.html_util import strip_tags from common.persistence import to_pickle, from_pickle con = sqlite.connect('lagou.db') with con: con.row_factory = sqlite.Row cur = con.cursor() cur.execute("select * from position where subcategory in ('后端开发', '前端开发', '用户研究')") rows = cur.fetchall() positions = [] for row in rows: if row['desc'] != 'n/a': positions.append((row['pos_id'], row['name'], row['industry'], row['desc'])) print(len(positions)) for p in positions[0]: print p to_pickle(positions, 'positions.pkl') ##### raw_positions = positions pos_dict = {} for rp in raw_positions: pos_dict[int(rp[0])] = (rp[1], rp[2], strip_tags(rp[3])) to_pickle(pos_dict, 'pos_norm.pkl')
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), desc=TEXT(stored=True, analyzer=analyzer), city=TEXT(stored=True)) idx_dir = 'lagou' if not os.path.exists(idx_dir): os.mkdir(idx_dir) ix = create_in(idx_dir, schema) # for create new index # ix = open_dir(idx_dir) # for read only writer = ix.writer() for j in jobs.values()[::10]: desc = j['desc'] if j['desc'] else u'无' city = j['city'] if j['city'] else u'未知' print(j['title']) print(j['job_id']) print(city) print(strip_tags(desc)) writer.add_document(title=unicode(j['title']), path=unicode('/' + str(j['job_id'])), desc=unicode(strip_tags(desc)), city=city) print('') writer.commit()