Exemplo n.º 1
0
    def save(self, *args, **kwargs):
        # Sanitize the post body.
        self.html = html_util.parse_html(self.content)

        # Must add tags with instance method. This is just for safety.
        self.tag_val = html_util.strip_tags(self.tag_val)

        # Posts other than a question also carry the same tag
        if self.is_toplevel and self.type != Post.QUESTION:
            required_tag = self.get_type_display()
            if required_tag not in self.tag_val:
                self.tag_val += "," + required_tag

        if not self.id:
            # Set the titles
            if self.parent and not self.title:
                self.title = self.parent.title

            if self.parent and self.parent.type in (Post.ANSWER, Post.COMMENT):
                # Only comments may be added to a parent that is answer or comment.
                self.type = Post.COMMENT

            if self.type is None:
                # Set post type if it was left empty.
                self.type = self.COMMENT if self.parent else self.FORUM

            # This runs only once upon object creation.
            self.title = self.parent.title if self.parent else self.title
            self.lastedit_user = self.author
            self.status = self.status or Post.PENDING
            self.creation_date = self.creation_date or general_util.now()
            self.lastedit_date = self.creation_date

            # Set the timestamps on the parent
            if self.type == Post.ANSWER and self.parent:
                self.parent.lastedit_date = self.lastedit_date
                self.parent.lastedit_user = self.lastedit_user
                self.parent.save()

        # Recompute post reply count
        self.update_reply_count()

        super(Post, self).save(*args, **kwargs)
Exemplo n.º 2
0
                desc=TEXT(stored=True, analyzer=analyzer),
                city=TEXT(stored=True))

idx_dir = 'lagou'
if not os.path.exists(idx_dir):
    os.mkdir(idx_dir)

ix = create_in(idx_dir, schema)  # for create new index
# ix = open_dir(idx_dir)  # for read only
writer = ix.writer()

for j in jobs.values()[::10]:

    desc = j['desc'] if j['desc'] else u'无'
    city = j['city'] if j['city'] else u'未知'

    print(j['title'])
    print(j['job_id'])
    print(city)
    print(strip_tags(desc))

    writer.add_document(
        title=unicode(j['title']),
        path=unicode('/' + str(j['job_id'])),
        desc=unicode(strip_tags(desc)),
        city=city
    )

    print('')

writer.commit()
Exemplo n.º 3
0
job_data_file = '../crawlers/unified.pkl'
jobs = from_pickle(job_data_file)
# print(type(jobs))

skills = [
    u'Python', u'自然语言处理', u'数据挖掘', u'搜索算法', u'精准推荐', u'用户研究员', u'交互设计师',
    u'.NET', u'Java', u'C', u'PHP', u'Ruby', u'Node.js', u'iOS', u'Android',
    u'Javascript', u'MongoDB', u'产品经理', u'APP设计师', u'UI设计师', u'数据分析师'
]

jlist = jobs.values()
loaded = defaultdict(list)

for j in jlist:
    if j['skill_tag'] in skills:
        loaded[j['skill_tag']].append(strip_tags(j['desc']))

# print(len(loaded))
# print(len(loaded[u'Python']))
# for desc in loaded[u'Python'][:2]:
#     print(desc)

## start to analyze
cn_stop_words = chinese()
cn_stop_words.append(u'一门')
cn_stop_words.append(u'任一')

en_stop_words = stopwords.words('english')
for esw in en_stop_words:
    if esw not in cn_stop_words:
        cn_stop_words.append(esw)
Exemplo n.º 4
0
    con.row_factory = sqlite.Row
    cur = con.cursor()
    cur.execute("select * from position")
    # cur.execute("select * from position where name like '%Python%' "
    #             "or name like '%机器学习%' or name like '%数据挖掘%' or name like '%自然语言处理%' "
    #             "or name like '%C#%' or name like '%搜索算法%' or name like '%Hadoop%' "
    #             "or name like '%交互设计师%' or name like '%数据分析师%' or name like '%Java%'")
rows = cur.fetchall()

print(len(rows))

n = 1000000
cnt_word_doc = Counter()
cnt_words = Counter()
for pos in rows[:n]:
    desc = strip_tags(pos['desc'])
    tokens = jieba.tokenize(desc, mode="search")
    words = [t[0] for t in tokens]
    cur = Counter(words)
    for k in cur:
        cnt_word_doc[k] += 1
        cnt_words[k] += cur[k]

to_pickle(cnt_words, 'cnt_words.pkl')
to_pickle(cnt_word_doc, 'cnt_word_doc.pkl')
# rows = from_pickle('pos_list.pkl')

for k in cnt_words.most_common(200):
    print u'{0}: {1}'.format(k[0], k[1])

print
Exemplo n.º 5
0
con = sqlite.connect('lagou.db')

with con:

    con.row_factory = sqlite.Row
    cur = con.cursor()
    cur.execute(
        "select * from position where subcategory in ('后端开发', '前端开发', '用户研究')")
    rows = cur.fetchall()

    positions = []
    for row in rows:
        if row['desc'] != 'n/a':
            positions.append(
                (row['pos_id'], row['name'], row['industry'], row['desc']))

    print(len(positions))
    for p in positions[0]:
        print p

    to_pickle(positions, 'positions.pkl')

#####
raw_positions = positions
pos_dict = {}
for rp in raw_positions:
    pos_dict[int(rp[0])] = (rp[1], rp[2], strip_tags(rp[3]))

to_pickle(pos_dict, 'pos_norm.pkl')
Exemplo n.º 6
0
#encoding=utf-8
from nltk import FreqDist
from common.html_util import strip_tags
from common.persistence import from_pickle

# nlp_jobs = from_pickle('nlp.pkl')
jobs = from_pickle('ux.pkl')
print(len(jobs))

for j in jobs[:3]:
    print(j['desc'])
    print('')
    # print(strip_tags(j['desc']))
    # print('')

job_desc = [strip_tags(j['desc']) for j in jobs]

## start to parse with jieba
import jieba
# segs = jieba.cut(job_desc[0])
# print(', '.join(segs))

# print('')
# segs_all = jieba.cut(job_desc[0], cut_all=True)
# print(', '.join(segs_all))

# tokens = []
# for jd in job_desc:
#     tokens += jieba.cut(jd)
#
# print(len(tokens))
Exemplo n.º 7
0
job_data_file = '../crawlers/unified.pkl'
jobs = from_pickle(job_data_file)
# print(type(jobs))


skills = [u'Python', u'自然语言处理', u'数据挖掘', u'搜索算法', u'精准推荐', u'用户研究员', u'交互设计师', u'.NET',
          u'Java', u'C', u'PHP', u'Ruby', u'Node.js', u'iOS', u'Android', u'Javascript',
          u'MongoDB', u'产品经理', u'APP设计师', u'UI设计师', u'数据分析师']

jlist = jobs.values()
loaded = defaultdict(list)

for j in jlist:
    if j['skill_tag'] in skills:
        loaded[j['skill_tag']].append(strip_tags(j['desc']))

# print(len(loaded))
# print(len(loaded[u'Python']))
# for desc in loaded[u'Python'][:2]:
#     print(desc)


## start to analyze
cn_stop_words = chinese()
cn_stop_words.append(u'一门')
cn_stop_words.append(u'任一')

en_stop_words = stopwords.words('english')
for esw in en_stop_words:
    if esw not in cn_stop_words:
Exemplo n.º 8
0
from common.html_util import strip_tags
from common.persistence import to_pickle, from_pickle

con = sqlite.connect('lagou.db')

with con:

    con.row_factory = sqlite.Row
    cur = con.cursor()
    cur.execute("select * from position where subcategory in ('后端开发', '前端开发', '用户研究')")
    rows = cur.fetchall()

    positions = []
    for row in rows:
        if row['desc'] != 'n/a':
            positions.append((row['pos_id'], row['name'], row['industry'], row['desc']))

    print(len(positions))
    for p in positions[0]:
        print p

    to_pickle(positions, 'positions.pkl')

#####
raw_positions = positions
pos_dict = {}
for rp in raw_positions:
    pos_dict[int(rp[0])] = (rp[1], rp[2], strip_tags(rp[3]))

to_pickle(pos_dict, 'pos_norm.pkl')
Exemplo n.º 9
0
schema = Schema(title=TEXT(stored=True),
                path=ID(stored=True),
                desc=TEXT(stored=True, analyzer=analyzer),
                city=TEXT(stored=True))

idx_dir = 'lagou'
if not os.path.exists(idx_dir):
    os.mkdir(idx_dir)

ix = create_in(idx_dir, schema)  # for create new index
# ix = open_dir(idx_dir)  # for read only
writer = ix.writer()

for j in jobs.values()[::10]:

    desc = j['desc'] if j['desc'] else u'无'
    city = j['city'] if j['city'] else u'未知'

    print(j['title'])
    print(j['job_id'])
    print(city)
    print(strip_tags(desc))

    writer.add_document(title=unicode(j['title']),
                        path=unicode('/' + str(j['job_id'])),
                        desc=unicode(strip_tags(desc)),
                        city=city)

    print('')

writer.commit()