def process_item(self, item, spider):
        if item['title'] is not None and item['posttime'] is not None and item['content'] is not None:

            # 使用cursor()方法获取操作游标
            cursor = self.db.cursor()
            # SQL 查询语句
            query = "SELECT id FROM article WHERE linkmd5id = '%s'" % (item['linkmd5id'])

            # print sql
            try:
                cursor.execute(query)
                if cursor.rowcount == 0:
                    stopwords = fenci.getStopWords()
                	arr = fenci.fenci(item['contentText'], stopwords)
                	#print 'contentWords:%s' % contentWords
                	contentWords = ' '.join(arr)

                    # SQL 插入语句
                    sql = "INSERT INTO article(title, \
                           posttime, source_url, source_name, content, \
                           content_text, link, linkmd5id, crawl_site, content_words) \
                           VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s' )" % \
                           (item['title'], item['posttime'], item['sourceUrl'], item['sourceName'], \
                           item['content'], item['contentText'], item['link'], item['linkmd5id'], item['crawlSite'], contentWords)
                    # 执行sql语句
                    cursor.execute(sql)
                    # 提交到数据库执行
                    self.db.commit()
                    print("-------insert success--------")
                else:
                    print("-------duplicate link--------")
def index():
    return render_template('template.html',
                           weibo_data=BaseModel().select()[:10],
                           baidu_data=BaiduModal().select()[:10],
                           zhihu_data=ZhihuModal().select()[:10],
                           weixin_data=WeixinModal().select()[:10],
                           hotword=fenci())
Пример #3
0
def shaomiao(request):
    count=int(1192337-935285)
    if count>0:
        t=935285
        while t<count+t:
          try:
            event_c=models.NewEventParagraph.objects.get(id=t)
            event_id=event_c.cat_name_id
            event_username=[]
            event_username.extend(fenci.fenci(str(event_c.txt)))
            x=0
            while x<int(len(event_username)):
                models.jiabinc.objects.create(
                    username=event_username[x],
                    cat_event_id=event_id,
                    baikeURL='http://baike.baidu.com/search/word?word='+str(event_username[x])+'',
                    homeurl='http://www.huodongjia.com/event-'+str(event_id)+'.html'

                )
                x+=1
          except:
              pass

              t+=1

    else:
        return HttpResponse('没有更新')
    return HttpResponse('成功')
Пример #4
0
def update_event(request):
    count=int(len(new_add_event))
    if count>0:
        t=0
        while t<count:
          try:
            event_c=models.NewEventParagraph.objects.get(id=new_add_event[t])
            event_id=event_c.cat_name_id
            event_username=[]
            event_username.extend(fenci.fenci(str(event_c.txt)))
            x=0
            while x<int(len(event_username)):
                models.jiabin_m.objects.create(
                    username=event_username[x],
                    cat_event_id=event_id,
                    baikeURL='http://baike.baidu.com/search/word?word='+str(event_username[x])+'',
                    homeurl='http://www.huodongjia.com/event-'+str(event_id)+'.html'

                )
                x+=1

            t+=1
            models.jiabin_event.objects.filter(id=1).update(all_event=new_add_event[t-1])
          except:
              pass
              return HttpResponse('更新错误')
    else:
        return HttpResponse('没有更新')
    return HttpResponse('成功')
Пример #5
0
    def searchBoxGetPoiInfo(self, mName):
        # 搜索框获取 关键字的 poi列表信息
        # 参数 mName  为 '地市名,poi名
        sourceName = mName.split(',')

        wd = fenci.fenci(sourceName[1])  # 对源poi名称进行中文分词 作为百度搜索的第一关键字
        wd2 = sourceName[0]  # 地市名 作为第二关键字

        poiInfoList = []
        # 最后一个 wd关键字 搜索出的所有poi的集合列表

        poiList = self.searchBoxGetPoiList(wd, wd2)
        # 搜索框的结果列表

        if poiList:
            for poi in poiList:
                (pylgon, csvName, name, uid, primary_uid, alias, addr,
                 address_norm, area, area_name, catalogID, di_tag, std_tag,
                 std_tag_id, tel, x, y, lon, lat, geo) = [[], '', '', '', '',
                                                          '', '', '', '', '',
                                                          '', '', '', '', '',
                                                          '', '', '', '', '']
                try:
                    csvName = sourceName[1]
                    name = self.stripStr(poi.get('name', ''))
                    uid = self.stripStr(poi.get('uid', '')) or ''
                    alias = self.stripStr(poi.get('alias', ''))
                    addr = self.stripStr(poi.get('addr', ''))
                    address_norm = self.stripStr(poi.get('address_norm', ''))
                    area = self.stripStr(poi.get('area', ''))
                    area_name = self.stripStr(poi.get('area_name', ''))
                    catalogID = self.stripStr(poi.get('catalogID', ''))
                    di_tag = self.stripStr(poi.get('di_tag', ''))
                    primary_uid = self.stripStr(poi.get('primary_uid', ''))
                    std_tag = self.stripStr(poi.get('std_tag', ''))
                    std_tag_id = self.stripStr(poi.get('std_tag_id', ''))
                    tel = self.stripStr(poi.get('tel', ''))

                    if isinstance(poi.get('x', ''), int):
                        x = poi.get('x', '') / 100
                        y = poi.get('y', '') / 100

                        pointGps = self.miToGPS(x, y)
                        lon = str(pointGps.get('lon', ''))
                        lat = str(pointGps.get('lat', ''))
                        x = str(x)
                        y = str(y)

                    geo = self.uuidGetGeo(uid)
                    if geo == []: geo = ''
                    print(",".join([
                        csvName, name, uid, primary_uid, alias, addr,
                        address_norm, area, area_name, catalogID, di_tag,
                        std_tag, std_tag_id, tel, x, y, lon, lat, geo
                    ]))

                    poiInfoList.append(",".join([
                        csvName, name, uid, primary_uid, alias, addr,
                        address_norm, area, area_name, catalogID, di_tag,
                        std_tag, std_tag_id, tel, x, y, lon, lat, geo + '\n'
                    ]))

                except Exception as e:
                    print(e)

            with open(self.poisFile, mode='a+', encoding='gbk',
                      errors=None) as f:  # 将采集进度写入文件
                f.writelines(poiInfoList)
            with open(self.currFile, mode='w', encoding='gbk',
                      errors=None) as f:  # 将采集进度写入文件
                f.writelines(mName)
Пример #6
0
def tochart(path):
    df = pd.read_excel(path, sheet_name=0, encoding='ANSI')
    df.reset_index()

    page = Page(page_title='7月事件单分析TOP10')
    #Bar
    bar = Bar(width=1000, height=700)
    collist = df.columns.values.tolist()
    fenlei = df[collist[0]]
    for col in range(1, len(collist) - 1):
        ds = collist[col]
        list2 = df[ds]
        bar.add(ds,
                fenlei,
                list2,
                is_stack=True,
                bar_category_gap='40%',
                xaxis_interval=0,
                xaxis_rotate=15,
                yaxis_rotate=30)
    page.add_chart(bar, name="bar")

    #词云图+饼图
    top = ""
    num = 30
    wordcloud = []
    pie = []
    for i in range(0, 3):
        keyword = []
        value = []
        top = fenlei[i]
        fenci.fenci(top, num, keyword, value)  #调用fenci
        print(keyword, value)
        #词云图
        wordcloud.append(
            WordCloud(title='↑关键词分析(TOP30):' + str(top),
                      title_text_size=14,
                      title_top='bottom',
                      width=500,
                      height=500))
        wordcloud[i].add(top,
                         keyword,
                         value,
                         word_size_range=[20, 60],
                         shape='diamond')
        page.add_chart(wordcloud[i], name='wordcloud' + str(i))
        #饼图
        pie.append(
            Pie(title='↑关键词分析(TOP10):' + str(top),
                title_text_size=14,
                title_top='bottom',
                width=600,
                height=500))
        pie[i].add(top,
                   keyword[0:10],
                   value[0:10],
                   radius=[30, 60],
                   label_text_color=None,
                   is_label_show=True,
                   legend_orient="vertical",
                   legend_pos="left")
        page.add_chart(pie[i], name='pie' + str(i))
        print('-' * 10)

    page.render('7月事件单分析TOP10+关键词.html')
    return 0
Пример #7
0
#coding:utf8
import sys
from fenci import fenci

fc = fenci()
fc.init_fenci()
ret = fc.get_text_fc('王晓明是个大坏蛋,十点二十分天马行空般的去打篮球')
for i in ret:
	print i[0]
	print '-----'
Пример #8
0
#coding:utf8
import sys
from fenci import fenci

fc = fenci()
fc.init_fenci()
ret = fc.get_text_fc('王晓明是个大坏蛋,十点二十分天马行空般的去打篮球')
for i in ret:
    print i[0]
    print '-----'