예제 #1
0
    def parse_sentence(self, response):

        word = response.meta['word']
        en_word = response.meta['en_word']
        get_log(settings.LOG_NAME_BINGWORD).info(
            'get the sentence ,word is %s,meaning is %s' % (en_word, word))
        item = BingEgSenItems()
        sentence_list = response.xpath('//*[@class="se_li"]')
        eg_sentence = {}
        eg_sentence['word'] = word
        cn_list = []
        en_list = []
        for s in sentence_list:
            en = s.xpath(
                './/*[@class="se_li1"]//*[@class="sen_en"]//text()').extract()
            cn = s.xpath(
                './/*[@class="se_li1"]//*[@class="sen_cn"]//text()').extract()
            cn_list.append(cn)
            en_list.append(en)
        eg_sentence['cn_list'] = cn_list
        eg_sentence['en_list'] = en_list
        item['eg_sentence'] = eg_sentence
        item['en_word'] = en_word
        database_handler.save_sentence(item)
        #print json.dumps(item, indent=2, ensure_ascii=True)
        #print item
        #print "="*100
        #bing_sen_output(item)
        #print "="*100
        yield item
예제 #2
0
 def inner_fun(*args, **kwargs):
     try:
         return fun(*args, **kwargs)
     except Exception, e:
         t, b, tb = sys.exc_info()
         get_log(settings.LOG_NAME_BINGWORD).error(
             '%s:%s,%s' % (t, b, traceback.print_tb(tb)))
예제 #3
0
def save_data(ob, try_time=1):
    #global session
    try:
        # 创建session对象:
        session = DBSession()
        session.add(ob)
        session.commit()
        id = ob.id
        session.close()
        return id
    except:
        #session=DBSession()
        #session.rollback()
        session.close()
        t, b, tb = sys.exc_info()
        get_log(settings.LOG_NAME_BINGWORD).error(
            'save data appear error,try time is %s, %s:%s,%s' %
            (try_time, t, b, traceback.print_tb(tb)))
        if try_time >= settings.TRY_TIME:
            ob_data = copy.deepcopy(ob)
            ob_data = ob_data.__dict__
            if '_sa_instance_state' in ob_data:
                del ob_data['_sa_instance_state']
            get_log(settings.LOG_NAME_BINGWORD).error(
                'save fail,want to save data is :%s' %
                (json.dumps(ob_data, encoding='utf-8', ensure_ascii=False)))
            return
        save_data(ob, try_time + 1)
예제 #4
0
def insert_basic_word_transform(item):
    for i in range(len(item['tense_names'])):
        ob = BasicWordTranceform()
        ob.prop_ext = item['en_word'].strip()
        # 复数
        if item['tense_names'][i].strip() == 'Plural Form:':
            ob.type = 0
            ob.prop_id = 0
            ob.spell = item['tense_words'][i]
            ob.status = 0
        #第三人称单数
        elif item['tense_names'][i].strip() == 'Simple Present:':
            ob.type = 1
            ob.prop_id = 0
            ob.spell = item['tense_words'][i]
            ob.status = 0
        #现在分词-ing
        elif item['tense_names'][i].strip() == 'Present Participle:':
            ob.type = 2
            ob.prop_id = 0
            ob.spell = item['tense_words'][i]
            ob.status = 0
        #过去式-past
        elif item['tense_names'][i].strip() == 'Past Tense:':
            ob.type = 3
            ob.prop_id = 0
            ob.spell = item['tense_words'][i]
            ob.status = 0
        #比较级
        elif item['tense_names'][i].strip() == 'Comparative Degree:':
            ob.type = 4
            ob.prop_id = 0
            ob.spell = item['tense_words'][i]
            ob.status = 0
        #最高级
        elif item['tense_names'][i].strip() == 'Superlative:':
            ob.type = 5
            ob.prop_id = 0
            ob.spell = item['tense_words'][i]
            ob.status = 0
        else:
            print item['tense_names'][i]
            print item['tense_words'][i]
            get_log(settings.LOG_NAME_BINGWORD).error(
                'the nonsupport transform type. '
                'type is %s,spell is %s' %
                (item['tense_names'][i], item['tense_words'][i]))

        save_data(ob)
예제 #5
0
def save_sentence(item):
    en_word = item['en_word']
    word = item['eg_sentence']['word']
    for i in range(len(item['eg_sentence']['cn_list'])):
        get_log(settings.LOG_NAME_BINGWORD).info(
            'handle one sentence, word is %s,meaning is %s' % (en_word, word))
        slist_cn = item['eg_sentence']['cn_list'][i]
        sentence_cn = ''.join(slist_cn)
        slist_en = item['eg_sentence']['en_list'][i]
        sentence_en = ''.join(slist_en)
        ob = BasicWordSentence()
        ob.prop_id = 0
        ob.prop_ext = '%s&%s' % (en_word, word)
        ob.index = i
        ob.english = sentence_en
        ob.chinese = sentence_cn
        ob.status = 0
        save_data(ob)
        # session=DBSession()
        # session.add(ob)
        # session.commit()
    return True
예제 #6
0
 def process_item(self, item, spider):
     get_log(settings.LOG_NAME_BINGWORD).warning(
         json.dumps(dict(item), ensure_ascii=False, encoding='utf-8'))
     return item
예제 #7
0
    def parse(self, response):
        item = BingItems()
        regext = re.compile('&q=(.*?)&')
        en_word = regext.findall(response.url)[0]

        is_exist = database_handler.query_basiec_word_base(en_word)
        if is_exist:
            get_log(settings.LOG_NAME_BINGWORD).info(
                "the word : %s ,database have existed" % en_word)
            get_log(settings.LOG_NAME_BINGWORD).error(
                "the word : %s ,database have existed" % en_word)
            return
        if response.url == "http://www.baidu.com":
            print 'continue'
        else:
            get_log(settings.LOG_NAME_BINGWORD).info(
                "start to get the word: %s" % (en_word, ))
            pr_us = response.xpath('//*[@class="hd_prUS"]/text()').extract()[0]
            gr = response.xpath('//*[@class="hd_pr"]/text()').extract()[0]
            audio_us = response.xpath(
                '//*[@class="hd_tf"]/a/@onmouseover').extract()[0]
            audio = response.xpath(
                '//*[@class="hd_tf"]/a/@onmouseover').extract()[1]
            item['en_word'] = en_word
            item['audio_us'] = audio_us
            item['audio_us_href'] = pr_us
            item['audio'] = audio
            item['audio_href'] = gr

            detail_list = response.xpath('//*[@class="qdef"]/ul/li')
            natures = []
            natures_meaning = []
            desc = {}
            for d in detail_list:
                nature = d.xpath('.//span[@class="pos"]/text()').extract()
                meaning = d.xpath(
                    './/span[@class="def"]/span/text()').extract()
                if not nature or not meaning:
                    continue
                natures.append(nature[0])
                natures_meaning.append(meaning[0])
                desc[nature[0]] = meaning[0]

            item['natures'] = natures
            item['natures_meaning'] = natures_meaning
            item['desc'] = desc

            tense_name_list = response.xpath(
                './/div[@class="qdef"]/div[@class="hd_div1"]//div[@class="hd_if"]/span/text()'
            ).extract()
            tense_word_list = response.xpath(
                './/div[@class="qdef"]/div[@class="hd_div1"]//div[@class="hd_if"]/a/text()'
            ).extract()
            tense_href_list = response.xpath(
                './/div[@class="qdef"]/div[@class="hd_div1"]//div[@class="hd_if"]/a/@href'
            ).extract()
            item['tense_names'] = tense_name_list
            item['tense_words'] = tense_word_list

            #detail_list = response.xpath('//div[@class="qdef"]/div[@class="wd_div"]/div[@id="thesaurusesid"]/div[@id="synoid"]/div[@class="df_div2"]/div[@class="de_title1"]/text()')
            #response.xpath('//div[@id="synoid"]//div[@class="col_fl"]/a/span/text()')

            #tongyi
            synoid_natures = response.xpath(
                '//div[@id="synoid"]//div[@class="de_title1"]/text()').extract(
                )
            synoid_list = response.xpath(
                '//div[@id="synoid"]//div[@class="df_div2"]')
            synoid_words_list = []
            for d in synoid_list:
                synoid_words = d.xpath(
                    './/div[@class="col_fl"]/a/span/text()').extract()
                synoid_words_list.append(synoid_words)

            item['synonymous_nature'] = synoid_natures
            item['synonymous_words'] = synoid_words_list

            #DAPEI
            colid_natures = response.xpath(
                '//div[@id="colid"]//div[@class="de_title2"]/text()').extract(
                )
            colid_list = response.xpath(
                '//div[@id="colid"]//div[@class="df_div2"]')
            colid_words_list = []
            for d in colid_list:
                colid_words = d.xpath(
                    './/div[@class="col_fl"]/a/span/text()').extract()
                colid_words_list.append(colid_words)

            item['phrase_nature'] = colid_natures
            item['phrase_words'] = colid_words_list

            #FANYI
            antoid_natures = response.xpath(
                '//div[@id="antoid"]//div[@class="de_title1"]/text()').extract(
                )
            antoid_list = response.xpath(
                '//div[@id="antoid"]//div[@class="df_div2"]')
            antoid_words_list = []
            for d in antoid_list:
                antoid_words = d.xpath(
                    './/div[@class="col_fl"]/a/span/text()').extract()
                antoid_words_list.append(antoid_words)
            item['antonym_nature'] = antoid_natures
            item['antonym_words'] = antoid_words_list

            #EN-EN
            #response.xpath('.//div[@id="homoid"]/table//tr[@class="def_row df_div1"][1]/td/div[@class="def_fl"]/div[@class="de_li1 de_li3"]/div/text()').extract()
            homoid_list = response.xpath(
                './/div[@id="homoid"]/table//tr[@class="def_row df_div1"]/td/div[@class="pos pos1"]/text()'
            ).extract()
            detail_en_list = response.xpath(
                './/div[@id="homoid"]/table//tr[@class="def_row df_div1"]')
            for detail_en in detail_en_list:
                detail_en_list = detail_en.xpath(
                    './/td/div[@class="def_fl"]/div[@class="de_li1 de_li3"]/div/text()'
                ).extract()
            item['en2en_explain_nature'] = homoid_list
            item['en2en_explain_sentence'] = detail_en_list

            #response.xpath('//*[@id="sentenceSeg"]/div[1]/div[2]/div[1]/a[3]')

            explain_list = response.xpath(
                '//*[@class="senDefLink"]/a/text()').extract()
            item['cn_meaning'] = explain_list
            print "*" * 100
            bing_item_output(item)
            base_id = database_handler.insert_basic_word_base(item)
            material_ids = database_handler.insert_basic_material(item)
            database_handler.insert_basic_word_properties(
                base_id, material_ids, item)
            database_handler.save_basic_word_association(base_id, item)
            database_handler.insert_basic_word_transform(item)

            print "*" * 100
            yield item
            for t in explain_list:
                if t == 'All':
                    continue
                surl = 'http://www.bing.com/dict/service?q=' + en_word + '%20' + t + '&dtype=sen'
                #print surl
                yield scrapy.Request(url=surl,
                                     meta={
                                         'en_word': en_word,
                                         'word': t
                                     },
                                     callback=self.parse_sentence)

        word_tuple = r.brpop('words', 10)
        if word_tuple:
            en_word = word_tuple[1]
            yield scrapy.Request(url=prefix + en_word,
                                 meta={'en_word': en_word},
                                 callback=self.parse)
        else:
            print 'spider over'
            return