def parse_sentence(self, response): word = response.meta['word'] en_word = response.meta['en_word'] get_log(settings.LOG_NAME_BINGWORD).info( 'get the sentence ,word is %s,meaning is %s' % (en_word, word)) item = BingEgSenItems() sentence_list = response.xpath('//*[@class="se_li"]') eg_sentence = {} eg_sentence['word'] = word cn_list = [] en_list = [] for s in sentence_list: en = s.xpath( './/*[@class="se_li1"]//*[@class="sen_en"]//text()').extract() cn = s.xpath( './/*[@class="se_li1"]//*[@class="sen_cn"]//text()').extract() cn_list.append(cn) en_list.append(en) eg_sentence['cn_list'] = cn_list eg_sentence['en_list'] = en_list item['eg_sentence'] = eg_sentence item['en_word'] = en_word database_handler.save_sentence(item) #print json.dumps(item, indent=2, ensure_ascii=True) #print item #print "="*100 #bing_sen_output(item) #print "="*100 yield item
def inner_fun(*args, **kwargs): try: return fun(*args, **kwargs) except Exception, e: t, b, tb = sys.exc_info() get_log(settings.LOG_NAME_BINGWORD).error( '%s:%s,%s' % (t, b, traceback.print_tb(tb)))
def save_data(ob, try_time=1): #global session try: # 创建session对象: session = DBSession() session.add(ob) session.commit() id = ob.id session.close() return id except: #session=DBSession() #session.rollback() session.close() t, b, tb = sys.exc_info() get_log(settings.LOG_NAME_BINGWORD).error( 'save data appear error,try time is %s, %s:%s,%s' % (try_time, t, b, traceback.print_tb(tb))) if try_time >= settings.TRY_TIME: ob_data = copy.deepcopy(ob) ob_data = ob_data.__dict__ if '_sa_instance_state' in ob_data: del ob_data['_sa_instance_state'] get_log(settings.LOG_NAME_BINGWORD).error( 'save fail,want to save data is :%s' % (json.dumps(ob_data, encoding='utf-8', ensure_ascii=False))) return save_data(ob, try_time + 1)
def insert_basic_word_transform(item): for i in range(len(item['tense_names'])): ob = BasicWordTranceform() ob.prop_ext = item['en_word'].strip() # 复数 if item['tense_names'][i].strip() == 'Plural Form:': ob.type = 0 ob.prop_id = 0 ob.spell = item['tense_words'][i] ob.status = 0 #第三人称单数 elif item['tense_names'][i].strip() == 'Simple Present:': ob.type = 1 ob.prop_id = 0 ob.spell = item['tense_words'][i] ob.status = 0 #现在分词-ing elif item['tense_names'][i].strip() == 'Present Participle:': ob.type = 2 ob.prop_id = 0 ob.spell = item['tense_words'][i] ob.status = 0 #过去式-past elif item['tense_names'][i].strip() == 'Past Tense:': ob.type = 3 ob.prop_id = 0 ob.spell = item['tense_words'][i] ob.status = 0 #比较级 elif item['tense_names'][i].strip() == 'Comparative Degree:': ob.type = 4 ob.prop_id = 0 ob.spell = item['tense_words'][i] ob.status = 0 #最高级 elif item['tense_names'][i].strip() == 'Superlative:': ob.type = 5 ob.prop_id = 0 ob.spell = item['tense_words'][i] ob.status = 0 else: print item['tense_names'][i] print item['tense_words'][i] get_log(settings.LOG_NAME_BINGWORD).error( 'the nonsupport transform type. ' 'type is %s,spell is %s' % (item['tense_names'][i], item['tense_words'][i])) save_data(ob)
def save_sentence(item): en_word = item['en_word'] word = item['eg_sentence']['word'] for i in range(len(item['eg_sentence']['cn_list'])): get_log(settings.LOG_NAME_BINGWORD).info( 'handle one sentence, word is %s,meaning is %s' % (en_word, word)) slist_cn = item['eg_sentence']['cn_list'][i] sentence_cn = ''.join(slist_cn) slist_en = item['eg_sentence']['en_list'][i] sentence_en = ''.join(slist_en) ob = BasicWordSentence() ob.prop_id = 0 ob.prop_ext = '%s&%s' % (en_word, word) ob.index = i ob.english = sentence_en ob.chinese = sentence_cn ob.status = 0 save_data(ob) # session=DBSession() # session.add(ob) # session.commit() return True
def process_item(self, item, spider): get_log(settings.LOG_NAME_BINGWORD).warning( json.dumps(dict(item), ensure_ascii=False, encoding='utf-8')) return item
def parse(self, response): item = BingItems() regext = re.compile('&q=(.*?)&') en_word = regext.findall(response.url)[0] is_exist = database_handler.query_basiec_word_base(en_word) if is_exist: get_log(settings.LOG_NAME_BINGWORD).info( "the word : %s ,database have existed" % en_word) get_log(settings.LOG_NAME_BINGWORD).error( "the word : %s ,database have existed" % en_word) return if response.url == "http://www.baidu.com": print 'continue' else: get_log(settings.LOG_NAME_BINGWORD).info( "start to get the word: %s" % (en_word, )) pr_us = response.xpath('//*[@class="hd_prUS"]/text()').extract()[0] gr = response.xpath('//*[@class="hd_pr"]/text()').extract()[0] audio_us = response.xpath( '//*[@class="hd_tf"]/a/@onmouseover').extract()[0] audio = response.xpath( '//*[@class="hd_tf"]/a/@onmouseover').extract()[1] item['en_word'] = en_word item['audio_us'] = audio_us item['audio_us_href'] = pr_us item['audio'] = audio item['audio_href'] = gr detail_list = response.xpath('//*[@class="qdef"]/ul/li') natures = [] natures_meaning = [] desc = {} for d in detail_list: nature = d.xpath('.//span[@class="pos"]/text()').extract() meaning = d.xpath( './/span[@class="def"]/span/text()').extract() if not nature or not meaning: continue natures.append(nature[0]) natures_meaning.append(meaning[0]) desc[nature[0]] = meaning[0] item['natures'] = natures item['natures_meaning'] = natures_meaning item['desc'] = desc tense_name_list = response.xpath( './/div[@class="qdef"]/div[@class="hd_div1"]//div[@class="hd_if"]/span/text()' ).extract() tense_word_list = response.xpath( './/div[@class="qdef"]/div[@class="hd_div1"]//div[@class="hd_if"]/a/text()' ).extract() tense_href_list = response.xpath( './/div[@class="qdef"]/div[@class="hd_div1"]//div[@class="hd_if"]/a/@href' ).extract() item['tense_names'] = tense_name_list item['tense_words'] = tense_word_list #detail_list = response.xpath('//div[@class="qdef"]/div[@class="wd_div"]/div[@id="thesaurusesid"]/div[@id="synoid"]/div[@class="df_div2"]/div[@class="de_title1"]/text()') #response.xpath('//div[@id="synoid"]//div[@class="col_fl"]/a/span/text()') #tongyi synoid_natures = response.xpath( '//div[@id="synoid"]//div[@class="de_title1"]/text()').extract( ) synoid_list = response.xpath( '//div[@id="synoid"]//div[@class="df_div2"]') synoid_words_list = [] for d in synoid_list: synoid_words = d.xpath( './/div[@class="col_fl"]/a/span/text()').extract() synoid_words_list.append(synoid_words) item['synonymous_nature'] = synoid_natures item['synonymous_words'] = synoid_words_list #DAPEI colid_natures = response.xpath( '//div[@id="colid"]//div[@class="de_title2"]/text()').extract( ) colid_list = response.xpath( '//div[@id="colid"]//div[@class="df_div2"]') colid_words_list = [] for d in colid_list: colid_words = d.xpath( './/div[@class="col_fl"]/a/span/text()').extract() colid_words_list.append(colid_words) item['phrase_nature'] = colid_natures item['phrase_words'] = colid_words_list #FANYI antoid_natures = response.xpath( '//div[@id="antoid"]//div[@class="de_title1"]/text()').extract( ) antoid_list = response.xpath( '//div[@id="antoid"]//div[@class="df_div2"]') antoid_words_list = [] for d in antoid_list: antoid_words = d.xpath( './/div[@class="col_fl"]/a/span/text()').extract() antoid_words_list.append(antoid_words) item['antonym_nature'] = antoid_natures item['antonym_words'] = antoid_words_list #EN-EN #response.xpath('.//div[@id="homoid"]/table//tr[@class="def_row df_div1"][1]/td/div[@class="def_fl"]/div[@class="de_li1 de_li3"]/div/text()').extract() homoid_list = response.xpath( './/div[@id="homoid"]/table//tr[@class="def_row df_div1"]/td/div[@class="pos pos1"]/text()' ).extract() detail_en_list = response.xpath( './/div[@id="homoid"]/table//tr[@class="def_row df_div1"]') for detail_en in detail_en_list: detail_en_list = detail_en.xpath( './/td/div[@class="def_fl"]/div[@class="de_li1 de_li3"]/div/text()' ).extract() item['en2en_explain_nature'] = homoid_list item['en2en_explain_sentence'] = detail_en_list #response.xpath('//*[@id="sentenceSeg"]/div[1]/div[2]/div[1]/a[3]') explain_list = response.xpath( '//*[@class="senDefLink"]/a/text()').extract() item['cn_meaning'] = explain_list print "*" * 100 bing_item_output(item) base_id = database_handler.insert_basic_word_base(item) material_ids = database_handler.insert_basic_material(item) database_handler.insert_basic_word_properties( base_id, material_ids, item) database_handler.save_basic_word_association(base_id, item) database_handler.insert_basic_word_transform(item) print "*" * 100 yield item for t in explain_list: if t == 'All': continue surl = 'http://www.bing.com/dict/service?q=' + en_word + '%20' + t + '&dtype=sen' #print surl yield scrapy.Request(url=surl, meta={ 'en_word': en_word, 'word': t }, callback=self.parse_sentence) word_tuple = r.brpop('words', 10) if word_tuple: en_word = word_tuple[1] yield scrapy.Request(url=prefix + en_word, meta={'en_word': en_word}, callback=self.parse) else: print 'spider over' return