Python add_to_search_index 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: search_indexes

메소드/함수: add_to_search_index

hotexamples.com에서의 예제들: 3

Python add_to_search_index - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 search_indexes.add_to_search_index에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

def make_index(index, url, content_soup):
    try:
        style_num = content_soup.find_all('style')
        script_num = content_soup.find_all('script')
        for script in script_num:
            content_soup.script.decompose()
        for style in style_num:
            content_soup.style.decompose()
        content = content_soup.body.get_text()

    except:
        return
    words = content.split()

    stopwords = ['']
    unwanted_punctuations = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~>>'

    try:
        with open('google.csv', 'rb') as sw:  #stopword from google
            read = csv.reader(sw)
            for stopword in read:
                stopwords.append(''.join(stopword))
    except:
        pass
    for word in words:

        word = word.lstrip(unwanted_punctuations)
        word = word.rstrip(unwanted_punctuations)
        word = word.lower()

        if word not in stopwords:
            add_to_search_index(index, word, url)


# make_index({}, 'http://www.google.com', soup(''))

예제 #2

파일 보기

파일: indexpg.py 프로젝트: fhim50/jobsearchengine

def make_index(index, url, content_soup):
    try:
       style_num=content_soup.find_all('style')
       script_num=content_soup.find_all('script')
       for script in script_num:
           content_soup.script.decompose()
       for style in style_num:
           content_soup.style.decompose()
       content=content_soup.body.get_text()
       
    except:
        return
    words=content.split()
    
    stopwords=['']
    unwanted_punctuations='!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~>>'
    
    try:
        with open('google.csv','rb') as sw :#stopword from google
            read=csv.reader(sw)
            for stopword in read:
                stopwords.append(''.join(stopword))
    except:
        pass
    for word in words:
        
        word=word.lstrip(unwanted_punctuations)
        word=word.rstrip(unwanted_punctuations)
        word=word.lower()
        
        if word not in stopwords:
           add_to_search_index(index,word,url)
           
        
# make_index({}, 'http://www.google.com', soup(''))

예제 #3

파일 보기

파일: indexpg.py 프로젝트: obengwilliam/searchjob

def make_index( url, content_soup):
    if content_soup==soup('','lxml'):
       print 'Cannot be indexed #CoNTeNT EmptY....'
      
       return ''
 	
    docdigest=sha224(content_soup.body.encode('utf-8')).hexdigest()
    
    texts=soup(content_soup.get_text( ),'lxml').findAll(text=True)

   #obtaining meta info
    meta_info=[]
    
    for i in  content_soup.findAll('meta'):
       try:
         if i['content']:
            for i in i['content'].split():
                 meta_info.append(i)
       except:
		pass
    
   
    '''
    (str(),str())->()
    
    This module is responsible for preprocessing content to obtain only keywords 
    This keywords are inserted into the document together with their location and frequency in the document
    This helps us to calculate relevant scores based on this document.

    '''
    
    try:
      
       if content_soup.title==None:
          title=url
       else:
            title=content_soup.title.string

       '''
       style_num=content_soup.find_all('style')
       script_num=content_soup.find_all('script')
       
       for script in script_num:
           content_soup.script.decompose()
       for style in style_num:
           content_soup.style.decompose()
       content=content_soup.body.get_text()
       #finding the best way to obtain only text from a page
       '''
      
       
       

       
       content= ''.join([visible(elem) for elem in texts])      
       #content=content.encode('ascii','ignore')
       
       
       #print soup(content,'lxml')
       #print content

       if meta_info:
           
          page_body=' '.join(meta_info).lower()
       else:
           page_body=' '.join(content.split()[:50]).lower()
       
       
    except:
               date=datetime.today()
               db.crawler_error_log.insert({'error_type':str(sys.exc_info()),'date':date,'from_module':str(__file__)})
               print 'problem with obtaining index from the make_index module','.......',traceback.print_exc()
               return
    
    #content=content.encode('ascii','ignore')
    splitter=re.compile('\\W*')
    
    words=[s.lower( ) for s in splitter.split(content) if s!='']
    #later try to remove all stopwords
    
    
   

    
    stopwords=stopword([])
    
    unwanted_punctuations="!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~>>"
    '''
    try:
        with open('stopwords.csv','rb') as sw :#stopword from google
            read=csv.reader(sw)
            for stopword in read:
                stopwords.append(''.join(stopword))
    except:
        print 'problem from stopwords.csv'
    '''
    def isStringLike(word):
         '''
           str()->bool()
		This module checks if a word is a string by using the duck typing style .
		if it wals like a duck and quacks like a duck then it duck like enough for a purpose
	 '''
         try: word+''
         except: return False
	 else:   return True
    
    for i in xrange(len(words)):
        word=words[i]
       
        
       
        
        #if word=='code':
        #  print word
        
       
       
        if word not in stopwords and (not word.isdigit()) and (word not in unwanted_punctuations) and(word.isalpha()) :
           if isStringLike(word):
           	'''
           	have to check for alphabet using regular expressions
           	'''
           	location,count =i,words.count(word)
           
           
           	word=word.lstrip(unwanted_punctuations)
           	word=word.rstrip(unwanted_punctuations)
           
           
           
           
           
           	#if word=='code':
           	#to index i stem every word and a url a location, etc. 
           	add_to_search_index(stem(word),url,title,page_body,location,count,docdigest)