def make_index(index, url, content_soup): try: style_num = content_soup.find_all('style') script_num = content_soup.find_all('script') for script in script_num: content_soup.script.decompose() for style in style_num: content_soup.style.decompose() content = content_soup.body.get_text() except: return words = content.split() stopwords = [''] unwanted_punctuations = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~>>' try: with open('google.csv', 'rb') as sw: #stopword from google read = csv.reader(sw) for stopword in read: stopwords.append(''.join(stopword)) except: pass for word in words: word = word.lstrip(unwanted_punctuations) word = word.rstrip(unwanted_punctuations) word = word.lower() if word not in stopwords: add_to_search_index(index, word, url) # make_index({}, 'http://www.google.com', soup(''))
def make_index(index, url, content_soup): try: style_num=content_soup.find_all('style') script_num=content_soup.find_all('script') for script in script_num: content_soup.script.decompose() for style in style_num: content_soup.style.decompose() content=content_soup.body.get_text() except: return words=content.split() stopwords=[''] unwanted_punctuations='!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~>>' try: with open('google.csv','rb') as sw :#stopword from google read=csv.reader(sw) for stopword in read: stopwords.append(''.join(stopword)) except: pass for word in words: word=word.lstrip(unwanted_punctuations) word=word.rstrip(unwanted_punctuations) word=word.lower() if word not in stopwords: add_to_search_index(index,word,url) # make_index({}, 'http://www.google.com', soup(''))
def make_index( url, content_soup): if content_soup==soup('','lxml'): print 'Cannot be indexed #CoNTeNT EmptY....' return '' docdigest=sha224(content_soup.body.encode('utf-8')).hexdigest() texts=soup(content_soup.get_text( ),'lxml').findAll(text=True) #obtaining meta info meta_info=[] for i in content_soup.findAll('meta'): try: if i['content']: for i in i['content'].split(): meta_info.append(i) except: pass ''' (str(),str())->() This module is responsible for preprocessing content to obtain only keywords This keywords are inserted into the document together with their location and frequency in the document This helps us to calculate relevant scores based on this document. ''' try: if content_soup.title==None: title=url else: title=content_soup.title.string ''' style_num=content_soup.find_all('style') script_num=content_soup.find_all('script') for script in script_num: content_soup.script.decompose() for style in style_num: content_soup.style.decompose() content=content_soup.body.get_text() #finding the best way to obtain only text from a page ''' content= ''.join([visible(elem) for elem in texts]) #content=content.encode('ascii','ignore') #print soup(content,'lxml') #print content if meta_info: page_body=' '.join(meta_info).lower() else: page_body=' '.join(content.split()[:50]).lower() except: date=datetime.today() db.crawler_error_log.insert({'error_type':str(sys.exc_info()),'date':date,'from_module':str(__file__)}) print 'problem with obtaining index from the make_index module','.......',traceback.print_exc() return #content=content.encode('ascii','ignore') splitter=re.compile('\\W*') words=[s.lower( ) for s in splitter.split(content) if s!=''] #later try to remove all stopwords stopwords=stopword([]) unwanted_punctuations="!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~>>" ''' try: with open('stopwords.csv','rb') as sw :#stopword from google read=csv.reader(sw) for stopword in read: stopwords.append(''.join(stopword)) except: print 'problem from stopwords.csv' ''' def isStringLike(word): ''' str()->bool() This module checks if a word is a string by using the duck typing style . if it wals like a duck and quacks like a duck then it duck like enough for a purpose ''' try: word+'' except: return False else: return True for i in xrange(len(words)): word=words[i] #if word=='code': # print word if word not in stopwords and (not word.isdigit()) and (word not in unwanted_punctuations) and(word.isalpha()) : if isStringLike(word): ''' have to check for alphabet using regular expressions ''' location,count =i,words.count(word) word=word.lstrip(unwanted_punctuations) word=word.rstrip(unwanted_punctuations) #if word=='code': #to index i stem every word and a url a location, etc. add_to_search_index(stem(word),url,title,page_body,location,count,docdigest)