Python BeautifulSoup.rfind примеры, bs4.BeautifulSoup.rfind Python примеры использования

Пример #1

0

Показать файл

 def spider_page(self, page_url):
     # print('爬虫每一个网页')
     driver_page = get_headless_chrome()
     driver_page.get(page_url)
     html_page = driver_page.page_source
     html_parse = BeautifulSoup(html_page, 'lxml')
     page_title = html_parse.find(class_='article').h2.string
     page_path = self.folder_path + '/' + page_title
     self.mkdir(page_path)
     os.chdir(page_path)
     file_names = self.get_files(page_path)  #获取文件夹中所有的文件名，类型为list
     page_total = int(html_parse.find(id='opic').previous_sibling.string)
     print(page_total)
     for i in range(page_total):
         driver_pic_index = get_headless_chrome()
         driver_pic_index.get(page_url + '/' + str(i + 1))
         pic_index = driver_pic_index.page_source
         pic_src = BeautifulSoup(pic_index,
                                 'lxml').find(id='content').img.attrs['src']
         print(pic_src)
         pic_name = pic_src[pic_src.rfind('/') + 1:]  #根据pic_src获取图片的名称
         if pic_src in file_names:
             print('图片已经存在，不在重复下载')
         else:
             self.save_img(pic_src, pic_name)

Пример #2

0

Показать файл

Файл: __init__.py Проект: ozn17/dp-tornado

    def validate(self, s):
        s_id = '_____dp_s_xss_____'
        s = '<div id="%s">%s</div>' % (s_id, s)
        s = BeautifulSoup(s, 'lxml')

        s = str(s.find(id=s_id))
        return s[s.find('>') + 1:s.rfind('<')]

Пример #3

0

Показать файл

def main():
    raw = download_url('https://www.eduro.com/')
    pretty = BeautifulSoup(raw, 'html.parser').get_text()
    start = pretty.find("Today's Quote of the Day")

    end = pretty.rfind('Also: Today\'s ')
    print(pretty[start + 40:end].strip())

Пример #4

0

Показать файл

Файл: __init__.py Проект: why2pac/dp-tornado

    def validate(self, s):
        s_id = '_____dp_s_xss_____'
        s = '<div id="%s">%s</div>' % (s_id, s)
        s = BeautifulSoup(s, 'lxml')

        s = str(s.find(id=s_id))
        return s[s.find('>')+1:s.rfind('<')]

Пример #5

0

Показать файл

Файл: content.py Проект: WANMAX/crawler

def _crawl_page(page_id, domain, page):
    try_ = 3
    while get_current_user():
        try:
            url = 'http://weibo.com/p/%s?page=%u' %(page_id,  page)
            url2 = 'http://weibo.com/p/aj/v6/mblog/mbloglist?domain=%s&page=%u&pre_page=%u&pagebar=0&pl_name=Pl_Third_App__9&id=%s&feed_type=1'%(domain, page, page, page_id)
            url3 = 'http://weibo.com/p/aj/v6/mblog/mbloglist?domain=%s&page=%u&pre_page=%u&pagebar=1&pl_name=Pl_Third_App__9&id=%s&feed_type=1'%(domain, page, page, page_id)
            html = urlopen(url).read().decode(_CHARSET, errors='ignore')
            if page==1:
                for temp in Soup(html).find_all('script', text=re.compile('{"ns":"pl\.content\.homeFeed\.index"')):
                    temp = temp.get_text()
                    if temp.find('主持人推荐')==-1 and temp.find('热门讨论')!=-1:
                        html = temp
                        break
            else:
                html = Soup(html).find('script', text=re.compile('{"ns":"pl\.content\.homeFeed\.index"')).get_text()
            html =  eval(html[html.find('(') + 1:html.rfind(')')])['html'].replace(r'\/', '/')
            html2 = urlopen(url2).read().decode(_CHARSET, errors='ignore')
            html2 = eval(html2)['data'].replace(r'\/', '/')
            html3 = urlopen(url3).read().decode(_CHARSET, errors='ignore')
            html3 = eval(html3)['data'].replace(r'\/', '/')
            change_user()
            time.sleep(INTERVAL)
            try_ = 3
            return ignore_emoji(html + html2 + html3)
        except:
            current_user = get_current_user()
            tag = test_cookie(current_user)
            if tag:
                if not try_:raise
                try_ -= 1
            change_user()
            if not tag:
                delete_cookie(current_user)

Пример #6

0

Показать файл

def get_newest_rss(self, url):
## Retreive an RSS feed and get the newest item
## Then, nicely format the title and description, and add a shortened URL

    dom = xml.dom.minidom.parse(urllib.request.urlopen(url))
    newest_news = dom.getElementsByTagName('item')[0]
    title = newest_news.getElementsByTagName('title')[0].childNodes[0].data
    description = BeautifulSoup(newest_news.getElementsByTagName('description')[0].childNodes[0].data)

    updated = dom.getElementsByTagName('pubDate')[0].childNodes[0].data
    updated = datetime.datetime.fromtimestamp(time.mktime(parsedate(updated)))
    ago = round((datetime.datetime.utcnow() - updated).seconds/60)



    links = description.findAll('a')
    for link in links:
        link.extract()
    links = description.findAll(color='#6f6f6f')
    for link in links:
        link.extract()

    title = title.strip()

    description = str(description)
    description = description.replace("\n", "")

    description = self.tools['remove_html_tags'](description)
    #description = description[0:len(description) - 9]
    description = description.strip()
    if description.rfind(".") != -1:
        description = description[0:description.rfind(".") + 1]

    link = self.tools['shorten_url'](newest_news.getElementsByTagName('link')[0].childNodes[0].data)

    description = "%s - %s [ %s ]" % (title, description, link)

    return description, updated, ago

Пример #7

0

Показать файл

Файл: news.py Проект: kconst/genmaybot

def google_news(self, e):
    query = urllib.parse.quote(e.input)
    url = ""
    if not query:
        url = "http://news.google.com/news?ned=us&topic=h&output=rss"
    else:
        url = "http://news.google.com/news?q=%s&output=rss" % query

    dom = xml.dom.minidom.parse(urllib.request.urlopen(url))
    newest_news = dom.getElementsByTagName('item')[0]
    title = newest_news.getElementsByTagName('title')[0].childNodes[0].data
    description = BeautifulSoup(newest_news.getElementsByTagName('description')[0].childNodes[0].data)

    links = description.findAll('a')
    for link in links:
        link.extract()
    links = description.findAll(color='#6f6f6f')
    for link in links:
        link.extract()

    title = title.strip()

    description = str(description)
    description = description.replace("\n", "")

    description = self.tools['remove_html_tags'](description)
#    description = tools.decode_htmlentities(description)
    description = description[0:len(description) - 9]
    description = description.strip()
    if description.rfind(".") != -1:
        description = description[0:description.rfind(".") + 1]

    link = self.tools['shorten_url'](newest_news.getElementsByTagName('link')[0].childNodes[0].data)

    e.output = "%s - %s [ %s ]" % (title, description, link)

    return e

Пример #8

0

Показать файл

 def _auth_connexion(self, url):
     logging.debug('thread_functions.py - CMEMSDataDownloadThread - _auth_connexion')
     first_res = requests.get(url, headers=self.headers)
     cas_url = re.search('(.*)/login.*', first_res.url).group(1) + '/v1/tickets'
     payload = 'username='******'') + '&password='******'')
     auth_res = requests.post(cas_url, headers=self.headers, data=payload)
     if auth_res.status_code >= 400:
         if 'error.authentication.credentials.bad' in auth_res.text:
             logging.exception('thread_functions.py - ECMWFDataDownloadThread - _auth_connexion - Exception occured during authentication: '
                               + 'wrong username and/or password.')
             self.download_failed.emit('Exception occured during authentication: wrong username and/or password.')
         else:
             logging.exception('thread_functions.py - ECMWFDataDownloadThread - _auth_connexion - Exception occured during authentication.')
             self.download_failed.emit('Exception occured during authentication.')
         raise Exception
     tgt_url = BeautifulSoup(auth_res.text, "html.parser").form['action']
     ticket_url = cas_url + '/' + tgt_url[tgt_url.rfind('/')+1:]
     redirect_service_url = urllib.parse.parse_qs(urllib.parse.urlparse(first_res.url).query, keep_blank_values=False)['service'][0]
     payload = 'service=' + urllib.parse.quote(redirect_service_url, safe='')
     ticket_res = requests.post(ticket_url, headers=self.headers, data=payload)
     return redirect_service_url + '&ticket=' + ticket_res.text

Пример #9

0

Показать файл

Файл: __init__.py Проект: why2pac/dp-tornado

    def strip_xss(self, s, whitelist=None):
        if whitelist is None:
            whitelist = (
                'a', 'abbr', 'aside', 'audio', 'bdi', 'bdo', 'blockquote', 'canvas',
                'caption', 'code', 'col', 'colgroup', 'data', 'dd', 'del',
                'details', 'div', 'dl', 'dt', 'em', 'figcaption', 'figure', 'h1',
                'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins', 'kbd', 'li',
                'mark', 'ol', 'p', 'pre', 'q', 'rp', 'rt', 'ruby', 's', 'samp',
                'small', 'source', 'span', 'strong', 'sub', 'summary', 'sup',
                'table', 'td', 'th', 'time', 'tr', 'track', 'u', 'ul', 'var',
                'video', 'wbr', 'b', 'br', 'site', 'font')

        elif not whitelist:
            whitelist = None

        s_id = '_____dp_s_xss_____'
        s = '<div id="%s">%s</div>' % (s_id, s)
        s = BeautifulSoup(s, 'lxml')

        self._strip_xss(s.find(id=s_id), whitelist)

        s = str(s.find(id=s_id))
        return s[s.find('>')+1:s.rfind('<')]

Пример #10

0

Показать файл

Файл: __init__.py Проект: ozn17/dp-tornado

    def strip_xss(self, s, whitelist=None):
        if whitelist is None:
            whitelist = ('a', 'abbr', 'aside', 'audio', 'bdi', 'bdo',
                         'blockquote', 'canvas', 'caption', 'code', 'col',
                         'colgroup', 'data', 'dd', 'del', 'details', 'div',
                         'dl', 'dt', 'em', 'figcaption', 'figure', 'h1', 'h2',
                         'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins',
                         'kbd', 'li', 'mark', 'ol', 'p', 'pre', 'q', 'rp',
                         'rt', 'ruby', 's', 'samp', 'small', 'source', 'span',
                         'strong', 'sub', 'summary', 'sup', 'table', 'td',
                         'th', 'time', 'tr', 'track', 'u', 'ul', 'var',
                         'video', 'wbr', 'b', 'br', 'site', 'font')

        elif not whitelist:
            whitelist = None

        s_id = '_____dp_s_xss_____'
        s = '<div id="%s">%s</div>' % (s_id, s)
        s = BeautifulSoup(s, 'lxml')

        self._strip_xss(s.find(id=s_id), whitelist)

        s = str(s.find(id=s_id))
        return s[s.find('>') + 1:s.rfind('<')]

Пример #11

0

Показать файл

##从网页访问在线文本
from urllib.request import urlopen
from bs4 import BeautifulSoup
import nltk

url = urlopen('https://www.bbc.co.uk/news')
html = url.read()
print(type(html))  ##查看raw的种类
print(html[:75])
print(len(html))

raw = BeautifulSoup(html).get_text()
tokens = nltk.word_tokenize(raw)
print(len(tokens))

print(raw.find('last'))
print(raw.rfind('disappear'))

tokens = tokens[96:399]
print(tokens)
text = nltk.Text(tokens)
print(text.concordance('gene'))

Пример #12

0

Показать файл

def split_transcripts(folder='../data', file='../clean_data.csv'):
    """

    Splits the data file into namedtuples, each with a different transcript. The tuples are stored in a dictionary.

    Format of namedtuple:
    Transcript(company=<string>,
               ticker=<string>,
               date=<timestamp>,
               prepared=<dataframe>,
               QandA=<dataframe>)

    """

    filepath = os.path.join(folder, file)
    df = pd.read_csv(filepath, verbose=True)

    end_of_transcripts = df[df.text == 'END OF TRANSCRIPT'].index
    print 'Total Number of Transcripts: {}'.format(len(end_of_transcripts))

    transcripts = {}
    last_end = -1
    n_transcript = 1

    for i, end in enumerate(end_of_transcripts):

        sys.stdout.write("Transcripts Completed: %d%%   \r" %
                         (100 * float(i) / len(end_of_transcripts)))
        sys.stdout.flush()

        transcript = df[(last_end + 1):end].reset_index(drop=True)
        last_end = end

        # Remove transcripts without required text (e.g. ones that reference an audio call only)
        if len(transcript) <= 4:
            continue

        # Extract company name from first line of transcript
        company = BeautifulSoup(transcript.iloc[0].values[0]).get_text()

        # Remove transcripts where the first line does not end with a closing parenthesis (indicates end of ticker)
        if len(company) < 1:
            continue

        if company[-1] != ')':
            continue

        # Extract the ticker from the company name
        open_paren = company.rfind('(')
        close_paren = company.find(')', open_paren)
        ticker = company[open_paren + 1:close_paren]

        # Extract the date of the call from the third line of the transcript
        date = BeautifulSoup(transcript.iloc[2].values[0]).get_text()

        # Remove transcripts with dates that cannot easily be converted into timestamps (for whatever reason)
        try:
            date = pd.to_datetime(date)
        except ValueError:
            # Uncomment the line below for examples of dates that are in an incorrect format
            # print date
            continue

        # Remove transcripts with improperly tagged Q&A sections
        try:
            begin_q_and_a = transcript[transcript.text.str.contains(
                'id=question-answer-session')].index[0]
        except IndexError:
            continue

        # Split the remaining text into prepared remarks and the Q&A session
        q_and_a = transcript[begin_q_and_a:].text.map(
            lambda x: BeautifulSoup(x).get_text())
        prepared = transcript[3:begin_q_and_a].text.map(
            lambda x: BeautifulSoup(x).get_text())

        # Store namedtuple in a dictionary, keys range from 1 to the number of transcripts
        transcripts[n_transcript] = Transcript(company=company,
                                               ticker=ticker,
                                               date=date,
                                               return_3days=None,
                                               return_30days=None,
                                               return_60days=None,
                                               return_90days=None,
                                               prepared=prepared,
                                               QandA=q_and_a)
        n_transcript += 1

    print 'Transcripts Remaining after Filtering: {}'.format(
        len(transcripts.keys()))
    return transcripts

Пример #13

0

Показать файл

    def __scraping__(self,
                     keyword=None,
                     location=None,
                     minSalary=None,
                     maxSalary=None,
                     minExperience=None,
                     maxExperience=None):

        # login
        browser = self.__authenticate__(self)

        headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
        }

        # construct filter criteria
        filter_criteria = {}
        if keyword is not None:
            filter_criteria.update({'key': keyword})

        if location is not None:
            filter_criteria.update({'location': location.value})

        if minSalary is not None:
            filter_criteria.update({'salary': minSalary})

        if maxSalary is not None:
            filter_criteria.update({'salary-max': maxSalary})

        if minExperience is not None:
            filter_criteria.update({'experience-min': minExperience})

        if maxExperience is not None:
            filter_criteria.update({'experience-max': maxExperience})

        # filter_criteria = {
        #             'key':'Software',
        #             'area': '2',
        #             'location':'51200',
        #             'position':'3,4',
        #             'job-type':'5',
        #             'salary':'6000',
        #             'salary-max':'7000',
        #             'classified':'1',
        #             'salary-option':'on',
        #             'job-posted':'0',
        #             'src':'1',
        #             'ojs':'4',
        #         }

        page_url = self.__base_url__
        url_parts = list(urlparse.urlparse(page_url))

        final_df = pd.DataFrame()

        # test to get number of pages
        page_criteria = {'pg': str(1)}
        filter_criteria.update(page_criteria)
        url_parts[4] = urlencode(filter_criteria)
        page_url = urlparse.urlunparse(url_parts)
        response = browser.open(page_url)

        # get total lists
        total_list = BeautifulSoup(response.content, "html.parser").find(
            "span", class_="pagination-result-count").string

        pages = 1

        if total_list is not None:
            logger.info(str(total_list))
            total_list = total_list[total_list.find("of") +
                                    len("of"):total_list.rfind("jobs")]
            total_list = total_list.strip().replace(',', '')
            logger.info("Attempt to parse " + str(total_list) +
                        " jobs at most")

            pages = math.ceil(int(total_list) / 40)  # 40 is item per page

        # To prevent over-scraping
        if General.PAGE_THRESHOLD.value != -1 and General.PAGE_THRESHOLD.value < pages:
            pages = General.PAGE_THRESHOLD.value

        for page in range(1, pages + 1):

            job_titles = []
            job_urls = []
            com_names = []
            com_urls = []

            locations = []
            salaries = []
            descriptions = []

            page_criteria = {'pg': str(page)}
            filter_criteria.update(page_criteria)
            url_parts[4] = urlencode(filter_criteria)
            page_url = urlparse.urlunparse(url_parts)

            logger.info("Processing Page " + str(page) + " : " + page_url)
            response = browser.open(page_url)

            if response.status_code != 200:
                raise ConnectionError("Cannot connect to " + page_url)

            # Get each job card
            raw_listing = BeautifulSoup(
                response.content, "html.parser").find_all(
                    "div",
                    {'id': lambda value: value and value.startswith("job_ad")})

            # For each job card, get job informations
            for element in raw_listing:

                # Get job general information
                job_el = element.find(
                    "a", {
                        'class':
                        lambda value: value and value.startswith(
                            "position-title-link")
                    })
                job_titles.append(job_el.get('data-job-title'))
                job_urls.append(job_el.get('href'))

                # Get company information
                com_el = element.find("a", {
                    'id':
                    lambda value: value and value.startswith("company_name")
                })

                if com_el is None:
                    com_el = element.find(
                        "span", {
                            'id':
                            lambda value: value and value.startswith(
                                "company_name")
                        })
                    com_names.append(com_el.string)
                    com_urls.append(None)
                else:
                    com_names.append(com_el.find('span').string)
                    com_urls.append(com_el.get('href'))

                # Get location information
                loc_el = element.find("li", {'class': 'job-location'})
                locations.append(loc_el.get('title'))

                sal_el = element.find("li", {'id': 'job_salary'})

                # Get salary information
                if sal_el:
                    font = sal_el.find("font")
                    if font:
                        salaries.append(sal_el.find("font").string)
                else:
                    salaries.append(None)

                # Get job description
                des_el = element.find(
                    "ul", {
                        'id':
                        lambda value: value and value.startswith(
                            "job_desc_detail")
                    }).find("li", recursive=False)

                if des_el:
                    descriptions.append(des_el.string)
                else:
                    descriptions.append(None)

            df = pd.concat([
                pd.Series(job_titles),
                pd.Series(job_urls),
                pd.Series(com_names),
                pd.Series(com_urls),
                pd.Series(locations),
                pd.Series(salaries),
                pd.Series(descriptions),
            ],
                           axis=1)

            df.columns = [[
                "Job Titles", "Job URLS", "Company Name", "Company URLS",
                "Location", "Salaries", "Descriptions"
            ]]
            final_df = final_df.append(df, ignore_index=True)

        final_df.columns = final_df.columns.get_level_values(0)
        logger.info("Parsing has ended...")
        return final_df

Пример #14

0

Показать файл

Файл: parse_data.py Проект: trevorlindsay/earnings-calls

def split_transcripts(folder='../data', file='../clean_data.csv'):

    """

    Splits the data file into namedtuples, each with a different transcript. The tuples are stored in a dictionary.

    Format of namedtuple:
    Transcript(company=<string>,
               ticker=<string>,
               date=<timestamp>,
               prepared=<dataframe>,
               QandA=<dataframe>)

    """

    filepath = os.path.join(folder, file)
    df = pd.read_csv(filepath, verbose=True)

    end_of_transcripts = df[df.text == 'END OF TRANSCRIPT'].index
    print 'Total Number of Transcripts: {}'.format(len(end_of_transcripts))

    transcripts = {}
    last_end = -1
    n_transcript = 1

    for i, end in enumerate(end_of_transcripts):

        sys.stdout.write("Transcripts Completed: %d%%   \r" % (100 * float(i) / len(end_of_transcripts)))
        sys.stdout.flush()

        transcript = df[(last_end + 1) : end].reset_index(drop=True)
        last_end = end

        # Remove transcripts without required text (e.g. ones that reference an audio call only)
        if len(transcript) <= 4:
            continue

        # Extract company name from first line of transcript
        company = BeautifulSoup(transcript.iloc[0].values[0]).get_text()

        # Remove transcripts where the first line does not end with a closing parenthesis (indicates end of ticker)
        if len(company) < 1:
            continue

        if company[-1] != ')':
            continue

        # Extract the ticker from the company name
        open_paren = company.rfind('(')
        close_paren = company.find(')', open_paren)
        ticker = company[open_paren + 1 : close_paren]

        # Extract the date of the call from the third line of the transcript
        date =  BeautifulSoup(transcript.iloc[2].values[0]).get_text()

        # Remove transcripts with dates that cannot easily be converted into timestamps (for whatever reason)
        try:
            date = pd.to_datetime(date)
        except ValueError:
            # Uncomment the line below for examples of dates that are in an incorrect format
            # print date
            continue

        # Remove transcripts with improperly tagged Q&A sections
        try:
            begin_q_and_a = transcript[transcript.text.str.contains('id=question-answer-session')].index[0]
        except IndexError:
            continue

        # Split the remaining text into prepared remarks and the Q&A session
        q_and_a = transcript[begin_q_and_a : ].text.map(lambda x: BeautifulSoup(x).get_text())
        prepared = transcript[3 : begin_q_and_a].text.map(lambda x: BeautifulSoup(x).get_text())

        # Store namedtuple in a dictionary, keys range from 1 to the number of transcripts
        transcripts[n_transcript] = Transcript(company=company,
                                               ticker=ticker,
                                               date=date,
                                               return_3days=None,
                                               return_30days=None,
                                               return_60days=None,
                                               return_90days=None,
                                               prepared=prepared,
                                               QandA=q_and_a)
        n_transcript += 1


    print 'Transcripts Remaining after Filtering: {}'.format(len(transcripts.keys()))
    return transcripts

Пример #15

0

Показать файл

Файл: NLP.py Проект: mednche/AdvancedProgrammingSkills

nltk.download("punkt")

################################################################################
# Extracts the raw text from The Waste Land by T. S. Eliot from an html format #
################################################################################

# Read HTML page
url = "http://www.gutenberg.org/files/1321/1321-h/1321-h.htm"
html = requests.get(url).text
raw = BeautifulSoup(html, "lxml").get_text()  # Without tags

# Cut down text
start = "il miglior fabbro"
start_pos = raw.find(start) + len(start)
end_pos = raw.rfind("Line 415 aetherial] aethereal")
raw = raw[start_pos:end_pos]

# Tokenising
tokens = nltk.word_tokenize(raw)

text = nltk.Text(tokens)
len(text)  #Number of words

# 20 most common words
fdist = nltk.FreqDist(text)
print(fdist.most_common(20))
fdist.plot(50, cumulative=True)

# 20 most common word length
fdist = nltk.FreqDist(len(w) for w in text)

Пример #16

0

Показать файл

Файл: ultimate.py Проект: iamvishnuks/mgu-analytics

def sem1(i):
    flag=0
    cflag=0
    global count
    global count2
    global countp
    global countf
    global coname
    browser = mechanize.Browser(factory=mechanize.RobustFactory())  #browser part starts
    browser.set_handle_robots(False)
    browser.open("http://14.139.185.88/cbcsshrCamp/index.php?module=public&page=result") 
    browser.select_form(nr=0)
    control=browser.find_control('exam_id')
    print control
    control.value=['201']#select semester value
    browser.form["prn"]=str(i)
    browser.submit() #browser part ends
    html = browser.response().readlines()
    for j in range(0,len(html)):
      if 'Not Registered' in html[j]: 
        flag=1
        count=count+1
      elif 'Invalid PRN !!' in html[j]:
        flag=1
        count2=count2+1
   #counting for failed candidates
    for j in range(0,len(html)):
      if 'Failed' in html[j]: 
        cflag=1
        break
    if cflag==1:
      countf=countf+1 
    if flag==1:
      print "Not registered %d"%count
    else:
      html = str(browser.response().readlines())
      raw = BeautifulSoup(html).get_text()
      raw=raw.replace("\\r\\n"," ")
      raw=raw.replace("\\t"," ")
      raw=raw.replace("\'"," ")
      raw=raw.replace(","," ")
      p=raw.find("Exam Centre")
      q=raw.rfind("P.O")
      a=raw.find("Permanent")
      b=raw.rfind("P.O")
      raw1=raw[a:b]
      x=raw.find("Course Code")
      y=raw.rfind("intended")
      raw2=raw[x:y]
      raw1=raw1.encode('ascii','ignore')
      raw2=raw2.encode('ascii','ignore')
      raw3=raw[p:q]
      raw3=raw3.encode('ascii','ignore')
      raw3=raw3.replace("m "," ")
      tokens3=word_tokenize(raw3)
      tokens1 = word_tokenize(raw1)
      tokens2=word_tokenize(raw2)
      tokens2=filter(lambda a: a != "-", tokens2)
      text1 = nltk.Text(tokens1)
      text2 = nltk.Text(tokens2)
      regno=tokens1[4]
      name1=' '.join(tokens1[9:11])
      coname='_'.join(tokens3[2:6])
      coname=coname.replace(":","")
      print coname
      code1=tokens2[16]
      code2=tokens2[33]
      code3=tokens2[48]
      code4=tokens2[63]
      code5=tokens2[82]
      code6=tokens2[100]
      subject1=' '.join(tokens2[17:22])
      subject2=' '.join(tokens2[34:37])
      subject3=' '.join(tokens2[49:52])
      subject4=' '.join(tokens2[64:71])
      subject5=' '.join(tokens2[83:89])
      subject6=' '.join(tokens2[101:104])
      credit1=tokens2[22]
      credit2=tokens2[37]
      credit3=tokens2[52]
      credit4=tokens2[71]
      credit5=tokens2[89]
      credit6=tokens2[104]
      credittot=tokens2[117]
      esa1=tokens2[23]
      esa2=tokens2[38]
      esa3=tokens2[53]
      esa4=tokens2[72]
      esa5=tokens2[90]
      esa6=tokens2[105]
      emax=80
      isa1=tokens2[25]
      isa2=tokens2[40]
      isa3=tokens2[55]
      isa4=tokens2[74]
      isa5=tokens2[92]
      isa6=tokens2[107]
      imax=20
      t1=tokens2[27]
      t2=tokens2[42]
      t3=tokens2[57]
      t4=tokens2[76]
      t5=tokens2[94]
      t6=tokens2[109]
      tmax=100
      gtmax=600
      grade1=tokens2[29]
      grade2=tokens2[44]
      grade3=tokens2[59]
      grade4=tokens2[78]
      grade5=tokens2[96]
      grade6=tokens2[111]
      tgrade=tokens2[123]
      gp1=tokens2[30]
      gp2=tokens2[45]
      gp3=tokens2[60]
      gp4=tokens2[79]
      gp5=tokens2[97]
      gp6=tokens2[112]
      cp1=tokens2[31]
      cp2=tokens2[46]
      cp3=tokens2[61]
      cp4=tokens2[80]
      cp5=tokens2[98]
      cp6=tokens2[113]
      tcp=tokens2[124]
      r1=tokens2[32]
      r2=tokens2[47]
      r3=tokens2[62]
      r4=tokens2[81]
      r5=tokens2[99]
      r6=tokens2[114]
      tr=tokens2[125]
      tot_marks=tokens2[121]
      scpa=tokens2[120]
      semester=1
      dbupdater(coname,semester,regno,name1,code1,subject1,esa1,isa1,t1,grade1,gp1,cp1,r1,scpa,tot_marks,tgrade,tcp,tr)
      dbupdater(coname,semester,regno,name1,code2,subject2,esa2,isa2,t2,grade2,gp2,cp2,r2,scpa,tot_marks,tgrade,tcp,tr)
      dbupdater(coname,semester,regno,name1,code3,subject3,esa3,isa3,t3,grade3,gp3,cp3,r3,scpa,tot_marks,tgrade,tcp,tr)
      dbupdater(coname,semester,regno,name1,code4,subject4,esa4,isa4,t4,grade4,gp4,cp4,r4,scpa,tot_marks,tgrade,tcp,tr)
      dbupdater(coname,semester,regno,name1,code5,subject5,esa5,isa5,t5,grade5,gp5,cp5,r5,scpa,tot_marks,tgrade,tcp,tr)
      dbupdater(coname,semester,regno,name1,code6,subject6,esa6,isa6,t6,grade6,gp6,cp6,r6,scpa,tot_marks,tgrade,tcp,tr)
      print name1+':'+regno+'\r\n\r\n'+code1+'   '+subject1+'   '+credit1+'   '+esa1+'   '+str(emax)+'   '+isa1+'   '+str(imax)+'   '+t1+'   '+str(tmax)+'   '+grade1+'   '+gp1+'   '+cp1+'   '+r1
      print '\r\n'+code2+'   '+subject2+'   '+credit2+'   '+esa2+'   '+str(emax)+'   '+isa2+'   '+str(imax)+'   '+t2+'   '+str(tmax)+'   '+grade2+'   '+gp2+'   '+cp2+'   '+r2
      print '\r\n'+code3+'   '+subject3+'   '+credit3+'   '+esa3+'   '+str(emax)+'   '+isa3+'   '+str(imax)+'   '+t3+'   '+str(tmax)+'   '+grade3+'   '+gp3+'   '+cp3+'   '+r3
      print '\r\n'+code4+'   '+subject4+'   '+credit4+'   '+esa4+'   '+str(emax)+'   '+isa4+'   '+str(imax)+'   '+t4+'   '+str(tmax)+'   '+grade4+'   '+gp4+'   '+cp4+'   '+r4
      print '\r\n'+code5+'   '+subject5+'   '+credit5+'   '+esa5+'   '+str(emax)+'   '+isa5+'   '+str(imax)+'   '+t5+'   '+str(tmax)+'   '+grade5+'   '+gp5+'   '+cp5+'   '+r5
      print '\r\n'+code6+'   '+subject6+'   '+credit6+'   '+esa6+'   '+str(emax)+'   '+isa6+'   '+str(imax)+'   '+t6+'   '+str(tmax)+'   '+grade6+'   '+gp6+'   '+cp6+'   '+r6
      print '\r\n'+'Total Credit : '+credittot+'\r\nSCPA : '+scpa+'\r\nTotal Marks : '+tot_marks+'/600'
      print '\r\n'+'Overall Grade : '+tgrade+'\r\nTotal CP : '+tcp+'\r\nOverall Status : '+tr

Python BeautifulSoup.rfind примеры использования